feat: split Grafana dashboards into per-folder ConfigMaps
Replace monolithic dashboards-configmap.yaml with 10 dedicated files, one per Grafana folder: Ingress, Observability, Infrastructure, Storage, Identity, DevTools, Search, Media, La Suite, Communications. New dashboards for Longhorn, PostgreSQL/CNPG, Cert-Manager, SeaweedFS, Hydra, Kratos, Gitea, OpenSearch, LiveKit, La Suite golden signals (Linkerd metrics), Matrix, and Email Pipeline.
This commit is contained in:
140
base/monitoring/dashboards-comms.yaml
Normal file
140
base/monitoring/dashboards-comms.yaml
Normal file
@@ -0,0 +1,140 @@
|
||||
# Grafana dashboard ConfigMaps — Communications
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-matrix
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Communications"
|
||||
data:
|
||||
matrix.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(request_total{namespace=\"matrix\", direction=\"inbound\"}[5m])) by (deployment)",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
},
|
||||
{
|
||||
"title": "Success Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(response_total{namespace=\"matrix\", direction=\"inbound\", classification=\"success\"}[5m])) by (deployment) / sum(rate(response_total{namespace=\"matrix\", direction=\"inbound\"}[5m])) by (deployment)",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "percentunit", "max": 1 } }
|
||||
},
|
||||
{
|
||||
"title": "Latency p95",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{namespace=\"matrix\", direction=\"inbound\"}[5m])) by (le, deployment))",
|
||||
"legendFormat": "{{deployment}} p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ms" } }
|
||||
},
|
||||
{
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(response_total{namespace=\"matrix\", direction=\"inbound\", classification=\"failure\"}[5m])) by (deployment) / sum(rate(response_total{namespace=\"matrix\", direction=\"inbound\"}[5m])) by (deployment)",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["matrix", "tuwunel", "communications"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Matrix / Tuwunel",
|
||||
"uid": "matrix"
|
||||
}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-email
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Communications"
|
||||
data:
|
||||
email.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate by Service",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(request_total{namespace=\"lasuite\", deployment=~\"postfix|messages-mta-in|messages-mta-out|messages-mpa|messages-worker\", direction=\"inbound\"}[5m])) by (deployment)",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
},
|
||||
{
|
||||
"title": "Error Rate by Service",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(response_total{namespace=\"lasuite\", deployment=~\"postfix|messages-mta-in|messages-mta-out|messages-mpa|messages-worker\", direction=\"inbound\", classification=\"failure\"}[5m])) by (deployment) / sum(rate(response_total{namespace=\"lasuite\", deployment=~\"postfix|messages-mta-in|messages-mta-out|messages-mpa|messages-worker\", direction=\"inbound\"}[5m])) by (deployment)",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "percentunit", "max": 1 } }
|
||||
},
|
||||
{
|
||||
"title": "Latency p95 by Service",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{namespace=\"lasuite\", deployment=~\"postfix|messages-mta-in|messages-mta-out|messages-mpa|messages-worker\", direction=\"inbound\"}[5m])) by (le, deployment))",
|
||||
"legendFormat": "{{deployment}} p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ms" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["email", "postfix", "communications"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Email Pipeline",
|
||||
"uid": "email-pipeline"
|
||||
}
|
||||
89
base/monitoring/dashboards-devtools.yaml
Normal file
89
base/monitoring/dashboards-devtools.yaml
Normal file
@@ -0,0 +1,89 @@
|
||||
# Grafana dashboard ConfigMaps — DevTools
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-gitea
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "DevTools"
|
||||
data:
|
||||
gitea.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Repositories",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "gitea_repositories", "legendFormat": "repos" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Users",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "gitea_users", "legendFormat": "users" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Issues",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "gitea_issues", "legendFormat": "issues" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Go Goroutines",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "go_goroutines{job=~\".*gitea.*\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Memory Usage",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "process_resident_memory_bytes{job=~\".*gitea.*\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
},
|
||||
{
|
||||
"title": "CPU Usage",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(process_cpu_seconds_total{job=~\".*gitea.*\"}[5m])",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "short" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["gitea", "devtools"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Gitea",
|
||||
"uid": "gitea"
|
||||
}
|
||||
179
base/monitoring/dashboards-identity.yaml
Normal file
179
base/monitoring/dashboards-identity.yaml
Normal file
@@ -0,0 +1,179 @@
|
||||
# Grafana dashboard ConfigMaps — Identity
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-hydra
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Identity"
|
||||
data:
|
||||
hydra.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=~\".*hydra.*\"}[5m])) by (handler)",
|
||||
"legendFormat": "{{handler}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
},
|
||||
{
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=~\".*hydra.*\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=~\".*hydra.*\"}[5m]))",
|
||||
"legendFormat": "5xx ratio"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Request Latency p95",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\".*hydra.*\"}[5m])) by (le, handler))",
|
||||
"legendFormat": "{{handler}} p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||
},
|
||||
{
|
||||
"title": "Go Goroutines",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "go_goroutines{job=~\".*hydra.*\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Memory Usage",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "go_memstats_alloc_bytes{job=~\".*hydra.*\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["hydra", "oauth2", "identity"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Hydra OAuth2",
|
||||
"uid": "hydra"
|
||||
}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-kratos
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Identity"
|
||||
data:
|
||||
kratos.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=~\".*kratos.*\"}[5m])) by (handler)",
|
||||
"legendFormat": "{{handler}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
},
|
||||
{
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_requests_total{job=~\".*kratos.*\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=~\".*kratos.*\"}[5m]))",
|
||||
"legendFormat": "5xx ratio"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Request Latency p95",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\".*kratos.*\"}[5m])) by (le, handler))",
|
||||
"legendFormat": "{{handler}} p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||
},
|
||||
{
|
||||
"title": "Go Goroutines",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "go_goroutines{job=~\".*kratos.*\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Memory Usage",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "go_memstats_alloc_bytes{job=~\".*kratos.*\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["kratos", "identity"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Kratos Identity",
|
||||
"uid": "kratos"
|
||||
}
|
||||
@@ -1,249 +1,4 @@
|
||||
# Grafana dashboard ConfigMaps — picked up by the Grafana sidecar.
|
||||
#
|
||||
# Each ConfigMap holds one or more dashboard JSON files. The sidecar
|
||||
# watches for the label grafana_dashboard=1 across all namespaces and
|
||||
# hot-loads them into Grafana (no restart required).
|
||||
#
|
||||
# The grafana_folder annotation groups dashboards into Grafana folders.
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-pingora
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Ingress"
|
||||
data:
|
||||
pingora.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Requests / sec",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(pingora_http_requests_total[5m]))",
|
||||
"legendFormat": "total"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)",
|
||||
"legendFormat": "{{status_code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Error Rate (5xx)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))",
|
||||
"legendFormat": "5xx ratio"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Request Latency (p50 / p95 / p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Active Connections",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pingora_active_connections",
|
||||
"legendFormat": "active"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Upstream Latency by Backend",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))",
|
||||
"legendFormat": "{{backend}} p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "DDoS / Scanner Detections",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(pingora_ddos_detections_total[5m]))",
|
||||
"legendFormat": "DDoS"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(pingora_scanner_detections_total[5m]))",
|
||||
"legendFormat": "Scanner"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))",
|
||||
"legendFormat": "Rate-limited"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["ingress", "pingora"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Pingora Proxy",
|
||||
"uid": "pingora-proxy"
|
||||
}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-loki
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Observability"
|
||||
data:
|
||||
loki-overview.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Log Volume by Namespace",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Error Logs",
|
||||
"type": "logs",
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"",
|
||||
"legendFormat": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["loki", "logs"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Loki — Log Overview",
|
||||
"uid": "loki-overview"
|
||||
}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-tempo
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Observability"
|
||||
data:
|
||||
tempo-overview.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Trace Ingestion Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(tempo_distributor_spans_received_total[5m]))",
|
||||
"legendFormat": "spans/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||
},
|
||||
{
|
||||
"title": "Service Map (RED)",
|
||||
"type": "nodeGraph",
|
||||
"gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "tempo" },
|
||||
"targets": [
|
||||
{ "queryType": "serviceMap" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Span Duration by Service (p95)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["tempo", "tracing"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Tempo — Trace Overview",
|
||||
"uid": "tempo-overview"
|
||||
}
|
||||
# Grafana dashboard ConfigMaps — Infrastructure
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
@@ -308,3 +63,259 @@ data:
|
||||
"title": "OpenBao / Vault",
|
||||
"uid": "openbao"
|
||||
}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-longhorn
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Infrastructure"
|
||||
data:
|
||||
longhorn.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Volume Usage",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "longhorn_volume_actual_size_bytes",
|
||||
"legendFormat": "{{volume}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
},
|
||||
{
|
||||
"title": "Volume Capacity",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(longhorn_volume_capacity_bytes)",
|
||||
"legendFormat": "total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
},
|
||||
{
|
||||
"title": "Disk Usage %",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":70},{"color":"red","value":85}] }
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Node Status",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "longhorn_node_count_total",
|
||||
"legendFormat": "nodes"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Volume State",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "longhorn_volume_state",
|
||||
"legendFormat": "{{volume}} — {{state}}",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["longhorn", "storage"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Longhorn Storage",
|
||||
"uid": "longhorn"
|
||||
}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-postgres
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Infrastructure"
|
||||
data:
|
||||
postgres.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Database Size",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cnpg_pg_database_size_bytes",
|
||||
"legendFormat": "{{datname}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
},
|
||||
{
|
||||
"title": "Active Connections",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cnpg_pg_stat_activity_count",
|
||||
"legendFormat": "{{state}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Collector Status",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cnpg_collector_up",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "mappings": [{"type":"value","options":{"0":{"text":"DOWN","color":"red"},"1":{"text":"UP","color":"green"}}}] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Transactions/sec",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 6, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(cnpg_pg_stat_database_xact_commit[5m]) + rate(cnpg_pg_stat_database_xact_rollback[5m])",
|
||||
"legendFormat": "{{datname}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||
},
|
||||
{
|
||||
"title": "Cache Hit Ratio",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "cnpg_pg_stat_database_blks_hit / (cnpg_pg_stat_database_blks_hit + cnpg_pg_stat_database_blks_read)",
|
||||
"legendFormat": "{{datname}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1 }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["postgres", "cnpg"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "PostgreSQL / CNPG",
|
||||
"uid": "postgres-cnpg"
|
||||
}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-certmanager
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Infrastructure"
|
||||
data:
|
||||
certmanager.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Certificates Expiring",
|
||||
"type": "table",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "certmanager_certificate_expiration_timestamp_seconds - time()",
|
||||
"legendFormat": "{{name}} ({{namespace}})",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
},
|
||||
"transformations": [
|
||||
{ "id": "sortBy", "options": { "fields": {}, "sort": [{ "field": "Value", "desc": false }] } }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Certificate Readiness",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "certmanager_certificate_ready_status{condition=\"True\"}",
|
||||
"legendFormat": "{{name}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "mappings": [{"type":"value","options":{"0":{"text":"NotReady","color":"red"},"1":{"text":"Ready","color":"green"}}}] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "ACME Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(certmanager_http_acme_client_request_count[5m])",
|
||||
"legendFormat": "{{status}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["cert-manager", "tls"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Cert-Manager",
|
||||
"uid": "cert-manager"
|
||||
}
|
||||
135
base/monitoring/dashboards-ingress.yaml
Normal file
135
base/monitoring/dashboards-ingress.yaml
Normal file
@@ -0,0 +1,135 @@
|
||||
# Grafana dashboard ConfigMaps — Ingress
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-pingora
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Ingress"
|
||||
data:
|
||||
pingora.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"title": "Requests / sec",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(pingora_http_requests_total[5m]))",
|
||||
"legendFormat": "total"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)",
|
||||
"legendFormat": "{{status_code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Error Rate (5xx)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))",
|
||||
"legendFormat": "5xx ratio"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Request Latency (p50 / p95 / p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Active Connections",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pingora_active_connections",
|
||||
"legendFormat": "active"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Upstream Latency by Backend",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))",
|
||||
"legendFormat": "{{backend}} p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "s" }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "DDoS / Scanner Detections",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(pingora_ddos_detections_total[5m]))",
|
||||
"legendFormat": "DDoS"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(pingora_scanner_detections_total[5m]))",
|
||||
"legendFormat": "Scanner"
|
||||
},
|
||||
{
|
||||
"expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))",
|
||||
"legendFormat": "Rate-limited"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "reqps" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["ingress", "pingora"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Pingora Proxy",
|
||||
"uid": "pingora-proxy"
|
||||
}
|
||||
135
base/monitoring/dashboards-lasuite.yaml
Normal file
135
base/monitoring/dashboards-lasuite.yaml
Normal file
@@ -0,0 +1,135 @@
|
||||
# Grafana dashboard ConfigMaps — La Suite (Golden Signals via Linkerd)
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-lasuite
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "La Suite"
|
||||
data:
|
||||
lasuite.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(request_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (deployment)",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
},
|
||||
{
|
||||
"title": "Success Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\", classification=\"success\"}[5m])) by (deployment) / sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (deployment)",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "percentunit", "max": 1 } }
|
||||
},
|
||||
{
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\", classification=\"failure\"}[5m])) by (deployment) / sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (deployment)",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Latency p50 / p95 / p99",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(response_latency_ms_bucket{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (le, deployment))",
|
||||
"legendFormat": "{{deployment}} p50"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (le, deployment))",
|
||||
"legendFormat": "{{deployment}} p95"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(response_latency_ms_bucket{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (le, deployment))",
|
||||
"legendFormat": "{{deployment}} p99"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ms" } }
|
||||
},
|
||||
{
|
||||
"title": "Request Rate by Status Code",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (deployment, status_code)",
|
||||
"legendFormat": "{{deployment}} {{status_code}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
},
|
||||
{
|
||||
"title": "TCP Connections",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "tcp_open_connections{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}",
|
||||
"legendFormat": "{{deployment}}"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["lasuite", "linkerd", "golden-signals"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "namespace",
|
||||
"type": "query",
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"query": "label_values(request_total{direction=\"inbound\"}, namespace)",
|
||||
"refresh": 2,
|
||||
"multi": true,
|
||||
"includeAll": true,
|
||||
"allValue": ".*"
|
||||
},
|
||||
{
|
||||
"name": "deployment",
|
||||
"type": "query",
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"query": "label_values(request_total{direction=\"inbound\", namespace=~\"$namespace\"}, deployment)",
|
||||
"refresh": 2,
|
||||
"multi": true,
|
||||
"includeAll": true,
|
||||
"allValue": ".*"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "La Suite — Golden Signals",
|
||||
"uid": "lasuite-golden"
|
||||
}
|
||||
90
base/monitoring/dashboards-media.yaml
Normal file
90
base/monitoring/dashboards-media.yaml
Normal file
@@ -0,0 +1,90 @@
|
||||
# Grafana dashboard ConfigMaps — Media
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-livekit
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Media"
|
||||
data:
|
||||
livekit.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Active Rooms",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "livekit_room_count", "legendFormat": "rooms" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Total Participants",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "livekit_participant_count", "legendFormat": "participants" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Total Tracks",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "livekit_track_count", "legendFormat": "tracks" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Packet Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(livekit_packet_total[5m])) by (direction)",
|
||||
"legendFormat": "{{direction}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||
},
|
||||
{
|
||||
"title": "Bandwidth",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(livekit_bytes_total[5m])) by (direction)",
|
||||
"legendFormat": "{{direction}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "Bps" } }
|
||||
},
|
||||
{
|
||||
"title": "NACK Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(livekit_nack_total[5m]))",
|
||||
"legendFormat": "NACKs"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["livekit", "media", "webrtc"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "LiveKit",
|
||||
"uid": "livekit"
|
||||
}
|
||||
106
base/monitoring/dashboards-observability.yaml
Normal file
106
base/monitoring/dashboards-observability.yaml
Normal file
@@ -0,0 +1,106 @@
|
||||
# Grafana dashboard ConfigMaps — Observability
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-loki
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Observability"
|
||||
data:
|
||||
loki-overview.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Log Volume by Namespace",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)",
|
||||
"legendFormat": "{{namespace}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Error Logs",
|
||||
"type": "logs",
|
||||
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "loki" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"",
|
||||
"legendFormat": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["loki", "logs"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Loki — Log Overview",
|
||||
"uid": "loki-overview"
|
||||
}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-tempo
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Observability"
|
||||
data:
|
||||
tempo-overview.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Trace Ingestion Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(tempo_distributor_spans_received_total[5m]))",
|
||||
"legendFormat": "spans/s"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||
},
|
||||
{
|
||||
"title": "Service Map (RED)",
|
||||
"type": "nodeGraph",
|
||||
"gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 },
|
||||
"datasource": { "uid": "tempo" },
|
||||
"targets": [
|
||||
{ "queryType": "serviceMap" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Span Duration by Service (p95)",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))",
|
||||
"legendFormat": "{{service}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["tempo", "tracing"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "Tempo — Trace Overview",
|
||||
"uid": "tempo-overview"
|
||||
}
|
||||
108
base/monitoring/dashboards-search.yaml
Normal file
108
base/monitoring/dashboards-search.yaml
Normal file
@@ -0,0 +1,108 @@
|
||||
# Grafana dashboard ConfigMaps — Search
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-opensearch
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Search"
|
||||
data:
|
||||
opensearch.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Cluster Health",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "opensearch_cluster_health_status", "legendFormat": "health" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "mappings": [{"type":"value","options":{"0":{"text":"GREEN","color":"green"},"1":{"text":"YELLOW","color":"yellow"},"2":{"text":"RED","color":"red"}}}] }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Active Shards",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "opensearch_cluster_health_active_shards", "legendFormat": "shards" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Node Count",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{ "expr": "opensearch_cluster_health_number_of_nodes", "legendFormat": "nodes" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Index Size",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "opensearch_index_store_size_bytes",
|
||||
"legendFormat": "{{index}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
},
|
||||
{
|
||||
"title": "JVM Heap Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "opensearch_jvm_mem_heap_used_bytes / opensearch_jvm_mem_heap_max_bytes",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.7},{"color":"red","value":0.85}] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Search Query Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(opensearch_indices_search_query_total[5m])",
|
||||
"legendFormat": "{{node}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||
},
|
||||
{
|
||||
"title": "GC Collection Time",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(opensearch_jvm_gc_collection_time_seconds[5m])",
|
||||
"legendFormat": "{{gc}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["opensearch", "search"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "OpenSearch",
|
||||
"uid": "opensearch"
|
||||
}
|
||||
163
base/monitoring/dashboards-storage.yaml
Normal file
163
base/monitoring/dashboards-storage.yaml
Normal file
@@ -0,0 +1,163 @@
|
||||
# Grafana dashboard ConfigMaps — Storage
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-seaweedfs
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
annotations:
|
||||
grafana_folder: "Storage"
|
||||
data:
|
||||
seaweedfs.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"title": "Cluster Overview",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "Data Nodes",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "seaweedfs_master_data_nodes or count(up{job=~\".*seaweedfs-volume.*\"})",
|
||||
"legendFormat": "nodes"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Total Volume Count",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 8, "y": 1 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "seaweedfs_master_volumes_count or sum(seaweedfs_volume_count)",
|
||||
"legendFormat": "volumes"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"title": "Total Disk Free",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 1 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(seaweedfs_disk_free_bytes)",
|
||||
"legendFormat": "free"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
},
|
||||
{
|
||||
"title": "Volume Server",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "Read Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(seaweedfs_volume_read_total[5m]))",
|
||||
"legendFormat": "reads"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||
},
|
||||
{
|
||||
"title": "Write Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(seaweedfs_volume_write_total[5m]))",
|
||||
"legendFormat": "writes"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||
},
|
||||
{
|
||||
"title": "Disk Usage",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "seaweedfs_disk_used_bytes",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "bytes" } }
|
||||
},
|
||||
{
|
||||
"title": "Filer",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "Filer Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(seaweedfs_filer_request_total[5m])) by (type)",
|
||||
"legendFormat": "{{type}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
},
|
||||
{
|
||||
"title": "Filer Latency p95",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(seaweedfs_filer_request_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||
},
|
||||
{
|
||||
"title": "S3 API",
|
||||
"type": "row",
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"title": "S3 Requests",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 24 },
|
||||
"datasource": { "uid": "prometheus" },
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(seaweedfs_s3_request_total[5m])) by (bucket, method)",
|
||||
"legendFormat": "{{bucket}} {{method}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"tags": ["seaweedfs", "storage", "s3"],
|
||||
"time": { "from": "now-1h", "to": "now" },
|
||||
"title": "SeaweedFS",
|
||||
"uid": "seaweedfs"
|
||||
}
|
||||
@@ -7,7 +7,22 @@ resources:
|
||||
- namespace.yaml
|
||||
- vault-secrets.yaml
|
||||
- grafana-oauth2client.yaml
|
||||
- dashboards-configmap.yaml
|
||||
# Dashboards (one ConfigMap per Grafana folder)
|
||||
- dashboards-ingress.yaml
|
||||
- dashboards-observability.yaml
|
||||
- dashboards-infrastructure.yaml
|
||||
- dashboards-storage.yaml
|
||||
- dashboards-identity.yaml
|
||||
- dashboards-devtools.yaml
|
||||
- dashboards-search.yaml
|
||||
- dashboards-media.yaml
|
||||
- dashboards-lasuite.yaml
|
||||
- dashboards-comms.yaml
|
||||
# AlertManager → Matrix bridge
|
||||
- matrix-alertmanager-receiver-deployment.yaml
|
||||
- matrix-bot-secret.yaml
|
||||
# Alert rules
|
||||
- alertrules-infrastructure.yaml
|
||||
|
||||
helmCharts:
|
||||
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
|
||||
Reference in New Issue
Block a user