diff --git a/base/monitoring/dashboards-comms.yaml b/base/monitoring/dashboards-comms.yaml new file mode 100644 index 0000000..07c3c58 --- /dev/null +++ b/base/monitoring/dashboards-comms.yaml @@ -0,0 +1,140 @@ +# Grafana dashboard ConfigMaps — Communications +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-matrix + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Communications" +data: + matrix.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(request_total{namespace=\"matrix\", direction=\"inbound\"}[5m])) by (deployment)", + "legendFormat": "{{deployment}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Success Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(response_total{namespace=\"matrix\", direction=\"inbound\", classification=\"success\"}[5m])) by (deployment) / sum(rate(response_total{namespace=\"matrix\", direction=\"inbound\"}[5m])) by (deployment)", + "legendFormat": "{{deployment}}" + } + ], + "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1 } } + }, + { + "title": "Latency p95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{namespace=\"matrix\", direction=\"inbound\"}[5m])) by (le, deployment))", + "legendFormat": "{{deployment}} p95" + } + ], + "fieldConfig": { "defaults": { "unit": "ms" } } + }, + { + "title": "Error Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(response_total{namespace=\"matrix\", direction=\"inbound\", classification=\"failure\"}[5m])) by (deployment) / sum(rate(response_total{namespace=\"matrix\", direction=\"inbound\"}[5m])) by (deployment)", + "legendFormat": "{{deployment}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1 } + } + } + ], + "schemaVersion": 39, + "tags": ["matrix", "tuwunel", "communications"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Matrix / Tuwunel", + "uid": "matrix" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-email + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Communications" +data: + email.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Request Rate by Service", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(request_total{namespace=\"lasuite\", deployment=~\"postfix|messages-mta-in|messages-mta-out|messages-mpa|messages-worker\", direction=\"inbound\"}[5m])) by (deployment)", + "legendFormat": "{{deployment}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Error Rate by Service", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(response_total{namespace=\"lasuite\", deployment=~\"postfix|messages-mta-in|messages-mta-out|messages-mpa|messages-worker\", direction=\"inbound\", classification=\"failure\"}[5m])) by (deployment) / sum(rate(response_total{namespace=\"lasuite\", deployment=~\"postfix|messages-mta-in|messages-mta-out|messages-mpa|messages-worker\", direction=\"inbound\"}[5m])) by (deployment)", + "legendFormat": "{{deployment}}" + } + ], + "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1 } } + }, + { + "title": "Latency p95 by Service", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{namespace=\"lasuite\", deployment=~\"postfix|messages-mta-in|messages-mta-out|messages-mpa|messages-worker\", direction=\"inbound\"}[5m])) by (le, deployment))", + "legendFormat": "{{deployment}} p95" + } + ], + "fieldConfig": { "defaults": { "unit": "ms" } } + } + ], + "schemaVersion": 39, + "tags": ["email", "postfix", "communications"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Email Pipeline", + "uid": "email-pipeline" + } diff --git a/base/monitoring/dashboards-devtools.yaml b/base/monitoring/dashboards-devtools.yaml new file mode 100644 index 0000000..d3cd022 --- /dev/null +++ b/base/monitoring/dashboards-devtools.yaml @@ -0,0 +1,89 @@ +# Grafana dashboard ConfigMaps — DevTools +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-gitea + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "DevTools" +data: + gitea.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Repositories", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_repositories", "legendFormat": "repos" } + ] + }, + { + "title": "Users", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_users", "legendFormat": "users" } + ] + }, + { + "title": "Issues", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_issues", "legendFormat": "issues" } + ] + }, + { + "title": "Go Goroutines", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "go_goroutines{job=~\".*gitea.*\"}", + "legendFormat": "{{instance}}" + } + ] + }, + { + "title": "Memory Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "process_resident_memory_bytes{job=~\".*gitea.*\"}", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "CPU Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{job=~\".*gitea.*\"}[5m])", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + } + ], + "schemaVersion": 39, + "tags": ["gitea", "devtools"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Gitea", + "uid": "gitea" + } diff --git a/base/monitoring/dashboards-identity.yaml b/base/monitoring/dashboards-identity.yaml new file mode 100644 index 0000000..2aa57ea --- /dev/null +++ b/base/monitoring/dashboards-identity.yaml @@ -0,0 +1,179 @@ +# Grafana dashboard ConfigMaps — Identity +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-hydra + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Identity" +data: + hydra.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=~\".*hydra.*\"}[5m])) by (handler)", + "legendFormat": "{{handler}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Error Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=~\".*hydra.*\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=~\".*hydra.*\"}[5m]))", + "legendFormat": "5xx ratio" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } + } + }, + { + "title": "Request Latency p95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\".*hydra.*\"}[5m])) by (le, handler))", + "legendFormat": "{{handler}} p95" + } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "Go Goroutines", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "go_goroutines{job=~\".*hydra.*\"}", + "legendFormat": "{{instance}}" + } + ] + }, + { + "title": "Memory Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=~\".*hydra.*\"}", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + } + ], + "schemaVersion": 39, + "tags": ["hydra", "oauth2", "identity"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Hydra OAuth2", + "uid": "hydra" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-kratos + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Identity" +data: + kratos.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=~\".*kratos.*\"}[5m])) by (handler)", + "legendFormat": "{{handler}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Error Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=~\".*kratos.*\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=~\".*kratos.*\"}[5m]))", + "legendFormat": "5xx ratio" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } + } + }, + { + "title": "Request Latency p95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\".*kratos.*\"}[5m])) by (le, handler))", + "legendFormat": "{{handler}} p95" + } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "Go Goroutines", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "go_goroutines{job=~\".*kratos.*\"}", + "legendFormat": "{{instance}}" + } + ] + }, + { + "title": "Memory Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "go_memstats_alloc_bytes{job=~\".*kratos.*\"}", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + } + ], + "schemaVersion": 39, + "tags": ["kratos", "identity"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Kratos Identity", + "uid": "kratos" + } diff --git a/base/monitoring/dashboards-configmap.yaml b/base/monitoring/dashboards-infrastructure.yaml similarity index 52% rename from base/monitoring/dashboards-configmap.yaml rename to base/monitoring/dashboards-infrastructure.yaml index 977a677..7c8c8e6 100644 --- a/base/monitoring/dashboards-configmap.yaml +++ b/base/monitoring/dashboards-infrastructure.yaml @@ -1,249 +1,4 @@ -# Grafana dashboard ConfigMaps — picked up by the Grafana sidecar. -# -# Each ConfigMap holds one or more dashboard JSON files. The sidecar -# watches for the label grafana_dashboard=1 across all namespaces and -# hot-loads them into Grafana (no restart required). -# -# The grafana_folder annotation groups dashboards into Grafana folders. ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboard-pingora - namespace: monitoring - labels: - grafana_dashboard: "1" - annotations: - grafana_folder: "Ingress" -data: - pingora.json: | - { - "annotations": { "list": [] }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "links": [], - "panels": [ - { - "title": "Requests / sec", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "sum(rate(pingora_http_requests_total[5m]))", - "legendFormat": "total" - }, - { - "expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)", - "legendFormat": "{{status_code}}" - } - ], - "fieldConfig": { - "defaults": { "unit": "reqps" } - } - }, - { - "title": "Error Rate (5xx)", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))", - "legendFormat": "5xx ratio" - } - ], - "fieldConfig": { - "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } - } - }, - { - "title": "Request Latency (p50 / p95 / p99)", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", - "legendFormat": "p50" - }, - { - "expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", - "legendFormat": "p95" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", - "legendFormat": "p99" - } - ], - "fieldConfig": { - "defaults": { "unit": "s" } - } - }, - { - "title": "Active Connections", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "pingora_active_connections", - "legendFormat": "active" - } - ] - }, - { - "title": "Upstream Latency by Backend", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))", - "legendFormat": "{{backend}} p95" - } - ], - "fieldConfig": { - "defaults": { "unit": "s" } - } - }, - { - "title": "DDoS / Scanner Detections", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "sum(rate(pingora_ddos_detections_total[5m]))", - "legendFormat": "DDoS" - }, - { - "expr": "sum(rate(pingora_scanner_detections_total[5m]))", - "legendFormat": "Scanner" - }, - { - "expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))", - "legendFormat": "Rate-limited" - } - ], - "fieldConfig": { - "defaults": { "unit": "reqps" } - } - } - ], - "schemaVersion": 39, - "tags": ["ingress", "pingora"], - "templating": { "list": [] }, - "time": { "from": "now-1h", "to": "now" }, - "title": "Pingora Proxy", - "uid": "pingora-proxy" - } ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboard-loki - namespace: monitoring - labels: - grafana_dashboard: "1" - annotations: - grafana_folder: "Observability" -data: - loki-overview.json: | - { - "annotations": { "list": [] }, - "editable": true, - "panels": [ - { - "title": "Log Volume by Namespace", - "type": "timeseries", - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, - "datasource": { "uid": "loki" }, - "targets": [ - { - "expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)", - "legendFormat": "{{namespace}}" - } - ] - }, - { - "title": "Error Logs", - "type": "logs", - "gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 }, - "datasource": { "uid": "loki" }, - "targets": [ - { - "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"", - "legendFormat": "" - } - ] - } - ], - "schemaVersion": 39, - "tags": ["loki", "logs"], - "time": { "from": "now-1h", "to": "now" }, - "title": "Loki — Log Overview", - "uid": "loki-overview" - } ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboard-tempo - namespace: monitoring - labels: - grafana_dashboard: "1" - annotations: - grafana_folder: "Observability" -data: - tempo-overview.json: | - { - "annotations": { "list": [] }, - "editable": true, - "panels": [ - { - "title": "Trace Ingestion Rate", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "sum(rate(tempo_distributor_spans_received_total[5m]))", - "legendFormat": "spans/s" - } - ], - "fieldConfig": { "defaults": { "unit": "ops" } } - }, - { - "title": "Service Map (RED)", - "type": "nodeGraph", - "gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 }, - "datasource": { "uid": "tempo" }, - "targets": [ - { "queryType": "serviceMap" } - ] - }, - { - "title": "Span Duration by Service (p95)", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))", - "legendFormat": "{{service}}" - } - ], - "fieldConfig": { "defaults": { "unit": "s" } } - } - ], - "schemaVersion": 39, - "tags": ["tempo", "tracing"], - "time": { "from": "now-1h", "to": "now" }, - "title": "Tempo — Trace Overview", - "uid": "tempo-overview" - } +# Grafana dashboard ConfigMaps — Infrastructure --- apiVersion: v1 kind: ConfigMap @@ -308,3 +63,259 @@ data: "title": "OpenBao / Vault", "uid": "openbao" } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-longhorn + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Infrastructure" +data: + longhorn.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Volume Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "longhorn_volume_actual_size_bytes", + "legendFormat": "{{volume}}" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Volume Capacity", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(longhorn_volume_capacity_bytes)", + "legendFormat": "total" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Disk Usage %", + "type": "gauge", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "(longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "max": 100, + "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":70},{"color":"red","value":85}] } + } + } + }, + { + "title": "Node Status", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "longhorn_node_count_total", + "legendFormat": "nodes" + } + ] + }, + { + "title": "Volume State", + "type": "table", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "longhorn_volume_state", + "legendFormat": "{{volume}} — {{state}}", + "format": "table", + "instant": true + } + ] + } + ], + "schemaVersion": 39, + "tags": ["longhorn", "storage"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Longhorn Storage", + "uid": "longhorn" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-postgres + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Infrastructure" +data: + postgres.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Database Size", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "cnpg_pg_database_size_bytes", + "legendFormat": "{{datname}}" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Active Connections", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "cnpg_pg_stat_activity_count", + "legendFormat": "{{state}}" + } + ] + }, + { + "title": "Collector Status", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "cnpg_collector_up", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { + "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"DOWN","color":"red"},"1":{"text":"UP","color":"green"}}}] } + } + }, + { + "title": "Transactions/sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 6, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "rate(cnpg_pg_stat_database_xact_commit[5m]) + rate(cnpg_pg_stat_database_xact_rollback[5m])", + "legendFormat": "{{datname}}" + } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Cache Hit Ratio", + "type": "gauge", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "cnpg_pg_stat_database_blks_hit / (cnpg_pg_stat_database_blks_hit + cnpg_pg_stat_database_blks_read)", + "legendFormat": "{{datname}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1 } + } + } + ], + "schemaVersion": 39, + "tags": ["postgres", "cnpg"], + "time": { "from": "now-1h", "to": "now" }, + "title": "PostgreSQL / CNPG", + "uid": "postgres-cnpg" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-certmanager + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Infrastructure" +data: + certmanager.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Certificates Expiring", + "type": "table", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "certmanager_certificate_expiration_timestamp_seconds - time()", + "legendFormat": "{{name}} ({{namespace}})", + "format": "table", + "instant": true + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + }, + "transformations": [ + { "id": "sortBy", "options": { "fields": {}, "sort": [{ "field": "Value", "desc": false }] } } + ] + }, + { + "title": "Certificate Readiness", + "type": "stat", + "gridPos": { "h": 4, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "certmanager_certificate_ready_status{condition=\"True\"}", + "legendFormat": "{{name}}" + } + ], + "fieldConfig": { + "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"NotReady","color":"red"},"1":{"text":"Ready","color":"green"}}}] } + } + }, + { + "title": "ACME Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "rate(certmanager_http_acme_client_request_count[5m])", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + } + ], + "schemaVersion": 39, + "tags": ["cert-manager", "tls"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Cert-Manager", + "uid": "cert-manager" + } diff --git a/base/monitoring/dashboards-ingress.yaml b/base/monitoring/dashboards-ingress.yaml new file mode 100644 index 0000000..20711ae --- /dev/null +++ b/base/monitoring/dashboards-ingress.yaml @@ -0,0 +1,135 @@ +# Grafana dashboard ConfigMaps — Ingress +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-pingora + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Ingress" +data: + pingora.json: | + { + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "title": "Requests / sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(pingora_http_requests_total[5m]))", + "legendFormat": "total" + }, + { + "expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)", + "legendFormat": "{{status_code}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + }, + { + "title": "Error Rate (5xx)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))", + "legendFormat": "5xx ratio" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } + } + }, + { + "title": "Request Latency (p50 / p95 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "Active Connections", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "pingora_active_connections", + "legendFormat": "active" + } + ] + }, + { + "title": "Upstream Latency by Backend", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))", + "legendFormat": "{{backend}} p95" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "DDoS / Scanner Detections", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(pingora_ddos_detections_total[5m]))", + "legendFormat": "DDoS" + }, + { + "expr": "sum(rate(pingora_scanner_detections_total[5m]))", + "legendFormat": "Scanner" + }, + { + "expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))", + "legendFormat": "Rate-limited" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + } + ], + "schemaVersion": 39, + "tags": ["ingress", "pingora"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Pingora Proxy", + "uid": "pingora-proxy" + } diff --git a/base/monitoring/dashboards-lasuite.yaml b/base/monitoring/dashboards-lasuite.yaml new file mode 100644 index 0000000..6b9c108 --- /dev/null +++ b/base/monitoring/dashboards-lasuite.yaml @@ -0,0 +1,135 @@ +# Grafana dashboard ConfigMaps — La Suite (Golden Signals via Linkerd) +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-lasuite + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "La Suite" +data: + lasuite.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(request_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (deployment)", + "legendFormat": "{{deployment}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Success Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\", classification=\"success\"}[5m])) by (deployment) / sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (deployment)", + "legendFormat": "{{deployment}}" + } + ], + "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1 } } + }, + { + "title": "Error Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\", classification=\"failure\"}[5m])) by (deployment) / sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (deployment)", + "legendFormat": "{{deployment}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } + } + }, + { + "title": "Latency p50 / p95 / p99", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(response_latency_ms_bucket{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (le, deployment))", + "legendFormat": "{{deployment}} p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (le, deployment))", + "legendFormat": "{{deployment}} p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(response_latency_ms_bucket{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (le, deployment))", + "legendFormat": "{{deployment}} p99" + } + ], + "fieldConfig": { "defaults": { "unit": "ms" } } + }, + { + "title": "Request Rate by Status Code", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(response_total{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}[5m])) by (deployment, status_code)", + "legendFormat": "{{deployment}} {{status_code}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "TCP Connections", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "tcp_open_connections{namespace=~\"$namespace\", deployment=~\"$deployment\", direction=\"inbound\"}", + "legendFormat": "{{deployment}}" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["lasuite", "linkerd", "golden-signals"], + "templating": { + "list": [ + { + "name": "namespace", + "type": "query", + "datasource": { "uid": "prometheus" }, + "query": "label_values(request_total{direction=\"inbound\"}, namespace)", + "refresh": 2, + "multi": true, + "includeAll": true, + "allValue": ".*" + }, + { + "name": "deployment", + "type": "query", + "datasource": { "uid": "prometheus" }, + "query": "label_values(request_total{direction=\"inbound\", namespace=~\"$namespace\"}, deployment)", + "refresh": 2, + "multi": true, + "includeAll": true, + "allValue": ".*" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "title": "La Suite — Golden Signals", + "uid": "lasuite-golden" + } diff --git a/base/monitoring/dashboards-media.yaml b/base/monitoring/dashboards-media.yaml new file mode 100644 index 0000000..684df13 --- /dev/null +++ b/base/monitoring/dashboards-media.yaml @@ -0,0 +1,90 @@ +# Grafana dashboard ConfigMaps — Media +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-livekit + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Media" +data: + livekit.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Active Rooms", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "livekit_room_count", "legendFormat": "rooms" } + ] + }, + { + "title": "Total Participants", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "livekit_participant_count", "legendFormat": "participants" } + ] + }, + { + "title": "Total Tracks", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "livekit_track_count", "legendFormat": "tracks" } + ] + }, + { + "title": "Packet Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(livekit_packet_total[5m])) by (direction)", + "legendFormat": "{{direction}}" + } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Bandwidth", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(livekit_bytes_total[5m])) by (direction)", + "legendFormat": "{{direction}}" + } + ], + "fieldConfig": { "defaults": { "unit": "Bps" } } + }, + { + "title": "NACK Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(livekit_nack_total[5m]))", + "legendFormat": "NACKs" + } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + } + ], + "schemaVersion": 39, + "tags": ["livekit", "media", "webrtc"], + "time": { "from": "now-1h", "to": "now" }, + "title": "LiveKit", + "uid": "livekit" + } diff --git a/base/monitoring/dashboards-observability.yaml b/base/monitoring/dashboards-observability.yaml new file mode 100644 index 0000000..d6193db --- /dev/null +++ b/base/monitoring/dashboards-observability.yaml @@ -0,0 +1,106 @@ +# Grafana dashboard ConfigMaps — Observability +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-loki + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Observability" +data: + loki-overview.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Log Volume by Namespace", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, + "datasource": { "uid": "loki" }, + "targets": [ + { + "expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)", + "legendFormat": "{{namespace}}" + } + ] + }, + { + "title": "Error Logs", + "type": "logs", + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 }, + "datasource": { "uid": "loki" }, + "targets": [ + { + "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"", + "legendFormat": "" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["loki", "logs"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Loki — Log Overview", + "uid": "loki-overview" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-tempo + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Observability" +data: + tempo-overview.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Trace Ingestion Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(tempo_distributor_spans_received_total[5m]))", + "legendFormat": "spans/s" + } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Service Map (RED)", + "type": "nodeGraph", + "gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "tempo" }, + "targets": [ + { "queryType": "serviceMap" } + ] + }, + { + "title": "Span Duration by Service (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))", + "legendFormat": "{{service}}" + } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + } + ], + "schemaVersion": 39, + "tags": ["tempo", "tracing"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Tempo — Trace Overview", + "uid": "tempo-overview" + } diff --git a/base/monitoring/dashboards-search.yaml b/base/monitoring/dashboards-search.yaml new file mode 100644 index 0000000..9696963 --- /dev/null +++ b/base/monitoring/dashboards-search.yaml @@ -0,0 +1,108 @@ +# Grafana dashboard ConfigMaps — Search +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-opensearch + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Search" +data: + opensearch.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Cluster Health", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "opensearch_cluster_health_status", "legendFormat": "health" } + ], + "fieldConfig": { + "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"GREEN","color":"green"},"1":{"text":"YELLOW","color":"yellow"},"2":{"text":"RED","color":"red"}}}] } + } + }, + { + "title": "Active Shards", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "opensearch_cluster_health_active_shards", "legendFormat": "shards" } + ] + }, + { + "title": "Node Count", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "opensearch_cluster_health_number_of_nodes", "legendFormat": "nodes" } + ] + }, + { + "title": "Index Size", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "opensearch_index_store_size_bytes", + "legendFormat": "{{index}}" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "JVM Heap Usage", + "type": "gauge", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "opensearch_jvm_mem_heap_used_bytes / opensearch_jvm_mem_heap_max_bytes", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.7},{"color":"red","value":0.85}] } } + } + }, + { + "title": "Search Query Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "rate(opensearch_indices_search_query_total[5m])", + "legendFormat": "{{node}}" + } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "GC Collection Time", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "rate(opensearch_jvm_gc_collection_time_seconds[5m])", + "legendFormat": "{{gc}}" + } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + } + ], + "schemaVersion": 39, + "tags": ["opensearch", "search"], + "time": { "from": "now-1h", "to": "now" }, + "title": "OpenSearch", + "uid": "opensearch" + } diff --git a/base/monitoring/dashboards-storage.yaml b/base/monitoring/dashboards-storage.yaml new file mode 100644 index 0000000..90f9c53 --- /dev/null +++ b/base/monitoring/dashboards-storage.yaml @@ -0,0 +1,163 @@ +# Grafana dashboard ConfigMaps — Storage +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-seaweedfs + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Storage" +data: + seaweedfs.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Cluster Overview", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, + { + "title": "Data Nodes", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "seaweedfs_master_data_nodes or count(up{job=~\".*seaweedfs-volume.*\"})", + "legendFormat": "nodes" + } + ] + }, + { + "title": "Total Volume Count", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "seaweedfs_master_volumes_count or sum(seaweedfs_volume_count)", + "legendFormat": "volumes" + } + ] + }, + { + "title": "Total Disk Free", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(seaweedfs_disk_free_bytes)", + "legendFormat": "free" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Volume Server", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "collapsed": false + }, + { + "title": "Read Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(seaweedfs_volume_read_total[5m]))", + "legendFormat": "reads" + } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Write Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(seaweedfs_volume_write_total[5m]))", + "legendFormat": "writes" + } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Disk Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "seaweedfs_disk_used_bytes", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Filer", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "collapsed": false + }, + { + "title": "Filer Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(seaweedfs_filer_request_total[5m])) by (type)", + "legendFormat": "{{type}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Filer Latency p95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(seaweedfs_filer_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "S3 API", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "collapsed": false + }, + { + "title": "S3 Requests", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 24 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(seaweedfs_s3_request_total[5m])) by (bucket, method)", + "legendFormat": "{{bucket}} {{method}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + } + ], + "schemaVersion": 39, + "tags": ["seaweedfs", "storage", "s3"], + "time": { "from": "now-1h", "to": "now" }, + "title": "SeaweedFS", + "uid": "seaweedfs" + } diff --git a/base/monitoring/kustomization.yaml b/base/monitoring/kustomization.yaml index b1307f8..88b3803 100644 --- a/base/monitoring/kustomization.yaml +++ b/base/monitoring/kustomization.yaml @@ -7,7 +7,22 @@ resources: - namespace.yaml - vault-secrets.yaml - grafana-oauth2client.yaml - - dashboards-configmap.yaml + # Dashboards (one ConfigMap per Grafana folder) + - dashboards-ingress.yaml + - dashboards-observability.yaml + - dashboards-infrastructure.yaml + - dashboards-storage.yaml + - dashboards-identity.yaml + - dashboards-devtools.yaml + - dashboards-search.yaml + - dashboards-media.yaml + - dashboards-lasuite.yaml + - dashboards-comms.yaml + # AlertManager → Matrix bridge + - matrix-alertmanager-receiver-deployment.yaml + - matrix-bot-secret.yaml + # Alert rules + - alertrules-infrastructure.yaml helmCharts: # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts