- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces, enable remote write receiver for Tempo metrics generator - Tempo: enable metrics generator (service-graphs + span-metrics) with remote write to Prometheus - Loki: add Grafana Alloy DaemonSet to ship container logs - Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao dashboards, add stable UIDs and cross-linking between datasources (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map) - Linkerd: enable proxy tracing to Alloy OTLP collector, point linkerd-viz at existing Prometheus instead of deploying its own - Pingora: add OTLP rollout plan (endpoint commented out until proxy telemetry panic fix is deployed and Alloy is verified healthy)
311 lines
9.6 KiB
YAML
311 lines
9.6 KiB
YAML
# Grafana dashboard ConfigMaps — picked up by the Grafana sidecar.
|
|
#
|
|
# Each ConfigMap holds one or more dashboard JSON files. The sidecar
|
|
# watches for the label grafana_dashboard=1 across all namespaces and
|
|
# hot-loads them into Grafana (no restart required).
|
|
#
|
|
# The grafana_folder annotation groups dashboards into Grafana folders.
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-pingora
|
|
namespace: monitoring
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
annotations:
|
|
grafana_folder: "Ingress"
|
|
data:
|
|
pingora.json: |
|
|
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"title": "Requests / sec",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(pingora_http_requests_total[5m]))",
|
|
"legendFormat": "total"
|
|
},
|
|
{
|
|
"expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)",
|
|
"legendFormat": "{{status_code}}"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "reqps" }
|
|
}
|
|
},
|
|
{
|
|
"title": "Error Rate (5xx)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))",
|
|
"legendFormat": "5xx ratio"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
|
|
}
|
|
},
|
|
{
|
|
"title": "Request Latency (p50 / p95 / p99)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p50"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p95"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
|
"legendFormat": "p99"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s" }
|
|
}
|
|
},
|
|
{
|
|
"title": "Active Connections",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"expr": "pingora_active_connections",
|
|
"legendFormat": "active"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Upstream Latency by Backend",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))",
|
|
"legendFormat": "{{backend}} p95"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "s" }
|
|
}
|
|
},
|
|
{
|
|
"title": "DDoS / Scanner Detections",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(pingora_ddos_detections_total[5m]))",
|
|
"legendFormat": "DDoS"
|
|
},
|
|
{
|
|
"expr": "sum(rate(pingora_scanner_detections_total[5m]))",
|
|
"legendFormat": "Scanner"
|
|
},
|
|
{
|
|
"expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))",
|
|
"legendFormat": "Rate-limited"
|
|
}
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "unit": "reqps" }
|
|
}
|
|
}
|
|
],
|
|
"schemaVersion": 39,
|
|
"tags": ["ingress", "pingora"],
|
|
"templating": { "list": [] },
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"title": "Pingora Proxy",
|
|
"uid": "pingora-proxy"
|
|
}
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-loki
|
|
namespace: monitoring
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
annotations:
|
|
grafana_folder: "Observability"
|
|
data:
|
|
loki-overview.json: |
|
|
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"panels": [
|
|
{
|
|
"title": "Log Volume by Namespace",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
|
|
"datasource": { "uid": "loki" },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)",
|
|
"legendFormat": "{{namespace}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"title": "Error Logs",
|
|
"type": "logs",
|
|
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 },
|
|
"datasource": { "uid": "loki" },
|
|
"targets": [
|
|
{
|
|
"expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"",
|
|
"legendFormat": ""
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"schemaVersion": 39,
|
|
"tags": ["loki", "logs"],
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"title": "Loki — Log Overview",
|
|
"uid": "loki-overview"
|
|
}
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-tempo
|
|
namespace: monitoring
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
annotations:
|
|
grafana_folder: "Observability"
|
|
data:
|
|
tempo-overview.json: |
|
|
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"panels": [
|
|
{
|
|
"title": "Trace Ingestion Rate",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"expr": "sum(rate(tempo_distributor_spans_received_total[5m]))",
|
|
"legendFormat": "spans/s"
|
|
}
|
|
],
|
|
"fieldConfig": { "defaults": { "unit": "ops" } }
|
|
},
|
|
{
|
|
"title": "Service Map (RED)",
|
|
"type": "nodeGraph",
|
|
"gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 },
|
|
"datasource": { "uid": "tempo" },
|
|
"targets": [
|
|
{ "queryType": "serviceMap" }
|
|
]
|
|
},
|
|
{
|
|
"title": "Span Duration by Service (p95)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))",
|
|
"legendFormat": "{{service}}"
|
|
}
|
|
],
|
|
"fieldConfig": { "defaults": { "unit": "s" } }
|
|
}
|
|
],
|
|
"schemaVersion": 39,
|
|
"tags": ["tempo", "tracing"],
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"title": "Tempo — Trace Overview",
|
|
"uid": "tempo-overview"
|
|
}
|
|
---
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-openbao
|
|
namespace: monitoring
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
annotations:
|
|
grafana_folder: "Infrastructure"
|
|
data:
|
|
openbao.json: |
|
|
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"panels": [
|
|
{
|
|
"title": "Vault/OpenBao Sealed Status",
|
|
"type": "stat",
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{ "expr": "vault_core_unsealed", "legendFormat": "unsealed" }
|
|
],
|
|
"fieldConfig": {
|
|
"defaults": { "mappings": [{"type":"value","options":{"0":{"text":"SEALED","color":"red"},"1":{"text":"UNSEALED","color":"green"}}}] }
|
|
}
|
|
},
|
|
{
|
|
"title": "Token Count",
|
|
"type": "stat",
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{ "expr": "vault_token_count", "legendFormat": "tokens" }
|
|
]
|
|
},
|
|
{
|
|
"title": "Request Rate",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{ "expr": "sum(rate(vault_core_handle_request_count[5m]))", "legendFormat": "req/s" }
|
|
],
|
|
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
|
},
|
|
{
|
|
"title": "Request Latency (p95)",
|
|
"type": "timeseries",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"datasource": { "uid": "prometheus" },
|
|
"targets": [
|
|
{ "expr": "histogram_quantile(0.95, sum(rate(vault_core_handle_request_bucket[5m])) by (le))", "legendFormat": "p95" }
|
|
],
|
|
"fieldConfig": { "defaults": { "unit": "s" } }
|
|
}
|
|
],
|
|
"schemaVersion": 39,
|
|
"tags": ["vault", "openbao"],
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"title": "OpenBao / Vault",
|
|
"uid": "openbao"
|
|
}
|