From d3943c9a84ac9316f997de2cd2a96883a46fc247 Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Sat, 21 Mar 2026 17:36:54 +0000 Subject: [PATCH] feat(monitoring): wire up full LGTM observability stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Prometheus: discover ServiceMonitors/PodMonitors in all namespaces, enable remote write receiver for Tempo metrics generator - Tempo: enable metrics generator (service-graphs + span-metrics) with remote write to Prometheus - Loki: add Grafana Alloy DaemonSet to ship container logs - Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao dashboards, add stable UIDs and cross-linking between datasources (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map) - Linkerd: enable proxy tracing to Alloy OTLP collector, point linkerd-viz at existing Prometheus instead of deploying its own - Pingora: add OTLP rollout plan (endpoint commented out until proxy telemetry panic fix is deployed and Alloy is verified healthy) --- base/ingress/pingora-config.yaml | 7 + base/mesh/kustomization.yaml | 2 + base/mesh/linkerd-control-plane-values.yaml | 19 ++ base/mesh/linkerd-viz-values.yaml | 9 + base/monitoring/alloy-values.yaml | 108 +++++++ base/monitoring/dashboards-configmap.yaml | 310 ++++++++++++++++++++ base/monitoring/kustomization.yaml | 10 + base/monitoring/prometheus-values.yaml | 46 +++ base/monitoring/tempo-values.yaml | 12 + 9 files changed, 523 insertions(+) create mode 100644 base/mesh/linkerd-control-plane-values.yaml create mode 100644 base/mesh/linkerd-viz-values.yaml create mode 100644 base/monitoring/alloy-values.yaml create mode 100644 base/monitoring/dashboards-configmap.yaml diff --git a/base/ingress/pingora-config.yaml b/base/ingress/pingora-config.yaml index 544f099..8f0b441 100644 --- a/base/ingress/pingora-config.yaml +++ b/base/ingress/pingora-config.yaml @@ -21,6 +21,13 @@ data: key_path = "/etc/tls/tls.key" [telemetry] + # Rollout plan for OTLP tracing: + # 1. Deploy proxy build that includes the graceful telemetry init + # (proxy/src/telemetry.rs — no longer panics on exporter failure) + # 2. Verify Alloy is running: + # kubectl -n monitoring get pods -l app.kubernetes.io/name=alloy + # 3. Uncomment the line below: + # otlp_endpoint = "http://alloy.monitoring.svc.cluster.local:4318" otlp_endpoint = "" metrics_port = 9090 diff --git a/base/mesh/kustomization.yaml b/base/mesh/kustomization.yaml index dada26a..13ab90c 100644 --- a/base/mesh/kustomization.yaml +++ b/base/mesh/kustomization.yaml @@ -29,9 +29,11 @@ helmCharts: version: "2025.12.3" releaseName: linkerd-control-plane namespace: mesh + valuesFile: linkerd-control-plane-values.yaml - name: linkerd-viz repo: https://helm.linkerd.io/edge version: "2026.1.4" releaseName: linkerd-viz namespace: mesh + valuesFile: linkerd-viz-values.yaml diff --git a/base/mesh/linkerd-control-plane-values.yaml b/base/mesh/linkerd-control-plane-values.yaml new file mode 100644 index 0000000..5d100fe --- /dev/null +++ b/base/mesh/linkerd-control-plane-values.yaml @@ -0,0 +1,19 @@ +# Linkerd control-plane overrides — enable proxy tracing to Tempo. +# +# Every meshed pod's Linkerd sidecar will export OTLP traces to the +# Alloy collector in the monitoring namespace, which forwards to Tempo. + +# Controller-level tracing (identity, destination controllers) +controller: + tracing: + enabled: true + collector: + endpoint: "alloy.monitoring.svc.cluster.local:4317" + +# Proxy-level tracing (every meshed sidecar) +proxy: + tracing: + enabled: true + traceServiceName: linkerd-proxy + collector: + endpoint: "alloy.monitoring.svc.cluster.local:4317" diff --git a/base/mesh/linkerd-viz-values.yaml b/base/mesh/linkerd-viz-values.yaml new file mode 100644 index 0000000..cfb14ac --- /dev/null +++ b/base/mesh/linkerd-viz-values.yaml @@ -0,0 +1,9 @@ +# Linkerd-viz overrides — use existing Prometheus instead of deploying a second one. +# +# By default linkerd-viz ships its own Prometheus, which wastes resources +# and creates a second scrape loop. Point it at kube-prometheus-stack instead. + +prometheus: + enabled: false + +prometheusUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090" diff --git a/base/monitoring/alloy-values.yaml b/base/monitoring/alloy-values.yaml new file mode 100644 index 0000000..1d64ee3 --- /dev/null +++ b/base/monitoring/alloy-values.yaml @@ -0,0 +1,108 @@ +# Grafana Alloy — lightweight agent that ships container logs to Loki +# and forwards OTLP traces to Tempo. +# +# Runs as a DaemonSet so every node's /var/log/pods is tailed. + +alloy: + configMap: + content: | + // ── Kubernetes log discovery ────────────────────────────────── + discovery.kubernetes "pods" { + role = "pod" + } + + discovery.relabel "pod_logs" { + targets = discovery.kubernetes.pods.targets + + // Keep only running pods + rule { + source_labels = ["__meta_kubernetes_pod_phase"] + regex = "Pending|Succeeded|Failed|Unknown" + action = "drop" + } + + // Standard labels + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + target_label = "container" + } + rule { + source_labels = ["__meta_kubernetes_pod_node_name"] + target_label = "node" + } + // Carry app label for easier Grafana filtering + rule { + source_labels = ["__meta_kubernetes_pod_label_app"] + target_label = "app" + } + rule { + source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"] + target_label = "app" + action = "replace" + regex = "(.+)" + } + } + + loki.source.kubernetes "pods" { + targets = discovery.relabel.pod_logs.output + forward_to = [loki.process.pipeline.receiver] + } + + // ── Log processing pipeline ────────────────────────────────── + loki.process "pipeline" { + // Detect and parse JSON log lines (common in Go / Python services) + stage.json { + expressions = { + level = "level", + msg = "msg", + traceID = "traceID", + } + } + + // Promote log level to a label for easier filtering + stage.labels { + values = { level = "" } + } + + forward_to = [loki.write.default.receiver] + } + + loki.write "default" { + endpoint { + url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push" + } + } + + // ── OTLP receiver (services can push traces here) ──────────── + otelcol.receiver.otlp "default" { + grpc { endpoint = "0.0.0.0:4317" } + http { endpoint = "0.0.0.0:4318" } + output { traces = [otelcol.exporter.otlp.tempo.input] } + } + + otelcol.exporter.otlp "tempo" { + client { + endpoint = "tempo.monitoring.svc.cluster.local:4317" + tls { insecure = true } + } + } + +controller: + type: daemonset + +# Mount node log directories for kubernetes log tailing +mounts: + varlog: true + +# Expose OTLP ports so in-cluster services can send traces to the local agent +service: + enabled: true + type: ClusterIP diff --git a/base/monitoring/dashboards-configmap.yaml b/base/monitoring/dashboards-configmap.yaml new file mode 100644 index 0000000..977a677 --- /dev/null +++ b/base/monitoring/dashboards-configmap.yaml @@ -0,0 +1,310 @@ +# Grafana dashboard ConfigMaps — picked up by the Grafana sidecar. +# +# Each ConfigMap holds one or more dashboard JSON files. The sidecar +# watches for the label grafana_dashboard=1 across all namespaces and +# hot-loads them into Grafana (no restart required). +# +# The grafana_folder annotation groups dashboards into Grafana folders. +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-pingora + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Ingress" +data: + pingora.json: | + { + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "title": "Requests / sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(pingora_http_requests_total[5m]))", + "legendFormat": "total" + }, + { + "expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)", + "legendFormat": "{{status_code}}" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + }, + { + "title": "Error Rate (5xx)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))", + "legendFormat": "5xx ratio" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } + } + }, + { + "title": "Request Latency (p50 / p95 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "Active Connections", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "pingora_active_connections", + "legendFormat": "active" + } + ] + }, + { + "title": "Upstream Latency by Backend", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))", + "legendFormat": "{{backend}} p95" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "DDoS / Scanner Detections", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(pingora_ddos_detections_total[5m]))", + "legendFormat": "DDoS" + }, + { + "expr": "sum(rate(pingora_scanner_detections_total[5m]))", + "legendFormat": "Scanner" + }, + { + "expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))", + "legendFormat": "Rate-limited" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + } + ], + "schemaVersion": 39, + "tags": ["ingress", "pingora"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Pingora Proxy", + "uid": "pingora-proxy" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-loki + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Observability" +data: + loki-overview.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Log Volume by Namespace", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 }, + "datasource": { "uid": "loki" }, + "targets": [ + { + "expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)", + "legendFormat": "{{namespace}}" + } + ] + }, + { + "title": "Error Logs", + "type": "logs", + "gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 }, + "datasource": { "uid": "loki" }, + "targets": [ + { + "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"", + "legendFormat": "" + } + ] + } + ], + "schemaVersion": 39, + "tags": ["loki", "logs"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Loki — Log Overview", + "uid": "loki-overview" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-tempo + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Observability" +data: + tempo-overview.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Trace Ingestion Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(tempo_distributor_spans_received_total[5m]))", + "legendFormat": "spans/s" + } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Service Map (RED)", + "type": "nodeGraph", + "gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "tempo" }, + "targets": [ + { "queryType": "serviceMap" } + ] + }, + { + "title": "Span Duration by Service (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))", + "legendFormat": "{{service}}" + } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + } + ], + "schemaVersion": 39, + "tags": ["tempo", "tracing"], + "time": { "from": "now-1h", "to": "now" }, + "title": "Tempo — Trace Overview", + "uid": "tempo-overview" + } +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-openbao + namespace: monitoring + labels: + grafana_dashboard: "1" + annotations: + grafana_folder: "Infrastructure" +data: + openbao.json: | + { + "annotations": { "list": [] }, + "editable": true, + "panels": [ + { + "title": "Vault/OpenBao Sealed Status", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "vault_core_unsealed", "legendFormat": "unsealed" } + ], + "fieldConfig": { + "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"SEALED","color":"red"},"1":{"text":"UNSEALED","color":"green"}}}] } + } + }, + { + "title": "Token Count", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "vault_token_count", "legendFormat": "tokens" } + ] + }, + { + "title": "Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(vault_core_handle_request_count[5m]))", "legendFormat": "req/s" } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Request Latency (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(vault_core_handle_request_bucket[5m])) by (le))", "legendFormat": "p95" } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + } + ], + "schemaVersion": 39, + "tags": ["vault", "openbao"], + "time": { "from": "now-1h", "to": "now" }, + "title": "OpenBao / Vault", + "uid": "openbao" + } diff --git a/base/monitoring/kustomization.yaml b/base/monitoring/kustomization.yaml index d5df99a..b1307f8 100644 --- a/base/monitoring/kustomization.yaml +++ b/base/monitoring/kustomization.yaml @@ -7,6 +7,7 @@ resources: - namespace.yaml - vault-secrets.yaml - grafana-oauth2client.yaml + - dashboards-configmap.yaml helmCharts: # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts @@ -32,3 +33,12 @@ helmCharts: releaseName: tempo namespace: monitoring valuesFile: tempo-values.yaml + + # Grafana Alloy — DaemonSet that ships container logs → Loki + # and provides an in-cluster OTLP receiver → Tempo. + - name: alloy + repo: https://grafana.github.io/helm-charts + version: "0.12.0" + releaseName: alloy + namespace: monitoring + valuesFile: alloy-values.yaml diff --git a/base/monitoring/prometheus-values.yaml b/base/monitoring/prometheus-values.yaml index cab5bc3..63aa6cf 100644 --- a/base/monitoring/prometheus-values.yaml +++ b/base/monitoring/prometheus-values.yaml @@ -39,10 +39,20 @@ grafana: sidecar: datasources: defaultDatasourceEnabled: false + dashboards: + enabled: true + # Pick up ConfigMaps with this label in any namespace + label: grafana_dashboard + labelValue: "1" + searchNamespace: ALL + folderAnnotation: grafana_folder + provider: + foldersFromFilesStructure: false additionalDataSources: - name: Prometheus type: prometheus + uid: prometheus url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090" access: proxy isDefault: true @@ -50,17 +60,53 @@ grafana: timeInterval: 30s - name: Loki type: loki + uid: loki url: "http://loki-gateway.monitoring.svc.cluster.local:80" access: proxy isDefault: false + jsonData: + derivedFields: + # Click a traceID in a log line → jump straight to Tempo + - datasourceUid: tempo + matcherRegex: '"traceID":"(\w+)"' + name: TraceID + url: "$${__value.raw}" - name: Tempo type: tempo + uid: tempo url: "http://tempo.monitoring.svc.cluster.local:3200" access: proxy isDefault: false + jsonData: + tracesToLogsV2: + datasourceUid: loki + filterByTraceID: true + filterBySpanID: false + tags: + - key: namespace + - key: pod + tracesToMetrics: + datasourceUid: prometheus + tags: + - key: service.name + value: service + lokiSearch: + datasourceUid: loki + serviceMap: + datasourceUid: prometheus prometheus: prometheusSpec: + # Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces, + # not just "monitoring". Without this, monitors in ingress, mesh, + # cert-manager, devtools, etc. are invisible to Prometheus. + serviceMonitorNamespaceSelector: {} + podMonitorNamespaceSelector: {} + ruleNamespaceSelector: {} + serviceMonitorSelector: {} + podMonitorSelector: {} + # Accept remote-write from Tempo metrics generator + enableRemoteWriteReceiver: true retention: 90d additionalArgs: # Allow browser-direct queries from the Grafana UI origin. diff --git a/base/monitoring/tempo-values.yaml b/base/monitoring/tempo-values.yaml index 5ed8604..3dd774b 100644 --- a/base/monitoring/tempo-values.yaml +++ b/base/monitoring/tempo-values.yaml @@ -16,6 +16,18 @@ tempo: path: /var/tempo/traces wal: path: /var/tempo/wal + # Generate span-derived RED metrics (rate / errors / duration) and push + # them into Prometheus so Grafana can show service-level indicators + # even without application-level metrics exporters. + metricsGenerator: + enabled: true + remoteWriteUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090/api/v1/write" + overrides: + defaults: + metrics_generator: + processors: + - service-graphs + - span-metrics persistence: enabled: true