From d3943c9a84ac9316f997de2cd2a96883a46fc247 Mon Sep 17 00:00:00 2001
From: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
Date: Sat, 21 Mar 2026 17:36:54 +0000
Subject: [PATCH] feat(monitoring): wire up full LGTM observability stack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces,
  enable remote write receiver for Tempo metrics generator
- Tempo: enable metrics generator (service-graphs + span-metrics)
  with remote write to Prometheus
- Loki: add Grafana Alloy DaemonSet to ship container logs
- Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao
  dashboards, add stable UIDs and cross-linking between datasources
  (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map)
- Linkerd: enable proxy tracing to Alloy OTLP collector, point
  linkerd-viz at existing Prometheus instead of deploying its own
- Pingora: add OTLP rollout plan (endpoint commented out until proxy
  telemetry panic fix is deployed and Alloy is verified healthy)
---
 base/ingress/pingora-config.yaml            |   7 +
 base/mesh/kustomization.yaml                |   2 +
 base/mesh/linkerd-control-plane-values.yaml |  19 ++
 base/mesh/linkerd-viz-values.yaml           |   9 +
 base/monitoring/alloy-values.yaml           | 108 +++++++
 base/monitoring/dashboards-configmap.yaml   | 310 ++++++++++++++++++++
 base/monitoring/kustomization.yaml          |  10 +
 base/monitoring/prometheus-values.yaml      |  46 +++
 base/monitoring/tempo-values.yaml           |  12 +
 9 files changed, 523 insertions(+)
 create mode 100644 base/mesh/linkerd-control-plane-values.yaml
 create mode 100644 base/mesh/linkerd-viz-values.yaml
 create mode 100644 base/monitoring/alloy-values.yaml
 create mode 100644 base/monitoring/dashboards-configmap.yaml

diff --git a/base/ingress/pingora-config.yaml b/base/ingress/pingora-config.yaml
index 544f099..8f0b441 100644
--- a/base/ingress/pingora-config.yaml
+++ b/base/ingress/pingora-config.yaml
@@ -21,6 +21,13 @@ data:
     key_path  = "/etc/tls/tls.key"
 
     [telemetry]
+    # Rollout plan for OTLP tracing:
+    #   1. Deploy proxy build that includes the graceful telemetry init
+    #      (proxy/src/telemetry.rs — no longer panics on exporter failure)
+    #   2. Verify Alloy is running:
+    #        kubectl -n monitoring get pods -l app.kubernetes.io/name=alloy
+    #   3. Uncomment the line below:
+    #   otlp_endpoint = "http://alloy.monitoring.svc.cluster.local:4318"
     otlp_endpoint = ""
     metrics_port  = 9090
 
diff --git a/base/mesh/kustomization.yaml b/base/mesh/kustomization.yaml
index dada26a..13ab90c 100644
--- a/base/mesh/kustomization.yaml
+++ b/base/mesh/kustomization.yaml
@@ -29,9 +29,11 @@ helmCharts:
     version: "2025.12.3"
     releaseName: linkerd-control-plane
     namespace: mesh
+    valuesFile: linkerd-control-plane-values.yaml
 
   - name: linkerd-viz
     repo: https://helm.linkerd.io/edge
     version: "2026.1.4"
     releaseName: linkerd-viz
     namespace: mesh
+    valuesFile: linkerd-viz-values.yaml
diff --git a/base/mesh/linkerd-control-plane-values.yaml b/base/mesh/linkerd-control-plane-values.yaml
new file mode 100644
index 0000000..5d100fe
--- /dev/null
+++ b/base/mesh/linkerd-control-plane-values.yaml
@@ -0,0 +1,19 @@
+# Linkerd control-plane overrides — enable proxy tracing to Tempo.
+#
+# Every meshed pod's Linkerd sidecar will export OTLP traces to the
+# Alloy collector in the monitoring namespace, which forwards to Tempo.
+
+# Controller-level tracing (identity, destination controllers)
+controller:
+  tracing:
+    enabled: true
+    collector:
+      endpoint: "alloy.monitoring.svc.cluster.local:4317"
+
+# Proxy-level tracing (every meshed sidecar)
+proxy:
+  tracing:
+    enabled: true
+    traceServiceName: linkerd-proxy
+    collector:
+      endpoint: "alloy.monitoring.svc.cluster.local:4317"
diff --git a/base/mesh/linkerd-viz-values.yaml b/base/mesh/linkerd-viz-values.yaml
new file mode 100644
index 0000000..cfb14ac
--- /dev/null
+++ b/base/mesh/linkerd-viz-values.yaml
@@ -0,0 +1,9 @@
+# Linkerd-viz overrides — use existing Prometheus instead of deploying a second one.
+#
+# By default linkerd-viz ships its own Prometheus, which wastes resources
+# and creates a second scrape loop.  Point it at kube-prometheus-stack instead.
+
+prometheus:
+  enabled: false
+
+prometheusUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
diff --git a/base/monitoring/alloy-values.yaml b/base/monitoring/alloy-values.yaml
new file mode 100644
index 0000000..1d64ee3
--- /dev/null
+++ b/base/monitoring/alloy-values.yaml
@@ -0,0 +1,108 @@
+# Grafana Alloy — lightweight agent that ships container logs to Loki
+# and forwards OTLP traces to Tempo.
+#
+# Runs as a DaemonSet so every node's /var/log/pods is tailed.
+
+alloy:
+  configMap:
+    content: |
+      // ── Kubernetes log discovery ──────────────────────────────────
+      discovery.kubernetes "pods" {
+        role = "pod"
+      }
+
+      discovery.relabel "pod_logs" {
+        targets = discovery.kubernetes.pods.targets
+
+        // Keep only running pods
+        rule {
+          source_labels = ["__meta_kubernetes_pod_phase"]
+          regex         = "Pending|Succeeded|Failed|Unknown"
+          action        = "drop"
+        }
+
+        // Standard labels
+        rule {
+          source_labels = ["__meta_kubernetes_namespace"]
+          target_label  = "namespace"
+        }
+        rule {
+          source_labels = ["__meta_kubernetes_pod_name"]
+          target_label  = "pod"
+        }
+        rule {
+          source_labels = ["__meta_kubernetes_pod_container_name"]
+          target_label  = "container"
+        }
+        rule {
+          source_labels = ["__meta_kubernetes_pod_node_name"]
+          target_label  = "node"
+        }
+        // Carry app label for easier Grafana filtering
+        rule {
+          source_labels = ["__meta_kubernetes_pod_label_app"]
+          target_label  = "app"
+        }
+        rule {
+          source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
+          target_label  = "app"
+          action        = "replace"
+          regex         = "(.+)"
+        }
+      }
+
+      loki.source.kubernetes "pods" {
+        targets    = discovery.relabel.pod_logs.output
+        forward_to = [loki.process.pipeline.receiver]
+      }
+
+      // ── Log processing pipeline ──────────────────────────────────
+      loki.process "pipeline" {
+        // Detect and parse JSON log lines (common in Go / Python services)
+        stage.json {
+          expressions = {
+            level   = "level",
+            msg     = "msg",
+            traceID = "traceID",
+          }
+        }
+
+        // Promote log level to a label for easier filtering
+        stage.labels {
+          values = { level = "" }
+        }
+
+        forward_to = [loki.write.default.receiver]
+      }
+
+      loki.write "default" {
+        endpoint {
+          url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push"
+        }
+      }
+
+      // ── OTLP receiver (services can push traces here) ────────────
+      otelcol.receiver.otlp "default" {
+        grpc { endpoint = "0.0.0.0:4317" }
+        http { endpoint = "0.0.0.0:4318" }
+        output { traces = [otelcol.exporter.otlp.tempo.input] }
+      }
+
+      otelcol.exporter.otlp "tempo" {
+        client {
+          endpoint = "tempo.monitoring.svc.cluster.local:4317"
+          tls { insecure = true }
+        }
+      }
+
+controller:
+  type: daemonset
+
+# Mount node log directories for kubernetes log tailing
+mounts:
+  varlog: true
+
+# Expose OTLP ports so in-cluster services can send traces to the local agent
+service:
+  enabled: true
+  type: ClusterIP
diff --git a/base/monitoring/dashboards-configmap.yaml b/base/monitoring/dashboards-configmap.yaml
new file mode 100644
index 0000000..977a677
--- /dev/null
+++ b/base/monitoring/dashboards-configmap.yaml
@@ -0,0 +1,310 @@
+# Grafana dashboard ConfigMaps — picked up by the Grafana sidecar.
+#
+# Each ConfigMap holds one or more dashboard JSON files.  The sidecar
+# watches for the label grafana_dashboard=1 across all namespaces and
+# hot-loads them into Grafana (no restart required).
+#
+# The grafana_folder annotation groups dashboards into Grafana folders.
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-pingora
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+  annotations:
+    grafana_folder: "Ingress"
+data:
+  pingora.json: |
+    {
+      "annotations": { "list": [] },
+      "editable": true,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 1,
+      "links": [],
+      "panels": [
+        {
+          "title": "Requests / sec",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "sum(rate(pingora_http_requests_total[5m]))",
+              "legendFormat": "total"
+            },
+            {
+              "expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)",
+              "legendFormat": "{{status_code}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "reqps" }
+          }
+        },
+        {
+          "title": "Error Rate (5xx)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))",
+              "legendFormat": "5xx ratio"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
+          }
+        },
+        {
+          "title": "Request Latency (p50 / p95 / p99)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
+              "legendFormat": "p50"
+            },
+            {
+              "expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
+              "legendFormat": "p95"
+            },
+            {
+              "expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
+              "legendFormat": "p99"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "s" }
+          }
+        },
+        {
+          "title": "Active Connections",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "pingora_active_connections",
+              "legendFormat": "active"
+            }
+          ]
+        },
+        {
+          "title": "Upstream Latency by Backend",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))",
+              "legendFormat": "{{backend}} p95"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "s" }
+          }
+        },
+        {
+          "title": "DDoS / Scanner Detections",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "sum(rate(pingora_ddos_detections_total[5m]))",
+              "legendFormat": "DDoS"
+            },
+            {
+              "expr": "sum(rate(pingora_scanner_detections_total[5m]))",
+              "legendFormat": "Scanner"
+            },
+            {
+              "expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))",
+              "legendFormat": "Rate-limited"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "reqps" }
+          }
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["ingress", "pingora"],
+      "templating": { "list": [] },
+      "time": { "from": "now-1h", "to": "now" },
+      "title": "Pingora Proxy",
+      "uid": "pingora-proxy"
+    }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-loki
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+  annotations:
+    grafana_folder: "Observability"
+data:
+  loki-overview.json: |
+    {
+      "annotations": { "list": [] },
+      "editable": true,
+      "panels": [
+        {
+          "title": "Log Volume by Namespace",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
+          "datasource": { "uid": "loki" },
+          "targets": [
+            {
+              "expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)",
+              "legendFormat": "{{namespace}}"
+            }
+          ]
+        },
+        {
+          "title": "Error Logs",
+          "type": "logs",
+          "gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 },
+          "datasource": { "uid": "loki" },
+          "targets": [
+            {
+              "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"",
+              "legendFormat": ""
+            }
+          ]
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["loki", "logs"],
+      "time": { "from": "now-1h", "to": "now" },
+      "title": "Loki — Log Overview",
+      "uid": "loki-overview"
+    }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-tempo
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+  annotations:
+    grafana_folder: "Observability"
+data:
+  tempo-overview.json: |
+    {
+      "annotations": { "list": [] },
+      "editable": true,
+      "panels": [
+        {
+          "title": "Trace Ingestion Rate",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "sum(rate(tempo_distributor_spans_received_total[5m]))",
+              "legendFormat": "spans/s"
+            }
+          ],
+          "fieldConfig": { "defaults": { "unit": "ops" } }
+        },
+        {
+          "title": "Service Map (RED)",
+          "type": "nodeGraph",
+          "gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 },
+          "datasource": { "uid": "tempo" },
+          "targets": [
+            { "queryType": "serviceMap" }
+          ]
+        },
+        {
+          "title": "Span Duration by Service (p95)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))",
+              "legendFormat": "{{service}}"
+            }
+          ],
+          "fieldConfig": { "defaults": { "unit": "s" } }
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["tempo", "tracing"],
+      "time": { "from": "now-1h", "to": "now" },
+      "title": "Tempo — Trace Overview",
+      "uid": "tempo-overview"
+    }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-openbao
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+  annotations:
+    grafana_folder: "Infrastructure"
+data:
+  openbao.json: |
+    {
+      "annotations": { "list": [] },
+      "editable": true,
+      "panels": [
+        {
+          "title": "Vault/OpenBao Sealed Status",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            { "expr": "vault_core_unsealed", "legendFormat": "unsealed" }
+          ],
+          "fieldConfig": {
+            "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"SEALED","color":"red"},"1":{"text":"UNSEALED","color":"green"}}}] }
+          }
+        },
+        {
+          "title": "Token Count",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            { "expr": "vault_token_count", "legendFormat": "tokens" }
+          ]
+        },
+        {
+          "title": "Request Rate",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            { "expr": "sum(rate(vault_core_handle_request_count[5m]))", "legendFormat": "req/s" }
+          ],
+          "fieldConfig": { "defaults": { "unit": "reqps" } }
+        },
+        {
+          "title": "Request Latency (p95)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            { "expr": "histogram_quantile(0.95, sum(rate(vault_core_handle_request_bucket[5m])) by (le))", "legendFormat": "p95" }
+          ],
+          "fieldConfig": { "defaults": { "unit": "s" } }
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["vault", "openbao"],
+      "time": { "from": "now-1h", "to": "now" },
+      "title": "OpenBao / Vault",
+      "uid": "openbao"
+    }
diff --git a/base/monitoring/kustomization.yaml b/base/monitoring/kustomization.yaml
index d5df99a..b1307f8 100644
--- a/base/monitoring/kustomization.yaml
+++ b/base/monitoring/kustomization.yaml
@@ -7,6 +7,7 @@ resources:
   - namespace.yaml
   - vault-secrets.yaml
   - grafana-oauth2client.yaml
+  - dashboards-configmap.yaml
 
 helmCharts:
   # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
@@ -32,3 +33,12 @@ helmCharts:
     releaseName: tempo
     namespace: monitoring
     valuesFile: tempo-values.yaml
+
+  # Grafana Alloy — DaemonSet that ships container logs → Loki
+  # and provides an in-cluster OTLP receiver → Tempo.
+  - name: alloy
+    repo: https://grafana.github.io/helm-charts
+    version: "0.12.0"
+    releaseName: alloy
+    namespace: monitoring
+    valuesFile: alloy-values.yaml
diff --git a/base/monitoring/prometheus-values.yaml b/base/monitoring/prometheus-values.yaml
index cab5bc3..63aa6cf 100644
--- a/base/monitoring/prometheus-values.yaml
+++ b/base/monitoring/prometheus-values.yaml
@@ -39,10 +39,20 @@ grafana:
   sidecar:
     datasources:
       defaultDatasourceEnabled: false
+    dashboards:
+      enabled: true
+      # Pick up ConfigMaps with this label in any namespace
+      label: grafana_dashboard
+      labelValue: "1"
+      searchNamespace: ALL
+      folderAnnotation: grafana_folder
+      provider:
+        foldersFromFilesStructure: false
 
   additionalDataSources:
     - name: Prometheus
       type: prometheus
+      uid: prometheus
       url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
       access: proxy
       isDefault: true
@@ -50,17 +60,53 @@ grafana:
         timeInterval: 30s
     - name: Loki
       type: loki
+      uid: loki
       url: "http://loki-gateway.monitoring.svc.cluster.local:80"
       access: proxy
       isDefault: false
+      jsonData:
+        derivedFields:
+          # Click a traceID in a log line → jump straight to Tempo
+          - datasourceUid: tempo
+            matcherRegex: '"traceID":"(\w+)"'
+            name: TraceID
+            url: "$${__value.raw}"
     - name: Tempo
       type: tempo
+      uid: tempo
       url: "http://tempo.monitoring.svc.cluster.local:3200"
       access: proxy
       isDefault: false
+      jsonData:
+        tracesToLogsV2:
+          datasourceUid: loki
+          filterByTraceID: true
+          filterBySpanID: false
+          tags:
+            - key: namespace
+            - key: pod
+        tracesToMetrics:
+          datasourceUid: prometheus
+          tags:
+            - key: service.name
+              value: service
+        lokiSearch:
+          datasourceUid: loki
+        serviceMap:
+          datasourceUid: prometheus
 
 prometheus:
   prometheusSpec:
+    # Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
+    # not just "monitoring".  Without this, monitors in ingress, mesh,
+    # cert-manager, devtools, etc. are invisible to Prometheus.
+    serviceMonitorNamespaceSelector: {}
+    podMonitorNamespaceSelector: {}
+    ruleNamespaceSelector: {}
+    serviceMonitorSelector: {}
+    podMonitorSelector: {}
+    # Accept remote-write from Tempo metrics generator
+    enableRemoteWriteReceiver: true
     retention: 90d
     additionalArgs:
       # Allow browser-direct queries from the Grafana UI origin.
diff --git a/base/monitoring/tempo-values.yaml b/base/monitoring/tempo-values.yaml
index 5ed8604..3dd774b 100644
--- a/base/monitoring/tempo-values.yaml
+++ b/base/monitoring/tempo-values.yaml
@@ -16,6 +16,18 @@ tempo:
         path: /var/tempo/traces
       wal:
         path: /var/tempo/wal
+  # Generate span-derived RED metrics (rate / errors / duration) and push
+  # them into Prometheus so Grafana can show service-level indicators
+  # even without application-level metrics exporters.
+  metricsGenerator:
+    enabled: true
+    remoteWriteUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090/api/v1/write"
+  overrides:
+    defaults:
+      metrics_generator:
+        processors:
+          - service-graphs
+          - span-metrics
 
 persistence:
   enabled: true