feat(monitoring): wire up full LGTM observability stack

- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces, enable remote write receiver for Tempo metrics generator - Tempo: enable metrics generator (service-graphs + span-metrics) with remote write to Prometheus - Loki: add Grafana Alloy DaemonSet to ship container logs - Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao dashboards, add stable UIDs and cross-linking between datasources (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map) - Linkerd: enable proxy tracing to Alloy OTLP collector, point linkerd-viz at existing Prometheus instead of deploying its own - Pingora: add OTLP rollout plan (endpoint commented out until proxy telemetry panic fix is deployed and Alloy is verified healthy)
2026-03-21 17:36:54 +00:00
parent 5f923d14f9
commit d3943c9a84
9 changed files with 523 additions and 0 deletions
--- a/base/ingress/pingora-config.yaml
+++ b/base/ingress/pingora-config.yaml
@@ -21,6 +21,13 @@ data:
    key_path  = "/etc/tls/tls.key"

    [telemetry]
+    # Rollout plan for OTLP tracing:
+    #   1. Deploy proxy build that includes the graceful telemetry init
+    #      (proxy/src/telemetry.rs — no longer panics on exporter failure)
+    #   2. Verify Alloy is running:
+    #        kubectl -n monitoring get pods -l app.kubernetes.io/name=alloy
+    #   3. Uncomment the line below:
+    #   otlp_endpoint = "http://alloy.monitoring.svc.cluster.local:4318"
    otlp_endpoint = ""
    metrics_port  = 9090

--- a/base/mesh/kustomization.yaml
+++ b/base/mesh/kustomization.yaml
@@ -29,9 +29,11 @@ helmCharts:
    version: "2025.12.3"
    releaseName: linkerd-control-plane
    namespace: mesh
+    valuesFile: linkerd-control-plane-values.yaml

  - name: linkerd-viz
    repo: https://helm.linkerd.io/edge
    version: "2026.1.4"
    releaseName: linkerd-viz
    namespace: mesh
+    valuesFile: linkerd-viz-values.yaml
--- a/base/mesh/linkerd-control-plane-values.yaml
+++ b/base/mesh/linkerd-control-plane-values.yaml
@@ -0,0 +1,19 @@
+# Linkerd control-plane overrides — enable proxy tracing to Tempo.
+#
+# Every meshed pod's Linkerd sidecar will export OTLP traces to the
+# Alloy collector in the monitoring namespace, which forwards to Tempo.
+
+# Controller-level tracing (identity, destination controllers)
+controller:
+  tracing:
+    enabled: true
+    collector:
+      endpoint: "alloy.monitoring.svc.cluster.local:4317"
+
+# Proxy-level tracing (every meshed sidecar)
+proxy:
+  tracing:
+    enabled: true
+    traceServiceName: linkerd-proxy
+    collector:
+      endpoint: "alloy.monitoring.svc.cluster.local:4317"
--- a/base/mesh/linkerd-viz-values.yaml
+++ b/base/mesh/linkerd-viz-values.yaml
@@ -0,0 +1,9 @@
+# Linkerd-viz overrides — use existing Prometheus instead of deploying a second one.
+#
+# By default linkerd-viz ships its own Prometheus, which wastes resources
+# and creates a second scrape loop.  Point it at kube-prometheus-stack instead.
+
+prometheus:
+  enabled: false
+
+prometheusUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
--- a/base/monitoring/alloy-values.yaml
+++ b/base/monitoring/alloy-values.yaml
@@ -0,0 +1,108 @@
+# Grafana Alloy — lightweight agent that ships container logs to Loki
+# and forwards OTLP traces to Tempo.
+#
+# Runs as a DaemonSet so every node's /var/log/pods is tailed.
+
+alloy:
+  configMap:
+    content: |
+      // ── Kubernetes log discovery ──────────────────────────────────
+      discovery.kubernetes "pods" {
+        role = "pod"
+      }
+
+      discovery.relabel "pod_logs" {
+        targets = discovery.kubernetes.pods.targets
+
+        // Keep only running pods
+        rule {
+          source_labels = ["__meta_kubernetes_pod_phase"]
+          regex         = "Pending|Succeeded|Failed|Unknown"
+          action        = "drop"
+        }
+
+        // Standard labels
+        rule {
+          source_labels = ["__meta_kubernetes_namespace"]
+          target_label  = "namespace"
+        }
+        rule {
+          source_labels = ["__meta_kubernetes_pod_name"]
+          target_label  = "pod"
+        }
+        rule {
+          source_labels = ["__meta_kubernetes_pod_container_name"]
+          target_label  = "container"
+        }
+        rule {
+          source_labels = ["__meta_kubernetes_pod_node_name"]
+          target_label  = "node"
+        }
+        // Carry app label for easier Grafana filtering
+        rule {
+          source_labels = ["__meta_kubernetes_pod_label_app"]
+          target_label  = "app"
+        }
+        rule {
+          source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
+          target_label  = "app"
+          action        = "replace"
+          regex         = "(.+)"
+        }
+      }
+
+      loki.source.kubernetes "pods" {
+        targets    = discovery.relabel.pod_logs.output
+        forward_to = [loki.process.pipeline.receiver]
+      }
+
+      // ── Log processing pipeline ──────────────────────────────────
+      loki.process "pipeline" {
+        // Detect and parse JSON log lines (common in Go / Python services)
+        stage.json {
+          expressions = {
+            level   = "level",
+            msg     = "msg",
+            traceID = "traceID",
+          }
+        }
+
+        // Promote log level to a label for easier filtering
+        stage.labels {
+          values = { level = "" }
+        }
+
+        forward_to = [loki.write.default.receiver]
+      }
+
+      loki.write "default" {
+        endpoint {
+          url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push"
+        }
+      }
+
+      // ── OTLP receiver (services can push traces here) ────────────
+      otelcol.receiver.otlp "default" {
+        grpc { endpoint = "0.0.0.0:4317" }
+        http { endpoint = "0.0.0.0:4318" }
+        output { traces = [otelcol.exporter.otlp.tempo.input] }
+      }
+
+      otelcol.exporter.otlp "tempo" {
+        client {
+          endpoint = "tempo.monitoring.svc.cluster.local:4317"
+          tls { insecure = true }
+        }
+      }
+
+controller:
+  type: daemonset
+
+# Mount node log directories for kubernetes log tailing
+mounts:
+  varlog: true
+
+# Expose OTLP ports so in-cluster services can send traces to the local agent
+service:
+  enabled: true
+  type: ClusterIP
--- a/base/monitoring/dashboards-configmap.yaml
+++ b/base/monitoring/dashboards-configmap.yaml
@@ -0,0 +1,310 @@
+# Grafana dashboard ConfigMaps — picked up by the Grafana sidecar.
+#
+# Each ConfigMap holds one or more dashboard JSON files.  The sidecar
+# watches for the label grafana_dashboard=1 across all namespaces and
+# hot-loads them into Grafana (no restart required).
+#
+# The grafana_folder annotation groups dashboards into Grafana folders.
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-pingora
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+  annotations:
+    grafana_folder: "Ingress"
+data:
+  pingora.json: |
+    {
+      "annotations": { "list": [] },
+      "editable": true,
+      "fiscalYearStartMonth": 0,
+      "graphTooltip": 1,
+      "links": [],
+      "panels": [
+        {
+          "title": "Requests / sec",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "sum(rate(pingora_http_requests_total[5m]))",
+              "legendFormat": "total"
+            },
+            {
+              "expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)",
+              "legendFormat": "{{status_code}}"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "reqps" }
+          }
+        },
+        {
+          "title": "Error Rate (5xx)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))",
+              "legendFormat": "5xx ratio"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
+          }
+        },
+        {
+          "title": "Request Latency (p50 / p95 / p99)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
+              "legendFormat": "p50"
+            },
+            {
+              "expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
+              "legendFormat": "p95"
+            },
+            {
+              "expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
+              "legendFormat": "p99"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "s" }
+          }
+        },
+        {
+          "title": "Active Connections",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "pingora_active_connections",
+              "legendFormat": "active"
+            }
+          ]
+        },
+        {
+          "title": "Upstream Latency by Backend",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))",
+              "legendFormat": "{{backend}} p95"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "s" }
+          }
+        },
+        {
+          "title": "DDoS / Scanner Detections",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "sum(rate(pingora_ddos_detections_total[5m]))",
+              "legendFormat": "DDoS"
+            },
+            {
+              "expr": "sum(rate(pingora_scanner_detections_total[5m]))",
+              "legendFormat": "Scanner"
+            },
+            {
+              "expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))",
+              "legendFormat": "Rate-limited"
+            }
+          ],
+          "fieldConfig": {
+            "defaults": { "unit": "reqps" }
+          }
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["ingress", "pingora"],
+      "templating": { "list": [] },
+      "time": { "from": "now-1h", "to": "now" },
+      "title": "Pingora Proxy",
+      "uid": "pingora-proxy"
+    }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-loki
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+  annotations:
+    grafana_folder: "Observability"
+data:
+  loki-overview.json: |
+    {
+      "annotations": { "list": [] },
+      "editable": true,
+      "panels": [
+        {
+          "title": "Log Volume by Namespace",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
+          "datasource": { "uid": "loki" },
+          "targets": [
+            {
+              "expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)",
+              "legendFormat": "{{namespace}}"
+            }
+          ]
+        },
+        {
+          "title": "Error Logs",
+          "type": "logs",
+          "gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 },
+          "datasource": { "uid": "loki" },
+          "targets": [
+            {
+              "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"",
+              "legendFormat": ""
+            }
+          ]
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["loki", "logs"],
+      "time": { "from": "now-1h", "to": "now" },
+      "title": "Loki — Log Overview",
+      "uid": "loki-overview"
+    }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-tempo
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+  annotations:
+    grafana_folder: "Observability"
+data:
+  tempo-overview.json: |
+    {
+      "annotations": { "list": [] },
+      "editable": true,
+      "panels": [
+        {
+          "title": "Trace Ingestion Rate",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "sum(rate(tempo_distributor_spans_received_total[5m]))",
+              "legendFormat": "spans/s"
+            }
+          ],
+          "fieldConfig": { "defaults": { "unit": "ops" } }
+        },
+        {
+          "title": "Service Map (RED)",
+          "type": "nodeGraph",
+          "gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 },
+          "datasource": { "uid": "tempo" },
+          "targets": [
+            { "queryType": "serviceMap" }
+          ]
+        },
+        {
+          "title": "Span Duration by Service (p95)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))",
+              "legendFormat": "{{service}}"
+            }
+          ],
+          "fieldConfig": { "defaults": { "unit": "s" } }
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["tempo", "tracing"],
+      "time": { "from": "now-1h", "to": "now" },
+      "title": "Tempo — Trace Overview",
+      "uid": "tempo-overview"
+    }
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-openbao
+  namespace: monitoring
+  labels:
+    grafana_dashboard: "1"
+  annotations:
+    grafana_folder: "Infrastructure"
+data:
+  openbao.json: |
+    {
+      "annotations": { "list": [] },
+      "editable": true,
+      "panels": [
+        {
+          "title": "Vault/OpenBao Sealed Status",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            { "expr": "vault_core_unsealed", "legendFormat": "unsealed" }
+          ],
+          "fieldConfig": {
+            "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"SEALED","color":"red"},"1":{"text":"UNSEALED","color":"green"}}}] }
+          }
+        },
+        {
+          "title": "Token Count",
+          "type": "stat",
+          "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            { "expr": "vault_token_count", "legendFormat": "tokens" }
+          ]
+        },
+        {
+          "title": "Request Rate",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            { "expr": "sum(rate(vault_core_handle_request_count[5m]))", "legendFormat": "req/s" }
+          ],
+          "fieldConfig": { "defaults": { "unit": "reqps" } }
+        },
+        {
+          "title": "Request Latency (p95)",
+          "type": "timeseries",
+          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
+          "datasource": { "uid": "prometheus" },
+          "targets": [
+            { "expr": "histogram_quantile(0.95, sum(rate(vault_core_handle_request_bucket[5m])) by (le))", "legendFormat": "p95" }
+          ],
+          "fieldConfig": { "defaults": { "unit": "s" } }
+        }
+      ],
+      "schemaVersion": 39,
+      "tags": ["vault", "openbao"],
+      "time": { "from": "now-1h", "to": "now" },
+      "title": "OpenBao / Vault",
+      "uid": "openbao"
+    }
--- a/base/monitoring/kustomization.yaml
+++ b/base/monitoring/kustomization.yaml
@@ -7,6 +7,7 @@ resources:
  - namespace.yaml
  - vault-secrets.yaml
  - grafana-oauth2client.yaml
+  - dashboards-configmap.yaml

 helmCharts:
  # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
@@ -32,3 +33,12 @@ helmCharts:
    releaseName: tempo
    namespace: monitoring
    valuesFile: tempo-values.yaml
+
+  # Grafana Alloy — DaemonSet that ships container logs → Loki
+  # and provides an in-cluster OTLP receiver → Tempo.
+  - name: alloy
+    repo: https://grafana.github.io/helm-charts
+    version: "0.12.0"
+    releaseName: alloy
+    namespace: monitoring
+    valuesFile: alloy-values.yaml
--- a/base/monitoring/prometheus-values.yaml
+++ b/base/monitoring/prometheus-values.yaml
@@ -39,10 +39,20 @@ grafana:
  sidecar:
    datasources:
      defaultDatasourceEnabled: false
+    dashboards:
+      enabled: true
+      # Pick up ConfigMaps with this label in any namespace
+      label: grafana_dashboard
+      labelValue: "1"
+      searchNamespace: ALL
+      folderAnnotation: grafana_folder
+      provider:
+        foldersFromFilesStructure: false

  additionalDataSources:
    - name: Prometheus
      type: prometheus
+      uid: prometheus
      url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
      access: proxy
      isDefault: true
@@ -50,17 +60,53 @@ grafana:
        timeInterval: 30s
    - name: Loki
      type: loki
+      uid: loki
      url: "http://loki-gateway.monitoring.svc.cluster.local:80"
      access: proxy
      isDefault: false
+      jsonData:
+        derivedFields:
+          # Click a traceID in a log line → jump straight to Tempo
+          - datasourceUid: tempo
+            matcherRegex: '"traceID":"(\w+)"'
+            name: TraceID
+            url: "$${__value.raw}"
    - name: Tempo
      type: tempo
+      uid: tempo
      url: "http://tempo.monitoring.svc.cluster.local:3200"
      access: proxy
      isDefault: false
+      jsonData:
+        tracesToLogsV2:
+          datasourceUid: loki
+          filterByTraceID: true
+          filterBySpanID: false
+          tags:
+            - key: namespace
+            - key: pod
+        tracesToMetrics:
+          datasourceUid: prometheus
+          tags:
+            - key: service.name
+              value: service
+        lokiSearch:
+          datasourceUid: loki
+        serviceMap:
+          datasourceUid: prometheus

 prometheus:
  prometheusSpec:
+    # Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
+    # not just "monitoring".  Without this, monitors in ingress, mesh,
+    # cert-manager, devtools, etc. are invisible to Prometheus.
+    serviceMonitorNamespaceSelector: {}
+    podMonitorNamespaceSelector: {}
+    ruleNamespaceSelector: {}
+    serviceMonitorSelector: {}
+    podMonitorSelector: {}
+    # Accept remote-write from Tempo metrics generator
+    enableRemoteWriteReceiver: true
    retention: 90d
    additionalArgs:
      # Allow browser-direct queries from the Grafana UI origin.
--- a/base/monitoring/tempo-values.yaml
+++ b/base/monitoring/tempo-values.yaml
@@ -16,6 +16,18 @@ tempo:
        path: /var/tempo/traces
      wal:
        path: /var/tempo/wal
+  # Generate span-derived RED metrics (rate / errors / duration) and push
+  # them into Prometheus so Grafana can show service-level indicators
+  # even without application-level metrics exporters.
+  metricsGenerator:
+    enabled: true
+    remoteWriteUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090/api/v1/write"
+  overrides:
+    defaults:
+      metrics_generator:
+        processors:
+          - service-graphs
+          - span-metrics

 persistence:
  enabled: true