feat(monitoring): wire up full LGTM observability stack

- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces, enable remote write receiver for Tempo metrics generator - Tempo: enable metrics generator (service-graphs + span-metrics) with remote write to Prometheus - Loki: add Grafana Alloy DaemonSet to ship container logs - Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao dashboards, add stable UIDs and cross-linking between datasources (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map) - Linkerd: enable proxy tracing to Alloy OTLP collector, point linkerd-viz at existing Prometheus instead of deploying its own - Pingora: add OTLP rollout plan (endpoint commented out until proxy telemetry panic fix is deployed and Alloy is verified healthy)
2026-03-21 17:36:54 +00:00
parent 5f923d14f9
commit d3943c9a84
9 changed files with 523 additions and 0 deletions
--- a/base/ingress/pingora-config.yaml
+++ b/base/ingress/pingora-config.yaml
@@ -21,6 +21,13 @@ data:
    key_path  = "/etc/tls/tls.key"
    [telemetry]
    # Rollout plan for OTLP tracing:
    #   1. Deploy proxy build that includes the graceful telemetry init
    #      (proxy/src/telemetry.rs — no longer panics on exporter failure)
    #   2. Verify Alloy is running:
    #        kubectl -n monitoring get pods -l app.kubernetes.io/name=alloy
    #   3. Uncomment the line below:
    #   otlp_endpoint = "http://alloy.monitoring.svc.cluster.local:4318"
    otlp_endpoint = ""
    metrics_port  = 9090
--- a/base/mesh/kustomization.yaml
+++ b/base/mesh/kustomization.yaml
@@ -29,9 +29,11 @@ helmCharts:
    version: "2025.12.3"
    releaseName: linkerd-control-plane
    namespace: mesh
    valuesFile: linkerd-control-plane-values.yaml
  - name: linkerd-viz
    repo: https://helm.linkerd.io/edge
    version: "2026.1.4"
    releaseName: linkerd-viz
    namespace: mesh
    valuesFile: linkerd-viz-values.yaml
--- a/base/mesh/linkerd-control-plane-values.yaml
+++ b/base/mesh/linkerd-control-plane-values.yaml
@@ -0,0 +1,19 @@
 # Linkerd control-plane overrides — enable proxy tracing to Tempo.
 #
 # Every meshed pod's Linkerd sidecar will export OTLP traces to the
 # Alloy collector in the monitoring namespace, which forwards to Tempo.
 # Controller-level tracing (identity, destination controllers)
 controller:
  tracing:
    enabled: true
    collector:
      endpoint: "alloy.monitoring.svc.cluster.local:4317"
 # Proxy-level tracing (every meshed sidecar)
 proxy:
  tracing:
    enabled: true
    traceServiceName: linkerd-proxy
    collector:
      endpoint: "alloy.monitoring.svc.cluster.local:4317"
--- a/base/mesh/linkerd-viz-values.yaml
+++ b/base/mesh/linkerd-viz-values.yaml
@@ -0,0 +1,9 @@
 # Linkerd-viz overrides — use existing Prometheus instead of deploying a second one.
 #
 # By default linkerd-viz ships its own Prometheus, which wastes resources
 # and creates a second scrape loop.  Point it at kube-prometheus-stack instead.
 prometheus:
  enabled: false
 prometheusUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
--- a/base/monitoring/alloy-values.yaml
+++ b/base/monitoring/alloy-values.yaml
@@ -0,0 +1,108 @@
 # Grafana Alloy — lightweight agent that ships container logs to Loki
 # and forwards OTLP traces to Tempo.
 #
 # Runs as a DaemonSet so every node's /var/log/pods is tailed.
 alloy:
  configMap:
    content: |
      // ── Kubernetes log discovery ──────────────────────────────────
      discovery.kubernetes "pods" {
        role = "pod"
      }
      discovery.relabel "pod_logs" {
        targets = discovery.kubernetes.pods.targets
        // Keep only running pods
        rule {
          source_labels = ["__meta_kubernetes_pod_phase"]
          regex         = "Pending|Succeeded|Failed|Unknown"
          action        = "drop"
        }
        // Standard labels
        rule {
          source_labels = ["__meta_kubernetes_namespace"]
          target_label  = "namespace"
        }
        rule {
          source_labels = ["__meta_kubernetes_pod_name"]
          target_label  = "pod"
        }
        rule {
          source_labels = ["__meta_kubernetes_pod_container_name"]
          target_label  = "container"
        }
        rule {
          source_labels = ["__meta_kubernetes_pod_node_name"]
          target_label  = "node"
        }
        // Carry app label for easier Grafana filtering
        rule {
          source_labels = ["__meta_kubernetes_pod_label_app"]
          target_label  = "app"
        }
        rule {
          source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
          target_label  = "app"
          action        = "replace"
          regex         = "(.+)"
        }
      }
      loki.source.kubernetes "pods" {
        targets    = discovery.relabel.pod_logs.output
        forward_to = [loki.process.pipeline.receiver]
      }
      // ── Log processing pipeline ──────────────────────────────────
      loki.process "pipeline" {
        // Detect and parse JSON log lines (common in Go / Python services)
        stage.json {
          expressions = {
            level   = "level",
            msg     = "msg",
            traceID = "traceID",
          }
        }
        // Promote log level to a label for easier filtering
        stage.labels {
          values = { level = "" }
        }
        forward_to = [loki.write.default.receiver]
      }
      loki.write "default" {
        endpoint {
          url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push"
        }
      }
      // ── OTLP receiver (services can push traces here) ────────────
      otelcol.receiver.otlp "default" {
        grpc { endpoint = "0.0.0.0:4317" }
        http { endpoint = "0.0.0.0:4318" }
        output { traces = [otelcol.exporter.otlp.tempo.input] }
      }
      otelcol.exporter.otlp "tempo" {
        client {
          endpoint = "tempo.monitoring.svc.cluster.local:4317"
          tls { insecure = true }
        }
      }
 controller:
  type: daemonset
 # Mount node log directories for kubernetes log tailing
 mounts:
  varlog: true
 # Expose OTLP ports so in-cluster services can send traces to the local agent
 service:
  enabled: true
  type: ClusterIP
--- a/base/monitoring/dashboards-configmap.yaml
+++ b/base/monitoring/dashboards-configmap.yaml
@@ -0,0 +1,310 @@
 # Grafana dashboard ConfigMaps — picked up by the Grafana sidecar.
 #
 # Each ConfigMap holds one or more dashboard JSON files.  The sidecar
 # watches for the label grafana_dashboard=1 across all namespaces and
 # hot-loads them into Grafana (no restart required).
 #
 # The grafana_folder annotation groups dashboards into Grafana folders.
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-dashboard-pingora
  namespace: monitoring
  labels:
    grafana_dashboard: "1"
  annotations:
    grafana_folder: "Ingress"
 data:
  pingora.json: |
    {
      "annotations": { "list": [] },
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 1,
      "links": [],
      "panels": [
        {
          "title": "Requests / sec",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            {
              "expr": "sum(rate(pingora_http_requests_total[5m]))",
              "legendFormat": "total"
            },
            {
              "expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)",
              "legendFormat": "{{status_code}}"
            }
          ],
          "fieldConfig": {
            "defaults": { "unit": "reqps" }
          }
        },
        {
          "title": "Error Rate (5xx)",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            {
              "expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))",
              "legendFormat": "5xx ratio"
            }
          ],
          "fieldConfig": {
            "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
          }
        },
        {
          "title": "Request Latency (p50 / p95 / p99)",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            {
              "expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
              "legendFormat": "p50"
            },
            {
              "expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
              "legendFormat": "p95"
            },
            {
              "expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
              "legendFormat": "p99"
            }
          ],
          "fieldConfig": {
            "defaults": { "unit": "s" }
          }
        },
        {
          "title": "Active Connections",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            {
              "expr": "pingora_active_connections",
              "legendFormat": "active"
            }
          ]
        },
        {
          "title": "Upstream Latency by Backend",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            {
              "expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))",
              "legendFormat": "{{backend}} p95"
            }
          ],
          "fieldConfig": {
            "defaults": { "unit": "s" }
          }
        },
        {
          "title": "DDoS / Scanner Detections",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            {
              "expr": "sum(rate(pingora_ddos_detections_total[5m]))",
              "legendFormat": "DDoS"
            },
            {
              "expr": "sum(rate(pingora_scanner_detections_total[5m]))",
              "legendFormat": "Scanner"
            },
            {
              "expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))",
              "legendFormat": "Rate-limited"
            }
          ],
          "fieldConfig": {
            "defaults": { "unit": "reqps" }
          }
        }
      ],
      "schemaVersion": 39,
      "tags": ["ingress", "pingora"],
      "templating": { "list": [] },
      "time": { "from": "now-1h", "to": "now" },
      "title": "Pingora Proxy",
      "uid": "pingora-proxy"
    }
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-dashboard-loki
  namespace: monitoring
  labels:
    grafana_dashboard: "1"
  annotations:
    grafana_folder: "Observability"
 data:
  loki-overview.json: |
    {
      "annotations": { "list": [] },
      "editable": true,
      "panels": [
        {
          "title": "Log Volume by Namespace",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
          "datasource": { "uid": "loki" },
          "targets": [
            {
              "expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)",
              "legendFormat": "{{namespace}}"
            }
          ]
        },
        {
          "title": "Error Logs",
          "type": "logs",
          "gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 },
          "datasource": { "uid": "loki" },
          "targets": [
            {
              "expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"",
              "legendFormat": ""
            }
          ]
        }
      ],
      "schemaVersion": 39,
      "tags": ["loki", "logs"],
      "time": { "from": "now-1h", "to": "now" },
      "title": "Loki — Log Overview",
      "uid": "loki-overview"
    }
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-dashboard-tempo
  namespace: monitoring
  labels:
    grafana_dashboard: "1"
  annotations:
    grafana_folder: "Observability"
 data:
  tempo-overview.json: |
    {
      "annotations": { "list": [] },
      "editable": true,
      "panels": [
        {
          "title": "Trace Ingestion Rate",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            {
              "expr": "sum(rate(tempo_distributor_spans_received_total[5m]))",
              "legendFormat": "spans/s"
            }
          ],
          "fieldConfig": { "defaults": { "unit": "ops" } }
        },
        {
          "title": "Service Map (RED)",
          "type": "nodeGraph",
          "gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 },
          "datasource": { "uid": "tempo" },
          "targets": [
            { "queryType": "serviceMap" }
          ]
        },
        {
          "title": "Span Duration by Service (p95)",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            {
              "expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))",
              "legendFormat": "{{service}}"
            }
          ],
          "fieldConfig": { "defaults": { "unit": "s" } }
        }
      ],
      "schemaVersion": 39,
      "tags": ["tempo", "tracing"],
      "time": { "from": "now-1h", "to": "now" },
      "title": "Tempo — Trace Overview",
      "uid": "tempo-overview"
    }
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-dashboard-openbao
  namespace: monitoring
  labels:
    grafana_dashboard: "1"
  annotations:
    grafana_folder: "Infrastructure"
 data:
  openbao.json: |
    {
      "annotations": { "list": [] },
      "editable": true,
      "panels": [
        {
          "title": "Vault/OpenBao Sealed Status",
          "type": "stat",
          "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            { "expr": "vault_core_unsealed", "legendFormat": "unsealed" }
          ],
          "fieldConfig": {
            "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"SEALED","color":"red"},"1":{"text":"UNSEALED","color":"green"}}}] }
          }
        },
        {
          "title": "Token Count",
          "type": "stat",
          "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            { "expr": "vault_token_count", "legendFormat": "tokens" }
          ]
        },
        {
          "title": "Request Rate",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            { "expr": "sum(rate(vault_core_handle_request_count[5m]))", "legendFormat": "req/s" }
          ],
          "fieldConfig": { "defaults": { "unit": "reqps" } }
        },
        {
          "title": "Request Latency (p95)",
          "type": "timeseries",
          "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
          "datasource": { "uid": "prometheus" },
          "targets": [
            { "expr": "histogram_quantile(0.95, sum(rate(vault_core_handle_request_bucket[5m])) by (le))", "legendFormat": "p95" }
          ],
          "fieldConfig": { "defaults": { "unit": "s" } }
        }
      ],
      "schemaVersion": 39,
      "tags": ["vault", "openbao"],
      "time": { "from": "now-1h", "to": "now" },
      "title": "OpenBao / Vault",
      "uid": "openbao"
    }
--- a/base/monitoring/kustomization.yaml
+++ b/base/monitoring/kustomization.yaml
@@ -7,6 +7,7 @@ resources:
  - namespace.yaml
  - vault-secrets.yaml
  - grafana-oauth2client.yaml
  - dashboards-configmap.yaml
 helmCharts:
  # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
@@ -32,3 +33,12 @@ helmCharts:
    releaseName: tempo
    namespace: monitoring
    valuesFile: tempo-values.yaml
  # Grafana Alloy — DaemonSet that ships container logs → Loki
  # and provides an in-cluster OTLP receiver → Tempo.
  - name: alloy
    repo: https://grafana.github.io/helm-charts
    version: "0.12.0"
    releaseName: alloy
    namespace: monitoring
    valuesFile: alloy-values.yaml
--- a/base/monitoring/prometheus-values.yaml
+++ b/base/monitoring/prometheus-values.yaml
@@ -39,10 +39,20 @@ grafana:
  sidecar:
    datasources:
      defaultDatasourceEnabled: false
    dashboards:
      enabled: true
      # Pick up ConfigMaps with this label in any namespace
      label: grafana_dashboard
      labelValue: "1"
      searchNamespace: ALL
      folderAnnotation: grafana_folder
      provider:
        foldersFromFilesStructure: false
  additionalDataSources:
    - name: Prometheus
      type: prometheus
      uid: prometheus
      url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
      access: proxy
      isDefault: true
@@ -50,17 +60,53 @@ grafana:
        timeInterval: 30s
    - name: Loki
      type: loki
      uid: loki
      url: "http://loki-gateway.monitoring.svc.cluster.local:80"
      access: proxy
      isDefault: false
      jsonData:
        derivedFields:
          # Click a traceID in a log line → jump straight to Tempo
          - datasourceUid: tempo
            matcherRegex: '"traceID":"(\w+)"'
            name: TraceID
            url: "$${__value.raw}"
    - name: Tempo
      type: tempo
      uid: tempo
      url: "http://tempo.monitoring.svc.cluster.local:3200"
      access: proxy
      isDefault: false
      jsonData:
        tracesToLogsV2:
          datasourceUid: loki
          filterByTraceID: true
          filterBySpanID: false
          tags:
            - key: namespace
            - key: pod
        tracesToMetrics:
          datasourceUid: prometheus
          tags:
            - key: service.name
              value: service
        lokiSearch:
          datasourceUid: loki
        serviceMap:
          datasourceUid: prometheus
 prometheus:
  prometheusSpec:
    # Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
    # not just "monitoring".  Without this, monitors in ingress, mesh,
    # cert-manager, devtools, etc. are invisible to Prometheus.
    serviceMonitorNamespaceSelector: {}
    podMonitorNamespaceSelector: {}
    ruleNamespaceSelector: {}
    serviceMonitorSelector: {}
    podMonitorSelector: {}
    # Accept remote-write from Tempo metrics generator
    enableRemoteWriteReceiver: true
    retention: 90d
    additionalArgs:
      # Allow browser-direct queries from the Grafana UI origin.
--- a/base/monitoring/tempo-values.yaml
+++ b/base/monitoring/tempo-values.yaml
@@ -16,6 +16,18 @@ tempo:
        path: /var/tempo/traces
      wal:
        path: /var/tempo/wal
  # Generate span-derived RED metrics (rate / errors / duration) and push
  # them into Prometheus so Grafana can show service-level indicators
  # even without application-level metrics exporters.
  metricsGenerator:
    enabled: true
    remoteWriteUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090/api/v1/write"
  overrides:
    defaults:
      metrics_generator:
        processors:
          - service-graphs
          - span-metrics
 persistence:
  enabled: true