feat(monitoring): wire up full LGTM observability stack

- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces,
  enable remote write receiver for Tempo metrics generator
- Tempo: enable metrics generator (service-graphs + span-metrics)
  with remote write to Prometheus
- Loki: add Grafana Alloy DaemonSet to ship container logs
- Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao
  dashboards, add stable UIDs and cross-linking between datasources
  (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map)
- Linkerd: enable proxy tracing to Alloy OTLP collector, point
  linkerd-viz at existing Prometheus instead of deploying its own
- Pingora: add OTLP rollout plan (endpoint commented out until proxy
  telemetry panic fix is deployed and Alloy is verified healthy)
This commit is contained in:
2026-03-21 17:36:54 +00:00
parent 5f923d14f9
commit d3943c9a84
9 changed files with 523 additions and 0 deletions

View File

@@ -39,10 +39,20 @@ grafana:
sidecar:
datasources:
defaultDatasourceEnabled: false
dashboards:
enabled: true
# Pick up ConfigMaps with this label in any namespace
label: grafana_dashboard
labelValue: "1"
searchNamespace: ALL
folderAnnotation: grafana_folder
provider:
foldersFromFilesStructure: false
additionalDataSources:
- name: Prometheus
type: prometheus
uid: prometheus
url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
access: proxy
isDefault: true
@@ -50,17 +60,53 @@ grafana:
timeInterval: 30s
- name: Loki
type: loki
uid: loki
url: "http://loki-gateway.monitoring.svc.cluster.local:80"
access: proxy
isDefault: false
jsonData:
derivedFields:
# Click a traceID in a log line → jump straight to Tempo
- datasourceUid: tempo
matcherRegex: '"traceID":"(\w+)"'
name: TraceID
url: "$${__value.raw}"
- name: Tempo
type: tempo
uid: tempo
url: "http://tempo.monitoring.svc.cluster.local:3200"
access: proxy
isDefault: false
jsonData:
tracesToLogsV2:
datasourceUid: loki
filterByTraceID: true
filterBySpanID: false
tags:
- key: namespace
- key: pod
tracesToMetrics:
datasourceUid: prometheus
tags:
- key: service.name
value: service
lokiSearch:
datasourceUid: loki
serviceMap:
datasourceUid: prometheus
prometheus:
prometheusSpec:
# Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
# not just "monitoring". Without this, monitors in ingress, mesh,
# cert-manager, devtools, etc. are invisible to Prometheus.
serviceMonitorNamespaceSelector: {}
podMonitorNamespaceSelector: {}
ruleNamespaceSelector: {}
serviceMonitorSelector: {}
podMonitorSelector: {}
# Accept remote-write from Tempo metrics generator
enableRemoteWriteReceiver: true
retention: 90d
additionalArgs:
# Allow browser-direct queries from the Grafana UI origin.