feat(monitoring): wire up full LGTM observability stack
- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces, enable remote write receiver for Tempo metrics generator - Tempo: enable metrics generator (service-graphs + span-metrics) with remote write to Prometheus - Loki: add Grafana Alloy DaemonSet to ship container logs - Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao dashboards, add stable UIDs and cross-linking between datasources (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map) - Linkerd: enable proxy tracing to Alloy OTLP collector, point linkerd-viz at existing Prometheus instead of deploying its own - Pingora: add OTLP rollout plan (endpoint commented out until proxy telemetry panic fix is deployed and Alloy is verified healthy)
This commit is contained in:
@@ -39,10 +39,20 @@ grafana:
|
||||
sidecar:
|
||||
datasources:
|
||||
defaultDatasourceEnabled: false
|
||||
dashboards:
|
||||
enabled: true
|
||||
# Pick up ConfigMaps with this label in any namespace
|
||||
label: grafana_dashboard
|
||||
labelValue: "1"
|
||||
searchNamespace: ALL
|
||||
folderAnnotation: grafana_folder
|
||||
provider:
|
||||
foldersFromFilesStructure: false
|
||||
|
||||
additionalDataSources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
|
||||
access: proxy
|
||||
isDefault: true
|
||||
@@ -50,17 +60,53 @@ grafana:
|
||||
timeInterval: 30s
|
||||
- name: Loki
|
||||
type: loki
|
||||
uid: loki
|
||||
url: "http://loki-gateway.monitoring.svc.cluster.local:80"
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
derivedFields:
|
||||
# Click a traceID in a log line → jump straight to Tempo
|
||||
- datasourceUid: tempo
|
||||
matcherRegex: '"traceID":"(\w+)"'
|
||||
name: TraceID
|
||||
url: "$${__value.raw}"
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
uid: tempo
|
||||
url: "http://tempo.monitoring.svc.cluster.local:3200"
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
tracesToLogsV2:
|
||||
datasourceUid: loki
|
||||
filterByTraceID: true
|
||||
filterBySpanID: false
|
||||
tags:
|
||||
- key: namespace
|
||||
- key: pod
|
||||
tracesToMetrics:
|
||||
datasourceUid: prometheus
|
||||
tags:
|
||||
- key: service.name
|
||||
value: service
|
||||
lokiSearch:
|
||||
datasourceUid: loki
|
||||
serviceMap:
|
||||
datasourceUid: prometheus
|
||||
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
# Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
|
||||
# not just "monitoring". Without this, monitors in ingress, mesh,
|
||||
# cert-manager, devtools, etc. are invisible to Prometheus.
|
||||
serviceMonitorNamespaceSelector: {}
|
||||
podMonitorNamespaceSelector: {}
|
||||
ruleNamespaceSelector: {}
|
||||
serviceMonitorSelector: {}
|
||||
podMonitorSelector: {}
|
||||
# Accept remote-write from Tempo metrics generator
|
||||
enableRemoteWriteReceiver: true
|
||||
retention: 90d
|
||||
additionalArgs:
|
||||
# Allow browser-direct queries from the Grafana UI origin.
|
||||
|
||||
Reference in New Issue
Block a user