feat(monitoring): wire up full LGTM observability stack
- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces, enable remote write receiver for Tempo metrics generator - Tempo: enable metrics generator (service-graphs + span-metrics) with remote write to Prometheus - Loki: add Grafana Alloy DaemonSet to ship container logs - Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao dashboards, add stable UIDs and cross-linking between datasources (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map) - Linkerd: enable proxy tracing to Alloy OTLP collector, point linkerd-viz at existing Prometheus instead of deploying its own - Pingora: add OTLP rollout plan (endpoint commented out until proxy telemetry panic fix is deployed and Alloy is verified healthy)
This commit is contained in:
108
base/monitoring/alloy-values.yaml
Normal file
108
base/monitoring/alloy-values.yaml
Normal file
@@ -0,0 +1,108 @@
|
||||
# Grafana Alloy — lightweight agent that ships container logs to Loki
|
||||
# and forwards OTLP traces to Tempo.
|
||||
#
|
||||
# Runs as a DaemonSet so every node's /var/log/pods is tailed.
|
||||
|
||||
alloy:
|
||||
configMap:
|
||||
content: |
|
||||
// ── Kubernetes log discovery ──────────────────────────────────
|
||||
discovery.kubernetes "pods" {
|
||||
role = "pod"
|
||||
}
|
||||
|
||||
discovery.relabel "pod_logs" {
|
||||
targets = discovery.kubernetes.pods.targets
|
||||
|
||||
// Keep only running pods
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_phase"]
|
||||
regex = "Pending|Succeeded|Failed|Unknown"
|
||||
action = "drop"
|
||||
}
|
||||
|
||||
// Standard labels
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_namespace"]
|
||||
target_label = "namespace"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_name"]
|
||||
target_label = "pod"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_container_name"]
|
||||
target_label = "container"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_node_name"]
|
||||
target_label = "node"
|
||||
}
|
||||
// Carry app label for easier Grafana filtering
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_label_app"]
|
||||
target_label = "app"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
||||
target_label = "app"
|
||||
action = "replace"
|
||||
regex = "(.+)"
|
||||
}
|
||||
}
|
||||
|
||||
loki.source.kubernetes "pods" {
|
||||
targets = discovery.relabel.pod_logs.output
|
||||
forward_to = [loki.process.pipeline.receiver]
|
||||
}
|
||||
|
||||
// ── Log processing pipeline ──────────────────────────────────
|
||||
loki.process "pipeline" {
|
||||
// Detect and parse JSON log lines (common in Go / Python services)
|
||||
stage.json {
|
||||
expressions = {
|
||||
level = "level",
|
||||
msg = "msg",
|
||||
traceID = "traceID",
|
||||
}
|
||||
}
|
||||
|
||||
// Promote log level to a label for easier filtering
|
||||
stage.labels {
|
||||
values = { level = "" }
|
||||
}
|
||||
|
||||
forward_to = [loki.write.default.receiver]
|
||||
}
|
||||
|
||||
loki.write "default" {
|
||||
endpoint {
|
||||
url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push"
|
||||
}
|
||||
}
|
||||
|
||||
// ── OTLP receiver (services can push traces here) ────────────
|
||||
otelcol.receiver.otlp "default" {
|
||||
grpc { endpoint = "0.0.0.0:4317" }
|
||||
http { endpoint = "0.0.0.0:4318" }
|
||||
output { traces = [otelcol.exporter.otlp.tempo.input] }
|
||||
}
|
||||
|
||||
otelcol.exporter.otlp "tempo" {
|
||||
client {
|
||||
endpoint = "tempo.monitoring.svc.cluster.local:4317"
|
||||
tls { insecure = true }
|
||||
}
|
||||
}
|
||||
|
||||
controller:
|
||||
type: daemonset
|
||||
|
||||
# Mount node log directories for kubernetes log tailing
|
||||
mounts:
|
||||
varlog: true
|
||||
|
||||
# Expose OTLP ports so in-cluster services can send traces to the local agent
|
||||
service:
|
||||
enabled: true
|
||||
type: ClusterIP
|
||||
Reference in New Issue
Block a user