- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces, enable remote write receiver for Tempo metrics generator - Tempo: enable metrics generator (service-graphs + span-metrics) with remote write to Prometheus - Loki: add Grafana Alloy DaemonSet to ship container logs - Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao dashboards, add stable UIDs and cross-linking between datasources (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map) - Linkerd: enable proxy tracing to Alloy OTLP collector, point linkerd-viz at existing Prometheus instead of deploying its own - Pingora: add OTLP rollout plan (endpoint commented out until proxy telemetry panic fix is deployed and Alloy is verified healthy)
109 lines
3.2 KiB
YAML
109 lines
3.2 KiB
YAML
# Grafana Alloy — lightweight agent that ships container logs to Loki
|
|
# and forwards OTLP traces to Tempo.
|
|
#
|
|
# Runs as a DaemonSet so every node's /var/log/pods is tailed.
|
|
|
|
alloy:
|
|
configMap:
|
|
content: |
|
|
// ── Kubernetes log discovery ──────────────────────────────────
|
|
discovery.kubernetes "pods" {
|
|
role = "pod"
|
|
}
|
|
|
|
discovery.relabel "pod_logs" {
|
|
targets = discovery.kubernetes.pods.targets
|
|
|
|
// Keep only running pods
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_phase"]
|
|
regex = "Pending|Succeeded|Failed|Unknown"
|
|
action = "drop"
|
|
}
|
|
|
|
// Standard labels
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_namespace"]
|
|
target_label = "namespace"
|
|
}
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_name"]
|
|
target_label = "pod"
|
|
}
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_container_name"]
|
|
target_label = "container"
|
|
}
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_node_name"]
|
|
target_label = "node"
|
|
}
|
|
// Carry app label for easier Grafana filtering
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app"]
|
|
target_label = "app"
|
|
}
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
|
target_label = "app"
|
|
action = "replace"
|
|
regex = "(.+)"
|
|
}
|
|
}
|
|
|
|
loki.source.kubernetes "pods" {
|
|
targets = discovery.relabel.pod_logs.output
|
|
forward_to = [loki.process.pipeline.receiver]
|
|
}
|
|
|
|
// ── Log processing pipeline ──────────────────────────────────
|
|
loki.process "pipeline" {
|
|
// Detect and parse JSON log lines (common in Go / Python services)
|
|
stage.json {
|
|
expressions = {
|
|
level = "level",
|
|
msg = "msg",
|
|
traceID = "traceID",
|
|
}
|
|
}
|
|
|
|
// Promote log level to a label for easier filtering
|
|
stage.labels {
|
|
values = { level = "" }
|
|
}
|
|
|
|
forward_to = [loki.write.default.receiver]
|
|
}
|
|
|
|
loki.write "default" {
|
|
endpoint {
|
|
url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push"
|
|
}
|
|
}
|
|
|
|
// ── OTLP receiver (services can push traces here) ────────────
|
|
otelcol.receiver.otlp "default" {
|
|
grpc { endpoint = "0.0.0.0:4317" }
|
|
http { endpoint = "0.0.0.0:4318" }
|
|
output { traces = [otelcol.exporter.otlp.tempo.input] }
|
|
}
|
|
|
|
otelcol.exporter.otlp "tempo" {
|
|
client {
|
|
endpoint = "tempo.monitoring.svc.cluster.local:4317"
|
|
tls { insecure = true }
|
|
}
|
|
}
|
|
|
|
controller:
|
|
type: daemonset
|
|
|
|
# Mount node log directories for kubernetes log tailing
|
|
mounts:
|
|
varlog: true
|
|
|
|
# Expose OTLP ports so in-cluster services can send traces to the local agent
|
|
service:
|
|
enabled: true
|
|
type: ClusterIP
|