2026-03-21 17:36:54 +00:00
|
|
|
# Grafana Alloy — lightweight agent that ships container logs to Loki
|
|
|
|
|
# and forwards OTLP traces to Tempo.
|
|
|
|
|
#
|
|
|
|
|
# Runs as a DaemonSet so every node's /var/log/pods is tailed.
|
|
|
|
|
|
|
|
|
|
alloy:
|
|
|
|
|
configMap:
|
|
|
|
|
content: |
|
|
|
|
|
// ── Kubernetes log discovery ──────────────────────────────────
|
|
|
|
|
discovery.kubernetes "pods" {
|
|
|
|
|
role = "pod"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
discovery.relabel "pod_logs" {
|
|
|
|
|
targets = discovery.kubernetes.pods.targets
|
|
|
|
|
|
|
|
|
|
// Keep only running pods
|
|
|
|
|
rule {
|
|
|
|
|
source_labels = ["__meta_kubernetes_pod_phase"]
|
|
|
|
|
regex = "Pending|Succeeded|Failed|Unknown"
|
|
|
|
|
action = "drop"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Standard labels
|
|
|
|
|
rule {
|
|
|
|
|
source_labels = ["__meta_kubernetes_namespace"]
|
|
|
|
|
target_label = "namespace"
|
|
|
|
|
}
|
|
|
|
|
rule {
|
|
|
|
|
source_labels = ["__meta_kubernetes_pod_name"]
|
|
|
|
|
target_label = "pod"
|
|
|
|
|
}
|
|
|
|
|
rule {
|
|
|
|
|
source_labels = ["__meta_kubernetes_pod_container_name"]
|
|
|
|
|
target_label = "container"
|
|
|
|
|
}
|
|
|
|
|
rule {
|
|
|
|
|
source_labels = ["__meta_kubernetes_pod_node_name"]
|
|
|
|
|
target_label = "node"
|
|
|
|
|
}
|
|
|
|
|
// Carry app label for easier Grafana filtering
|
|
|
|
|
rule {
|
|
|
|
|
source_labels = ["__meta_kubernetes_pod_label_app"]
|
|
|
|
|
target_label = "app"
|
|
|
|
|
}
|
|
|
|
|
rule {
|
|
|
|
|
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
|
|
|
|
target_label = "app"
|
|
|
|
|
action = "replace"
|
|
|
|
|
regex = "(.+)"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
loki.source.kubernetes "pods" {
|
|
|
|
|
targets = discovery.relabel.pod_logs.output
|
|
|
|
|
forward_to = [loki.process.pipeline.receiver]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Log processing pipeline ──────────────────────────────────
|
|
|
|
|
loki.process "pipeline" {
|
|
|
|
|
// Detect and parse JSON log lines (common in Go / Python services)
|
|
|
|
|
stage.json {
|
|
|
|
|
expressions = {
|
|
|
|
|
level = "level",
|
|
|
|
|
msg = "msg",
|
|
|
|
|
traceID = "traceID",
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Promote log level to a label for easier filtering
|
|
|
|
|
stage.labels {
|
|
|
|
|
values = { level = "" }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
forward_to = [loki.write.default.receiver]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
loki.write "default" {
|
|
|
|
|
endpoint {
|
feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules
The Longhorn memory leak went undetected for 14 days because alerting
was broken (email receiver, missing label selector, no node alerts).
This overhaul brings alerting to production grade.
Fixes:
- Alloy Loki URL pointed to deleted loki-gateway, now loki:3100
- seaweedfs-bucket-init crash on unsupported `mc versioning` command
- All PrometheusRules now have `release: kube-prometheus-stack` label
- Removed broken email receiver, Matrix-only alerting
New alert coverage:
- Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM
- Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full
- Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror
- Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down
- Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart)
- SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s
- Recording rules for Linkerd RED metrics and node aggregates
- Watchdog heartbeat → Matrix every 12h (dead pipeline detection)
- Inhibition: critical suppresses warning for same alert+namespace
- OpenSearchClusterYellow only fires with >1 data node (single-node aware)
2026-04-06 15:52:06 +01:00
|
|
|
url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
|
2026-03-21 17:36:54 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── OTLP receiver (services can push traces here) ────────────
|
|
|
|
|
otelcol.receiver.otlp "default" {
|
|
|
|
|
grpc { endpoint = "0.0.0.0:4317" }
|
|
|
|
|
http { endpoint = "0.0.0.0:4318" }
|
|
|
|
|
output { traces = [otelcol.exporter.otlp.tempo.input] }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
otelcol.exporter.otlp "tempo" {
|
|
|
|
|
client {
|
|
|
|
|
endpoint = "tempo.monitoring.svc.cluster.local:4317"
|
|
|
|
|
tls { insecure = true }
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
controller:
|
|
|
|
|
type: daemonset
|
|
|
|
|
|
|
|
|
|
# Mount node log directories for kubernetes log tailing
|
|
|
|
|
mounts:
|
|
|
|
|
varlog: true
|
|
|
|
|
|
|
|
|
|
# Expose OTLP ports so in-cluster services can send traces to the local agent
|
|
|
|
|
service:
|
|
|
|
|
enabled: true
|
|
|
|
|
type: ClusterIP
|