Files
sbbb/base/monitoring/prometheus-values.yaml
Sienna Meridian Satterwhite e4987b4c58 feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules
The Longhorn memory leak went undetected for 14 days because alerting
was broken (email receiver, missing label selector, no node alerts).
This overhaul brings alerting to production grade.

Fixes:
- Alloy Loki URL pointed to deleted loki-gateway, now loki:3100
- seaweedfs-bucket-init crash on unsupported `mc versioning` command
- All PrometheusRules now have `release: kube-prometheus-stack` label
- Removed broken email receiver, Matrix-only alerting

New alert coverage:
- Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM
- Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full
- Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror
- Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down
- Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart)
- SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s
- Recording rules for Linkerd RED metrics and node aggregates
- Watchdog heartbeat → Matrix every 12h (dead pipeline detection)
- Inhibition: critical suppresses warning for same alert+namespace
- OpenSearchClusterYellow only fires with >1 data node (single-node aware)
2026-04-06 15:52:06 +01:00

172 lines
5.1 KiB
YAML

# kube-prometheus-stack — Prometheus + AlertManager + Grafana + node-exporter + kube-state-metrics
#
# k3s quirks: kube-proxy is replaced by Cilium; etcd/scheduler/controller-manager
# don't expose metrics on standard ports. Disable their monitors to avoid noise.
grafana:
adminUser: admin
admin:
existingSecret: grafana-admin
passwordKey: admin-password
persistence:
enabled: true
size: 2Gi
# Inject Hydra OIDC client credentials (created by Hydra Maester from the OAuth2Client CRD)
envFromSecret: grafana-oidc
grafana.ini:
server:
root_url: "https://metrics.DOMAIN_SUFFIX"
auth:
# Keep local login as fallback (admin password from grafana-admin secret)
disable_login_form: false
signout_redirect_url: "https://auth.DOMAIN_SUFFIX/oauth2/sessions/logout"
auth.generic_oauth:
enabled: true
name: Sunbeam
icon: signin
# CLIENT_ID / CLIENT_SECRET injected from grafana-oidc K8s Secret via envFromSecret
client_id: "${CLIENT_ID}"
client_secret: "${CLIENT_SECRET}"
scopes: "openid email profile"
auth_url: "https://auth.DOMAIN_SUFFIX/oauth2/auth"
token_url: "https://auth.DOMAIN_SUFFIX/oauth2/token"
api_url: "https://auth.DOMAIN_SUFFIX/userinfo"
allow_sign_up: true
# Small studio — anyone with a valid La Suite account is an admin.
# To restrict to specific users, set role_attribute_path instead.
auto_assign_org_role: Admin
skip_org_role_sync: true
sidecar:
datasources:
defaultDatasourceEnabled: false
dashboards:
enabled: true
# Pick up ConfigMaps with this label in any namespace
label: grafana_dashboard
labelValue: "1"
searchNamespace: ALL
folderAnnotation: grafana_folder
provider:
foldersFromFilesStructure: false
additionalDataSources:
- name: Prometheus
type: prometheus
uid: prometheus
url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
access: proxy
isDefault: true
jsonData:
timeInterval: 30s
- name: Loki
type: loki
uid: loki
url: "http://loki.monitoring.svc.cluster.local:3100"
access: proxy
isDefault: false
jsonData:
derivedFields:
# Click a traceID in a log line → jump straight to Tempo
- datasourceUid: tempo
matcherRegex: '"traceID":"(\w+)"'
name: TraceID
url: "$${__value.raw}"
- name: Tempo
type: tempo
uid: tempo
url: "http://tempo.monitoring.svc.cluster.local:3200"
access: proxy
isDefault: false
jsonData:
tracesToLogsV2:
datasourceUid: loki
filterByTraceID: true
filterBySpanID: false
tags:
- key: namespace
- key: pod
tracesToMetrics:
datasourceUid: prometheus
tags:
- key: service.name
value: service
lokiSearch:
datasourceUid: loki
serviceMap:
datasourceUid: prometheus
prometheus:
prometheusSpec:
# Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
# not just "monitoring". Without this, monitors in ingress, mesh,
# cert-manager, devtools, etc. are invisible to Prometheus.
serviceMonitorNamespaceSelector: {}
podMonitorNamespaceSelector: {}
ruleNamespaceSelector: {}
serviceMonitorSelector: {}
podMonitorSelector: {}
# Accept remote-write from Tempo metrics generator
enableRemoteWriteReceiver: true
retention: 90d
additionalArgs:
# Allow browser-direct queries from the Grafana UI origin.
- name: web.cors.origin
value: "https://metrics.DOMAIN_SUFFIX"
storageSpec:
volumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 30Gi
alertmanager:
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 2Gi
config:
route:
group_by: [alertname, namespace]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: matrix
routes:
- matchers:
- alertname = Watchdog
receiver: matrix
repeat_interval: 12h
- matchers:
- severity = critical
receiver: matrix
- matchers:
- severity = warning
receiver: matrix
receivers:
- name: matrix
webhook_configs:
- url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts"
send_resolved: true
inhibitRules:
# Critical alerts suppress warnings for the same alertname+namespace
- source_matchers:
- severity = critical
target_matchers:
- severity = warning
equal: [alertname, namespace]
# Disable monitors for components k3s doesn't expose
kubeEtcd:
enabled: false
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubeProxy:
enabled: false