feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules

The Longhorn memory leak went undetected for 14 days because alerting was broken (email receiver, missing label selector, no node alerts). This overhaul brings alerting to production grade. Fixes: - Alloy Loki URL pointed to deleted loki-gateway, now loki:3100 - seaweedfs-bucket-init crash on unsupported `mc versioning` command - All PrometheusRules now have `release: kube-prometheus-stack` label - Removed broken email receiver, Matrix-only alerting New alert coverage: - Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM - Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full - Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror - Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down - Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart) - SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s - Recording rules for Linkerd RED metrics and node aggregates - Watchdog heartbeat → Matrix every 12h (dead pipeline detection) - Inhibition: critical suppresses warning for same alert+namespace - OpenSearchClusterYellow only fires with >1 data node (single-node aware)
2026-04-06 15:52:06 +01:00
parent f07b3353aa
commit e4987b4c58
22 changed files with 515 additions and 24 deletions
--- a/base/monitoring/alertrules-infrastructure.yaml
+++ b/base/monitoring/alertrules-infrastructure.yaml
@@ -5,6 +5,7 @@ metadata:
  namespace: monitoring
  labels:
    role: alert-rules
+    release: kube-prometheus-stack
 spec:
  groups:
    - name: infrastructure
@@ -53,3 +54,154 @@ spec:
          annotations:
            summary: "Certificate not ready"
            description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."
+
+    - name: node
+      rules:
+        - alert: NodeMemoryHigh
+          expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Node memory usage above 85%"
+            description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}."
+
+        - alert: NodeMemoryCritical
+          expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Node memory usage above 95%"
+            description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent."
+
+        - alert: NodeSwapActive
+          expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Node swap usage above 50%"
+            description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure."
+
+        - alert: NodeCPUHigh
+          expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90
+          for: 15m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Node CPU usage above 90% for 15 minutes"
+            description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}."
+
+        - alert: NodeFilesystemFull
+          expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Filesystem usage above 85%"
+            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
+
+        - alert: NodeFilesystemCritical
+          expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Filesystem usage above 95%"
+            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
+
+        - alert: NodeFilesystemFilesRunningOut
+          expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Filesystem inodes running low"
+            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free."
+
+        - alert: NodeNetworkErrors
+          expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Network interface errors detected"
+            description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors."
+
+        - alert: NodeClockSkew
+          expr: abs(node_timex_offset_seconds) > 0.05
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Node clock skew detected"
+            description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail."
+
+        - alert: NodeOOMKills
+          expr: increase(node_vmstat_oom_kill[5m]) > 0
+          for: 0m
+          labels:
+            severity: warning
+          annotations:
+            summary: "OOM kill detected"
+            description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes."
+
+    - name: kubernetes
+      rules:
+        - alert: PodMemoryNearLimit
+          expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Pod memory near limit"
+            description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit."
+
+        - alert: PersistentVolumeUsageHigh
+          expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "PVC usage above 85%"
+            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
+
+        - alert: PersistentVolumeUsageCritical
+          expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: "PVC usage above 95%"
+            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
+
+        - alert: DeploymentNoReadyPods
+          expr: |
+            kube_deployment_status_replicas_available == 0
+            and kube_deployment_spec_replicas > 0
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Deployment has no ready pods"
+            description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas."
+
+        - alert: CronJobLastRunFailed
+          expr: |
+            kube_job_status_failed{namespace!="kube-system"} > 0
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Job failed"
+            description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed."
+
+        - alert: PodRestartingFrequently
+          expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
+          for: 10m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Pod is restarting frequently"
+            description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour."