Files
sbbb/base/monitoring/alertrules-infrastructure.yaml
Sienna Meridian Satterwhite e4987b4c58 feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules
The Longhorn memory leak went undetected for 14 days because alerting
was broken (email receiver, missing label selector, no node alerts).
This overhaul brings alerting to production grade.

Fixes:
- Alloy Loki URL pointed to deleted loki-gateway, now loki:3100
- seaweedfs-bucket-init crash on unsupported `mc versioning` command
- All PrometheusRules now have `release: kube-prometheus-stack` label
- Removed broken email receiver, Matrix-only alerting

New alert coverage:
- Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM
- Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full
- Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror
- Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down
- Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart)
- SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s
- Recording rules for Linkerd RED metrics and node aggregates
- Watchdog heartbeat → Matrix every 12h (dead pipeline detection)
- Inhibition: critical suppresses warning for same alert+namespace
- OpenSearchClusterYellow only fires with >1 data node (single-node aware)
2026-04-06 15:52:06 +01:00

208 lines
8.3 KiB
YAML

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: infrastructure-alerts
namespace: monitoring
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: infrastructure
rules:
- alert: LonghornDiskSpaceLow
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90
for: 5m
labels:
severity: critical
annotations:
summary: "Longhorn disk space critically low"
description: "Longhorn disk on {{ $labels.node }} is over 90% full."
- alert: LonghornVolumeSpaceLow
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Longhorn volume space low"
description: "Longhorn volume {{ $labels.volume }} is over 85% full."
- alert: CertExpiringCritical
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400
for: 10m
labels:
severity: critical
annotations:
summary: "Certificate expiring in less than 24 hours"
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours"
- alert: CertExpiringSoon
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800
for: 30m
labels:
severity: warning
annotations:
summary: "Certificate expiring in less than 7 days"
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days"
- alert: CertNotReady
expr: certmanager_certificate_ready_status{condition="True"} != 1
for: 10m
labels:
severity: warning
annotations:
summary: "Certificate not ready"
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."
- name: node
rules:
- alert: NodeMemoryHigh
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Node memory usage above 85%"
description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}."
- alert: NodeMemoryCritical
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
for: 2m
labels:
severity: critical
annotations:
summary: "Node memory usage above 95%"
description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent."
- alert: NodeSwapActive
expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50
for: 10m
labels:
severity: warning
annotations:
summary: "Node swap usage above 50%"
description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure."
- alert: NodeCPUHigh
expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90
for: 15m
labels:
severity: warning
annotations:
summary: "Node CPU usage above 90% for 15 minutes"
description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}."
- alert: NodeFilesystemFull
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Filesystem usage above 85%"
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
- alert: NodeFilesystemCritical
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95
for: 2m
labels:
severity: critical
annotations:
summary: "Filesystem usage above 95%"
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
- alert: NodeFilesystemFilesRunningOut
expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Filesystem inodes running low"
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free."
- alert: NodeNetworkErrors
expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Network interface errors detected"
description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors."
- alert: NodeClockSkew
expr: abs(node_timex_offset_seconds) > 0.05
for: 10m
labels:
severity: warning
annotations:
summary: "Node clock skew detected"
description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail."
- alert: NodeOOMKills
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "OOM kill detected"
description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes."
- name: kubernetes
rules:
- alert: PodMemoryNearLimit
expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90
for: 10m
labels:
severity: warning
annotations:
summary: "Pod memory near limit"
description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit."
- alert: PersistentVolumeUsageHigh
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "PVC usage above 85%"
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
- alert: PersistentVolumeUsageCritical
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95
for: 2m
labels:
severity: critical
annotations:
summary: "PVC usage above 95%"
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
- alert: DeploymentNoReadyPods
expr: |
kube_deployment_status_replicas_available == 0
and kube_deployment_spec_replicas > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Deployment has no ready pods"
description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas."
- alert: CronJobLastRunFailed
expr: |
kube_job_status_failed{namespace!="kube-system"} > 0
for: 10m
labels:
severity: warning
annotations:
summary: "Job failed"
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed."
- alert: PodRestartingFrequently
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 10m
labels:
severity: warning
annotations:
summary: "Pod is restarting frequently"
description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour."