The Longhorn memory leak went undetected for 14 days because alerting was broken (email receiver, missing label selector, no node alerts). This overhaul brings alerting to production grade. Fixes: - Alloy Loki URL pointed to deleted loki-gateway, now loki:3100 - seaweedfs-bucket-init crash on unsupported `mc versioning` command - All PrometheusRules now have `release: kube-prometheus-stack` label - Removed broken email receiver, Matrix-only alerting New alert coverage: - Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM - Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full - Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror - Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down - Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart) - SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s - Recording rules for Linkerd RED metrics and node aggregates - Watchdog heartbeat → Matrix every 12h (dead pipeline detection) - Inhibition: critical suppresses warning for same alert+namespace - OpenSearchClusterYellow only fires with >1 data node (single-node aware)
208 lines
8.3 KiB
YAML
208 lines
8.3 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: infrastructure-alerts
|
|
namespace: monitoring
|
|
labels:
|
|
role: alert-rules
|
|
release: kube-prometheus-stack
|
|
spec:
|
|
groups:
|
|
- name: infrastructure
|
|
rules:
|
|
- alert: LonghornDiskSpaceLow
|
|
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Longhorn disk space critically low"
|
|
description: "Longhorn disk on {{ $labels.node }} is over 90% full."
|
|
|
|
- alert: LonghornVolumeSpaceLow
|
|
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn volume space low"
|
|
description: "Longhorn volume {{ $labels.volume }} is over 85% full."
|
|
|
|
- alert: CertExpiringCritical
|
|
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Certificate expiring in less than 24 hours"
|
|
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours"
|
|
|
|
- alert: CertExpiringSoon
|
|
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Certificate expiring in less than 7 days"
|
|
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days"
|
|
|
|
- alert: CertNotReady
|
|
expr: certmanager_certificate_ready_status{condition="True"} != 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Certificate not ready"
|
|
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."
|
|
|
|
- name: node
|
|
rules:
|
|
- alert: NodeMemoryHigh
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Node memory usage above 85%"
|
|
description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}."
|
|
|
|
- alert: NodeMemoryCritical
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node memory usage above 95%"
|
|
description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent."
|
|
|
|
- alert: NodeSwapActive
|
|
expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Node swap usage above 50%"
|
|
description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure."
|
|
|
|
- alert: NodeCPUHigh
|
|
expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Node CPU usage above 90% for 15 minutes"
|
|
description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}."
|
|
|
|
- alert: NodeFilesystemFull
|
|
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Filesystem usage above 85%"
|
|
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
|
|
|
|
- alert: NodeFilesystemCritical
|
|
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Filesystem usage above 95%"
|
|
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
|
|
|
|
- alert: NodeFilesystemFilesRunningOut
|
|
expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Filesystem inodes running low"
|
|
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free."
|
|
|
|
- alert: NodeNetworkErrors
|
|
expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Network interface errors detected"
|
|
description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors."
|
|
|
|
- alert: NodeClockSkew
|
|
expr: abs(node_timex_offset_seconds) > 0.05
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Node clock skew detected"
|
|
description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail."
|
|
|
|
- alert: NodeOOMKills
|
|
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "OOM kill detected"
|
|
description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes."
|
|
|
|
- name: kubernetes
|
|
rules:
|
|
- alert: PodMemoryNearLimit
|
|
expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod memory near limit"
|
|
description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit."
|
|
|
|
- alert: PersistentVolumeUsageHigh
|
|
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PVC usage above 85%"
|
|
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
|
|
|
|
- alert: PersistentVolumeUsageCritical
|
|
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PVC usage above 95%"
|
|
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
|
|
|
|
- alert: DeploymentNoReadyPods
|
|
expr: |
|
|
kube_deployment_status_replicas_available == 0
|
|
and kube_deployment_spec_replicas > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Deployment has no ready pods"
|
|
description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas."
|
|
|
|
- alert: CronJobLastRunFailed
|
|
expr: |
|
|
kube_job_status_failed{namespace!="kube-system"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Job failed"
|
|
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed."
|
|
|
|
- alert: PodRestartingFrequently
|
|
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod is restarting frequently"
|
|
description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour."
|