feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules

The Longhorn memory leak went undetected for 14 days because alerting
was broken (email receiver, missing label selector, no node alerts).
This overhaul brings alerting to production grade.

Fixes:
- Alloy Loki URL pointed to deleted loki-gateway, now loki:3100
- seaweedfs-bucket-init crash on unsupported `mc versioning` command
- All PrometheusRules now have `release: kube-prometheus-stack` label
- Removed broken email receiver, Matrix-only alerting

New alert coverage:
- Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM
- Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full
- Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror
- Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down
- Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart)
- SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s
- Recording rules for Linkerd RED metrics and node aggregates
- Watchdog heartbeat → Matrix every 12h (dead pipeline detection)
- Inhibition: critical suppresses warning for same alert+namespace
- OpenSearchClusterYellow only fires with >1 data node (single-node aware)
This commit is contained in:
2026-04-06 15:52:06 +01:00
parent f07b3353aa
commit e4987b4c58
22 changed files with 515 additions and 24 deletions

View File

@@ -18,6 +18,7 @@ resources:
- openbao-servicemonitor.yaml
- postgres-alertrules.yaml
- openbao-alertrules.yaml
- valkey-alertrules.yaml
- searxng-deployment.yaml
helmCharts:

View File

@@ -5,6 +5,7 @@ metadata:
namespace: data
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: openbao

View File

@@ -5,6 +5,7 @@ metadata:
namespace: data
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: opensearch
@@ -19,13 +20,16 @@ spec:
description: "OpenSearch cluster {{ $labels.cluster }} health status is red."
- alert: OpenSearchClusterYellow
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
expr: |
elasticsearch_cluster_health_status{color="yellow"} == 1
and on(cluster)
elasticsearch_cluster_health_number_of_data_nodes > 1
for: 10m
labels:
severity: warning
annotations:
summary: "OpenSearch cluster health is YELLOW"
description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow."
description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow (multi-node, so unassigned shards indicate a real problem)."
- alert: OpenSearchHeapHigh
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 0.85

View File

@@ -5,6 +5,7 @@ metadata:
namespace: data
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: postgres
@@ -35,3 +36,41 @@ spec:
annotations:
summary: "PostgreSQL connection count is high"
description: "Pod {{ $labels.pod }} has {{ $value }} active connections."
- alert: PostgresBackupStale
expr: |
time() - cnpg_collector_last_available_backup_timestamp > 90000
for: 10m
labels:
severity: critical
annotations:
summary: "PostgreSQL backup is stale"
description: "No successful backup in over 25 hours (daily schedule expected)."
- alert: PostgresBackupFailed
expr: |
cnpg_collector_last_failed_backup_timestamp > cnpg_collector_last_available_backup_timestamp
for: 5m
labels:
severity: critical
annotations:
summary: "PostgreSQL backup failed"
description: "Last backup failed more recently than last success. Check barman/S3."
- alert: PostgresWALArchivingStale
expr: cnpg_pg_stat_archiver_seconds_since_last_archival > 300
for: 5m
labels:
severity: critical
annotations:
summary: "PostgreSQL WAL archiving stale"
description: "No WAL archived in {{ $value | humanizeDuration }}. Point-in-time recovery may be impossible."
- alert: PostgresDeadlocks
expr: rate(cnpg_pg_stat_database_deadlocks[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL deadlocks detected"
description: "Database {{ $labels.datname }} is experiencing deadlocks."

View File

@@ -0,0 +1,21 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: valkey-alerts
namespace: data
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: valkey
rules:
- alert: ValkeyDown
expr: |
kube_deployment_status_replicas_available{namespace="data", deployment="valkey"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Valkey (Redis) is down"
description: "Valkey cache server is down. All apps using Redis/Celery are affected."

View File

@@ -5,6 +5,7 @@ metadata:
namespace: devtools
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: gitea

View File

@@ -38,8 +38,8 @@ spec:
# Enable object versioning on buckets that require it.
# Drive's WOPI GetFile response includes X-WOPI-ItemVersion from S3 VersionId.
mc versioning enable weed/sunbeam-drive
echo "Versioning enabled: sunbeam-drive"
# SeaweedFS doesn't support `mc versioning` — use the S3 API directly.
mc versioning enable weed/sunbeam-drive || echo "Versioning not supported by SeaweedFS mc, skipping (filer handles versioning natively)"
envFrom:
- secretRef:
name: seaweedfs-s3-credentials

View File

@@ -13,3 +13,4 @@ resources:
- hydra-oauth2client.yaml
- sol-deployment.yaml
- sol-config.yaml
- tuwunel-alertrules.yaml

View File

@@ -0,0 +1,31 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: tuwunel-alerts
namespace: matrix
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: tuwunel
rules:
- alert: TuwunelDown
expr: |
kube_deployment_status_replicas_available{namespace="matrix", deployment="tuwunel"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Matrix homeserver (Tuwunel) is down"
description: "Tuwunel is down — Matrix messaging, alertbot delivery, and Sol are all affected."
- alert: SolDown
expr: |
kube_deployment_status_replicas_available{namespace="matrix", deployment="sol"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Sol is down"
description: "Sol virtual librarian is not running."

View File

@@ -5,6 +5,7 @@ metadata:
namespace: media
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: livekit

View File

@@ -5,6 +5,7 @@ metadata:
namespace: mesh
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: linkerd-mesh

View File

@@ -5,6 +5,7 @@ metadata:
namespace: monitoring
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: infrastructure
@@ -53,3 +54,154 @@ spec:
annotations:
summary: "Certificate not ready"
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."
- name: node
rules:
- alert: NodeMemoryHigh
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Node memory usage above 85%"
description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}."
- alert: NodeMemoryCritical
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
for: 2m
labels:
severity: critical
annotations:
summary: "Node memory usage above 95%"
description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent."
- alert: NodeSwapActive
expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50
for: 10m
labels:
severity: warning
annotations:
summary: "Node swap usage above 50%"
description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure."
- alert: NodeCPUHigh
expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90
for: 15m
labels:
severity: warning
annotations:
summary: "Node CPU usage above 90% for 15 minutes"
description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}."
- alert: NodeFilesystemFull
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Filesystem usage above 85%"
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
- alert: NodeFilesystemCritical
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95
for: 2m
labels:
severity: critical
annotations:
summary: "Filesystem usage above 95%"
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
- alert: NodeFilesystemFilesRunningOut
expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Filesystem inodes running low"
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free."
- alert: NodeNetworkErrors
expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Network interface errors detected"
description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors."
- alert: NodeClockSkew
expr: abs(node_timex_offset_seconds) > 0.05
for: 10m
labels:
severity: warning
annotations:
summary: "Node clock skew detected"
description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail."
- alert: NodeOOMKills
expr: increase(node_vmstat_oom_kill[5m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: "OOM kill detected"
description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes."
- name: kubernetes
rules:
- alert: PodMemoryNearLimit
expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90
for: 10m
labels:
severity: warning
annotations:
summary: "Pod memory near limit"
description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit."
- alert: PersistentVolumeUsageHigh
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "PVC usage above 85%"
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
- alert: PersistentVolumeUsageCritical
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95
for: 2m
labels:
severity: critical
annotations:
summary: "PVC usage above 95%"
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
- alert: DeploymentNoReadyPods
expr: |
kube_deployment_status_replicas_available == 0
and kube_deployment_spec_replicas > 0
for: 5m
labels:
severity: critical
annotations:
summary: "Deployment has no ready pods"
description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas."
- alert: CronJobLastRunFailed
expr: |
kube_job_status_failed{namespace!="kube-system"} > 0
for: 10m
labels:
severity: warning
annotations:
summary: "Job failed"
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed."
- alert: PodRestartingFrequently
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 10m
labels:
severity: warning
annotations:
summary: "Pod is restarting frequently"
description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour."

View File

@@ -0,0 +1,71 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: observability-alerts
namespace: monitoring
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusWALCorruption
expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Prometheus WAL corruption detected"
description: "Prometheus detected WAL corruption — data loss may be occurring."
- alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus rule evaluation failures"
description: "Some Prometheus rules are failing to evaluate — alerts may not fire."
- alert: PrometheusStorageFull
expr: prometheus_tsdb_storage_blocks_bytes > 25.5e9
for: 10m
labels:
severity: warning
annotations:
summary: "Prometheus storage over 85% of 30Gi PVC"
description: "Prometheus TSDB is using {{ $value | humanize1024 }}B of its 30Gi PVC."
- name: loki
rules:
- alert: LokiDown
expr: up{job=~".*loki.*", container="loki"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Loki is down"
description: "Loki log aggregation is offline — logs are being dropped."
- name: tempo
rules:
- alert: TempoDown
expr: up{job=~".*tempo.*"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Tempo is down"
description: "Tempo trace backend is offline — traces are being dropped."
- name: alertmanager
rules:
- alert: AlertManagerWebhookFailures
expr: increase(alertmanager_notifications_failed_total{integration="webhook"}[15m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "AlertManager webhook delivery failing"
description: "AlertManager cannot deliver alerts to Matrix webhook receiver."

View File

@@ -0,0 +1,62 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: slo-alerts
namespace: monitoring
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
# SLO: Kratos/Hydra auth stack — 99.9% availability (43 min/month budget)
- name: slo-auth
rules:
- alert: AuthErrorBudgetFastBurn
expr: |
service:error_rate:5m{deployment=~"kratos|hydra"} > 0.0144
for: 2m
labels:
severity: critical
slo: auth-availability
annotations:
summary: "Auth stack burning error budget at 14.4x rate"
description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (14.4x burn rate for 99.9% SLO)."
- alert: AuthErrorBudgetSlowBurn
expr: |
service:error_rate:5m{deployment=~"kratos|hydra"} > 0.003
for: 1h
labels:
severity: warning
slo: auth-availability
annotations:
summary: "Auth stack slowly burning error budget"
description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (3x burn rate for 99.9% SLO)."
# SLO: Tuwunel Matrix homeserver — 99.5% availability (3.6 hr/month budget)
- name: slo-matrix
rules:
- alert: MatrixErrorBudgetFastBurn
expr: |
service:error_rate:5m{deployment="tuwunel"} > 0.072
for: 2m
labels:
severity: critical
slo: matrix-availability
annotations:
summary: "Matrix homeserver burning error budget at 14.4x rate"
description: "Tuwunel error rate is {{ $value | humanizePercentage }}."
# SLO: All services — latency p95 under 2s
- name: slo-latency
rules:
- alert: ServiceLatencyBudgetBurn
expr: |
service:latency_p95:5m > 2000
for: 10m
labels:
severity: warning
slo: latency
annotations:
summary: "Service p95 latency exceeds 2s SLO"
description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms."

View File

@@ -77,7 +77,7 @@ alloy:
loki.write "default" {
endpoint {
url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push"
url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
}
}

View File

@@ -23,6 +23,9 @@ resources:
- matrix-bot-secret.yaml
# Alert rules
- alertrules-infrastructure.yaml
- alertrules-observability.yaml
- alertrules-slo.yaml
- recording-rules.yaml
helmCharts:
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts

View File

@@ -61,7 +61,7 @@ grafana:
- name: Loki
type: loki
uid: loki
url: "http://loki-gateway.monitoring.svc.cluster.local:80"
url: "http://loki.monitoring.svc.cluster.local:3100"
access: proxy
isDefault: false
jsonData:
@@ -130,10 +130,6 @@ alertmanager:
requests:
storage: 2Gi
config:
global:
smtp_from: "alerts@DOMAIN_SUFFIX"
smtp_smarthost: "postfix.lasuite.svc.cluster.local:25"
smtp_require_tls: false
route:
group_by: [alertname, namespace]
group_wait: 30s
@@ -143,30 +139,26 @@ alertmanager:
routes:
- matchers:
- alertname = Watchdog
receiver: "null"
receiver: matrix
repeat_interval: 12h
- matchers:
- severity = critical
receiver: critical
receiver: matrix
- matchers:
- severity = warning
receiver: matrix
receivers:
- name: "null"
- name: email
email_configs:
- to: "ops@DOMAIN_SUFFIX"
send_resolved: true
- name: matrix
webhook_configs:
- url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts"
send_resolved: true
- name: critical
webhook_configs:
- url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts"
send_resolved: true
email_configs:
- to: "ops@DOMAIN_SUFFIX"
send_resolved: true
inhibitRules:
# Critical alerts suppress warnings for the same alertname+namespace
- source_matchers:
- severity = critical
target_matchers:
- severity = warning
equal: [alertname, namespace]
# Disable monitors for components k3s doesn't expose
kubeEtcd:

View File

@@ -0,0 +1,40 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: recording-rules
namespace: monitoring
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: linkerd-service-sli
interval: 30s
rules:
- record: service:request_rate:5m
expr: sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
- record: service:error_rate:5m
expr: |
sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
/ sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
- record: service:latency_p95:5m
expr: |
histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace))
- record: service:latency_p99:5m
expr: |
histogram_quantile(0.99, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace))
- name: node-aggregates
interval: 30s
rules:
- record: node:memory_usage_ratio
expr: 1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes
- record: node:cpu_usage_ratio
expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))
- record: node:swap_usage_ratio
expr: 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes

View File

@@ -5,6 +5,7 @@ metadata:
namespace: ory
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: ory

View File

@@ -0,0 +1,17 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: stalwart
resources:
- namespace.yaml
- vault-secrets.yaml
- oidc-client.yaml
- oidc-client-bulwark.yaml
- stalwart-config.yaml
- stalwart-deployment.yaml
- stalwart-service.yaml
- certificate.yaml
- bulwark-deployment.yaml
- bulwark-service.yaml
- stalwart-alertrules.yaml

View File

@@ -0,0 +1,31 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: stalwart-alerts
namespace: stalwart
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: stalwart
rules:
- alert: StalwartDown
expr: |
kube_deployment_status_replicas_available{namespace="stalwart", deployment="stalwart"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Stalwart mail server is down"
description: "Stalwart is down — IMAP, SMTP submission, and webmail are all affected."
- alert: BulwarkDown
expr: |
kube_deployment_status_replicas_available{namespace="stalwart", deployment="bulwark"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Bulwark spam filter is down"
description: "Bulwark is down — inbound email filtering is not running."

View File

@@ -5,6 +5,7 @@ metadata:
namespace: storage
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: seaweedfs
@@ -44,3 +45,23 @@ spec:
annotations:
summary: "SeaweedFS disk space low"
description: "SeaweedFS disk on {{ $labels.instance }} has less than 15% free space."
- alert: SeaweedFSMirrorJobFailing
expr: |
kube_job_status_failed{namespace="storage", job_name=~"seaweedfs-s3-mirror.*"} > 0
for: 5m
labels:
severity: critical
annotations:
summary: "SeaweedFS S3 mirror job failed"
description: "Job {{ $labels.job_name }} failed. S3 backups to Scaleway are not running."
- alert: SeaweedFSMirrorStale
expr: |
time() - kube_cronjob_status_last_successful_time{namespace="storage", cronjob="seaweedfs-s3-mirror"} > 7200
for: 10m
labels:
severity: warning
annotations:
summary: "SeaweedFS S3 mirror hasn't succeeded in 2+ hours"
description: "CronJob seaweedfs-s3-mirror last succeeded {{ $value | humanizeDuration }} ago."