The Longhorn memory leak went undetected for 14 days because alerting was broken (email receiver, missing label selector, no node alerts). This overhaul brings alerting to production grade. Fixes: - Alloy Loki URL pointed to deleted loki-gateway, now loki:3100 - seaweedfs-bucket-init crash on unsupported `mc versioning` command - All PrometheusRules now have `release: kube-prometheus-stack` label - Removed broken email receiver, Matrix-only alerting New alert coverage: - Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM - Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full - Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror - Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down - Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart) - SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s - Recording rules for Linkerd RED metrics and node aggregates - Watchdog heartbeat → Matrix every 12h (dead pipeline detection) - Inhibition: critical suppresses warning for same alert+namespace - OpenSearchClusterYellow only fires with >1 data node (single-node aware)
77 lines
2.6 KiB
YAML
77 lines
2.6 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: postgres-alerts
|
|
namespace: data
|
|
labels:
|
|
role: alert-rules
|
|
release: kube-prometheus-stack
|
|
spec:
|
|
groups:
|
|
- name: postgres
|
|
rules:
|
|
- alert: PostgresDown
|
|
expr: cnpg_collector_up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL instance is down"
|
|
description: "CNPG collector reports {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
|
|
|
- alert: PostgresDatabaseSizeLarge
|
|
expr: cnpg_pg_database_size_bytes > 7e9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL database size is large"
|
|
description: "Database {{ $labels.datname }} is {{ $value | humanize1024 }} (PVC limit 10Gi)"
|
|
|
|
- alert: PostgresHighConnections
|
|
expr: sum by (pod) (cnpg_backends_total) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL connection count is high"
|
|
description: "Pod {{ $labels.pod }} has {{ $value }} active connections."
|
|
|
|
- alert: PostgresBackupStale
|
|
expr: |
|
|
time() - cnpg_collector_last_available_backup_timestamp > 90000
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL backup is stale"
|
|
description: "No successful backup in over 25 hours (daily schedule expected)."
|
|
|
|
- alert: PostgresBackupFailed
|
|
expr: |
|
|
cnpg_collector_last_failed_backup_timestamp > cnpg_collector_last_available_backup_timestamp
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL backup failed"
|
|
description: "Last backup failed more recently than last success. Check barman/S3."
|
|
|
|
- alert: PostgresWALArchivingStale
|
|
expr: cnpg_pg_stat_archiver_seconds_since_last_archival > 300
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL WAL archiving stale"
|
|
description: "No WAL archived in {{ $value | humanizeDuration }}. Point-in-time recovery may be impossible."
|
|
|
|
- alert: PostgresDeadlocks
|
|
expr: rate(cnpg_pg_stat_database_deadlocks[5m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL deadlocks detected"
|
|
description: "Database {{ $labels.datname }} is experiencing deadlocks."
|