apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: postgres-alerts namespace: data labels: role: alert-rules release: kube-prometheus-stack spec: groups: - name: postgres rules: - alert: PostgresDown expr: cnpg_collector_up == 0 for: 2m labels: severity: critical annotations: summary: "PostgreSQL instance is down" description: "CNPG collector reports {{ $labels.namespace }}/{{ $labels.pod }} is down." - alert: PostgresDatabaseSizeLarge expr: cnpg_pg_database_size_bytes > 7e9 for: 5m labels: severity: warning annotations: summary: "PostgreSQL database size is large" description: "Database {{ $labels.datname }} is {{ $value | humanize1024 }} (PVC limit 10Gi)" - alert: PostgresHighConnections expr: sum by (pod) (cnpg_backends_total) > 80 for: 5m labels: severity: warning annotations: summary: "PostgreSQL connection count is high" description: "Pod {{ $labels.pod }} has {{ $value }} active connections." - alert: PostgresBackupStale expr: | time() - cnpg_collector_last_available_backup_timestamp > 90000 for: 10m labels: severity: critical annotations: summary: "PostgreSQL backup is stale" description: "No successful backup in over 25 hours (daily schedule expected)." - alert: PostgresBackupFailed expr: | cnpg_collector_last_failed_backup_timestamp > cnpg_collector_last_available_backup_timestamp for: 5m labels: severity: critical annotations: summary: "PostgreSQL backup failed" description: "Last backup failed more recently than last success. Check barman/S3." - alert: PostgresWALArchivingStale expr: cnpg_pg_stat_archiver_seconds_since_last_archival > 300 for: 5m labels: severity: critical annotations: summary: "PostgreSQL WAL archiving stale" description: "No WAL archived in {{ $value | humanizeDuration }}. Point-in-time recovery may be impossible." - alert: PostgresDeadlocks expr: rate(cnpg_pg_stat_database_deadlocks[5m]) > 0 for: 5m labels: severity: warning annotations: summary: "PostgreSQL deadlocks detected" description: "Database {{ $labels.datname }} is experiencing deadlocks."