sbbb/base/data/postgres-alertrules.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: postgres-alerts
  namespace: data
  labels:
    role: alert-rules
    release: kube-prometheus-stack
spec:
  groups:
    - name: postgres
      rules:
        - alert: PostgresDown
          expr: cnpg_collector_up == 0
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "PostgreSQL instance is down"
            description: "CNPG collector reports {{ $labels.namespace }}/{{ $labels.pod }} is down."

        - alert: PostgresDatabaseSizeLarge
          expr: cnpg_pg_database_size_bytes > 7e9
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "PostgreSQL database size is large"
            description: "Database {{ $labels.datname }} is {{ $value | humanize1024 }} (PVC limit 10Gi)"

        - alert: PostgresHighConnections
          expr: sum by (pod) (cnpg_backends_total) > 80
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "PostgreSQL connection count is high"
            description: "Pod {{ $labels.pod }} has {{ $value }} active connections."

        - alert: PostgresBackupStale
          expr: |
            time() - cnpg_collector_last_available_backup_timestamp > 90000
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: "PostgreSQL backup is stale"
            description: "No successful backup in over 25 hours (daily schedule expected)."

        - alert: PostgresBackupFailed
          expr: |
            cnpg_collector_last_failed_backup_timestamp > cnpg_collector_last_available_backup_timestamp
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "PostgreSQL backup failed"
            description: "Last backup failed more recently than last success. Check barman/S3."

        - alert: PostgresWALArchivingStale
          expr: cnpg_pg_stat_archiver_seconds_since_last_archival > 300
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "PostgreSQL WAL archiving stale"
            description: "No WAL archived in {{ $value | humanizeDuration }}. Point-in-time recovery may be impossible."

        - alert: PostgresDeadlocks
          expr: rate(cnpg_pg_stat_database_deadlocks[5m]) > 0
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "PostgreSQL deadlocks detected"
            description: "Database {{ $labels.datname }} is experiencing deadlocks."