Files
sbbb/base/data/postgres-alertrules.yaml

77 lines
2.6 KiB
YAML
Raw Normal View History

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: postgres-alerts
namespace: data
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: postgres
rules:
- alert: PostgresDown
expr: cnpg_collector_up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "PostgreSQL instance is down"
description: "CNPG collector reports {{ $labels.namespace }}/{{ $labels.pod }} is down."
- alert: PostgresDatabaseSizeLarge
expr: cnpg_pg_database_size_bytes > 7e9
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL database size is large"
description: "Database {{ $labels.datname }} is {{ $value | humanize1024 }} (PVC limit 10Gi)"
- alert: PostgresHighConnections
expr: sum by (pod) (cnpg_backends_total) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL connection count is high"
description: "Pod {{ $labels.pod }} has {{ $value }} active connections."
- alert: PostgresBackupStale
expr: |
time() - cnpg_collector_last_available_backup_timestamp > 90000
for: 10m
labels:
severity: critical
annotations:
summary: "PostgreSQL backup is stale"
description: "No successful backup in over 25 hours (daily schedule expected)."
- alert: PostgresBackupFailed
expr: |
cnpg_collector_last_failed_backup_timestamp > cnpg_collector_last_available_backup_timestamp
for: 5m
labels:
severity: critical
annotations:
summary: "PostgreSQL backup failed"
description: "Last backup failed more recently than last success. Check barman/S3."
- alert: PostgresWALArchivingStale
expr: cnpg_pg_stat_archiver_seconds_since_last_archival > 300
for: 5m
labels:
severity: critical
annotations:
summary: "PostgreSQL WAL archiving stale"
description: "No WAL archived in {{ $value | humanizeDuration }}. Point-in-time recovery may be impossible."
- alert: PostgresDeadlocks
expr: rate(cnpg_pg_stat_database_deadlocks[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL deadlocks detected"
description: "Database {{ $labels.datname }} is experiencing deadlocks."