apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: observability-alerts namespace: monitoring labels: role: alert-rules release: kube-prometheus-stack spec: groups: - name: prometheus rules: - alert: PrometheusWALCorruption expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0 for: 0m labels: severity: critical annotations: summary: "Prometheus WAL corruption detected" description: "Prometheus detected WAL corruption — data loss may be occurring." - alert: PrometheusRuleFailures expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0 for: 5m labels: severity: warning annotations: summary: "Prometheus rule evaluation failures" description: "Some Prometheus rules are failing to evaluate — alerts may not fire." - alert: PrometheusStorageFull expr: prometheus_tsdb_storage_blocks_bytes > 25.5e9 for: 10m labels: severity: warning annotations: summary: "Prometheus storage over 85% of 30Gi PVC" description: "Prometheus TSDB is using {{ $value | humanize1024 }}B of its 30Gi PVC." - name: loki rules: - alert: LokiDown expr: up{job=~".*loki.*", container="loki"} == 0 for: 2m labels: severity: critical annotations: summary: "Loki is down" description: "Loki log aggregation is offline — logs are being dropped." - name: tempo rules: - alert: TempoDown expr: up{job=~".*tempo.*"} == 0 for: 2m labels: severity: warning annotations: summary: "Tempo is down" description: "Tempo trace backend is offline — traces are being dropped." - name: alertmanager rules: - alert: AlertManagerWebhookFailures expr: increase(alertmanager_notifications_failed_total{integration="webhook"}[15m]) > 0 for: 5m labels: severity: critical annotations: summary: "AlertManager webhook delivery failing" description: "AlertManager cannot deliver alerts to Matrix webhook receiver."