sbbb/base/monitoring/alertrules-observability.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: observability-alerts
  namespace: monitoring
  labels:
    role: alert-rules
    release: kube-prometheus-stack
spec:
  groups:
    - name: prometheus
      rules:
        - alert: PrometheusWALCorruption
          expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: "Prometheus WAL corruption detected"
            description: "Prometheus detected WAL corruption — data loss may be occurring."

        - alert: PrometheusRuleFailures
          expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Prometheus rule evaluation failures"
            description: "Some Prometheus rules are failing to evaluate — alerts may not fire."

        - alert: PrometheusStorageFull
          expr: prometheus_tsdb_storage_blocks_bytes > 25.5e9
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Prometheus storage over 85% of 30Gi PVC"
            description: "Prometheus TSDB is using {{ $value | humanize1024 }}B of its 30Gi PVC."

    - name: loki
      rules:
        - alert: LokiDown
          expr: up{job=~".*loki.*", container="loki"} == 0
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "Loki is down"
            description: "Loki log aggregation is offline — logs are being dropped."

    - name: tempo
      rules:
        - alert: TempoDown
          expr: up{job=~".*tempo.*"} == 0
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: "Tempo is down"
            description: "Tempo trace backend is offline — traces are being dropped."

    - name: alertmanager
      rules:
        - alert: AlertManagerWebhookFailures
          expr: increase(alertmanager_notifications_failed_total{integration="webhook"}[15m]) > 0
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "AlertManager webhook delivery failing"
            description: "AlertManager cannot deliver alerts to Matrix webhook receiver."