Files
sbbb/base/monitoring/alertrules-observability.yaml

72 lines
2.3 KiB
YAML
Raw Normal View History

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: observability-alerts
namespace: monitoring
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusWALCorruption
expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Prometheus WAL corruption detected"
description: "Prometheus detected WAL corruption — data loss may be occurring."
- alert: PrometheusRuleFailures
expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus rule evaluation failures"
description: "Some Prometheus rules are failing to evaluate — alerts may not fire."
- alert: PrometheusStorageFull
expr: prometheus_tsdb_storage_blocks_bytes > 25.5e9
for: 10m
labels:
severity: warning
annotations:
summary: "Prometheus storage over 85% of 30Gi PVC"
description: "Prometheus TSDB is using {{ $value | humanize1024 }}B of its 30Gi PVC."
- name: loki
rules:
- alert: LokiDown
expr: up{job=~".*loki.*", container="loki"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Loki is down"
description: "Loki log aggregation is offline — logs are being dropped."
- name: tempo
rules:
- alert: TempoDown
expr: up{job=~".*tempo.*"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "Tempo is down"
description: "Tempo trace backend is offline — traces are being dropped."
- name: alertmanager
rules:
- alert: AlertManagerWebhookFailures
expr: increase(alertmanager_notifications_failed_total{integration="webhook"}[15m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "AlertManager webhook delivery failing"
description: "AlertManager cannot deliver alerts to Matrix webhook receiver."