Files
sbbb/base/monitoring/alertrules-slo.yaml

63 lines
2.2 KiB
YAML
Raw Normal View History

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: slo-alerts
namespace: monitoring
labels:
role: alert-rules
release: kube-prometheus-stack
spec:
groups:
# SLO: Kratos/Hydra auth stack — 99.9% availability (43 min/month budget)
- name: slo-auth
rules:
- alert: AuthErrorBudgetFastBurn
expr: |
service:error_rate:5m{deployment=~"kratos|hydra"} > 0.0144
for: 2m
labels:
severity: critical
slo: auth-availability
annotations:
summary: "Auth stack burning error budget at 14.4x rate"
description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (14.4x burn rate for 99.9% SLO)."
- alert: AuthErrorBudgetSlowBurn
expr: |
service:error_rate:5m{deployment=~"kratos|hydra"} > 0.003
for: 1h
labels:
severity: warning
slo: auth-availability
annotations:
summary: "Auth stack slowly burning error budget"
description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (3x burn rate for 99.9% SLO)."
# SLO: Tuwunel Matrix homeserver — 99.5% availability (3.6 hr/month budget)
- name: slo-matrix
rules:
- alert: MatrixErrorBudgetFastBurn
expr: |
service:error_rate:5m{deployment="tuwunel"} > 0.072
for: 2m
labels:
severity: critical
slo: matrix-availability
annotations:
summary: "Matrix homeserver burning error budget at 14.4x rate"
description: "Tuwunel error rate is {{ $value | humanizePercentage }}."
# SLO: All services — latency p95 under 2s
- name: slo-latency
rules:
- alert: ServiceLatencyBudgetBurn
expr: |
service:latency_p95:5m > 2000
for: 10m
labels:
severity: warning
slo: latency
annotations:
summary: "Service p95 latency exceeds 2s SLO"
description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms."