apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: slo-alerts namespace: monitoring labels: role: alert-rules release: kube-prometheus-stack spec: groups: # SLO: Kratos/Hydra auth stack — 99.9% availability (43 min/month budget) - name: slo-auth rules: - alert: AuthErrorBudgetFastBurn expr: | service:error_rate:5m{deployment=~"kratos|hydra"} > 0.0144 for: 2m labels: severity: critical slo: auth-availability annotations: summary: "Auth stack burning error budget at 14.4x rate" description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (14.4x burn rate for 99.9% SLO)." - alert: AuthErrorBudgetSlowBurn expr: | service:error_rate:5m{deployment=~"kratos|hydra"} > 0.003 for: 1h labels: severity: warning slo: auth-availability annotations: summary: "Auth stack slowly burning error budget" description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (3x burn rate for 99.9% SLO)." # SLO: Tuwunel Matrix homeserver — 99.5% availability (3.6 hr/month budget) - name: slo-matrix rules: - alert: MatrixErrorBudgetFastBurn expr: | service:error_rate:5m{deployment="tuwunel"} > 0.072 for: 2m labels: severity: critical slo: matrix-availability annotations: summary: "Matrix homeserver burning error budget at 14.4x rate" description: "Tuwunel error rate is {{ $value | humanizePercentage }}." # SLO: All services — latency p95 under 2s - name: slo-latency rules: - alert: ServiceLatencyBudgetBurn expr: | service:latency_p95:5m > 2000 for: 10m labels: severity: warning slo: latency annotations: summary: "Service p95 latency exceeds 2s SLO" description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms."