63 lines
2.2 KiB
YAML
63 lines
2.2 KiB
YAML
|
|
apiVersion: monitoring.coreos.com/v1
|
||
|
|
kind: PrometheusRule
|
||
|
|
metadata:
|
||
|
|
name: slo-alerts
|
||
|
|
namespace: monitoring
|
||
|
|
labels:
|
||
|
|
role: alert-rules
|
||
|
|
release: kube-prometheus-stack
|
||
|
|
spec:
|
||
|
|
groups:
|
||
|
|
# SLO: Kratos/Hydra auth stack — 99.9% availability (43 min/month budget)
|
||
|
|
- name: slo-auth
|
||
|
|
rules:
|
||
|
|
- alert: AuthErrorBudgetFastBurn
|
||
|
|
expr: |
|
||
|
|
service:error_rate:5m{deployment=~"kratos|hydra"} > 0.0144
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
slo: auth-availability
|
||
|
|
annotations:
|
||
|
|
summary: "Auth stack burning error budget at 14.4x rate"
|
||
|
|
description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (14.4x burn rate for 99.9% SLO)."
|
||
|
|
|
||
|
|
- alert: AuthErrorBudgetSlowBurn
|
||
|
|
expr: |
|
||
|
|
service:error_rate:5m{deployment=~"kratos|hydra"} > 0.003
|
||
|
|
for: 1h
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
slo: auth-availability
|
||
|
|
annotations:
|
||
|
|
summary: "Auth stack slowly burning error budget"
|
||
|
|
description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (3x burn rate for 99.9% SLO)."
|
||
|
|
|
||
|
|
# SLO: Tuwunel Matrix homeserver — 99.5% availability (3.6 hr/month budget)
|
||
|
|
- name: slo-matrix
|
||
|
|
rules:
|
||
|
|
- alert: MatrixErrorBudgetFastBurn
|
||
|
|
expr: |
|
||
|
|
service:error_rate:5m{deployment="tuwunel"} > 0.072
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
slo: matrix-availability
|
||
|
|
annotations:
|
||
|
|
summary: "Matrix homeserver burning error budget at 14.4x rate"
|
||
|
|
description: "Tuwunel error rate is {{ $value | humanizePercentage }}."
|
||
|
|
|
||
|
|
# SLO: All services — latency p95 under 2s
|
||
|
|
- name: slo-latency
|
||
|
|
rules:
|
||
|
|
- alert: ServiceLatencyBudgetBurn
|
||
|
|
expr: |
|
||
|
|
service:latency_p95:5m > 2000
|
||
|
|
for: 10m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
slo: latency
|
||
|
|
annotations:
|
||
|
|
summary: "Service p95 latency exceeds 2s SLO"
|
||
|
|
description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms."
|