feat: add PrometheusRule alerts for all services
28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit).
This commit is contained in:
@@ -13,6 +13,9 @@ resources:
|
||||
# Hydra chart CRDs are not rendered by helm template; apply manually.
|
||||
- hydra-oauth2client-crd.yaml
|
||||
- vault-secrets.yaml
|
||||
- ory-alertrules.yaml
|
||||
- hydra-servicemonitor.yaml
|
||||
- kratos-servicemonitor.yaml
|
||||
|
||||
patches:
|
||||
# Set Kratos selfservice UI URLs (DOMAIN_SUFFIX substituted at apply time).
|
||||
|
||||
46
base/ory/ory-alertrules.yaml
Normal file
46
base/ory/ory-alertrules.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: ory-alerts
|
||||
namespace: ory
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: ory
|
||||
rules:
|
||||
- alert: HydraDown
|
||||
expr: up{job=~".*hydra.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Hydra is down"
|
||||
description: "Hydra instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: KratosDown
|
||||
expr: up{job=~".*kratos.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Kratos is down"
|
||||
description: "Kratos instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: HydraHighErrorRate
|
||||
expr: sum(rate(http_requests_total{job=~".*hydra.*",code=~"5.."}[5m])) / sum(rate(http_requests_total{job=~".*hydra.*"}[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Hydra has a high HTTP error rate"
|
||||
description: "Hydra 5xx error rate is {{ $value | humanizePercentage }}."
|
||||
|
||||
- alert: KratosHighErrorRate
|
||||
expr: sum(rate(http_requests_total{job=~".*kratos.*",code=~"5.."}[5m])) / sum(rate(http_requests_total{job=~".*kratos.*"}[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Kratos has a high HTTP error rate"
|
||||
description: "Kratos 5xx error rate is {{ $value | humanizePercentage }}."
|
||||
Reference in New Issue
Block a user