feat: add PrometheusRule alerts for all services

28 alert rules across 9 PrometheusRule files covering infrastructure
(Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch),
storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos),
media (LiveKit), and mesh (Linkerd golden signals for all services).

Severity routing: critical alerts fire to Matrix + email, warnings
to Matrix only (AlertManager config updated in separate commit).
This commit is contained in:
2026-03-24 12:20:55 +00:00
parent 74bb59cfdc
commit 3fc54c8851
15 changed files with 363 additions and 2 deletions

View File

@@ -3,6 +3,7 @@ kind: Kustomization
resources:
- namespace.yaml
- linkerd-alertrules.yaml
# NOTE: Linkerd stable releases moved behind a commercial paywall in Feb 2024.
# As of 2.15, stable artifacts are Buoyant Enterprise for Linkerd (BEL) only.

View File

@@ -0,0 +1,44 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: linkerd-mesh-alerts
namespace: mesh
labels:
role: alert-rules
spec:
groups:
- name: linkerd-mesh
rules:
- alert: ServiceHighErrorRate
expr: |
sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
/ sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
> 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Service has high error rate"
description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate"
- alert: ServiceHighErrorRateCritical
expr: |
sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
/ sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
> 0.25
for: 2m
labels:
severity: critical
annotations:
summary: "Service has critically high error rate"
description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate"
- alert: ServiceHighLatency
expr: |
histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) > 2000
for: 5m
labels:
severity: warning
annotations:
summary: "Service has high p95 latency"
description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms"