28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit).
45 lines
1.7 KiB
YAML
45 lines
1.7 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: linkerd-mesh-alerts
|
|
namespace: mesh
|
|
labels:
|
|
role: alert-rules
|
|
spec:
|
|
groups:
|
|
- name: linkerd-mesh
|
|
rules:
|
|
- alert: ServiceHighErrorRate
|
|
expr: |
|
|
sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
|
|
/ sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
|
|
> 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Service has high error rate"
|
|
description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate"
|
|
|
|
- alert: ServiceHighErrorRateCritical
|
|
expr: |
|
|
sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
|
|
/ sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
|
|
> 0.25
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service has critically high error rate"
|
|
description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate"
|
|
|
|
- alert: ServiceHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) > 2000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Service has high p95 latency"
|
|
description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms"
|