feat: add PrometheusRule alerts for all services

28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit).
2026-03-24 12:20:55 +00:00
parent 74bb59cfdc
commit 3fc54c8851
15 changed files with 363 additions and 2 deletions
--- a/base/media/kustomization.yaml
+++ b/base/media/kustomization.yaml
@@ -6,6 +6,10 @@ namespace: media
 resources:
  - namespace.yaml
  - vault-secrets.yaml
+  - livekit-alertrules.yaml
+  # livekit-servicemonitor.yaml disabled — LiveKit runs on hostNetwork and port 6789
+  # is not reachable from Prometheus due to host firewall. Open port 6789 on the host
+  # or add an iptables rule, then re-enable.

 helmCharts:
  # helm repo add livekit https://helm.livekit.io
--- a/base/media/livekit-alertrules.yaml
+++ b/base/media/livekit-alertrules.yaml
@@ -0,0 +1,28 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: livekit-alerts
+  namespace: media
+  labels:
+    role: alert-rules
+spec:
+  groups:
+    - name: livekit
+      rules:
+        - alert: LiveKitDown
+          expr: up{job=~".*livekit.*"} == 0
+          for: 2m
+          labels:
+            severity: critical
+          annotations:
+            summary: "LiveKit is down"
+            description: "LiveKit instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
+
+        - alert: LiveKitHighNACKRate
+          expr: sum(rate(livekit_nack_total[5m])) > 100
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "LiveKit NACK rate is high"
+            description: "LiveKit NACK rate is {{ $value }}/s, indicating potential media quality issues."