feat: add PrometheusRule alerts for all services
28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit).
This commit is contained in:
55
base/monitoring/alertrules-infrastructure.yaml
Normal file
55
base/monitoring/alertrules-infrastructure.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: infrastructure-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: infrastructure
|
||||
rules:
|
||||
- alert: LonghornDiskSpaceLow
|
||||
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Longhorn disk space critically low"
|
||||
description: "Longhorn disk on {{ $labels.node }} is over 90% full."
|
||||
|
||||
- alert: LonghornVolumeSpaceLow
|
||||
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Longhorn volume space low"
|
||||
description: "Longhorn volume {{ $labels.volume }} is over 85% full."
|
||||
|
||||
- alert: CertExpiringCritical
|
||||
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Certificate expiring in less than 24 hours"
|
||||
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours"
|
||||
|
||||
- alert: CertExpiringSoon
|
||||
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Certificate expiring in less than 7 days"
|
||||
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days"
|
||||
|
||||
- alert: CertNotReady
|
||||
expr: certmanager_certificate_ready_status{condition="True"} != 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Certificate not ready"
|
||||
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."
|
||||
Reference in New Issue
Block a user