Files
sbbb/base/monitoring/alertrules-infrastructure.yaml

56 lines
2.0 KiB
YAML
Raw Permalink Normal View History

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: infrastructure-alerts
namespace: monitoring
labels:
role: alert-rules
spec:
groups:
- name: infrastructure
rules:
- alert: LonghornDiskSpaceLow
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90
for: 5m
labels:
severity: critical
annotations:
summary: "Longhorn disk space critically low"
description: "Longhorn disk on {{ $labels.node }} is over 90% full."
- alert: LonghornVolumeSpaceLow
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Longhorn volume space low"
description: "Longhorn volume {{ $labels.volume }} is over 85% full."
- alert: CertExpiringCritical
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400
for: 10m
labels:
severity: critical
annotations:
summary: "Certificate expiring in less than 24 hours"
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours"
- alert: CertExpiringSoon
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800
for: 30m
labels:
severity: warning
annotations:
summary: "Certificate expiring in less than 7 days"
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days"
- alert: CertNotReady
expr: certmanager_certificate_ready_status{condition="True"} != 1
for: 10m
labels:
severity: warning
annotations:
summary: "Certificate not ready"
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."