base/monitoring/alertrules-infrastructure.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: infrastructure-alerts
  namespace: monitoring
  labels:
    role: alert-rules
    release: kube-prometheus-stack
spec:
  groups:
    - name: infrastructure
      rules:
        - alert: LonghornDiskSpaceLow
          expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "Longhorn disk space critically low"
            description: "Longhorn disk on {{ $labels.node }} is over 90% full."

        - alert: LonghornVolumeSpaceLow
          expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Longhorn volume space low"
            description: "Longhorn volume {{ $labels.volume }} is over 85% full."

        - alert: CertExpiringCritical
          expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: "Certificate expiring in less than 24 hours"
            description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours"

        - alert: CertExpiringSoon
          expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800
          for: 30m
          labels:
            severity: warning
          annotations:
            summary: "Certificate expiring in less than 7 days"
            description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days"

        - alert: CertNotReady
          expr: certmanager_certificate_ready_status{condition="True"} != 1
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Certificate not ready"
            description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."

    - name: node
      rules:
        - alert: NodeMemoryHigh
          expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Node memory usage above 85%"
            description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}."

        - alert: NodeMemoryCritical
          expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "Node memory usage above 95%"
            description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent."

        - alert: NodeSwapActive
          expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Node swap usage above 50%"
            description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure."

        - alert: NodeCPUHigh
          expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90
          for: 15m
          labels:
            severity: warning
          annotations:
            summary: "Node CPU usage above 90% for 15 minutes"
            description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}."

        - alert: NodeFilesystemFull
          expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Filesystem usage above 85%"
            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."

        - alert: NodeFilesystemCritical
          expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "Filesystem usage above 95%"
            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."

        - alert: NodeFilesystemFilesRunningOut
          expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Filesystem inodes running low"
            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free."

        - alert: NodeNetworkErrors
          expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Network interface errors detected"
            description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors."

        - alert: NodeClockSkew
          expr: abs(node_timex_offset_seconds) > 0.05
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Node clock skew detected"
            description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail."

        - alert: NodeOOMKills
          expr: increase(node_vmstat_oom_kill[5m]) > 0
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: "OOM kill detected"
            description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes."

    - name: kubernetes
      rules:
        - alert: PodMemoryNearLimit
          expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Pod memory near limit"
            description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit."

        - alert: PersistentVolumeUsageHigh
          expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "PVC usage above 85%"
            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."

        - alert: PersistentVolumeUsageCritical
          expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "PVC usage above 95%"
            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."

        - alert: DeploymentNoReadyPods
          expr: |
            kube_deployment_status_replicas_available == 0
            and kube_deployment_spec_replicas > 0
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "Deployment has no ready pods"
            description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas."

        - alert: CronJobLastRunFailed
          expr: |
            kube_job_status_failed{namespace!="kube-system"} > 0
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Job failed"
            description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed."

        - alert: PodRestartingFrequently
          expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Pod is restarting frequently"
            description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour."
feat: add PrometheusRule alerts for all services 28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit). 2026-03-24 12:20:55 +00:00			`apiVersion: monitoring.coreos.com/v1`
			`kind: PrometheusRule`
			`metadata:`
			`name: infrastructure-alerts`
			`namespace: monitoring`
			`labels:`
			`role: alert-rules`
feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules The Longhorn memory leak went undetected for 14 days because alerting was broken (email receiver, missing label selector, no node alerts). This overhaul brings alerting to production grade. Fixes: - Alloy Loki URL pointed to deleted loki-gateway, now loki:3100 - seaweedfs-bucket-init crash on unsupported `mc versioning` command - All PrometheusRules now have `release: kube-prometheus-stack` label - Removed broken email receiver, Matrix-only alerting New alert coverage: - Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM - Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full - Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror - Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down - Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart) - SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s - Recording rules for Linkerd RED metrics and node aggregates - Watchdog heartbeat → Matrix every 12h (dead pipeline detection) - Inhibition: critical suppresses warning for same alert+namespace - OpenSearchClusterYellow only fires with >1 data node (single-node aware) 2026-04-06 15:52:06 +01:00			`release: kube-prometheus-stack`
feat: add PrometheusRule alerts for all services 28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit). 2026-03-24 12:20:55 +00:00			`spec:`
			`groups:`
			`- name: infrastructure`
			`rules:`
			`- alert: LonghornDiskSpaceLow`
			`expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90`
			`for: 5m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Longhorn disk space critically low"`
			`description: "Longhorn disk on {{ $labels.node }} is over 90% full."`

			`- alert: LonghornVolumeSpaceLow`
			`expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Longhorn volume space low"`
			`description: "Longhorn volume {{ $labels.volume }} is over 85% full."`

			`- alert: CertExpiringCritical`
			`expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400`
			`for: 10m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Certificate expiring in less than 24 hours"`
			`description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours"`

			`- alert: CertExpiringSoon`
			`expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800`
			`for: 30m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Certificate expiring in less than 7 days"`
			`description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days"`

			`- alert: CertNotReady`
			`expr: certmanager_certificate_ready_status{condition="True"} != 1`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Certificate not ready"`
			`description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."`
feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules The Longhorn memory leak went undetected for 14 days because alerting was broken (email receiver, missing label selector, no node alerts). This overhaul brings alerting to production grade. Fixes: - Alloy Loki URL pointed to deleted loki-gateway, now loki:3100 - seaweedfs-bucket-init crash on unsupported `mc versioning` command - All PrometheusRules now have `release: kube-prometheus-stack` label - Removed broken email receiver, Matrix-only alerting New alert coverage: - Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM - Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full - Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror - Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down - Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart) - SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s - Recording rules for Linkerd RED metrics and node aggregates - Watchdog heartbeat → Matrix every 12h (dead pipeline detection) - Inhibition: critical suppresses warning for same alert+namespace - OpenSearchClusterYellow only fires with >1 data node (single-node aware) 2026-04-06 15:52:06 +01:00
			`- name: node`
			`rules:`
			`- alert: NodeMemoryHigh`
			`expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Node memory usage above 85%"`
			`description: "{{ $labels.instance }} memory usage is {{ $value \| humanizePercentage }}."`

			`- alert: NodeMemoryCritical`
			`expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95`
			`for: 2m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Node memory usage above 95%"`
			`description: "{{ $labels.instance }} memory usage is {{ $value \| humanizePercentage }}. OOM kills imminent."`

			`- alert: NodeSwapActive`
			`expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Node swap usage above 50%"`
			`description: "{{ $labels.instance }} swap is {{ $value \| humanizePercentage }} free. System is under memory pressure."`

			`- alert: NodeCPUHigh`
			`expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90`
			`for: 15m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Node CPU usage above 90% for 15 minutes"`
			`description: "{{ $labels.instance }} CPU usage is {{ $value \| humanizePercentage }}."`

			`- alert: NodeFilesystemFull`
			`expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs\|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs\|overlay"}) > 0.85`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Filesystem usage above 85%"`
			`description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value \| humanizePercentage }} full."`

			`- alert: NodeFilesystemCritical`
			`expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs\|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs\|overlay"}) > 0.95`
			`for: 2m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Filesystem usage above 95%"`
			`description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value \| humanizePercentage }} full."`

			`- alert: NodeFilesystemFilesRunningOut`
			`expr: node_filesystem_files_free{fstype!~"tmpfs\|overlay"} / node_filesystem_files{fstype!~"tmpfs\|overlay"} < 0.05`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Filesystem inodes running low"`
			`description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free."`

			`- alert: NodeNetworkErrors`
			`expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Network interface errors detected"`
			`description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors."`

			`- alert: NodeClockSkew`
			`expr: abs(node_timex_offset_seconds) > 0.05`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Node clock skew detected"`
			`description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail."`

			`- alert: NodeOOMKills`
			`expr: increase(node_vmstat_oom_kill[5m]) > 0`
			`for: 0m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "OOM kill detected"`
			`description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes."`

			`- name: kubernetes`
			`rules:`
			`- alert: PodMemoryNearLimit`
			`expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Pod memory near limit"`
			`description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value \| humanizePercentage }} of its memory limit."`

			`- alert: PersistentVolumeUsageHigh`
			`expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "PVC usage above 85%"`
			`description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value \| humanizePercentage }} full."`

			`- alert: PersistentVolumeUsageCritical`
			`expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95`
			`for: 2m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "PVC usage above 95%"`
			`description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value \| humanizePercentage }} full."`

			`- alert: DeploymentNoReadyPods`
			`expr: \|`
			`kube_deployment_status_replicas_available == 0`
			`and kube_deployment_spec_replicas > 0`
			`for: 5m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Deployment has no ready pods"`
			`description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas."`

			`- alert: CronJobLastRunFailed`
			`expr: \|`
			`kube_job_status_failed{namespace!="kube-system"} > 0`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Job failed"`
			`description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed."`

			`- alert: PodRestartingFrequently`
			`expr: increase(kube_pod_container_status_restarts_total[1h]) > 5`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Pod is restarting frequently"`
			`description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value \| humanize }} times in the last hour."`