apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: infrastructure-alerts namespace: monitoring labels: role: alert-rules release: kube-prometheus-stack spec: groups: - name: infrastructure rules: - alert: LonghornDiskSpaceLow expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90 for: 5m labels: severity: critical annotations: summary: "Longhorn disk space critically low" description: "Longhorn disk on {{ $labels.node }} is over 90% full." - alert: LonghornVolumeSpaceLow expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85 for: 5m labels: severity: warning annotations: summary: "Longhorn volume space low" description: "Longhorn volume {{ $labels.volume }} is over 85% full." - alert: CertExpiringCritical expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400 for: 10m labels: severity: critical annotations: summary: "Certificate expiring in less than 24 hours" description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours" - alert: CertExpiringSoon expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800 for: 30m labels: severity: warning annotations: summary: "Certificate expiring in less than 7 days" description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days" - alert: CertNotReady expr: certmanager_certificate_ready_status{condition="True"} != 1 for: 10m labels: severity: warning annotations: summary: "Certificate not ready" description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state." - name: node rules: - alert: NodeMemoryHigh expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85 for: 5m labels: severity: warning annotations: summary: "Node memory usage above 85%" description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}." - alert: NodeMemoryCritical expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95 for: 2m labels: severity: critical annotations: summary: "Node memory usage above 95%" description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent." - alert: NodeSwapActive expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50 for: 10m labels: severity: warning annotations: summary: "Node swap usage above 50%" description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure." - alert: NodeCPUHigh expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90 for: 15m labels: severity: warning annotations: summary: "Node CPU usage above 90% for 15 minutes" description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}." - alert: NodeFilesystemFull expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85 for: 5m labels: severity: warning annotations: summary: "Filesystem usage above 85%" description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full." - alert: NodeFilesystemCritical expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95 for: 2m labels: severity: critical annotations: summary: "Filesystem usage above 95%" description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full." - alert: NodeFilesystemFilesRunningOut expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05 for: 5m labels: severity: warning annotations: summary: "Filesystem inodes running low" description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free." - alert: NodeNetworkErrors expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "Network interface errors detected" description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors." - alert: NodeClockSkew expr: abs(node_timex_offset_seconds) > 0.05 for: 10m labels: severity: warning annotations: summary: "Node clock skew detected" description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail." - alert: NodeOOMKills expr: increase(node_vmstat_oom_kill[5m]) > 0 for: 0m labels: severity: warning annotations: summary: "OOM kill detected" description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes." - name: kubernetes rules: - alert: PodMemoryNearLimit expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90 for: 10m labels: severity: warning annotations: summary: "Pod memory near limit" description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit." - alert: PersistentVolumeUsageHigh expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85 for: 5m labels: severity: warning annotations: summary: "PVC usage above 85%" description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full." - alert: PersistentVolumeUsageCritical expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95 for: 2m labels: severity: critical annotations: summary: "PVC usage above 95%" description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full." - alert: DeploymentNoReadyPods expr: | kube_deployment_status_replicas_available == 0 and kube_deployment_spec_replicas > 0 for: 5m labels: severity: critical annotations: summary: "Deployment has no ready pods" description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas." - alert: CronJobLastRunFailed expr: | kube_job_status_failed{namespace!="kube-system"} > 0 for: 10m labels: severity: warning annotations: summary: "Job failed" description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed." - alert: PodRestartingFrequently expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 for: 10m labels: severity: warning annotations: summary: "Pod is restarting frequently" description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour."