sbbb/base/monitoring/alertrules-infrastructure.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: infrastructure-alerts
  namespace: monitoring
  labels:
    role: alert-rules
    release: kube-prometheus-stack
spec:
  groups:
    - name: infrastructure
      rules:
        - alert: LonghornDiskSpaceLow
          expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "Longhorn disk space critically low"
            description: "Longhorn disk on {{ $labels.node }} is over 90% full."

        - alert: LonghornVolumeSpaceLow
          expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Longhorn volume space low"
            description: "Longhorn volume {{ $labels.volume }} is over 85% full."

        - alert: CertExpiringCritical
          expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400
          for: 10m
          labels:
            severity: critical
          annotations:
            summary: "Certificate expiring in less than 24 hours"
            description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours"

        - alert: CertExpiringSoon
          expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800
          for: 30m
          labels:
            severity: warning
          annotations:
            summary: "Certificate expiring in less than 7 days"
            description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days"

        - alert: CertNotReady
          expr: certmanager_certificate_ready_status{condition="True"} != 1
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Certificate not ready"
            description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."

    - name: node
      rules:
        - alert: NodeMemoryHigh
          expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Node memory usage above 85%"
            description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}."

        - alert: NodeMemoryCritical
          expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "Node memory usage above 95%"
            description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent."

        - alert: NodeSwapActive
          expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Node swap usage above 50%"
            description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure."

        - alert: NodeCPUHigh
          expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90
          for: 15m
          labels:
            severity: warning
          annotations:
            summary: "Node CPU usage above 90% for 15 minutes"
            description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}."

        - alert: NodeFilesystemFull
          expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Filesystem usage above 85%"
            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."

        - alert: NodeFilesystemCritical
          expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "Filesystem usage above 95%"
            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."

        - alert: NodeFilesystemFilesRunningOut
          expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Filesystem inodes running low"
            description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free."

        - alert: NodeNetworkErrors
          expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Network interface errors detected"
            description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors."

        - alert: NodeClockSkew
          expr: abs(node_timex_offset_seconds) > 0.05
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Node clock skew detected"
            description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail."

        - alert: NodeOOMKills
          expr: increase(node_vmstat_oom_kill[5m]) > 0
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: "OOM kill detected"
            description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes."

    - name: kubernetes
      rules:
        - alert: PodMemoryNearLimit
          expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Pod memory near limit"
            description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit."

        - alert: PersistentVolumeUsageHigh
          expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "PVC usage above 85%"
            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."

        - alert: PersistentVolumeUsageCritical
          expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "PVC usage above 95%"
            description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."

        - alert: DeploymentNoReadyPods
          expr: |
            kube_deployment_status_replicas_available == 0
            and kube_deployment_spec_replicas > 0
          for: 5m
          labels:
            severity: critical
          annotations:
            summary: "Deployment has no ready pods"
            description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas."

        - alert: CronJobLastRunFailed
          expr: |
            kube_job_status_failed{namespace!="kube-system"} > 0
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Job failed"
            description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed."

        - alert: PodRestartingFrequently
          expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
          for: 10m
          labels:
            severity: warning
          annotations:
            summary: "Pod is restarting frequently"
            description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour."