From 3fc54c88513ef4f61beb1721699a2034a778db7c Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Tue, 24 Mar 2026 12:20:55 +0000 Subject: [PATCH] feat: add PrometheusRule alerts for all services 28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit). --- base/data/kustomization.yaml | 4 +- base/data/openbao-alertrules.yaml | 28 ++++++++++ base/data/opensearch-alertrules.yaml | 37 +++++++++++++ base/data/postgres-alertrules.yaml | 37 +++++++++++++ base/devtools/gitea-alertrules.yaml | 28 ++++++++++ base/devtools/kustomization.yaml | 2 + base/media/kustomization.yaml | 4 ++ base/media/livekit-alertrules.yaml | 28 ++++++++++ base/mesh/kustomization.yaml | 1 + base/mesh/linkerd-alertrules.yaml | 44 +++++++++++++++ .../monitoring/alertrules-infrastructure.yaml | 55 +++++++++++++++++++ base/ory/kustomization.yaml | 3 + base/ory/ory-alertrules.yaml | 46 ++++++++++++++++ base/storage/kustomization.yaml | 2 + base/storage/seaweedfs-alertrules.yaml | 46 ++++++++++++++++ 15 files changed, 363 insertions(+), 2 deletions(-) create mode 100644 base/data/openbao-alertrules.yaml create mode 100644 base/data/opensearch-alertrules.yaml create mode 100644 base/data/postgres-alertrules.yaml create mode 100644 base/devtools/gitea-alertrules.yaml create mode 100644 base/media/livekit-alertrules.yaml create mode 100644 base/mesh/linkerd-alertrules.yaml create mode 100644 base/monitoring/alertrules-infrastructure.yaml create mode 100644 base/ory/ory-alertrules.yaml create mode 100644 base/storage/seaweedfs-alertrules.yaml diff --git a/base/data/kustomization.yaml b/base/data/kustomization.yaml index d80c66d..f0e900a 100644 --- a/base/data/kustomization.yaml +++ b/base/data/kustomization.yaml @@ -11,9 +11,9 @@ resources: - opensearch-deployment.yaml - opensearch-service.yaml - opensearch-pvc.yaml - - openbao-keys-placeholder.yaml - barman-vault-secret.yaml - - opensearch-servicemonitor.yaml + # opensearch-servicemonitor.yaml removed — OpenSearch 3.x has no prometheus-exporter plugin. + # TODO: add opensearch-exporter sidecar for Prometheus metrics. - opensearch-alertrules.yaml - postgres-alertrules.yaml - openbao-alertrules.yaml diff --git a/base/data/openbao-alertrules.yaml b/base/data/openbao-alertrules.yaml new file mode 100644 index 0000000..a709e55 --- /dev/null +++ b/base/data/openbao-alertrules.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: openbao-alerts + namespace: data + labels: + role: alert-rules +spec: + groups: + - name: openbao + rules: + - alert: VaultSealed + expr: vault_core_unsealed == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "OpenBao/Vault is sealed" + description: "OpenBao/Vault is sealed — automatic unseal may have failed" + + - alert: VaultDown + expr: up{job=~".*openbao.*"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "OpenBao/Vault is down" + description: "OpenBao instance {{ $labels.namespace }}/{{ $labels.pod }} is down." diff --git a/base/data/opensearch-alertrules.yaml b/base/data/opensearch-alertrules.yaml new file mode 100644 index 0000000..ded8dbf --- /dev/null +++ b/base/data/opensearch-alertrules.yaml @@ -0,0 +1,37 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: opensearch-alerts + namespace: data + labels: + role: alert-rules +spec: + groups: + - name: opensearch + rules: + - alert: OpenSearchClusterRed + expr: opensearch_cluster_health_status{color="red"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: "OpenSearch cluster health is RED" + description: "OpenSearch cluster {{ $labels.cluster }} health status is red." + + - alert: OpenSearchClusterYellow + expr: opensearch_cluster_health_status{color="yellow"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "OpenSearch cluster health is YELLOW" + description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow." + + - alert: OpenSearchHeapHigh + expr: (opensearch_jvm_mem_heap_used_bytes / opensearch_jvm_mem_heap_max_bytes) > 0.85 + for: 5m + labels: + severity: warning + annotations: + summary: "OpenSearch JVM heap usage is high" + description: "OpenSearch node {{ $labels.node }} in {{ $labels.namespace }} heap usage is above 85%." diff --git a/base/data/postgres-alertrules.yaml b/base/data/postgres-alertrules.yaml new file mode 100644 index 0000000..97e3e1e --- /dev/null +++ b/base/data/postgres-alertrules.yaml @@ -0,0 +1,37 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: postgres-alerts + namespace: data + labels: + role: alert-rules +spec: + groups: + - name: postgres + rules: + - alert: PostgresDown + expr: cnpg_collector_up == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "PostgreSQL instance is down" + description: "CNPG collector reports {{ $labels.namespace }}/{{ $labels.pod }} is down." + + - alert: PostgresDatabaseSizeLarge + expr: cnpg_pg_database_size_bytes > 7e9 + for: 5m + labels: + severity: warning + annotations: + summary: "PostgreSQL database size is large" + description: "Database {{ $labels.datname }} is {{ $value | humanize1024 }} (PVC limit 10Gi)" + + - alert: PostgresHighConnections + expr: sum by (pod) (cnpg_pg_stat_activity_count) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "PostgreSQL connection count is high" + description: "Pod {{ $labels.pod }} has {{ $value }} active connections." diff --git a/base/devtools/gitea-alertrules.yaml b/base/devtools/gitea-alertrules.yaml new file mode 100644 index 0000000..da0e137 --- /dev/null +++ b/base/devtools/gitea-alertrules.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: gitea-alerts + namespace: devtools + labels: + role: alert-rules +spec: + groups: + - name: gitea + rules: + - alert: GiteaDown + expr: up{job=~".*gitea.*"} == 0 + for: 3m + labels: + severity: critical + annotations: + summary: "Gitea is down" + description: "Gitea instance {{ $labels.namespace }}/{{ $labels.pod }} is down." + + - alert: GiteaHighGoroutines + expr: go_goroutines{job=~".*gitea.*"} > 500 + for: 5m + labels: + severity: warning + annotations: + summary: "Gitea goroutine count is high" + description: "Gitea {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} goroutines." diff --git a/base/devtools/kustomization.yaml b/base/devtools/kustomization.yaml index c3f57ed..320b62d 100644 --- a/base/devtools/kustomization.yaml +++ b/base/devtools/kustomization.yaml @@ -7,6 +7,8 @@ resources: - namespace.yaml - vault-secrets.yaml - gitea-theme-cm.yaml + - gitea-servicemonitor.yaml + - gitea-alertrules.yaml helmCharts: # helm repo add gitea-charts https://dl.gitea.com/charts/ diff --git a/base/media/kustomization.yaml b/base/media/kustomization.yaml index 4b00bd9..f66c4ab 100644 --- a/base/media/kustomization.yaml +++ b/base/media/kustomization.yaml @@ -6,6 +6,10 @@ namespace: media resources: - namespace.yaml - vault-secrets.yaml + - livekit-alertrules.yaml + # livekit-servicemonitor.yaml disabled — LiveKit runs on hostNetwork and port 6789 + # is not reachable from Prometheus due to host firewall. Open port 6789 on the host + # or add an iptables rule, then re-enable. helmCharts: # helm repo add livekit https://helm.livekit.io diff --git a/base/media/livekit-alertrules.yaml b/base/media/livekit-alertrules.yaml new file mode 100644 index 0000000..632027e --- /dev/null +++ b/base/media/livekit-alertrules.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: livekit-alerts + namespace: media + labels: + role: alert-rules +spec: + groups: + - name: livekit + rules: + - alert: LiveKitDown + expr: up{job=~".*livekit.*"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "LiveKit is down" + description: "LiveKit instance {{ $labels.namespace }}/{{ $labels.pod }} is down." + + - alert: LiveKitHighNACKRate + expr: sum(rate(livekit_nack_total[5m])) > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "LiveKit NACK rate is high" + description: "LiveKit NACK rate is {{ $value }}/s, indicating potential media quality issues." diff --git a/base/mesh/kustomization.yaml b/base/mesh/kustomization.yaml index 13ab90c..8a7a9ae 100644 --- a/base/mesh/kustomization.yaml +++ b/base/mesh/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization resources: - namespace.yaml + - linkerd-alertrules.yaml # NOTE: Linkerd stable releases moved behind a commercial paywall in Feb 2024. # As of 2.15, stable artifacts are Buoyant Enterprise for Linkerd (BEL) only. diff --git a/base/mesh/linkerd-alertrules.yaml b/base/mesh/linkerd-alertrules.yaml new file mode 100644 index 0000000..a482827 --- /dev/null +++ b/base/mesh/linkerd-alertrules.yaml @@ -0,0 +1,44 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: linkerd-mesh-alerts + namespace: mesh + labels: + role: alert-rules +spec: + groups: + - name: linkerd-mesh + rules: + - alert: ServiceHighErrorRate + expr: | + sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace) + / sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace) + > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Service has high error rate" + description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate" + + - alert: ServiceHighErrorRateCritical + expr: | + sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace) + / sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace) + > 0.25 + for: 2m + labels: + severity: critical + annotations: + summary: "Service has critically high error rate" + description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate" + + - alert: ServiceHighLatency + expr: | + histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) > 2000 + for: 5m + labels: + severity: warning + annotations: + summary: "Service has high p95 latency" + description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms" diff --git a/base/monitoring/alertrules-infrastructure.yaml b/base/monitoring/alertrules-infrastructure.yaml new file mode 100644 index 0000000..f6b30c1 --- /dev/null +++ b/base/monitoring/alertrules-infrastructure.yaml @@ -0,0 +1,55 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: infrastructure-alerts + namespace: monitoring + labels: + role: alert-rules +spec: + groups: + - name: infrastructure + rules: + - alert: LonghornDiskSpaceLow + expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90 + for: 5m + labels: + severity: critical + annotations: + summary: "Longhorn disk space critically low" + description: "Longhorn disk on {{ $labels.node }} is over 90% full." + + - alert: LonghornVolumeSpaceLow + expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85 + for: 5m + labels: + severity: warning + annotations: + summary: "Longhorn volume space low" + description: "Longhorn volume {{ $labels.volume }} is over 85% full." + + - alert: CertExpiringCritical + expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400 + for: 10m + labels: + severity: critical + annotations: + summary: "Certificate expiring in less than 24 hours" + description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours" + + - alert: CertExpiringSoon + expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800 + for: 30m + labels: + severity: warning + annotations: + summary: "Certificate expiring in less than 7 days" + description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days" + + - alert: CertNotReady + expr: certmanager_certificate_ready_status{condition="True"} != 1 + for: 10m + labels: + severity: warning + annotations: + summary: "Certificate not ready" + description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state." diff --git a/base/ory/kustomization.yaml b/base/ory/kustomization.yaml index adf965f..cfb129e 100644 --- a/base/ory/kustomization.yaml +++ b/base/ory/kustomization.yaml @@ -13,6 +13,9 @@ resources: # Hydra chart CRDs are not rendered by helm template; apply manually. - hydra-oauth2client-crd.yaml - vault-secrets.yaml + - ory-alertrules.yaml + - hydra-servicemonitor.yaml + - kratos-servicemonitor.yaml patches: # Set Kratos selfservice UI URLs (DOMAIN_SUFFIX substituted at apply time). diff --git a/base/ory/ory-alertrules.yaml b/base/ory/ory-alertrules.yaml new file mode 100644 index 0000000..c83a4f1 --- /dev/null +++ b/base/ory/ory-alertrules.yaml @@ -0,0 +1,46 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: ory-alerts + namespace: ory + labels: + role: alert-rules +spec: + groups: + - name: ory + rules: + - alert: HydraDown + expr: up{job=~".*hydra.*"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Hydra is down" + description: "Hydra instance {{ $labels.namespace }}/{{ $labels.pod }} is down." + + - alert: KratosDown + expr: up{job=~".*kratos.*"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Kratos is down" + description: "Kratos instance {{ $labels.namespace }}/{{ $labels.pod }} is down." + + - alert: HydraHighErrorRate + expr: sum(rate(http_requests_total{job=~".*hydra.*",code=~"5.."}[5m])) / sum(rate(http_requests_total{job=~".*hydra.*"}[5m])) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Hydra has a high HTTP error rate" + description: "Hydra 5xx error rate is {{ $value | humanizePercentage }}." + + - alert: KratosHighErrorRate + expr: sum(rate(http_requests_total{job=~".*kratos.*",code=~"5.."}[5m])) / sum(rate(http_requests_total{job=~".*kratos.*"}[5m])) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Kratos has a high HTTP error rate" + description: "Kratos 5xx error rate is {{ $value | humanizePercentage }}." diff --git a/base/storage/kustomization.yaml b/base/storage/kustomization.yaml index bd745bc..a88f47d 100644 --- a/base/storage/kustomization.yaml +++ b/base/storage/kustomization.yaml @@ -12,3 +12,5 @@ resources: - seaweedfs-filer-pvc.yaml - vault-secrets.yaml - seaweedfs-remote-sync.yaml + - seaweedfs-servicemonitor.yaml + - seaweedfs-alertrules.yaml diff --git a/base/storage/seaweedfs-alertrules.yaml b/base/storage/seaweedfs-alertrules.yaml new file mode 100644 index 0000000..8fc9cfe --- /dev/null +++ b/base/storage/seaweedfs-alertrules.yaml @@ -0,0 +1,46 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: seaweedfs-alerts + namespace: storage + labels: + role: alert-rules +spec: + groups: + - name: seaweedfs + rules: + - alert: SeaweedFSMasterDown + expr: up{job=~".*seaweedfs.*", app="seaweedfs-master"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "SeaweedFS master is down" + description: "SeaweedFS master {{ $labels.namespace }}/{{ $labels.pod }} is down." + + - alert: SeaweedFSVolumeDown + expr: up{job=~".*seaweedfs.*", app="seaweedfs-volume"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "SeaweedFS volume server is down" + description: "SeaweedFS volume server {{ $labels.namespace }}/{{ $labels.pod }} is down." + + - alert: SeaweedFSFilerDown + expr: up{job=~".*seaweedfs.*", app="seaweedfs-filer"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "SeaweedFS filer is down" + description: "SeaweedFS filer {{ $labels.namespace }}/{{ $labels.pod }} is down." + + - alert: SeaweedFSDiskLow + expr: (seaweedfs_disk_free_bytes / (seaweedfs_disk_free_bytes + seaweedfs_disk_used_bytes)) < 0.15 + for: 5m + labels: + severity: warning + annotations: + summary: "SeaweedFS disk space low" + description: "SeaweedFS disk on {{ $labels.instance }} has less than 15% free space."