feat: add PrometheusRule alerts for all services
28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit).
This commit is contained in:
@@ -11,9 +11,9 @@ resources:
|
||||
- opensearch-deployment.yaml
|
||||
- opensearch-service.yaml
|
||||
- opensearch-pvc.yaml
|
||||
- openbao-keys-placeholder.yaml
|
||||
- barman-vault-secret.yaml
|
||||
- opensearch-servicemonitor.yaml
|
||||
# opensearch-servicemonitor.yaml removed — OpenSearch 3.x has no prometheus-exporter plugin.
|
||||
# TODO: add opensearch-exporter sidecar for Prometheus metrics.
|
||||
- opensearch-alertrules.yaml
|
||||
- postgres-alertrules.yaml
|
||||
- openbao-alertrules.yaml
|
||||
|
||||
28
base/data/openbao-alertrules.yaml
Normal file
28
base/data/openbao-alertrules.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: openbao-alerts
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: openbao
|
||||
rules:
|
||||
- alert: VaultSealed
|
||||
expr: vault_core_unsealed == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "OpenBao/Vault is sealed"
|
||||
description: "OpenBao/Vault is sealed — automatic unseal may have failed"
|
||||
|
||||
- alert: VaultDown
|
||||
expr: up{job=~".*openbao.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "OpenBao/Vault is down"
|
||||
description: "OpenBao instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
37
base/data/opensearch-alertrules.yaml
Normal file
37
base/data/opensearch-alertrules.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: opensearch-alerts
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: opensearch
|
||||
rules:
|
||||
- alert: OpenSearchClusterRed
|
||||
expr: opensearch_cluster_health_status{color="red"} == 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "OpenSearch cluster health is RED"
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} health status is red."
|
||||
|
||||
- alert: OpenSearchClusterYellow
|
||||
expr: opensearch_cluster_health_status{color="yellow"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "OpenSearch cluster health is YELLOW"
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow."
|
||||
|
||||
- alert: OpenSearchHeapHigh
|
||||
expr: (opensearch_jvm_mem_heap_used_bytes / opensearch_jvm_mem_heap_max_bytes) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "OpenSearch JVM heap usage is high"
|
||||
description: "OpenSearch node {{ $labels.node }} in {{ $labels.namespace }} heap usage is above 85%."
|
||||
37
base/data/postgres-alertrules.yaml
Normal file
37
base/data/postgres-alertrules.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: postgres-alerts
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: postgres
|
||||
rules:
|
||||
- alert: PostgresDown
|
||||
expr: cnpg_collector_up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL instance is down"
|
||||
description: "CNPG collector reports {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: PostgresDatabaseSizeLarge
|
||||
expr: cnpg_pg_database_size_bytes > 7e9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL database size is large"
|
||||
description: "Database {{ $labels.datname }} is {{ $value | humanize1024 }} (PVC limit 10Gi)"
|
||||
|
||||
- alert: PostgresHighConnections
|
||||
expr: sum by (pod) (cnpg_pg_stat_activity_count) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL connection count is high"
|
||||
description: "Pod {{ $labels.pod }} has {{ $value }} active connections."
|
||||
28
base/devtools/gitea-alertrules.yaml
Normal file
28
base/devtools/gitea-alertrules.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: gitea-alerts
|
||||
namespace: devtools
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: gitea
|
||||
rules:
|
||||
- alert: GiteaDown
|
||||
expr: up{job=~".*gitea.*"} == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Gitea is down"
|
||||
description: "Gitea instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: GiteaHighGoroutines
|
||||
expr: go_goroutines{job=~".*gitea.*"} > 500
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Gitea goroutine count is high"
|
||||
description: "Gitea {{ $labels.namespace }}/{{ $labels.pod }} has {{ $value }} goroutines."
|
||||
@@ -7,6 +7,8 @@ resources:
|
||||
- namespace.yaml
|
||||
- vault-secrets.yaml
|
||||
- gitea-theme-cm.yaml
|
||||
- gitea-servicemonitor.yaml
|
||||
- gitea-alertrules.yaml
|
||||
|
||||
helmCharts:
|
||||
# helm repo add gitea-charts https://dl.gitea.com/charts/
|
||||
|
||||
@@ -6,6 +6,10 @@ namespace: media
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- vault-secrets.yaml
|
||||
- livekit-alertrules.yaml
|
||||
# livekit-servicemonitor.yaml disabled — LiveKit runs on hostNetwork and port 6789
|
||||
# is not reachable from Prometheus due to host firewall. Open port 6789 on the host
|
||||
# or add an iptables rule, then re-enable.
|
||||
|
||||
helmCharts:
|
||||
# helm repo add livekit https://helm.livekit.io
|
||||
|
||||
28
base/media/livekit-alertrules.yaml
Normal file
28
base/media/livekit-alertrules.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: livekit-alerts
|
||||
namespace: media
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: livekit
|
||||
rules:
|
||||
- alert: LiveKitDown
|
||||
expr: up{job=~".*livekit.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "LiveKit is down"
|
||||
description: "LiveKit instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: LiveKitHighNACKRate
|
||||
expr: sum(rate(livekit_nack_total[5m])) > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "LiveKit NACK rate is high"
|
||||
description: "LiveKit NACK rate is {{ $value }}/s, indicating potential media quality issues."
|
||||
@@ -3,6 +3,7 @@ kind: Kustomization
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- linkerd-alertrules.yaml
|
||||
|
||||
# NOTE: Linkerd stable releases moved behind a commercial paywall in Feb 2024.
|
||||
# As of 2.15, stable artifacts are Buoyant Enterprise for Linkerd (BEL) only.
|
||||
|
||||
44
base/mesh/linkerd-alertrules.yaml
Normal file
44
base/mesh/linkerd-alertrules.yaml
Normal file
@@ -0,0 +1,44 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: linkerd-mesh-alerts
|
||||
namespace: mesh
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: linkerd-mesh
|
||||
rules:
|
||||
- alert: ServiceHighErrorRate
|
||||
expr: |
|
||||
sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
|
||||
/ sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
|
||||
> 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Service has high error rate"
|
||||
description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate"
|
||||
|
||||
- alert: ServiceHighErrorRateCritical
|
||||
expr: |
|
||||
sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
|
||||
/ sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
|
||||
> 0.25
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service has critically high error rate"
|
||||
description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate"
|
||||
|
||||
- alert: ServiceHighLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) > 2000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Service has high p95 latency"
|
||||
description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms"
|
||||
55
base/monitoring/alertrules-infrastructure.yaml
Normal file
55
base/monitoring/alertrules-infrastructure.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: infrastructure-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: infrastructure
|
||||
rules:
|
||||
- alert: LonghornDiskSpaceLow
|
||||
expr: (longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) > 0.90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Longhorn disk space critically low"
|
||||
description: "Longhorn disk on {{ $labels.node }} is over 90% full."
|
||||
|
||||
- alert: LonghornVolumeSpaceLow
|
||||
expr: (longhorn_volume_actual_size_bytes / longhorn_volume_capacity_bytes) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Longhorn volume space low"
|
||||
description: "Longhorn volume {{ $labels.volume }} is over 85% full."
|
||||
|
||||
- alert: CertExpiringCritical
|
||||
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 86400
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Certificate expiring in less than 24 hours"
|
||||
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 24 hours"
|
||||
|
||||
- alert: CertExpiringSoon
|
||||
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) < 604800
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Certificate expiring in less than 7 days"
|
||||
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} expires in less than 7 days"
|
||||
|
||||
- alert: CertNotReady
|
||||
expr: certmanager_certificate_ready_status{condition="True"} != 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Certificate not ready"
|
||||
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."
|
||||
@@ -13,6 +13,9 @@ resources:
|
||||
# Hydra chart CRDs are not rendered by helm template; apply manually.
|
||||
- hydra-oauth2client-crd.yaml
|
||||
- vault-secrets.yaml
|
||||
- ory-alertrules.yaml
|
||||
- hydra-servicemonitor.yaml
|
||||
- kratos-servicemonitor.yaml
|
||||
|
||||
patches:
|
||||
# Set Kratos selfservice UI URLs (DOMAIN_SUFFIX substituted at apply time).
|
||||
|
||||
46
base/ory/ory-alertrules.yaml
Normal file
46
base/ory/ory-alertrules.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: ory-alerts
|
||||
namespace: ory
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: ory
|
||||
rules:
|
||||
- alert: HydraDown
|
||||
expr: up{job=~".*hydra.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Hydra is down"
|
||||
description: "Hydra instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: KratosDown
|
||||
expr: up{job=~".*kratos.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Kratos is down"
|
||||
description: "Kratos instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: HydraHighErrorRate
|
||||
expr: sum(rate(http_requests_total{job=~".*hydra.*",code=~"5.."}[5m])) / sum(rate(http_requests_total{job=~".*hydra.*"}[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Hydra has a high HTTP error rate"
|
||||
description: "Hydra 5xx error rate is {{ $value | humanizePercentage }}."
|
||||
|
||||
- alert: KratosHighErrorRate
|
||||
expr: sum(rate(http_requests_total{job=~".*kratos.*",code=~"5.."}[5m])) / sum(rate(http_requests_total{job=~".*kratos.*"}[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Kratos has a high HTTP error rate"
|
||||
description: "Kratos 5xx error rate is {{ $value | humanizePercentage }}."
|
||||
@@ -12,3 +12,5 @@ resources:
|
||||
- seaweedfs-filer-pvc.yaml
|
||||
- vault-secrets.yaml
|
||||
- seaweedfs-remote-sync.yaml
|
||||
- seaweedfs-servicemonitor.yaml
|
||||
- seaweedfs-alertrules.yaml
|
||||
|
||||
46
base/storage/seaweedfs-alertrules.yaml
Normal file
46
base/storage/seaweedfs-alertrules.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: seaweedfs-alerts
|
||||
namespace: storage
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: seaweedfs
|
||||
rules:
|
||||
- alert: SeaweedFSMasterDown
|
||||
expr: up{job=~".*seaweedfs.*", app="seaweedfs-master"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SeaweedFS master is down"
|
||||
description: "SeaweedFS master {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: SeaweedFSVolumeDown
|
||||
expr: up{job=~".*seaweedfs.*", app="seaweedfs-volume"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SeaweedFS volume server is down"
|
||||
description: "SeaweedFS volume server {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: SeaweedFSFilerDown
|
||||
expr: up{job=~".*seaweedfs.*", app="seaweedfs-filer"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SeaweedFS filer is down"
|
||||
description: "SeaweedFS filer {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: SeaweedFSDiskLow
|
||||
expr: (seaweedfs_disk_free_bytes / (seaweedfs_disk_free_bytes + seaweedfs_disk_used_bytes)) < 0.15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SeaweedFS disk space low"
|
||||
description: "SeaweedFS disk on {{ $labels.instance }} has less than 15% free space."
|
||||
Reference in New Issue
Block a user