feat: add PrometheusRule alerts for all services
28 alert rules across 9 PrometheusRule files covering infrastructure (Longhorn, cert-manager), data (PostgreSQL, OpenBao, OpenSearch), storage (SeaweedFS), devtools (Gitea), identity (Hydra, Kratos), media (LiveKit), and mesh (Linkerd golden signals for all services). Severity routing: critical alerts fire to Matrix + email, warnings to Matrix only (AlertManager config updated in separate commit).
This commit is contained in:
@@ -11,9 +11,9 @@ resources:
|
||||
- opensearch-deployment.yaml
|
||||
- opensearch-service.yaml
|
||||
- opensearch-pvc.yaml
|
||||
- openbao-keys-placeholder.yaml
|
||||
- barman-vault-secret.yaml
|
||||
- opensearch-servicemonitor.yaml
|
||||
# opensearch-servicemonitor.yaml removed — OpenSearch 3.x has no prometheus-exporter plugin.
|
||||
# TODO: add opensearch-exporter sidecar for Prometheus metrics.
|
||||
- opensearch-alertrules.yaml
|
||||
- postgres-alertrules.yaml
|
||||
- openbao-alertrules.yaml
|
||||
|
||||
28
base/data/openbao-alertrules.yaml
Normal file
28
base/data/openbao-alertrules.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: openbao-alerts
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: openbao
|
||||
rules:
|
||||
- alert: VaultSealed
|
||||
expr: vault_core_unsealed == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "OpenBao/Vault is sealed"
|
||||
description: "OpenBao/Vault is sealed — automatic unseal may have failed"
|
||||
|
||||
- alert: VaultDown
|
||||
expr: up{job=~".*openbao.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "OpenBao/Vault is down"
|
||||
description: "OpenBao instance {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
37
base/data/opensearch-alertrules.yaml
Normal file
37
base/data/opensearch-alertrules.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: opensearch-alerts
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: opensearch
|
||||
rules:
|
||||
- alert: OpenSearchClusterRed
|
||||
expr: opensearch_cluster_health_status{color="red"} == 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "OpenSearch cluster health is RED"
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} health status is red."
|
||||
|
||||
- alert: OpenSearchClusterYellow
|
||||
expr: opensearch_cluster_health_status{color="yellow"} == 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "OpenSearch cluster health is YELLOW"
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow."
|
||||
|
||||
- alert: OpenSearchHeapHigh
|
||||
expr: (opensearch_jvm_mem_heap_used_bytes / opensearch_jvm_mem_heap_max_bytes) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "OpenSearch JVM heap usage is high"
|
||||
description: "OpenSearch node {{ $labels.node }} in {{ $labels.namespace }} heap usage is above 85%."
|
||||
37
base/data/postgres-alertrules.yaml
Normal file
37
base/data/postgres-alertrules.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: postgres-alerts
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: postgres
|
||||
rules:
|
||||
- alert: PostgresDown
|
||||
expr: cnpg_collector_up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL instance is down"
|
||||
description: "CNPG collector reports {{ $labels.namespace }}/{{ $labels.pod }} is down."
|
||||
|
||||
- alert: PostgresDatabaseSizeLarge
|
||||
expr: cnpg_pg_database_size_bytes > 7e9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL database size is large"
|
||||
description: "Database {{ $labels.datname }} is {{ $value | humanize1024 }} (PVC limit 10Gi)"
|
||||
|
||||
- alert: PostgresHighConnections
|
||||
expr: sum by (pod) (cnpg_pg_stat_activity_count) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL connection count is high"
|
||||
description: "Pod {{ $labels.pod }} has {{ $value }} active connections."
|
||||
Reference in New Issue
Block a user