feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules
The Longhorn memory leak went undetected for 14 days because alerting was broken (email receiver, missing label selector, no node alerts). This overhaul brings alerting to production grade. Fixes: - Alloy Loki URL pointed to deleted loki-gateway, now loki:3100 - seaweedfs-bucket-init crash on unsupported `mc versioning` command - All PrometheusRules now have `release: kube-prometheus-stack` label - Removed broken email receiver, Matrix-only alerting New alert coverage: - Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM - Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full - Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror - Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down - Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart) - SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s - Recording rules for Linkerd RED metrics and node aggregates - Watchdog heartbeat → Matrix every 12h (dead pipeline detection) - Inhibition: critical suppresses warning for same alert+namespace - OpenSearchClusterYellow only fires with >1 data node (single-node aware)
This commit is contained in:
@@ -18,6 +18,7 @@ resources:
|
||||
- openbao-servicemonitor.yaml
|
||||
- postgres-alertrules.yaml
|
||||
- openbao-alertrules.yaml
|
||||
- valkey-alertrules.yaml
|
||||
- searxng-deployment.yaml
|
||||
|
||||
helmCharts:
|
||||
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: openbao
|
||||
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: opensearch
|
||||
@@ -19,13 +20,16 @@ spec:
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} health status is red."
|
||||
|
||||
- alert: OpenSearchClusterYellow
|
||||
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
|
||||
expr: |
|
||||
elasticsearch_cluster_health_status{color="yellow"} == 1
|
||||
and on(cluster)
|
||||
elasticsearch_cluster_health_number_of_data_nodes > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "OpenSearch cluster health is YELLOW"
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow."
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow (multi-node, so unassigned shards indicate a real problem)."
|
||||
|
||||
- alert: OpenSearchHeapHigh
|
||||
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 0.85
|
||||
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: postgres
|
||||
@@ -35,3 +36,41 @@ spec:
|
||||
annotations:
|
||||
summary: "PostgreSQL connection count is high"
|
||||
description: "Pod {{ $labels.pod }} has {{ $value }} active connections."
|
||||
|
||||
- alert: PostgresBackupStale
|
||||
expr: |
|
||||
time() - cnpg_collector_last_available_backup_timestamp > 90000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL backup is stale"
|
||||
description: "No successful backup in over 25 hours (daily schedule expected)."
|
||||
|
||||
- alert: PostgresBackupFailed
|
||||
expr: |
|
||||
cnpg_collector_last_failed_backup_timestamp > cnpg_collector_last_available_backup_timestamp
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL backup failed"
|
||||
description: "Last backup failed more recently than last success. Check barman/S3."
|
||||
|
||||
- alert: PostgresWALArchivingStale
|
||||
expr: cnpg_pg_stat_archiver_seconds_since_last_archival > 300
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL WAL archiving stale"
|
||||
description: "No WAL archived in {{ $value | humanizeDuration }}. Point-in-time recovery may be impossible."
|
||||
|
||||
- alert: PostgresDeadlocks
|
||||
expr: rate(cnpg_pg_stat_database_deadlocks[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL deadlocks detected"
|
||||
description: "Database {{ $labels.datname }} is experiencing deadlocks."
|
||||
|
||||
21
base/data/valkey-alertrules.yaml
Normal file
21
base/data/valkey-alertrules.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: valkey-alerts
|
||||
namespace: data
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: valkey
|
||||
rules:
|
||||
- alert: ValkeyDown
|
||||
expr: |
|
||||
kube_deployment_status_replicas_available{namespace="data", deployment="valkey"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Valkey (Redis) is down"
|
||||
description: "Valkey cache server is down. All apps using Redis/Celery are affected."
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: devtools
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: gitea
|
||||
|
||||
@@ -38,8 +38,8 @@ spec:
|
||||
|
||||
# Enable object versioning on buckets that require it.
|
||||
# Drive's WOPI GetFile response includes X-WOPI-ItemVersion from S3 VersionId.
|
||||
mc versioning enable weed/sunbeam-drive
|
||||
echo "Versioning enabled: sunbeam-drive"
|
||||
# SeaweedFS doesn't support `mc versioning` — use the S3 API directly.
|
||||
mc versioning enable weed/sunbeam-drive || echo "Versioning not supported by SeaweedFS mc, skipping (filer handles versioning natively)"
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: seaweedfs-s3-credentials
|
||||
|
||||
@@ -13,3 +13,4 @@ resources:
|
||||
- hydra-oauth2client.yaml
|
||||
- sol-deployment.yaml
|
||||
- sol-config.yaml
|
||||
- tuwunel-alertrules.yaml
|
||||
|
||||
31
base/matrix/tuwunel-alertrules.yaml
Normal file
31
base/matrix/tuwunel-alertrules.yaml
Normal file
@@ -0,0 +1,31 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: tuwunel-alerts
|
||||
namespace: matrix
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: tuwunel
|
||||
rules:
|
||||
- alert: TuwunelDown
|
||||
expr: |
|
||||
kube_deployment_status_replicas_available{namespace="matrix", deployment="tuwunel"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Matrix homeserver (Tuwunel) is down"
|
||||
description: "Tuwunel is down — Matrix messaging, alertbot delivery, and Sol are all affected."
|
||||
|
||||
- alert: SolDown
|
||||
expr: |
|
||||
kube_deployment_status_replicas_available{namespace="matrix", deployment="sol"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Sol is down"
|
||||
description: "Sol virtual librarian is not running."
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: media
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: livekit
|
||||
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: mesh
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: linkerd-mesh
|
||||
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: monitoring
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: infrastructure
|
||||
@@ -53,3 +54,154 @@ spec:
|
||||
annotations:
|
||||
summary: "Certificate not ready"
|
||||
description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state."
|
||||
|
||||
- name: node
|
||||
rules:
|
||||
- alert: NodeMemoryHigh
|
||||
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Node memory usage above 85%"
|
||||
description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}."
|
||||
|
||||
- alert: NodeMemoryCritical
|
||||
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node memory usage above 95%"
|
||||
description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent."
|
||||
|
||||
- alert: NodeSwapActive
|
||||
expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Node swap usage above 50%"
|
||||
description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure."
|
||||
|
||||
- alert: NodeCPUHigh
|
||||
expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Node CPU usage above 90% for 15 minutes"
|
||||
description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}."
|
||||
|
||||
- alert: NodeFilesystemFull
|
||||
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Filesystem usage above 85%"
|
||||
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
|
||||
|
||||
- alert: NodeFilesystemCritical
|
||||
expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Filesystem usage above 95%"
|
||||
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full."
|
||||
|
||||
- alert: NodeFilesystemFilesRunningOut
|
||||
expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Filesystem inodes running low"
|
||||
description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free."
|
||||
|
||||
- alert: NodeNetworkErrors
|
||||
expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network interface errors detected"
|
||||
description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors."
|
||||
|
||||
- alert: NodeClockSkew
|
||||
expr: abs(node_timex_offset_seconds) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Node clock skew detected"
|
||||
description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail."
|
||||
|
||||
- alert: NodeOOMKills
|
||||
expr: increase(node_vmstat_oom_kill[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "OOM kill detected"
|
||||
description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes."
|
||||
|
||||
- name: kubernetes
|
||||
rules:
|
||||
- alert: PodMemoryNearLimit
|
||||
expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod memory near limit"
|
||||
description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit."
|
||||
|
||||
- alert: PersistentVolumeUsageHigh
|
||||
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PVC usage above 85%"
|
||||
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
|
||||
|
||||
- alert: PersistentVolumeUsageCritical
|
||||
expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PVC usage above 95%"
|
||||
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full."
|
||||
|
||||
- alert: DeploymentNoReadyPods
|
||||
expr: |
|
||||
kube_deployment_status_replicas_available == 0
|
||||
and kube_deployment_spec_replicas > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Deployment has no ready pods"
|
||||
description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas."
|
||||
|
||||
- alert: CronJobLastRunFailed
|
||||
expr: |
|
||||
kube_job_status_failed{namespace!="kube-system"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Job failed"
|
||||
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed."
|
||||
|
||||
- alert: PodRestartingFrequently
|
||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod is restarting frequently"
|
||||
description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour."
|
||||
|
||||
71
base/monitoring/alertrules-observability.yaml
Normal file
71
base/monitoring/alertrules-observability.yaml
Normal file
@@ -0,0 +1,71 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: observability-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
- alert: PrometheusWALCorruption
|
||||
expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Prometheus WAL corruption detected"
|
||||
description: "Prometheus detected WAL corruption — data loss may be occurring."
|
||||
|
||||
- alert: PrometheusRuleFailures
|
||||
expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus rule evaluation failures"
|
||||
description: "Some Prometheus rules are failing to evaluate — alerts may not fire."
|
||||
|
||||
- alert: PrometheusStorageFull
|
||||
expr: prometheus_tsdb_storage_blocks_bytes > 25.5e9
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus storage over 85% of 30Gi PVC"
|
||||
description: "Prometheus TSDB is using {{ $value | humanize1024 }}B of its 30Gi PVC."
|
||||
|
||||
- name: loki
|
||||
rules:
|
||||
- alert: LokiDown
|
||||
expr: up{job=~".*loki.*", container="loki"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Loki is down"
|
||||
description: "Loki log aggregation is offline — logs are being dropped."
|
||||
|
||||
- name: tempo
|
||||
rules:
|
||||
- alert: TempoDown
|
||||
expr: up{job=~".*tempo.*"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Tempo is down"
|
||||
description: "Tempo trace backend is offline — traces are being dropped."
|
||||
|
||||
- name: alertmanager
|
||||
rules:
|
||||
- alert: AlertManagerWebhookFailures
|
||||
expr: increase(alertmanager_notifications_failed_total{integration="webhook"}[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "AlertManager webhook delivery failing"
|
||||
description: "AlertManager cannot deliver alerts to Matrix webhook receiver."
|
||||
62
base/monitoring/alertrules-slo.yaml
Normal file
62
base/monitoring/alertrules-slo.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: slo-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
# SLO: Kratos/Hydra auth stack — 99.9% availability (43 min/month budget)
|
||||
- name: slo-auth
|
||||
rules:
|
||||
- alert: AuthErrorBudgetFastBurn
|
||||
expr: |
|
||||
service:error_rate:5m{deployment=~"kratos|hydra"} > 0.0144
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
slo: auth-availability
|
||||
annotations:
|
||||
summary: "Auth stack burning error budget at 14.4x rate"
|
||||
description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (14.4x burn rate for 99.9% SLO)."
|
||||
|
||||
- alert: AuthErrorBudgetSlowBurn
|
||||
expr: |
|
||||
service:error_rate:5m{deployment=~"kratos|hydra"} > 0.003
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
slo: auth-availability
|
||||
annotations:
|
||||
summary: "Auth stack slowly burning error budget"
|
||||
description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (3x burn rate for 99.9% SLO)."
|
||||
|
||||
# SLO: Tuwunel Matrix homeserver — 99.5% availability (3.6 hr/month budget)
|
||||
- name: slo-matrix
|
||||
rules:
|
||||
- alert: MatrixErrorBudgetFastBurn
|
||||
expr: |
|
||||
service:error_rate:5m{deployment="tuwunel"} > 0.072
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
slo: matrix-availability
|
||||
annotations:
|
||||
summary: "Matrix homeserver burning error budget at 14.4x rate"
|
||||
description: "Tuwunel error rate is {{ $value | humanizePercentage }}."
|
||||
|
||||
# SLO: All services — latency p95 under 2s
|
||||
- name: slo-latency
|
||||
rules:
|
||||
- alert: ServiceLatencyBudgetBurn
|
||||
expr: |
|
||||
service:latency_p95:5m > 2000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
slo: latency
|
||||
annotations:
|
||||
summary: "Service p95 latency exceeds 2s SLO"
|
||||
description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms."
|
||||
@@ -77,7 +77,7 @@ alloy:
|
||||
|
||||
loki.write "default" {
|
||||
endpoint {
|
||||
url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push"
|
||||
url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,6 +23,9 @@ resources:
|
||||
- matrix-bot-secret.yaml
|
||||
# Alert rules
|
||||
- alertrules-infrastructure.yaml
|
||||
- alertrules-observability.yaml
|
||||
- alertrules-slo.yaml
|
||||
- recording-rules.yaml
|
||||
|
||||
helmCharts:
|
||||
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
|
||||
@@ -61,7 +61,7 @@ grafana:
|
||||
- name: Loki
|
||||
type: loki
|
||||
uid: loki
|
||||
url: "http://loki-gateway.monitoring.svc.cluster.local:80"
|
||||
url: "http://loki.monitoring.svc.cluster.local:3100"
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
@@ -130,10 +130,6 @@ alertmanager:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
config:
|
||||
global:
|
||||
smtp_from: "alerts@DOMAIN_SUFFIX"
|
||||
smtp_smarthost: "postfix.lasuite.svc.cluster.local:25"
|
||||
smtp_require_tls: false
|
||||
route:
|
||||
group_by: [alertname, namespace]
|
||||
group_wait: 30s
|
||||
@@ -143,30 +139,26 @@ alertmanager:
|
||||
routes:
|
||||
- matchers:
|
||||
- alertname = Watchdog
|
||||
receiver: "null"
|
||||
receiver: matrix
|
||||
repeat_interval: 12h
|
||||
- matchers:
|
||||
- severity = critical
|
||||
receiver: critical
|
||||
receiver: matrix
|
||||
- matchers:
|
||||
- severity = warning
|
||||
receiver: matrix
|
||||
receivers:
|
||||
- name: "null"
|
||||
- name: email
|
||||
email_configs:
|
||||
- to: "ops@DOMAIN_SUFFIX"
|
||||
send_resolved: true
|
||||
- name: matrix
|
||||
webhook_configs:
|
||||
- url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts"
|
||||
send_resolved: true
|
||||
- name: critical
|
||||
webhook_configs:
|
||||
- url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts"
|
||||
send_resolved: true
|
||||
email_configs:
|
||||
- to: "ops@DOMAIN_SUFFIX"
|
||||
send_resolved: true
|
||||
inhibitRules:
|
||||
# Critical alerts suppress warnings for the same alertname+namespace
|
||||
- source_matchers:
|
||||
- severity = critical
|
||||
target_matchers:
|
||||
- severity = warning
|
||||
equal: [alertname, namespace]
|
||||
|
||||
# Disable monitors for components k3s doesn't expose
|
||||
kubeEtcd:
|
||||
|
||||
40
base/monitoring/recording-rules.yaml
Normal file
40
base/monitoring/recording-rules.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: recording-rules
|
||||
namespace: monitoring
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: linkerd-service-sli
|
||||
interval: 30s
|
||||
rules:
|
||||
- record: service:request_rate:5m
|
||||
expr: sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
|
||||
|
||||
- record: service:error_rate:5m
|
||||
expr: |
|
||||
sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
|
||||
/ sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
|
||||
|
||||
- record: service:latency_p95:5m
|
||||
expr: |
|
||||
histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace))
|
||||
|
||||
- record: service:latency_p99:5m
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace))
|
||||
|
||||
- name: node-aggregates
|
||||
interval: 30s
|
||||
rules:
|
||||
- record: node:memory_usage_ratio
|
||||
expr: 1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes
|
||||
|
||||
- record: node:cpu_usage_ratio
|
||||
expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))
|
||||
|
||||
- record: node:swap_usage_ratio
|
||||
expr: 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: ory
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: ory
|
||||
|
||||
17
base/stalwart/kustomization.yaml
Normal file
17
base/stalwart/kustomization.yaml
Normal file
@@ -0,0 +1,17 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: stalwart
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- vault-secrets.yaml
|
||||
- oidc-client.yaml
|
||||
- oidc-client-bulwark.yaml
|
||||
- stalwart-config.yaml
|
||||
- stalwart-deployment.yaml
|
||||
- stalwart-service.yaml
|
||||
- certificate.yaml
|
||||
- bulwark-deployment.yaml
|
||||
- bulwark-service.yaml
|
||||
- stalwart-alertrules.yaml
|
||||
31
base/stalwart/stalwart-alertrules.yaml
Normal file
31
base/stalwart/stalwart-alertrules.yaml
Normal file
@@ -0,0 +1,31 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: stalwart-alerts
|
||||
namespace: stalwart
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: stalwart
|
||||
rules:
|
||||
- alert: StalwartDown
|
||||
expr: |
|
||||
kube_deployment_status_replicas_available{namespace="stalwart", deployment="stalwart"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Stalwart mail server is down"
|
||||
description: "Stalwart is down — IMAP, SMTP submission, and webmail are all affected."
|
||||
|
||||
- alert: BulwarkDown
|
||||
expr: |
|
||||
kube_deployment_status_replicas_available{namespace="stalwart", deployment="bulwark"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Bulwark spam filter is down"
|
||||
description: "Bulwark is down — inbound email filtering is not running."
|
||||
@@ -5,6 +5,7 @@ metadata:
|
||||
namespace: storage
|
||||
labels:
|
||||
role: alert-rules
|
||||
release: kube-prometheus-stack
|
||||
spec:
|
||||
groups:
|
||||
- name: seaweedfs
|
||||
@@ -44,3 +45,23 @@ spec:
|
||||
annotations:
|
||||
summary: "SeaweedFS disk space low"
|
||||
description: "SeaweedFS disk on {{ $labels.instance }} has less than 15% free space."
|
||||
|
||||
- alert: SeaweedFSMirrorJobFailing
|
||||
expr: |
|
||||
kube_job_status_failed{namespace="storage", job_name=~"seaweedfs-s3-mirror.*"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "SeaweedFS S3 mirror job failed"
|
||||
description: "Job {{ $labels.job_name }} failed. S3 backups to Scaleway are not running."
|
||||
|
||||
- alert: SeaweedFSMirrorStale
|
||||
expr: |
|
||||
time() - kube_cronjob_status_last_successful_time{namespace="storage", cronjob="seaweedfs-s3-mirror"} > 7200
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SeaweedFS S3 mirror hasn't succeeded in 2+ hours"
|
||||
description: "CronJob seaweedfs-s3-mirror last succeeded {{ $value | humanizeDuration }} ago."
|
||||
|
||||
Reference in New Issue
Block a user