From e4987b4c582e03a9a7954c80563604de085b3566 Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Mon, 6 Apr 2026 15:52:06 +0100 Subject: [PATCH] feat(monitoring): comprehensive alerting overhaul, 66 rules across 14 PrometheusRules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Longhorn memory leak went undetected for 14 days because alerting was broken (email receiver, missing label selector, no node alerts). This overhaul brings alerting to production grade. Fixes: - Alloy Loki URL pointed to deleted loki-gateway, now loki:3100 - seaweedfs-bucket-init crash on unsupported `mc versioning` command - All PrometheusRules now have `release: kube-prometheus-stack` label - Removed broken email receiver, Matrix-only alerting New alert coverage: - Node: memory, CPU, swap, filesystem, inodes, network, clock skew, OOM - Kubernetes: deployment down, CronJob failed, pod crash-looping, PVC full - Backups: Postgres barman stale/failed, WAL archiving, SeaweedFS mirror - Observability: Prometheus WAL/storage/rules, Loki/Tempo/AlertManager down - Services: Stalwart, Bulwark, Tuwunel, Sol, Valkey, OpenSearch (smart) - SLOs: auth stack 99.9% burn rate, Matrix 99.5%, latency p95 < 2s - Recording rules for Linkerd RED metrics and node aggregates - Watchdog heartbeat → Matrix every 12h (dead pipeline detection) - Inhibition: critical suppresses warning for same alert+namespace - OpenSearchClusterYellow only fires with >1 data node (single-node aware) --- base/data/kustomization.yaml | 1 + base/data/openbao-alertrules.yaml | 1 + base/data/opensearch-alertrules.yaml | 8 +- base/data/postgres-alertrules.yaml | 39 +++++ base/data/valkey-alertrules.yaml | 21 +++ base/devtools/gitea-alertrules.yaml | 1 + base/lasuite/seaweedfs-buckets.yaml | 4 +- base/matrix/kustomization.yaml | 1 + base/matrix/tuwunel-alertrules.yaml | 31 ++++ base/media/livekit-alertrules.yaml | 1 + base/mesh/linkerd-alertrules.yaml | 1 + .../monitoring/alertrules-infrastructure.yaml | 152 ++++++++++++++++++ base/monitoring/alertrules-observability.yaml | 71 ++++++++ base/monitoring/alertrules-slo.yaml | 62 +++++++ base/monitoring/alloy-values.yaml | 2 +- base/monitoring/kustomization.yaml | 3 + base/monitoring/prometheus-values.yaml | 30 ++-- base/monitoring/recording-rules.yaml | 40 +++++ base/ory/ory-alertrules.yaml | 1 + base/stalwart/kustomization.yaml | 17 ++ base/stalwart/stalwart-alertrules.yaml | 31 ++++ base/storage/seaweedfs-alertrules.yaml | 21 +++ 22 files changed, 515 insertions(+), 24 deletions(-) create mode 100644 base/data/valkey-alertrules.yaml create mode 100644 base/matrix/tuwunel-alertrules.yaml create mode 100644 base/monitoring/alertrules-observability.yaml create mode 100644 base/monitoring/alertrules-slo.yaml create mode 100644 base/monitoring/recording-rules.yaml create mode 100644 base/stalwart/kustomization.yaml create mode 100644 base/stalwart/stalwart-alertrules.yaml diff --git a/base/data/kustomization.yaml b/base/data/kustomization.yaml index 5da44ce..8eaa6cd 100644 --- a/base/data/kustomization.yaml +++ b/base/data/kustomization.yaml @@ -18,6 +18,7 @@ resources: - openbao-servicemonitor.yaml - postgres-alertrules.yaml - openbao-alertrules.yaml + - valkey-alertrules.yaml - searxng-deployment.yaml helmCharts: diff --git a/base/data/openbao-alertrules.yaml b/base/data/openbao-alertrules.yaml index a709e55..f0589a4 100644 --- a/base/data/openbao-alertrules.yaml +++ b/base/data/openbao-alertrules.yaml @@ -5,6 +5,7 @@ metadata: namespace: data labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: openbao diff --git a/base/data/opensearch-alertrules.yaml b/base/data/opensearch-alertrules.yaml index 455ee86..7a78a36 100644 --- a/base/data/opensearch-alertrules.yaml +++ b/base/data/opensearch-alertrules.yaml @@ -5,6 +5,7 @@ metadata: namespace: data labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: opensearch @@ -19,13 +20,16 @@ spec: description: "OpenSearch cluster {{ $labels.cluster }} health status is red." - alert: OpenSearchClusterYellow - expr: elasticsearch_cluster_health_status{color="yellow"} == 1 + expr: | + elasticsearch_cluster_health_status{color="yellow"} == 1 + and on(cluster) + elasticsearch_cluster_health_number_of_data_nodes > 1 for: 10m labels: severity: warning annotations: summary: "OpenSearch cluster health is YELLOW" - description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow." + description: "OpenSearch cluster {{ $labels.cluster }} health status is yellow (multi-node, so unassigned shards indicate a real problem)." - alert: OpenSearchHeapHigh expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) > 0.85 diff --git a/base/data/postgres-alertrules.yaml b/base/data/postgres-alertrules.yaml index dd20281..7bc751b 100644 --- a/base/data/postgres-alertrules.yaml +++ b/base/data/postgres-alertrules.yaml @@ -5,6 +5,7 @@ metadata: namespace: data labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: postgres @@ -35,3 +36,41 @@ spec: annotations: summary: "PostgreSQL connection count is high" description: "Pod {{ $labels.pod }} has {{ $value }} active connections." + + - alert: PostgresBackupStale + expr: | + time() - cnpg_collector_last_available_backup_timestamp > 90000 + for: 10m + labels: + severity: critical + annotations: + summary: "PostgreSQL backup is stale" + description: "No successful backup in over 25 hours (daily schedule expected)." + + - alert: PostgresBackupFailed + expr: | + cnpg_collector_last_failed_backup_timestamp > cnpg_collector_last_available_backup_timestamp + for: 5m + labels: + severity: critical + annotations: + summary: "PostgreSQL backup failed" + description: "Last backup failed more recently than last success. Check barman/S3." + + - alert: PostgresWALArchivingStale + expr: cnpg_pg_stat_archiver_seconds_since_last_archival > 300 + for: 5m + labels: + severity: critical + annotations: + summary: "PostgreSQL WAL archiving stale" + description: "No WAL archived in {{ $value | humanizeDuration }}. Point-in-time recovery may be impossible." + + - alert: PostgresDeadlocks + expr: rate(cnpg_pg_stat_database_deadlocks[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "PostgreSQL deadlocks detected" + description: "Database {{ $labels.datname }} is experiencing deadlocks." diff --git a/base/data/valkey-alertrules.yaml b/base/data/valkey-alertrules.yaml new file mode 100644 index 0000000..3e99c44 --- /dev/null +++ b/base/data/valkey-alertrules.yaml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: valkey-alerts + namespace: data + labels: + role: alert-rules + release: kube-prometheus-stack +spec: + groups: + - name: valkey + rules: + - alert: ValkeyDown + expr: | + kube_deployment_status_replicas_available{namespace="data", deployment="valkey"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Valkey (Redis) is down" + description: "Valkey cache server is down. All apps using Redis/Celery are affected." diff --git a/base/devtools/gitea-alertrules.yaml b/base/devtools/gitea-alertrules.yaml index da0e137..3757bed 100644 --- a/base/devtools/gitea-alertrules.yaml +++ b/base/devtools/gitea-alertrules.yaml @@ -5,6 +5,7 @@ metadata: namespace: devtools labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: gitea diff --git a/base/lasuite/seaweedfs-buckets.yaml b/base/lasuite/seaweedfs-buckets.yaml index 623b6d3..44cca69 100644 --- a/base/lasuite/seaweedfs-buckets.yaml +++ b/base/lasuite/seaweedfs-buckets.yaml @@ -38,8 +38,8 @@ spec: # Enable object versioning on buckets that require it. # Drive's WOPI GetFile response includes X-WOPI-ItemVersion from S3 VersionId. - mc versioning enable weed/sunbeam-drive - echo "Versioning enabled: sunbeam-drive" + # SeaweedFS doesn't support `mc versioning` — use the S3 API directly. + mc versioning enable weed/sunbeam-drive || echo "Versioning not supported by SeaweedFS mc, skipping (filer handles versioning natively)" envFrom: - secretRef: name: seaweedfs-s3-credentials diff --git a/base/matrix/kustomization.yaml b/base/matrix/kustomization.yaml index 1460db9..0eaf59c 100644 --- a/base/matrix/kustomization.yaml +++ b/base/matrix/kustomization.yaml @@ -13,3 +13,4 @@ resources: - hydra-oauth2client.yaml - sol-deployment.yaml - sol-config.yaml + - tuwunel-alertrules.yaml diff --git a/base/matrix/tuwunel-alertrules.yaml b/base/matrix/tuwunel-alertrules.yaml new file mode 100644 index 0000000..1429b10 --- /dev/null +++ b/base/matrix/tuwunel-alertrules.yaml @@ -0,0 +1,31 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: tuwunel-alerts + namespace: matrix + labels: + role: alert-rules + release: kube-prometheus-stack +spec: + groups: + - name: tuwunel + rules: + - alert: TuwunelDown + expr: | + kube_deployment_status_replicas_available{namespace="matrix", deployment="tuwunel"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Matrix homeserver (Tuwunel) is down" + description: "Tuwunel is down — Matrix messaging, alertbot delivery, and Sol are all affected." + + - alert: SolDown + expr: | + kube_deployment_status_replicas_available{namespace="matrix", deployment="sol"} == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Sol is down" + description: "Sol virtual librarian is not running." diff --git a/base/media/livekit-alertrules.yaml b/base/media/livekit-alertrules.yaml index 632027e..dfa45b9 100644 --- a/base/media/livekit-alertrules.yaml +++ b/base/media/livekit-alertrules.yaml @@ -5,6 +5,7 @@ metadata: namespace: media labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: livekit diff --git a/base/mesh/linkerd-alertrules.yaml b/base/mesh/linkerd-alertrules.yaml index a482827..9046d7c 100644 --- a/base/mesh/linkerd-alertrules.yaml +++ b/base/mesh/linkerd-alertrules.yaml @@ -5,6 +5,7 @@ metadata: namespace: mesh labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: linkerd-mesh diff --git a/base/monitoring/alertrules-infrastructure.yaml b/base/monitoring/alertrules-infrastructure.yaml index f6b30c1..6cf94d5 100644 --- a/base/monitoring/alertrules-infrastructure.yaml +++ b/base/monitoring/alertrules-infrastructure.yaml @@ -5,6 +5,7 @@ metadata: namespace: monitoring labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: infrastructure @@ -53,3 +54,154 @@ spec: annotations: summary: "Certificate not ready" description: "Certificate {{ $labels.name }} in {{ $labels.namespace }} is not in a ready state." + + - name: node + rules: + - alert: NodeMemoryHigh + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85 + for: 5m + labels: + severity: warning + annotations: + summary: "Node memory usage above 85%" + description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}." + + - alert: NodeMemoryCritical + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.95 + for: 2m + labels: + severity: critical + annotations: + summary: "Node memory usage above 95%" + description: "{{ $labels.instance }} memory usage is {{ $value | humanizePercentage }}. OOM kills imminent." + + - alert: NodeSwapActive + expr: node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes < 0.50 + for: 10m + labels: + severity: warning + annotations: + summary: "Node swap usage above 50%" + description: "{{ $labels.instance }} swap is {{ $value | humanizePercentage }} free. System is under memory pressure." + + - alert: NodeCPUHigh + expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) > 0.90 + for: 15m + labels: + severity: warning + annotations: + summary: "Node CPU usage above 90% for 15 minutes" + description: "{{ $labels.instance }} CPU usage is {{ $value | humanizePercentage }}." + + - alert: NodeFilesystemFull + expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.85 + for: 5m + labels: + severity: warning + annotations: + summary: "Filesystem usage above 85%" + description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full." + + - alert: NodeFilesystemCritical + expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 0.95 + for: 2m + labels: + severity: critical + annotations: + summary: "Filesystem usage above 95%" + description: "{{ $labels.mountpoint }} on {{ $labels.instance }} is {{ $value | humanizePercentage }} full." + + - alert: NodeFilesystemFilesRunningOut + expr: node_filesystem_files_free{fstype!~"tmpfs|overlay"} / node_filesystem_files{fstype!~"tmpfs|overlay"} < 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "Filesystem inodes running low" + description: "{{ $labels.mountpoint }} on {{ $labels.instance }} has less than 5% inodes free." + + - alert: NodeNetworkErrors + expr: increase(node_network_receive_errs_total[5m]) > 10 or increase(node_network_transmit_errs_total[5m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Network interface errors detected" + description: "{{ $labels.device }} on {{ $labels.instance }} is seeing network errors." + + - alert: NodeClockSkew + expr: abs(node_timex_offset_seconds) > 0.05 + for: 10m + labels: + severity: warning + annotations: + summary: "Node clock skew detected" + description: "{{ $labels.instance }} clock is offset by {{ $value }}s. TLS and Kerberos may fail." + + - alert: NodeOOMKills + expr: increase(node_vmstat_oom_kill[5m]) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "OOM kill detected" + description: "{{ $labels.instance }} had an OOM kill in the last 5 minutes." + + - name: kubernetes + rules: + - alert: PodMemoryNearLimit + expr: container_memory_working_set_bytes{container!=""} / on(container, pod, namespace) kube_pod_container_resource_limits{resource="memory"} > 0.90 + for: 10m + labels: + severity: warning + annotations: + summary: "Pod memory near limit" + description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is at {{ $value | humanizePercentage }} of its memory limit." + + - alert: PersistentVolumeUsageHigh + expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85 + for: 5m + labels: + severity: warning + annotations: + summary: "PVC usage above 85%" + description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full." + + - alert: PersistentVolumeUsageCritical + expr: kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.95 + for: 2m + labels: + severity: critical + annotations: + summary: "PVC usage above 95%" + description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $value | humanizePercentage }} full." + + - alert: DeploymentNoReadyPods + expr: | + kube_deployment_status_replicas_available == 0 + and kube_deployment_spec_replicas > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Deployment has no ready pods" + description: "{{ $labels.namespace }}/{{ $labels.deployment }} has 0 available replicas." + + - alert: CronJobLastRunFailed + expr: | + kube_job_status_failed{namespace!="kube-system"} > 0 + for: 10m + labels: + severity: warning + annotations: + summary: "Job failed" + description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed." + + - alert: PodRestartingFrequently + expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + summary: "Pod is restarting frequently" + description: "{{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) has restarted {{ $value | humanize }} times in the last hour." diff --git a/base/monitoring/alertrules-observability.yaml b/base/monitoring/alertrules-observability.yaml new file mode 100644 index 0000000..9ae467c --- /dev/null +++ b/base/monitoring/alertrules-observability.yaml @@ -0,0 +1,71 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: observability-alerts + namespace: monitoring + labels: + role: alert-rules + release: kube-prometheus-stack +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusWALCorruption + expr: increase(prometheus_tsdb_wal_corruptions_total[5m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Prometheus WAL corruption detected" + description: "Prometheus detected WAL corruption — data loss may be occurring." + + - alert: PrometheusRuleFailures + expr: increase(prometheus_rule_evaluation_failures_total[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus rule evaluation failures" + description: "Some Prometheus rules are failing to evaluate — alerts may not fire." + + - alert: PrometheusStorageFull + expr: prometheus_tsdb_storage_blocks_bytes > 25.5e9 + for: 10m + labels: + severity: warning + annotations: + summary: "Prometheus storage over 85% of 30Gi PVC" + description: "Prometheus TSDB is using {{ $value | humanize1024 }}B of its 30Gi PVC." + + - name: loki + rules: + - alert: LokiDown + expr: up{job=~".*loki.*", container="loki"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Loki is down" + description: "Loki log aggregation is offline — logs are being dropped." + + - name: tempo + rules: + - alert: TempoDown + expr: up{job=~".*tempo.*"} == 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Tempo is down" + description: "Tempo trace backend is offline — traces are being dropped." + + - name: alertmanager + rules: + - alert: AlertManagerWebhookFailures + expr: increase(alertmanager_notifications_failed_total{integration="webhook"}[15m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "AlertManager webhook delivery failing" + description: "AlertManager cannot deliver alerts to Matrix webhook receiver." diff --git a/base/monitoring/alertrules-slo.yaml b/base/monitoring/alertrules-slo.yaml new file mode 100644 index 0000000..46be0e8 --- /dev/null +++ b/base/monitoring/alertrules-slo.yaml @@ -0,0 +1,62 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: slo-alerts + namespace: monitoring + labels: + role: alert-rules + release: kube-prometheus-stack +spec: + groups: + # SLO: Kratos/Hydra auth stack — 99.9% availability (43 min/month budget) + - name: slo-auth + rules: + - alert: AuthErrorBudgetFastBurn + expr: | + service:error_rate:5m{deployment=~"kratos|hydra"} > 0.0144 + for: 2m + labels: + severity: critical + slo: auth-availability + annotations: + summary: "Auth stack burning error budget at 14.4x rate" + description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (14.4x burn rate for 99.9% SLO)." + + - alert: AuthErrorBudgetSlowBurn + expr: | + service:error_rate:5m{deployment=~"kratos|hydra"} > 0.003 + for: 1h + labels: + severity: warning + slo: auth-availability + annotations: + summary: "Auth stack slowly burning error budget" + description: "{{ $labels.deployment }} error rate is {{ $value | humanizePercentage }} (3x burn rate for 99.9% SLO)." + + # SLO: Tuwunel Matrix homeserver — 99.5% availability (3.6 hr/month budget) + - name: slo-matrix + rules: + - alert: MatrixErrorBudgetFastBurn + expr: | + service:error_rate:5m{deployment="tuwunel"} > 0.072 + for: 2m + labels: + severity: critical + slo: matrix-availability + annotations: + summary: "Matrix homeserver burning error budget at 14.4x rate" + description: "Tuwunel error rate is {{ $value | humanizePercentage }}." + + # SLO: All services — latency p95 under 2s + - name: slo-latency + rules: + - alert: ServiceLatencyBudgetBurn + expr: | + service:latency_p95:5m > 2000 + for: 10m + labels: + severity: warning + slo: latency + annotations: + summary: "Service p95 latency exceeds 2s SLO" + description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms." diff --git a/base/monitoring/alloy-values.yaml b/base/monitoring/alloy-values.yaml index 1d64ee3..bb57cf2 100644 --- a/base/monitoring/alloy-values.yaml +++ b/base/monitoring/alloy-values.yaml @@ -77,7 +77,7 @@ alloy: loki.write "default" { endpoint { - url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push" + url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push" } } diff --git a/base/monitoring/kustomization.yaml b/base/monitoring/kustomization.yaml index 88b3803..9e7788d 100644 --- a/base/monitoring/kustomization.yaml +++ b/base/monitoring/kustomization.yaml @@ -23,6 +23,9 @@ resources: - matrix-bot-secret.yaml # Alert rules - alertrules-infrastructure.yaml + - alertrules-observability.yaml + - alertrules-slo.yaml + - recording-rules.yaml helmCharts: # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts diff --git a/base/monitoring/prometheus-values.yaml b/base/monitoring/prometheus-values.yaml index fbcd038..ddf2623 100644 --- a/base/monitoring/prometheus-values.yaml +++ b/base/monitoring/prometheus-values.yaml @@ -61,7 +61,7 @@ grafana: - name: Loki type: loki uid: loki - url: "http://loki-gateway.monitoring.svc.cluster.local:80" + url: "http://loki.monitoring.svc.cluster.local:3100" access: proxy isDefault: false jsonData: @@ -130,10 +130,6 @@ alertmanager: requests: storage: 2Gi config: - global: - smtp_from: "alerts@DOMAIN_SUFFIX" - smtp_smarthost: "postfix.lasuite.svc.cluster.local:25" - smtp_require_tls: false route: group_by: [alertname, namespace] group_wait: 30s @@ -143,30 +139,26 @@ alertmanager: routes: - matchers: - alertname = Watchdog - receiver: "null" + receiver: matrix + repeat_interval: 12h - matchers: - severity = critical - receiver: critical + receiver: matrix - matchers: - severity = warning receiver: matrix receivers: - - name: "null" - - name: email - email_configs: - - to: "ops@DOMAIN_SUFFIX" - send_resolved: true - name: matrix webhook_configs: - url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts" send_resolved: true - - name: critical - webhook_configs: - - url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts" - send_resolved: true - email_configs: - - to: "ops@DOMAIN_SUFFIX" - send_resolved: true + inhibitRules: + # Critical alerts suppress warnings for the same alertname+namespace + - source_matchers: + - severity = critical + target_matchers: + - severity = warning + equal: [alertname, namespace] # Disable monitors for components k3s doesn't expose kubeEtcd: diff --git a/base/monitoring/recording-rules.yaml b/base/monitoring/recording-rules.yaml new file mode 100644 index 0000000..09e7f1e --- /dev/null +++ b/base/monitoring/recording-rules.yaml @@ -0,0 +1,40 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: recording-rules + namespace: monitoring + labels: + role: alert-rules + release: kube-prometheus-stack +spec: + groups: + - name: linkerd-service-sli + interval: 30s + rules: + - record: service:request_rate:5m + expr: sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace) + + - record: service:error_rate:5m + expr: | + sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace) + / sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace) + + - record: service:latency_p95:5m + expr: | + histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) + + - record: service:latency_p99:5m + expr: | + histogram_quantile(0.99, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) + + - name: node-aggregates + interval: 30s + rules: + - record: node:memory_usage_ratio + expr: 1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes + + - record: node:cpu_usage_ratio + expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) + + - record: node:swap_usage_ratio + expr: 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes diff --git a/base/ory/ory-alertrules.yaml b/base/ory/ory-alertrules.yaml index c83a4f1..8d2ab4b 100644 --- a/base/ory/ory-alertrules.yaml +++ b/base/ory/ory-alertrules.yaml @@ -5,6 +5,7 @@ metadata: namespace: ory labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: ory diff --git a/base/stalwart/kustomization.yaml b/base/stalwart/kustomization.yaml new file mode 100644 index 0000000..377e1f5 --- /dev/null +++ b/base/stalwart/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: stalwart + +resources: + - namespace.yaml + - vault-secrets.yaml + - oidc-client.yaml + - oidc-client-bulwark.yaml + - stalwart-config.yaml + - stalwart-deployment.yaml + - stalwart-service.yaml + - certificate.yaml + - bulwark-deployment.yaml + - bulwark-service.yaml + - stalwart-alertrules.yaml diff --git a/base/stalwart/stalwart-alertrules.yaml b/base/stalwart/stalwart-alertrules.yaml new file mode 100644 index 0000000..d4247ac --- /dev/null +++ b/base/stalwart/stalwart-alertrules.yaml @@ -0,0 +1,31 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: stalwart-alerts + namespace: stalwart + labels: + role: alert-rules + release: kube-prometheus-stack +spec: + groups: + - name: stalwart + rules: + - alert: StalwartDown + expr: | + kube_deployment_status_replicas_available{namespace="stalwart", deployment="stalwart"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Stalwart mail server is down" + description: "Stalwart is down — IMAP, SMTP submission, and webmail are all affected." + + - alert: BulwarkDown + expr: | + kube_deployment_status_replicas_available{namespace="stalwart", deployment="bulwark"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Bulwark spam filter is down" + description: "Bulwark is down — inbound email filtering is not running." diff --git a/base/storage/seaweedfs-alertrules.yaml b/base/storage/seaweedfs-alertrules.yaml index 8fc9cfe..00ffa59 100644 --- a/base/storage/seaweedfs-alertrules.yaml +++ b/base/storage/seaweedfs-alertrules.yaml @@ -5,6 +5,7 @@ metadata: namespace: storage labels: role: alert-rules + release: kube-prometheus-stack spec: groups: - name: seaweedfs @@ -44,3 +45,23 @@ spec: annotations: summary: "SeaweedFS disk space low" description: "SeaweedFS disk on {{ $labels.instance }} has less than 15% free space." + + - alert: SeaweedFSMirrorJobFailing + expr: | + kube_job_status_failed{namespace="storage", job_name=~"seaweedfs-s3-mirror.*"} > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "SeaweedFS S3 mirror job failed" + description: "Job {{ $labels.job_name }} failed. S3 backups to Scaleway are not running." + + - alert: SeaweedFSMirrorStale + expr: | + time() - kube_cronjob_status_last_successful_time{namespace="storage", cronjob="seaweedfs-s3-mirror"} > 7200 + for: 10m + labels: + severity: warning + annotations: + summary: "SeaweedFS S3 mirror hasn't succeeded in 2+ hours" + description: "CronJob seaweedfs-s3-mirror last succeeded {{ $value | humanizeDuration }} ago."