From 5e622ce316c4521f900a44a7c6f4f6a95aa709d0 Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Tue, 24 Mar 2026 12:21:29 +0000 Subject: [PATCH] feat: AlertManager Matrix integration with severity routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deploy matrix-alertmanager-receiver bridge (pending bot credentials in OpenBao). Update AlertManager routing: critical → Matrix + email, warning → Matrix only, Watchdog → null. Reduce repeat interval to 4h. --- ...trix-alertmanager-receiver-deployment.yaml | 65 +++++++++++++++++++ base/monitoring/matrix-bot-secret.yaml | 27 ++++++++ base/monitoring/prometheus-values.yaml | 26 +++++++- 3 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 base/monitoring/matrix-alertmanager-receiver-deployment.yaml create mode 100644 base/monitoring/matrix-bot-secret.yaml diff --git a/base/monitoring/matrix-alertmanager-receiver-deployment.yaml b/base/monitoring/matrix-alertmanager-receiver-deployment.yaml new file mode 100644 index 0000000..196e950 --- /dev/null +++ b/base/monitoring/matrix-alertmanager-receiver-deployment.yaml @@ -0,0 +1,65 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: matrix-alertmanager-receiver + namespace: monitoring + labels: + app: matrix-alertmanager-receiver +spec: + replicas: 1 + selector: + matchLabels: + app: matrix-alertmanager-receiver + template: + metadata: + labels: + app: matrix-alertmanager-receiver + spec: + containers: + - name: receiver + image: ghcr.io/metio/matrix-alertmanager-receiver:2024.11.27 + ports: + - containerPort: 3000 + protocol: TCP + env: + - name: MAR_HOMESERVER_URL + value: "http://tuwunel.matrix.svc.cluster.local:6167" + - name: MAR_USER_ID + value: "@alertbot:sunbeam.pt" + - name: MAR_ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: matrix-bot-creds + key: access_token + - name: MAR_ROOM_MAPPING + value: "ops=$(ROOM_ID)" + - name: ROOM_ID + valueFrom: + secretKeyRef: + name: matrix-bot-creds + key: room_id + - name: MAR_PORT + value: "3000" + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + memory: 64Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: matrix-alertmanager-receiver + namespace: monitoring + labels: + app: matrix-alertmanager-receiver +spec: + type: ClusterIP + ports: + - port: 3000 + targetPort: 3000 + protocol: TCP + selector: + app: matrix-alertmanager-receiver diff --git a/base/monitoring/matrix-bot-secret.yaml b/base/monitoring/matrix-bot-secret.yaml new file mode 100644 index 0000000..80d2666 --- /dev/null +++ b/base/monitoring/matrix-bot-secret.yaml @@ -0,0 +1,27 @@ +--- +# Matrix alertbot credentials from OpenBao KV at secret/alertbot. +apiVersion: secrets.hashicorp.com/v1beta1 +kind: VaultStaticSecret +metadata: + name: matrix-bot-creds + namespace: monitoring +spec: + vaultAuthRef: vso-auth + mount: secret + type: kv-v2 + path: alertbot + refreshAfter: 30s + rolloutRestartTargets: + - kind: Deployment + name: matrix-alertmanager-receiver + destination: + name: matrix-bot-creds + create: true + overwrite: true + transformation: + excludeRaw: true + templates: + access_token: + text: "{{ index .Secrets \"access_token\" }}" + room_id: + text: "{{ index .Secrets \"room_id\" }}" diff --git a/base/monitoring/prometheus-values.yaml b/base/monitoring/prometheus-values.yaml index 63aa6cf..96fbe6b 100644 --- a/base/monitoring/prometheus-values.yaml +++ b/base/monitoring/prometheus-values.yaml @@ -138,13 +138,35 @@ alertmanager: group_by: [alertname, namespace] group_wait: 30s group_interval: 5m - repeat_interval: 12h - receiver: email + repeat_interval: 4h + receiver: matrix + routes: + - matchers: + - alertname = Watchdog + receiver: "null" + - matchers: + - severity = critical + receiver: critical + - matchers: + - severity = warning + receiver: matrix receivers: + - name: "null" - name: email email_configs: - to: "ops@DOMAIN_SUFFIX" send_resolved: true + - name: matrix + webhook_configs: + - url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts" + send_resolved: true + - name: critical + webhook_configs: + - url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts" + send_resolved: true + email_configs: + - to: "ops@DOMAIN_SUFFIX" + send_resolved: true # Disable monitors for components k3s doesn't expose kubeEtcd: