The Prometheus operator uses snake_case (inhibit_rules) not camelCase (inhibitRules), causing alertmanager reconciliation to fail. Also route InfoInhibitor alerts to null to stop flooding the Matrix alerts room.
176 lines
5.2 KiB
YAML
176 lines
5.2 KiB
YAML
# kube-prometheus-stack — Prometheus + AlertManager + Grafana + node-exporter + kube-state-metrics
|
|
#
|
|
# k3s quirks: kube-proxy is replaced by Cilium; etcd/scheduler/controller-manager
|
|
# don't expose metrics on standard ports. Disable their monitors to avoid noise.
|
|
|
|
grafana:
|
|
adminUser: admin
|
|
admin:
|
|
existingSecret: grafana-admin
|
|
passwordKey: admin-password
|
|
persistence:
|
|
enabled: true
|
|
size: 2Gi
|
|
# Inject Hydra OIDC client credentials (created by Hydra Maester from the OAuth2Client CRD)
|
|
envFromSecret: grafana-oidc
|
|
grafana.ini:
|
|
server:
|
|
root_url: "https://metrics.DOMAIN_SUFFIX"
|
|
auth:
|
|
# Keep local login as fallback (admin password from grafana-admin secret)
|
|
disable_login_form: false
|
|
signout_redirect_url: "https://auth.DOMAIN_SUFFIX/oauth2/sessions/logout"
|
|
auth.generic_oauth:
|
|
enabled: true
|
|
name: Sunbeam
|
|
icon: signin
|
|
# CLIENT_ID / CLIENT_SECRET injected from grafana-oidc K8s Secret via envFromSecret
|
|
client_id: "${CLIENT_ID}"
|
|
client_secret: "${CLIENT_SECRET}"
|
|
scopes: "openid email profile"
|
|
auth_url: "https://auth.DOMAIN_SUFFIX/oauth2/auth"
|
|
token_url: "https://auth.DOMAIN_SUFFIX/oauth2/token"
|
|
api_url: "https://auth.DOMAIN_SUFFIX/userinfo"
|
|
allow_sign_up: true
|
|
# Small studio — anyone with a valid La Suite account is an admin.
|
|
# To restrict to specific users, set role_attribute_path instead.
|
|
auto_assign_org_role: Admin
|
|
skip_org_role_sync: true
|
|
sidecar:
|
|
datasources:
|
|
defaultDatasourceEnabled: false
|
|
dashboards:
|
|
enabled: true
|
|
# Pick up ConfigMaps with this label in any namespace
|
|
label: grafana_dashboard
|
|
labelValue: "1"
|
|
searchNamespace: ALL
|
|
folderAnnotation: grafana_folder
|
|
provider:
|
|
foldersFromFilesStructure: false
|
|
|
|
additionalDataSources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
uid: prometheus
|
|
url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
|
|
access: proxy
|
|
isDefault: true
|
|
jsonData:
|
|
timeInterval: 30s
|
|
- name: Loki
|
|
type: loki
|
|
uid: loki
|
|
url: "http://loki.monitoring.svc.cluster.local:3100"
|
|
access: proxy
|
|
isDefault: false
|
|
jsonData:
|
|
derivedFields:
|
|
# Click a traceID in a log line → jump straight to Tempo
|
|
- datasourceUid: tempo
|
|
matcherRegex: '"traceID":"(\w+)"'
|
|
name: TraceID
|
|
url: "$${__value.raw}"
|
|
- name: Tempo
|
|
type: tempo
|
|
uid: tempo
|
|
url: "http://tempo.monitoring.svc.cluster.local:3200"
|
|
access: proxy
|
|
isDefault: false
|
|
jsonData:
|
|
tracesToLogsV2:
|
|
datasourceUid: loki
|
|
filterByTraceID: true
|
|
filterBySpanID: false
|
|
tags:
|
|
- key: namespace
|
|
- key: pod
|
|
tracesToMetrics:
|
|
datasourceUid: prometheus
|
|
tags:
|
|
- key: service.name
|
|
value: service
|
|
lokiSearch:
|
|
datasourceUid: loki
|
|
serviceMap:
|
|
datasourceUid: prometheus
|
|
|
|
prometheus:
|
|
prometheusSpec:
|
|
# Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
|
|
# not just "monitoring". Without this, monitors in ingress, mesh,
|
|
# cert-manager, devtools, etc. are invisible to Prometheus.
|
|
serviceMonitorNamespaceSelector: {}
|
|
podMonitorNamespaceSelector: {}
|
|
ruleNamespaceSelector: {}
|
|
serviceMonitorSelector: {}
|
|
podMonitorSelector: {}
|
|
# Accept remote-write from Tempo metrics generator
|
|
enableRemoteWriteReceiver: true
|
|
retention: 90d
|
|
additionalArgs:
|
|
# Allow browser-direct queries from the Grafana UI origin.
|
|
- name: web.cors.origin
|
|
value: "https://metrics.DOMAIN_SUFFIX"
|
|
storageSpec:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
accessModes: [ReadWriteOnce]
|
|
resources:
|
|
requests:
|
|
storage: 30Gi
|
|
|
|
alertmanager:
|
|
alertmanagerSpec:
|
|
storage:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
accessModes: [ReadWriteOnce]
|
|
resources:
|
|
requests:
|
|
storage: 2Gi
|
|
config:
|
|
route:
|
|
group_by: [alertname, namespace]
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
receiver: matrix
|
|
routes:
|
|
- matchers:
|
|
- alertname = Watchdog
|
|
receiver: matrix
|
|
repeat_interval: 12h
|
|
- matchers:
|
|
- alertname = InfoInhibitor
|
|
receiver: "null"
|
|
- matchers:
|
|
- severity = critical
|
|
receiver: matrix
|
|
- matchers:
|
|
- severity = warning
|
|
receiver: matrix
|
|
receivers:
|
|
- name: "null"
|
|
- name: matrix
|
|
webhook_configs:
|
|
- url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts"
|
|
send_resolved: true
|
|
inhibit_rules:
|
|
# Critical alerts suppress warnings for the same alertname+namespace
|
|
- source_matchers:
|
|
- severity = critical
|
|
target_matchers:
|
|
- severity = warning
|
|
equal: [alertname, namespace]
|
|
|
|
# Disable monitors for components k3s doesn't expose
|
|
kubeEtcd:
|
|
enabled: false
|
|
kubeControllerManager:
|
|
enabled: false
|
|
kubeScheduler:
|
|
enabled: false
|
|
kubeProxy:
|
|
enabled: false
|