Files
sbbb/base/monitoring/prometheus-values.yaml

180 lines
5.4 KiB
YAML
Raw Permalink Normal View History

# kube-prometheus-stack — Prometheus + AlertManager + Grafana + node-exporter + kube-state-metrics
#
# k3s quirks: kube-proxy is replaced by Cilium; etcd/scheduler/controller-manager
# don't expose metrics on standard ports. Disable their monitors to avoid noise.
grafana:
adminUser: admin
admin:
existingSecret: grafana-admin
passwordKey: admin-password
persistence:
enabled: true
size: 2Gi
# Inject Hydra OIDC client credentials (created by Hydra Maester from the OAuth2Client CRD)
envFromSecret: grafana-oidc
grafana.ini:
server:
root_url: "https://metrics.DOMAIN_SUFFIX"
auth:
# Keep local login as fallback (admin password from grafana-admin secret)
disable_login_form: false
signout_redirect_url: "https://auth.DOMAIN_SUFFIX/oauth2/sessions/logout"
auth.generic_oauth:
enabled: true
name: Sunbeam
icon: signin
# CLIENT_ID / CLIENT_SECRET injected from grafana-oidc K8s Secret via envFromSecret
client_id: "${CLIENT_ID}"
client_secret: "${CLIENT_SECRET}"
scopes: "openid email profile"
auth_url: "https://auth.DOMAIN_SUFFIX/oauth2/auth"
token_url: "https://auth.DOMAIN_SUFFIX/oauth2/token"
api_url: "https://auth.DOMAIN_SUFFIX/userinfo"
allow_sign_up: true
# Small studio — anyone with a valid La Suite account is an admin.
# To restrict to specific users, set role_attribute_path instead.
auto_assign_org_role: Admin
skip_org_role_sync: true
sidecar:
datasources:
defaultDatasourceEnabled: false
dashboards:
enabled: true
# Pick up ConfigMaps with this label in any namespace
label: grafana_dashboard
labelValue: "1"
searchNamespace: ALL
folderAnnotation: grafana_folder
provider:
foldersFromFilesStructure: false
additionalDataSources:
- name: Prometheus
type: prometheus
uid: prometheus
url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
access: proxy
isDefault: true
jsonData:
timeInterval: 30s
- name: Loki
type: loki
uid: loki
url: "http://loki-gateway.monitoring.svc.cluster.local:80"
access: proxy
isDefault: false
jsonData:
derivedFields:
# Click a traceID in a log line → jump straight to Tempo
- datasourceUid: tempo
matcherRegex: '"traceID":"(\w+)"'
name: TraceID
url: "$${__value.raw}"
- name: Tempo
type: tempo
uid: tempo
url: "http://tempo.monitoring.svc.cluster.local:3200"
access: proxy
isDefault: false
jsonData:
tracesToLogsV2:
datasourceUid: loki
filterByTraceID: true
filterBySpanID: false
tags:
- key: namespace
- key: pod
tracesToMetrics:
datasourceUid: prometheus
tags:
- key: service.name
value: service
lokiSearch:
datasourceUid: loki
serviceMap:
datasourceUid: prometheus
prometheus:
prometheusSpec:
# Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
# not just "monitoring". Without this, monitors in ingress, mesh,
# cert-manager, devtools, etc. are invisible to Prometheus.
serviceMonitorNamespaceSelector: {}
podMonitorNamespaceSelector: {}
ruleNamespaceSelector: {}
serviceMonitorSelector: {}
podMonitorSelector: {}
# Accept remote-write from Tempo metrics generator
enableRemoteWriteReceiver: true
retention: 90d
additionalArgs:
# Allow browser-direct queries from the Grafana UI origin.
- name: web.cors.origin
value: "https://metrics.DOMAIN_SUFFIX"
storageSpec:
volumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 30Gi
alertmanager:
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 2Gi
config:
global:
smtp_from: "alerts@DOMAIN_SUFFIX"
smtp_smarthost: "postfix.lasuite.svc.cluster.local:25"
smtp_require_tls: false
route:
group_by: [alertname, namespace]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: matrix
routes:
- matchers:
- alertname = Watchdog
receiver: "null"
- matchers:
- severity = critical
receiver: critical
- matchers:
- severity = warning
receiver: matrix
receivers:
- name: "null"
- name: email
email_configs:
- to: "ops@DOMAIN_SUFFIX"
send_resolved: true
- name: matrix
webhook_configs:
- url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts"
send_resolved: true
- name: critical
webhook_configs:
- url: "http://matrix-alertmanager-receiver.monitoring.svc.cluster.local:3000/alerts/alerts"
send_resolved: true
email_configs:
- to: "ops@DOMAIN_SUFFIX"
send_resolved: true
# Disable monitors for components k3s doesn't expose
kubeEtcd:
enabled: false
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubeProxy:
enabled: false