feat(monitoring): wire up full LGTM observability stack
- Prometheus: discover ServiceMonitors/PodMonitors in all namespaces, enable remote write receiver for Tempo metrics generator - Tempo: enable metrics generator (service-graphs + span-metrics) with remote write to Prometheus - Loki: add Grafana Alloy DaemonSet to ship container logs - Grafana: enable dashboard sidecar, add Pingora/Loki/Tempo/OpenBao dashboards, add stable UIDs and cross-linking between datasources (Loki↔Tempo derived fields, traces→logs, traces→metrics, service map) - Linkerd: enable proxy tracing to Alloy OTLP collector, point linkerd-viz at existing Prometheus instead of deploying its own - Pingora: add OTLP rollout plan (endpoint commented out until proxy telemetry panic fix is deployed and Alloy is verified healthy)
This commit is contained in:
@@ -21,6 +21,13 @@ data:
|
|||||||
key_path = "/etc/tls/tls.key"
|
key_path = "/etc/tls/tls.key"
|
||||||
|
|
||||||
[telemetry]
|
[telemetry]
|
||||||
|
# Rollout plan for OTLP tracing:
|
||||||
|
# 1. Deploy proxy build that includes the graceful telemetry init
|
||||||
|
# (proxy/src/telemetry.rs — no longer panics on exporter failure)
|
||||||
|
# 2. Verify Alloy is running:
|
||||||
|
# kubectl -n monitoring get pods -l app.kubernetes.io/name=alloy
|
||||||
|
# 3. Uncomment the line below:
|
||||||
|
# otlp_endpoint = "http://alloy.monitoring.svc.cluster.local:4318"
|
||||||
otlp_endpoint = ""
|
otlp_endpoint = ""
|
||||||
metrics_port = 9090
|
metrics_port = 9090
|
||||||
|
|
||||||
|
|||||||
@@ -29,9 +29,11 @@ helmCharts:
|
|||||||
version: "2025.12.3"
|
version: "2025.12.3"
|
||||||
releaseName: linkerd-control-plane
|
releaseName: linkerd-control-plane
|
||||||
namespace: mesh
|
namespace: mesh
|
||||||
|
valuesFile: linkerd-control-plane-values.yaml
|
||||||
|
|
||||||
- name: linkerd-viz
|
- name: linkerd-viz
|
||||||
repo: https://helm.linkerd.io/edge
|
repo: https://helm.linkerd.io/edge
|
||||||
version: "2026.1.4"
|
version: "2026.1.4"
|
||||||
releaseName: linkerd-viz
|
releaseName: linkerd-viz
|
||||||
namespace: mesh
|
namespace: mesh
|
||||||
|
valuesFile: linkerd-viz-values.yaml
|
||||||
|
|||||||
19
base/mesh/linkerd-control-plane-values.yaml
Normal file
19
base/mesh/linkerd-control-plane-values.yaml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# Linkerd control-plane overrides — enable proxy tracing to Tempo.
|
||||||
|
#
|
||||||
|
# Every meshed pod's Linkerd sidecar will export OTLP traces to the
|
||||||
|
# Alloy collector in the monitoring namespace, which forwards to Tempo.
|
||||||
|
|
||||||
|
# Controller-level tracing (identity, destination controllers)
|
||||||
|
controller:
|
||||||
|
tracing:
|
||||||
|
enabled: true
|
||||||
|
collector:
|
||||||
|
endpoint: "alloy.monitoring.svc.cluster.local:4317"
|
||||||
|
|
||||||
|
# Proxy-level tracing (every meshed sidecar)
|
||||||
|
proxy:
|
||||||
|
tracing:
|
||||||
|
enabled: true
|
||||||
|
traceServiceName: linkerd-proxy
|
||||||
|
collector:
|
||||||
|
endpoint: "alloy.monitoring.svc.cluster.local:4317"
|
||||||
9
base/mesh/linkerd-viz-values.yaml
Normal file
9
base/mesh/linkerd-viz-values.yaml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# Linkerd-viz overrides — use existing Prometheus instead of deploying a second one.
|
||||||
|
#
|
||||||
|
# By default linkerd-viz ships its own Prometheus, which wastes resources
|
||||||
|
# and creates a second scrape loop. Point it at kube-prometheus-stack instead.
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
prometheusUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
|
||||||
108
base/monitoring/alloy-values.yaml
Normal file
108
base/monitoring/alloy-values.yaml
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
# Grafana Alloy — lightweight agent that ships container logs to Loki
|
||||||
|
# and forwards OTLP traces to Tempo.
|
||||||
|
#
|
||||||
|
# Runs as a DaemonSet so every node's /var/log/pods is tailed.
|
||||||
|
|
||||||
|
alloy:
|
||||||
|
configMap:
|
||||||
|
content: |
|
||||||
|
// ── Kubernetes log discovery ──────────────────────────────────
|
||||||
|
discovery.kubernetes "pods" {
|
||||||
|
role = "pod"
|
||||||
|
}
|
||||||
|
|
||||||
|
discovery.relabel "pod_logs" {
|
||||||
|
targets = discovery.kubernetes.pods.targets
|
||||||
|
|
||||||
|
// Keep only running pods
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_phase"]
|
||||||
|
regex = "Pending|Succeeded|Failed|Unknown"
|
||||||
|
action = "drop"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Standard labels
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_namespace"]
|
||||||
|
target_label = "namespace"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_name"]
|
||||||
|
target_label = "pod"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_container_name"]
|
||||||
|
target_label = "container"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_node_name"]
|
||||||
|
target_label = "node"
|
||||||
|
}
|
||||||
|
// Carry app label for easier Grafana filtering
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_label_app"]
|
||||||
|
target_label = "app"
|
||||||
|
}
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
||||||
|
target_label = "app"
|
||||||
|
action = "replace"
|
||||||
|
regex = "(.+)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loki.source.kubernetes "pods" {
|
||||||
|
targets = discovery.relabel.pod_logs.output
|
||||||
|
forward_to = [loki.process.pipeline.receiver]
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Log processing pipeline ──────────────────────────────────
|
||||||
|
loki.process "pipeline" {
|
||||||
|
// Detect and parse JSON log lines (common in Go / Python services)
|
||||||
|
stage.json {
|
||||||
|
expressions = {
|
||||||
|
level = "level",
|
||||||
|
msg = "msg",
|
||||||
|
traceID = "traceID",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Promote log level to a label for easier filtering
|
||||||
|
stage.labels {
|
||||||
|
values = { level = "" }
|
||||||
|
}
|
||||||
|
|
||||||
|
forward_to = [loki.write.default.receiver]
|
||||||
|
}
|
||||||
|
|
||||||
|
loki.write "default" {
|
||||||
|
endpoint {
|
||||||
|
url = "http://loki-gateway.monitoring.svc.cluster.local:80/loki/api/v1/push"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── OTLP receiver (services can push traces here) ────────────
|
||||||
|
otelcol.receiver.otlp "default" {
|
||||||
|
grpc { endpoint = "0.0.0.0:4317" }
|
||||||
|
http { endpoint = "0.0.0.0:4318" }
|
||||||
|
output { traces = [otelcol.exporter.otlp.tempo.input] }
|
||||||
|
}
|
||||||
|
|
||||||
|
otelcol.exporter.otlp "tempo" {
|
||||||
|
client {
|
||||||
|
endpoint = "tempo.monitoring.svc.cluster.local:4317"
|
||||||
|
tls { insecure = true }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
controller:
|
||||||
|
type: daemonset
|
||||||
|
|
||||||
|
# Mount node log directories for kubernetes log tailing
|
||||||
|
mounts:
|
||||||
|
varlog: true
|
||||||
|
|
||||||
|
# Expose OTLP ports so in-cluster services can send traces to the local agent
|
||||||
|
service:
|
||||||
|
enabled: true
|
||||||
|
type: ClusterIP
|
||||||
310
base/monitoring/dashboards-configmap.yaml
Normal file
310
base/monitoring/dashboards-configmap.yaml
Normal file
@@ -0,0 +1,310 @@
|
|||||||
|
# Grafana dashboard ConfigMaps — picked up by the Grafana sidecar.
|
||||||
|
#
|
||||||
|
# Each ConfigMap holds one or more dashboard JSON files. The sidecar
|
||||||
|
# watches for the label grafana_dashboard=1 across all namespaces and
|
||||||
|
# hot-loads them into Grafana (no restart required).
|
||||||
|
#
|
||||||
|
# The grafana_folder annotation groups dashboards into Grafana folders.
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-dashboard-pingora
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
annotations:
|
||||||
|
grafana_folder: "Ingress"
|
||||||
|
data:
|
||||||
|
pingora.json: |
|
||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"graphTooltip": 1,
|
||||||
|
"links": [],
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Requests / sec",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(pingora_http_requests_total[5m]))",
|
||||||
|
"legendFormat": "total"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)",
|
||||||
|
"legendFormat": "{{status_code}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "reqps" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Error Rate (5xx)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))",
|
||||||
|
"legendFormat": "5xx ratio"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Request Latency (p50 / p95 / p99)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"legendFormat": "p50"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"legendFormat": "p95"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))",
|
||||||
|
"legendFormat": "p99"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "s" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Active Connections",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "pingora_active_connections",
|
||||||
|
"legendFormat": "active"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Upstream Latency by Backend",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))",
|
||||||
|
"legendFormat": "{{backend}} p95"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "s" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "DDoS / Scanner Detections",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(pingora_ddos_detections_total[5m]))",
|
||||||
|
"legendFormat": "DDoS"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(pingora_scanner_detections_total[5m]))",
|
||||||
|
"legendFormat": "Scanner"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))",
|
||||||
|
"legendFormat": "Rate-limited"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "unit": "reqps" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"tags": ["ingress", "pingora"],
|
||||||
|
"templating": { "list": [] },
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"title": "Pingora Proxy",
|
||||||
|
"uid": "pingora-proxy"
|
||||||
|
}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-dashboard-loki
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
annotations:
|
||||||
|
grafana_folder: "Observability"
|
||||||
|
data:
|
||||||
|
loki-overview.json: |
|
||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Log Volume by Namespace",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 0 },
|
||||||
|
"datasource": { "uid": "loki" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(count_over_time({namespace=~\".+\"}[5m])) by (namespace)",
|
||||||
|
"legendFormat": "{{namespace}}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Error Logs",
|
||||||
|
"type": "logs",
|
||||||
|
"gridPos": { "h": 12, "w": 24, "x": 0, "y": 8 },
|
||||||
|
"datasource": { "uid": "loki" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "{namespace=~\".+\"} |~ \"(?i)(error|panic|fatal|exception)\"",
|
||||||
|
"legendFormat": ""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"tags": ["loki", "logs"],
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"title": "Loki — Log Overview",
|
||||||
|
"uid": "loki-overview"
|
||||||
|
}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-dashboard-tempo
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
annotations:
|
||||||
|
grafana_folder: "Observability"
|
||||||
|
data:
|
||||||
|
tempo-overview.json: |
|
||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Trace Ingestion Rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(tempo_distributor_spans_received_total[5m]))",
|
||||||
|
"legendFormat": "spans/s"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "ops" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Service Map (RED)",
|
||||||
|
"type": "nodeGraph",
|
||||||
|
"gridPos": { "h": 16, "w": 12, "x": 12, "y": 0 },
|
||||||
|
"datasource": { "uid": "tempo" },
|
||||||
|
"targets": [
|
||||||
|
{ "queryType": "serviceMap" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Span Duration by Service (p95)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service))",
|
||||||
|
"legendFormat": "{{service}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"tags": ["tempo", "tracing"],
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"title": "Tempo — Trace Overview",
|
||||||
|
"uid": "tempo-overview"
|
||||||
|
}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-dashboard-openbao
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
grafana_dashboard: "1"
|
||||||
|
annotations:
|
||||||
|
grafana_folder: "Infrastructure"
|
||||||
|
data:
|
||||||
|
openbao.json: |
|
||||||
|
{
|
||||||
|
"annotations": { "list": [] },
|
||||||
|
"editable": true,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"title": "Vault/OpenBao Sealed Status",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "vault_core_unsealed", "legendFormat": "unsealed" }
|
||||||
|
],
|
||||||
|
"fieldConfig": {
|
||||||
|
"defaults": { "mappings": [{"type":"value","options":{"0":{"text":"SEALED","color":"red"},"1":{"text":"UNSEALED","color":"green"}}}] }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Token Count",
|
||||||
|
"type": "stat",
|
||||||
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "vault_token_count", "legendFormat": "tokens" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Request Rate",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "sum(rate(vault_core_handle_request_count[5m]))", "legendFormat": "req/s" }
|
||||||
|
],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "reqps" } }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Request Latency (p95)",
|
||||||
|
"type": "timeseries",
|
||||||
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||||
|
"datasource": { "uid": "prometheus" },
|
||||||
|
"targets": [
|
||||||
|
{ "expr": "histogram_quantile(0.95, sum(rate(vault_core_handle_request_bucket[5m])) by (le))", "legendFormat": "p95" }
|
||||||
|
],
|
||||||
|
"fieldConfig": { "defaults": { "unit": "s" } }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"schemaVersion": 39,
|
||||||
|
"tags": ["vault", "openbao"],
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"title": "OpenBao / Vault",
|
||||||
|
"uid": "openbao"
|
||||||
|
}
|
||||||
@@ -7,6 +7,7 @@ resources:
|
|||||||
- namespace.yaml
|
- namespace.yaml
|
||||||
- vault-secrets.yaml
|
- vault-secrets.yaml
|
||||||
- grafana-oauth2client.yaml
|
- grafana-oauth2client.yaml
|
||||||
|
- dashboards-configmap.yaml
|
||||||
|
|
||||||
helmCharts:
|
helmCharts:
|
||||||
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||||
@@ -32,3 +33,12 @@ helmCharts:
|
|||||||
releaseName: tempo
|
releaseName: tempo
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
valuesFile: tempo-values.yaml
|
valuesFile: tempo-values.yaml
|
||||||
|
|
||||||
|
# Grafana Alloy — DaemonSet that ships container logs → Loki
|
||||||
|
# and provides an in-cluster OTLP receiver → Tempo.
|
||||||
|
- name: alloy
|
||||||
|
repo: https://grafana.github.io/helm-charts
|
||||||
|
version: "0.12.0"
|
||||||
|
releaseName: alloy
|
||||||
|
namespace: monitoring
|
||||||
|
valuesFile: alloy-values.yaml
|
||||||
|
|||||||
@@ -39,10 +39,20 @@ grafana:
|
|||||||
sidecar:
|
sidecar:
|
||||||
datasources:
|
datasources:
|
||||||
defaultDatasourceEnabled: false
|
defaultDatasourceEnabled: false
|
||||||
|
dashboards:
|
||||||
|
enabled: true
|
||||||
|
# Pick up ConfigMaps with this label in any namespace
|
||||||
|
label: grafana_dashboard
|
||||||
|
labelValue: "1"
|
||||||
|
searchNamespace: ALL
|
||||||
|
folderAnnotation: grafana_folder
|
||||||
|
provider:
|
||||||
|
foldersFromFilesStructure: false
|
||||||
|
|
||||||
additionalDataSources:
|
additionalDataSources:
|
||||||
- name: Prometheus
|
- name: Prometheus
|
||||||
type: prometheus
|
type: prometheus
|
||||||
|
uid: prometheus
|
||||||
url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
|
url: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090"
|
||||||
access: proxy
|
access: proxy
|
||||||
isDefault: true
|
isDefault: true
|
||||||
@@ -50,17 +60,53 @@ grafana:
|
|||||||
timeInterval: 30s
|
timeInterval: 30s
|
||||||
- name: Loki
|
- name: Loki
|
||||||
type: loki
|
type: loki
|
||||||
|
uid: loki
|
||||||
url: "http://loki-gateway.monitoring.svc.cluster.local:80"
|
url: "http://loki-gateway.monitoring.svc.cluster.local:80"
|
||||||
access: proxy
|
access: proxy
|
||||||
isDefault: false
|
isDefault: false
|
||||||
|
jsonData:
|
||||||
|
derivedFields:
|
||||||
|
# Click a traceID in a log line → jump straight to Tempo
|
||||||
|
- datasourceUid: tempo
|
||||||
|
matcherRegex: '"traceID":"(\w+)"'
|
||||||
|
name: TraceID
|
||||||
|
url: "$${__value.raw}"
|
||||||
- name: Tempo
|
- name: Tempo
|
||||||
type: tempo
|
type: tempo
|
||||||
|
uid: tempo
|
||||||
url: "http://tempo.monitoring.svc.cluster.local:3200"
|
url: "http://tempo.monitoring.svc.cluster.local:3200"
|
||||||
access: proxy
|
access: proxy
|
||||||
isDefault: false
|
isDefault: false
|
||||||
|
jsonData:
|
||||||
|
tracesToLogsV2:
|
||||||
|
datasourceUid: loki
|
||||||
|
filterByTraceID: true
|
||||||
|
filterBySpanID: false
|
||||||
|
tags:
|
||||||
|
- key: namespace
|
||||||
|
- key: pod
|
||||||
|
tracesToMetrics:
|
||||||
|
datasourceUid: prometheus
|
||||||
|
tags:
|
||||||
|
- key: service.name
|
||||||
|
value: service
|
||||||
|
lokiSearch:
|
||||||
|
datasourceUid: loki
|
||||||
|
serviceMap:
|
||||||
|
datasourceUid: prometheus
|
||||||
|
|
||||||
prometheus:
|
prometheus:
|
||||||
prometheusSpec:
|
prometheusSpec:
|
||||||
|
# Discover ServiceMonitors / PodMonitors / PrometheusRules in ALL namespaces,
|
||||||
|
# not just "monitoring". Without this, monitors in ingress, mesh,
|
||||||
|
# cert-manager, devtools, etc. are invisible to Prometheus.
|
||||||
|
serviceMonitorNamespaceSelector: {}
|
||||||
|
podMonitorNamespaceSelector: {}
|
||||||
|
ruleNamespaceSelector: {}
|
||||||
|
serviceMonitorSelector: {}
|
||||||
|
podMonitorSelector: {}
|
||||||
|
# Accept remote-write from Tempo metrics generator
|
||||||
|
enableRemoteWriteReceiver: true
|
||||||
retention: 90d
|
retention: 90d
|
||||||
additionalArgs:
|
additionalArgs:
|
||||||
# Allow browser-direct queries from the Grafana UI origin.
|
# Allow browser-direct queries from the Grafana UI origin.
|
||||||
|
|||||||
@@ -16,6 +16,18 @@ tempo:
|
|||||||
path: /var/tempo/traces
|
path: /var/tempo/traces
|
||||||
wal:
|
wal:
|
||||||
path: /var/tempo/wal
|
path: /var/tempo/wal
|
||||||
|
# Generate span-derived RED metrics (rate / errors / duration) and push
|
||||||
|
# them into Prometheus so Grafana can show service-level indicators
|
||||||
|
# even without application-level metrics exporters.
|
||||||
|
metricsGenerator:
|
||||||
|
enabled: true
|
||||||
|
remoteWriteUrl: "http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090/api/v1/write"
|
||||||
|
overrides:
|
||||||
|
defaults:
|
||||||
|
metrics_generator:
|
||||||
|
processors:
|
||||||
|
- service-graphs
|
||||||
|
- span-metrics
|
||||||
|
|
||||||
persistence:
|
persistence:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|||||||
Reference in New Issue
Block a user