feat(observability): enable OTLP tracing, fix Prometheus scraping, add proxy ServiceMonitor

- Set otlp_endpoint to Tempo HTTP receiver (port 4318) for request tracing
- Add hostNetwork to prometheusSpec so it can reach kubelet/node-exporter on node public IP
- Add ServiceMonitor for proxy metrics scrape on port 9090
- Add CORS origin and Grafana datasource config for monitoring stack
This commit is contained in:
2026-03-09 08:20:42 +00:00
parent caefb071a8
commit 91983ddf29
4 changed files with 43 additions and 5 deletions

View File

@@ -9,6 +9,7 @@ resources:
- pingora-deployment.yaml - pingora-deployment.yaml
- pingora-service.yaml - pingora-service.yaml
- pingora-config.yaml - pingora-config.yaml
- pingora-servicemonitor.yaml
images: images:
- name: sunbeam-proxy - name: sunbeam-proxy

View File

@@ -21,8 +21,7 @@ data:
key_path = "/etc/tls/tls.key" key_path = "/etc/tls/tls.key"
[telemetry] [telemetry]
# Empty = OTEL disabled. Set to http://otel-collector.data.svc:4318 when ready. otlp_endpoint = "http://tempo.monitoring.svc.cluster.local:4318"
otlp_endpoint = ""
metrics_port = 9090 metrics_port = 9090
# Kubernetes resource names for cert/config watchers. # Kubernetes resource names for cert/config watchers.

View File

@@ -0,0 +1,15 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: pingora
namespace: ingress
labels:
app: pingora
spec:
selector:
matchLabels:
app: pingora
endpoints:
- port: metrics
interval: 15s
path: /metrics

View File

@@ -15,7 +15,7 @@ grafana:
envFromSecret: grafana-oidc envFromSecret: grafana-oidc
grafana.ini: grafana.ini:
server: server:
root_url: "https://grafana.DOMAIN_SUFFIX" root_url: "https://metrics.DOMAIN_SUFFIX"
auth: auth:
# Keep local login as fallback (admin password from grafana-admin secret) # Keep local login as fallback (admin password from grafana-admin secret)
disable_login_form: false disable_login_form: false
@@ -36,21 +36,44 @@ grafana:
# To restrict to specific users, set role_attribute_path instead. # To restrict to specific users, set role_attribute_path instead.
auto_assign_org_role: Admin auto_assign_org_role: Admin
skip_org_role_sync: true skip_org_role_sync: true
sidecar:
datasources:
# Disable the auto-provisioned ClusterIP datasource; we define it
# explicitly below using the external URL so Grafana's backend reaches
# Prometheus via Pingora (https://systemmetrics.DOMAIN_SUFFIX) rather
# than the cluster-internal ClusterIP which is blocked by network policy.
defaultDatasourceEnabled: false
additionalDataSources: additionalDataSources:
- name: Prometheus
type: prometheus
url: "https://systemmetrics.DOMAIN_SUFFIX"
access: proxy
isDefault: true
jsonData:
timeInterval: 30s
- name: Loki - name: Loki
type: loki type: loki
url: http://loki.monitoring.svc.cluster.local:3100 url: "https://systemlogs.DOMAIN_SUFFIX"
access: proxy access: proxy
isDefault: false isDefault: false
- name: Tempo - name: Tempo
type: tempo type: tempo
url: http://tempo.monitoring.svc.cluster.local:3100 url: "https://systemtracing.DOMAIN_SUFFIX"
access: proxy access: proxy
isDefault: false isDefault: false
prometheus: prometheus:
prometheusSpec: prometheusSpec:
retention: 90d retention: 90d
# hostNetwork allows Prometheus to reach kubelet (10250) and node-exporter
# (9100) on the node's public InternalIP. On a single-node bare-metal
# server, pod-to-node-public-IP traffic doesn't route without this.
hostNetwork: true
additionalArgs:
# Allow browser-direct queries from the Grafana UI origin.
- name: web.cors.origin
value: "https://metrics.DOMAIN_SUFFIX"
storageSpec: storageSpec:
volumeClaimTemplate: volumeClaimTemplate:
spec: spec: