apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: recording-rules namespace: monitoring labels: role: alert-rules release: kube-prometheus-stack spec: groups: - name: linkerd-service-sli interval: 30s rules: - record: service:request_rate:5m expr: sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace) - record: service:error_rate:5m expr: | sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace) / sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace) - record: service:latency_p95:5m expr: | histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) - record: service:latency_p99:5m expr: | histogram_quantile(0.99, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) - name: node-aggregates interval: 30s rules: - record: node:memory_usage_ratio expr: 1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes - record: node:cpu_usage_ratio expr: 1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) - record: node:swap_usage_ratio expr: 1 - node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes