apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: linkerd-mesh-alerts namespace: mesh labels: role: alert-rules spec: groups: - name: linkerd-mesh rules: - alert: ServiceHighErrorRate expr: | sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace) / sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace) > 0.05 for: 5m labels: severity: warning annotations: summary: "Service has high error rate" description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate" - alert: ServiceHighErrorRateCritical expr: | sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace) / sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace) > 0.25 for: 2m labels: severity: critical annotations: summary: "Service has critically high error rate" description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate" - alert: ServiceHighLatency expr: | histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) > 2000 for: 5m labels: severity: warning annotations: summary: "Service has high p95 latency" description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms"