sbbb/base/mesh/linkerd-alertrules.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: linkerd-mesh-alerts
  namespace: mesh
  labels:
    role: alert-rules
spec:
  groups:
    - name: linkerd-mesh
      rules:
        - alert: ServiceHighErrorRate
          expr: |
            sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
            / sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
            > 0.05
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Service has high error rate"
            description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate"

        - alert: ServiceHighErrorRateCritical
          expr: |
            sum(rate(response_total{classification="failure",direction="inbound"}[5m])) by (deployment, namespace)
            / sum(rate(response_total{direction="inbound"}[5m])) by (deployment, namespace)
            > 0.25
          for: 2m
          labels:
            severity: critical
          annotations:
            summary: "Service has critically high error rate"
            description: "{{ $labels.deployment }} in {{ $labels.namespace }} has {{ $value | humanizePercentage }} error rate"

        - alert: ServiceHighLatency
          expr: |
            histogram_quantile(0.95, sum(rate(response_latency_ms_bucket{direction="inbound"}[5m])) by (le, deployment, namespace)) > 2000
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Service has high p95 latency"
            description: "{{ $labels.deployment }} in {{ $labels.namespace }} p95 latency is {{ $value }}ms"