From eab91eb85d1715dbf0da8070dc18cf9a43ef019e Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Wed, 25 Mar 2026 17:58:51 +0000 Subject: [PATCH] feat(monitoring): expanded dashboards for all services Enriched dashboards for DevTools (Gitea), Identity (Hydra/Kratos), Infrastructure (Longhorn, PostgreSQL, cert-manager, OpenBao), Ingress (Pingora), and Storage (SeaweedFS). --- base/monitoring/dashboards-devtools.yaml | 242 +++++++++- base/monitoring/dashboards-identity.yaml | 358 ++++++++++++--- .../monitoring/dashboards-infrastructure.yaml | 427 +++++++++++++++--- base/monitoring/dashboards-ingress.yaml | 219 ++++++--- base/monitoring/dashboards-storage.yaml | 315 ++++++++++--- 5 files changed, 1278 insertions(+), 283 deletions(-) diff --git a/base/monitoring/dashboards-devtools.yaml b/base/monitoring/dashboards-devtools.yaml index d3cd022..c6df7db 100644 --- a/base/monitoring/dashboards-devtools.yaml +++ b/base/monitoring/dashboards-devtools.yaml @@ -14,55 +14,221 @@ data: { "annotations": { "list": [] }, "editable": true, + "graphTooltip": 1, "panels": [ + { + "title": "Overview", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, { "title": "Repositories", "type": "stat", - "gridPos": { "h": 4, "w": 8, "x": 0, "y": 0 }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { "expr": "gitea_repositories", "legendFormat": "repos" } + { "expr": "gitea_repositories{job=\"gitea-http\"}", "legendFormat": "", "instant": true } ] }, { "title": "Users", "type": "stat", - "gridPos": { "h": 4, "w": 8, "x": 8, "y": 0 }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { "expr": "gitea_users", "legendFormat": "users" } + { "expr": "gitea_users{job=\"gitea-http\"}", "legendFormat": "", "instant": true } ] }, { - "title": "Issues", + "title": "Organizations", "type": "stat", - "gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { "expr": "gitea_issues", "legendFormat": "issues" } + { "expr": "gitea_organizations{job=\"gitea-http\"}", "legendFormat": "", "instant": true } ] }, { - "title": "Go Goroutines", + "title": "Teams", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_teams{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Stars", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_stars{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Webhooks", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_webhooks{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Issues & Activity", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "collapsed": false + }, + { + "title": "Open Issues", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_issues_open{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":50},{"color":"red","value":200}] } } } + }, + { + "title": "Closed Issues", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_issues_closed{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Total Issues", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_issues{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Releases", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_releases{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Milestones", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_milestones{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Comments", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_comments{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Collaboration", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, + "collapsed": false + }, + { + "title": "Follows", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 11 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_follows{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Watches", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 11 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_watches{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Labels", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 11 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_labels{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Public Keys", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 11 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_publickeys{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "OAuth Sources", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 11 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_oauths{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Mirrors", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 11 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_mirrors{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Runtime", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }, + "collapsed": false + }, + { + "title": "Goroutines", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 0, "y": 4 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 }, "datasource": { "uid": "prometheus" }, "targets": [ { - "expr": "go_goroutines{job=~\".*gitea.*\"}", + "expr": "go_goroutines{job=\"gitea-http\"}", "legendFormat": "{{instance}}" } - ] + ], + "fieldConfig": { "defaults": { "unit": "short" } } }, { "title": "Memory Usage", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 8, "y": 4 }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 }, "datasource": { "uid": "prometheus" }, "targets": [ { - "expr": "process_resident_memory_bytes{job=~\".*gitea.*\"}", - "legendFormat": "{{instance}}" + "expr": "process_resident_memory_bytes{job=\"gitea-http\"}", + "legendFormat": "RSS {{instance}}" + }, + { + "expr": "go_memstats_alloc_bytes{job=\"gitea-http\"}", + "legendFormat": "Go alloc {{instance}}" } ], "fieldConfig": { "defaults": { "unit": "bytes" } } @@ -70,19 +236,63 @@ data: { "title": "CPU Usage", "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 16, "y": 4 }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 16 }, "datasource": { "uid": "prometheus" }, "targets": [ { - "expr": "rate(process_cpu_seconds_total{job=~\".*gitea.*\"}[5m])", + "expr": "rate(process_cpu_seconds_total{job=\"gitea-http\"}[5m])", "legendFormat": "{{instance}}" } ], "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "title": "Background Tasks", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }, + "collapsed": false + }, + { + "title": "Pending Hook Tasks", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 25 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_hooktasks{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":10},{"color":"red","value":50}] } } } + }, + { + "title": "Update Tasks", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 25 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_updatetasks{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Attachments", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 25 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_attachments{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Login Sources", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 25 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "gitea_loginsources{job=\"gitea-http\"}", "legendFormat": "", "instant": true } + ] } ], "schemaVersion": 39, "tags": ["gitea", "devtools"], + "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "title": "Gitea", "uid": "gitea" diff --git a/base/monitoring/dashboards-identity.yaml b/base/monitoring/dashboards-identity.yaml index 2aa57ea..9453843 100644 --- a/base/monitoring/dashboards-identity.yaml +++ b/base/monitoring/dashboards-identity.yaml @@ -14,76 +14,189 @@ data: { "annotations": { "list": [] }, "editable": true, + "graphTooltip": 1, "panels": [ + { + "title": "Overview", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, { "title": "Request Rate", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(http_requests_total{job=~\".*hydra.*\"}[5m])) by (handler)", - "legendFormat": "{{handler}}" - } + { "expr": "sum(rate(http_requests_total{job=\"hydra-admin\"}[5m]))", "legendFormat": "", "instant": true } ], "fieldConfig": { "defaults": { "unit": "reqps" } } }, { - "title": "Error Rate", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "title": "Error Rate (5xx)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(http_requests_total{job=~\".*hydra.*\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=~\".*hydra.*\"}[5m]))", - "legendFormat": "5xx ratio" - } + { "expr": "sum(rate(http_requests_statuses_total{job=\"hydra-admin\",code=~\"5..\"}[5m])) / sum(rate(http_requests_statuses_total{job=\"hydra-admin\"}[5m]))", "legendFormat": "", "instant": true } ], "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } } }, { - "title": "Request Latency p95", + "title": "Goroutines", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "go_goroutines{job=\"hydra-admin\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Memory (RSS)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "process_resident_memory_bytes{job=\"hydra-admin\"}", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Request Traffic", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "collapsed": false + }, + { + "title": "Requests / sec by Handler", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, "datasource": { "uid": "prometheus" }, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\".*hydra.*\"}[5m])) by (le, handler))", - "legendFormat": "{{handler}} p95" + "expr": "sum(rate(http_requests_total{job=\"hydra-admin\"}[5m])) by (handler)", + "legendFormat": "{{handler}}" } ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Requests / sec by Method", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=\"hydra-admin\"}[5m])) by (method)", + "legendFormat": "{{method}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Response Status Codes", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_statuses_total{job=\"hydra-admin\"}[5m])) by (code)", + "legendFormat": "{{code}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Request Latency (p50 / p95 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(http_requests_duration_seconds_bucket{job=\"hydra-admin\"}[5m])) by (le))", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(http_requests_duration_seconds_bucket{job=\"hydra-admin\"}[5m])) by (le))", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(http_requests_duration_seconds_bucket{job=\"hydra-admin\"}[5m])) by (le))", "legendFormat": "p99" } + ], "fieldConfig": { "defaults": { "unit": "s" } } }, { - "title": "Go Goroutines", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "go_goroutines{job=~\".*hydra.*\"}", - "legendFormat": "{{instance}}" - } - ] + "title": "Size & Timing", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "collapsed": false }, { - "title": "Memory Usage", + "title": "Request Size (p95)", "type": "timeseries", - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "go_memstats_alloc_bytes{job=~\".*hydra.*\"}", - "legendFormat": "{{instance}}" - } + { "expr": "histogram_quantile(0.95, sum(rate(http_requests_size_bytes_bucket{job=\"hydra-admin\"}[5m])) by (le))", "legendFormat": "p95" } ], "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Response Size (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 23 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(http_response_size_bytes_bucket{job=\"hydra-admin\"}[5m])) by (le))", "legendFormat": "p95" } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Response Time (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(http_response_time_seconds_bucket{job=\"hydra-admin\"}[5m])) by (le))", "legendFormat": "p95" } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "Runtime", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "collapsed": false + }, + { + "title": "Goroutines", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 32 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "go_goroutines{job=\"hydra-admin\"}", "legendFormat": "{{instance}}" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "title": "Memory", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 32 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "process_resident_memory_bytes{job=\"hydra-admin\"}", "legendFormat": "RSS" }, + { "expr": "go_memstats_alloc_bytes{job=\"hydra-admin\"}", "legendFormat": "Go alloc" } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "CPU Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 32 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(process_cpu_seconds_total{job=\"hydra-admin\"}[5m])", "legendFormat": "{{instance}}" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } } ], "schemaVersion": 39, "tags": ["hydra", "oauth2", "identity"], + "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "title": "Hydra OAuth2", "uid": "hydra" @@ -103,76 +216,189 @@ data: { "annotations": { "list": [] }, "editable": true, + "graphTooltip": 1, "panels": [ + { + "title": "Overview", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, { "title": "Request Rate", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(http_requests_total{job=~\".*kratos.*\"}[5m])) by (handler)", - "legendFormat": "{{handler}}" - } + { "expr": "sum(rate(http_requests_total{job=\"kratos-admin\"}[5m]))", "legendFormat": "", "instant": true } ], "fieldConfig": { "defaults": { "unit": "reqps" } } }, { - "title": "Error Rate", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "title": "Error Rate (5xx)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(http_requests_total{job=~\".*kratos.*\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{job=~\".*kratos.*\"}[5m]))", - "legendFormat": "5xx ratio" - } + { "expr": "sum(rate(http_requests_statuses_total{job=\"kratos-admin\",code=~\"5..\"}[5m])) / sum(rate(http_requests_statuses_total{job=\"kratos-admin\"}[5m]))", "legendFormat": "", "instant": true } ], "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } } }, { - "title": "Request Latency p95", + "title": "Goroutines", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "go_goroutines{job=\"kratos-admin\"}", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Memory (RSS)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "process_resident_memory_bytes{job=\"kratos-admin\"}", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Request Traffic", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "collapsed": false + }, + { + "title": "Requests / sec by Handler", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, "datasource": { "uid": "prometheus" }, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\".*kratos.*\"}[5m])) by (le, handler))", - "legendFormat": "{{handler}} p95" + "expr": "sum(rate(http_requests_total{job=\"kratos-admin\"}[5m])) by (handler)", + "legendFormat": "{{handler}}" } ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Requests / sec by Method", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=\"kratos-admin\"}[5m])) by (method)", + "legendFormat": "{{method}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Response Status Codes", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_statuses_total{job=\"kratos-admin\"}[5m])) by (code)", + "legendFormat": "{{code}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Request Latency (p50 / p95 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.50, sum(rate(http_requests_duration_seconds_bucket{job=\"kratos-admin\"}[5m])) by (le))", "legendFormat": "p50" }, + { "expr": "histogram_quantile(0.95, sum(rate(http_requests_duration_seconds_bucket{job=\"kratos-admin\"}[5m])) by (le))", "legendFormat": "p95" }, + { "expr": "histogram_quantile(0.99, sum(rate(http_requests_duration_seconds_bucket{job=\"kratos-admin\"}[5m])) by (le))", "legendFormat": "p99" } + ], "fieldConfig": { "defaults": { "unit": "s" } } }, { - "title": "Go Goroutines", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "go_goroutines{job=~\".*kratos.*\"}", - "legendFormat": "{{instance}}" - } - ] + "title": "Size & Timing", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "collapsed": false }, { - "title": "Memory Usage", + "title": "Request Size (p95)", "type": "timeseries", - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "go_memstats_alloc_bytes{job=~\".*kratos.*\"}", - "legendFormat": "{{instance}}" - } + { "expr": "histogram_quantile(0.95, sum(rate(http_requests_size_bytes_bucket{job=\"kratos-admin\"}[5m])) by (le))", "legendFormat": "p95" } ], "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Response Size (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 23 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(http_response_size_bytes_bucket{job=\"kratos-admin\"}[5m])) by (le))", "legendFormat": "p95" } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Response Time (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(http_response_time_seconds_bucket{job=\"kratos-admin\"}[5m])) by (le))", "legendFormat": "p95" } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "Runtime", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "collapsed": false + }, + { + "title": "Goroutines", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 32 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "go_goroutines{job=\"kratos-admin\"}", "legendFormat": "{{instance}}" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "title": "Memory", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 32 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "process_resident_memory_bytes{job=\"kratos-admin\"}", "legendFormat": "RSS" }, + { "expr": "go_memstats_alloc_bytes{job=\"kratos-admin\"}", "legendFormat": "Go alloc" } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "CPU Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 32 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(process_cpu_seconds_total{job=\"kratos-admin\"}[5m])", "legendFormat": "{{instance}}" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } } ], "schemaVersion": 39, "tags": ["kratos", "identity"], + "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "title": "Kratos Identity", "uid": "kratos" diff --git a/base/monitoring/dashboards-infrastructure.yaml b/base/monitoring/dashboards-infrastructure.yaml index 7c8c8e6..797614c 100644 --- a/base/monitoring/dashboards-infrastructure.yaml +++ b/base/monitoring/dashboards-infrastructure.yaml @@ -14,32 +14,86 @@ data: { "annotations": { "list": [] }, "editable": true, + "graphTooltip": 1, "panels": [ { - "title": "Vault/OpenBao Sealed Status", + "title": "Health", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, + { + "title": "Seal Status", "type": "stat", - "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { "expr": "vault_core_unsealed", "legendFormat": "unsealed" } + { "expr": "vault_core_unsealed", "legendFormat": "", "instant": true } ], "fieldConfig": { "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"SEALED","color":"red"},"1":{"text":"UNSEALED","color":"green"}}}] } } }, { - "title": "Token Count", + "title": "Active Node", "type": "stat", - "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { "expr": "vault_token_count", "legendFormat": "tokens" } + { "expr": "vault_core_active", "legendFormat": "", "instant": true } + ], + "fieldConfig": { + "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"Standby","color":"yellow"},"1":{"text":"Active","color":"green"}}}] } + } + }, + { + "title": "In-Flight Requests", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "vault_core_in_flight_requests", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":50},{"color":"red","value":200}] } } } + }, + { + "title": "Active Leases", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "vault_expire_num_leases", "legendFormat": "", "instant": true } ] }, + { + "title": "Irrevocable Leases", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "vault_expire_num_irrevocable_leases", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":1},{"color":"red","value":10}] } } } + }, + { + "title": "Mount Table Entries", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "vault_core_mount_table_num_entries", "legendFormat": "", "instant": true } + ] + }, + { + "title": "Request Performance", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "collapsed": false + }, { "title": "Request Rate", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 }, "datasource": { "uid": "prometheus" }, "targets": [ { "expr": "sum(rate(vault_core_handle_request_count[5m]))", "legendFormat": "req/s" } @@ -47,18 +101,126 @@ data: "fieldConfig": { "defaults": { "unit": "reqps" } } }, { - "title": "Request Latency (p95)", + "title": "Request Latency (avg)", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 }, "datasource": { "uid": "prometheus" }, "targets": [ - { "expr": "histogram_quantile(0.95, sum(rate(vault_core_handle_request_bucket[5m])) by (le))", "legendFormat": "p95" } + { "expr": "rate(vault_core_handle_request_sum[5m]) / rate(vault_core_handle_request_count[5m])", "legendFormat": "avg" } ], "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "Token Lookups / sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(vault_token_lookup_count[5m])", "legendFormat": "lookups/s" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Barrier & Cache", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "collapsed": false + }, + { + "title": "Barrier Ops / sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(vault_barrier_get_count[5m])", "legendFormat": "get" }, + { "expr": "rate(vault_barrier_put_count[5m])", "legendFormat": "put" }, + { "expr": "rate(vault_barrier_list_count[5m])", "legendFormat": "list" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Cache Hit Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(vault_cache_hit[5m])", "legendFormat": "cache hits/s" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Audit", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "collapsed": false + }, + { + "title": "Audit Log Throughput", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(vault_audit_log_request_count[5m])", "legendFormat": "request logs/s" }, + { "expr": "rate(vault_audit_log_response_count[5m])", "legendFormat": "response logs/s" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Audit Log Failures", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(vault_audit_log_request_failure[5m])", "legendFormat": "request failures/s" }, + { "expr": "rate(vault_audit_log_response_failure[5m])", "legendFormat": "response failures/s" } + ], + "fieldConfig": { + "defaults": { "unit": "ops", "thresholds": { "steps": [{"color":"green","value":null},{"color":"red","value":0.01}] } } + } + }, + { + "title": "Runtime", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 }, + "collapsed": false + }, + { + "title": "Memory", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 33 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "vault_runtime_alloc_bytes", "legendFormat": "alloc" }, + { "expr": "vault_runtime_sys_bytes", "legendFormat": "sys" } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Goroutines & Heap Objects", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 33 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "vault_runtime_num_goroutines", "legendFormat": "goroutines" }, + { "expr": "vault_runtime_heap_objects", "legendFormat": "heap objects" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "title": "GC Activity", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 33 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(vault_runtime_total_gc_runs[5m])", "legendFormat": "GC runs/s" }, + { "expr": "rate(vault_runtime_total_gc_pause_ns[5m])", "legendFormat": "GC pause ns/s" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } } ], "schemaVersion": 39, "tags": ["vault", "openbao"], + "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "title": "OpenBao / Vault", "uid": "openbao" @@ -78,6 +240,7 @@ data: { "annotations": { "list": [] }, "editable": true, + "graphTooltip": 1, "panels": [ { "title": "Volume Usage", @@ -100,7 +263,8 @@ data: "targets": [ { "expr": "sum(longhorn_volume_capacity_bytes)", - "legendFormat": "total" + "legendFormat": "total", + "instant": true } ], "fieldConfig": { "defaults": { "unit": "bytes" } } @@ -113,7 +277,8 @@ data: "targets": [ { "expr": "(longhorn_disk_usage_bytes / longhorn_disk_capacity_bytes) * 100", - "legendFormat": "{{node}}" + "legendFormat": "{{node}}", + "instant": true } ], "fieldConfig": { @@ -132,7 +297,8 @@ data: "targets": [ { "expr": "longhorn_node_count_total", - "legendFormat": "nodes" + "legendFormat": "nodes", + "instant": true } ] }, @@ -153,6 +319,7 @@ data: ], "schemaVersion": 39, "tags": ["longhorn", "storage"], + "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "title": "Longhorn Storage", "uid": "longhorn" @@ -172,78 +339,203 @@ data: { "annotations": { "list": [] }, "editable": true, + "graphTooltip": 1, "panels": [ { - "title": "Database Size", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "cnpg_pg_database_size_bytes", - "legendFormat": "{{datname}}" - } - ], - "fieldConfig": { "defaults": { "unit": "bytes" } } + "title": "Health", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false }, { - "title": "Active Connections", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "cnpg_pg_stat_activity_count", - "legendFormat": "{{state}}" - } - ] - }, - { - "title": "Collector Status", + "title": "Status", "type": "stat", - "gridPos": { "h": 4, "w": 6, "x": 0, "y": 8 }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "cnpg_collector_up", - "legendFormat": "{{instance}}" - } - ], - "fieldConfig": { - "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"DOWN","color":"red"},"1":{"text":"UP","color":"green"}}}] } - } + "targets": [{ "expr": "cnpg_collector_up", "legendFormat": "", "instant": true }], + "fieldConfig": { "defaults": { "mappings": [{"type":"value","options":{"0":{"text":"DOWN","color":"red"},"1":{"text":"UP","color":"green"}}}] } } }, { - "title": "Transactions/sec", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 6, "y": 8 }, + "title": "Backends", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "rate(cnpg_pg_stat_database_xact_commit[5m]) + rate(cnpg_pg_stat_database_xact_rollback[5m])", - "legendFormat": "{{datname}}" - } - ], - "fieldConfig": { "defaults": { "unit": "ops" } } + "targets": [{ "expr": "sum(cnpg_backends_total)", "legendFormat": "", "instant": true }], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":60},{"color":"red","value":80}] } } } + }, + { + "title": "Waiting Backends", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "sum(cnpg_backends_waiting_total)", "legendFormat": "", "instant": true }], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":5},{"color":"red","value":15}] } } } + }, + { + "title": "Longest Transaction", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "max(cnpg_backends_max_tx_duration_seconds)", "legendFormat": "", "instant": true }], + "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":30},{"color":"red","value":300}] } } } }, { "title": "Cache Hit Ratio", "type": "gauge", - "gridPos": { "h": 4, "w": 6, "x": 18, "y": 8 }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "sum(cnpg_pg_stat_database_blks_hit) / (sum(cnpg_pg_stat_database_blks_hit) + sum(cnpg_pg_stat_database_blks_read))", "legendFormat": "", "instant": true }], + "fieldConfig": { "defaults": { "unit": "percentunit", "min": 0, "max": 1, "thresholds": { "steps": [{"color":"red","value":null},{"color":"yellow","value":0.9},{"color":"green","value":0.99}] } } } + }, + { + "title": "WAL Size", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "cnpg_collector_pg_wal", "legendFormat": "", "instant": true }], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Databases", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "collapsed": false + }, + { + "title": "Database Size", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "cnpg_pg_database_size_bytes{datname!~'template.*|postgres'}", "legendFormat": "{{datname}}" }], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "Connections by Database", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "cnpg_backends_total{datname!~'template.*'}", "legendFormat": "{{datname}}" }], + "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "title": "Transactions / sec (commits)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "cnpg_pg_stat_database_blks_hit / (cnpg_pg_stat_database_blks_hit + cnpg_pg_stat_database_blks_read)", - "legendFormat": "{{datname}}" - } + { "expr": "rate(cnpg_pg_stat_database_xact_commit{datname!~'template.*|postgres'}[5m])", "legendFormat": "{{datname}} commits" }, + { "expr": "rate(cnpg_pg_stat_database_xact_rollback{datname!~'template.*|postgres'}[5m])", "legendFormat": "{{datname}} rollbacks" } ], - "fieldConfig": { - "defaults": { "unit": "percentunit", "max": 1 } - } + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Rows Fetched / sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "rate(cnpg_pg_stat_database_tup_fetched{datname!~'template.*|postgres'}[5m])", "legendFormat": "{{datname}}" }], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "I/O & Caching", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "collapsed": false + }, + { + "title": "Block I/O (reads vs cache hits)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 23 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(cnpg_pg_stat_database_blks_read[5m]))", "legendFormat": "disk reads" }, + { "expr": "sum(rate(cnpg_pg_stat_database_blks_hit[5m]))", "legendFormat": "cache hits" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Temp Files / Bytes Written", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 23 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(cnpg_pg_stat_database_temp_bytes[5m])", "legendFormat": "temp bytes/s" }, + { "expr": "rate(cnpg_pg_stat_database_temp_files[5m])", "legendFormat": "temp files/s" } + ], + "fieldConfig": { "defaults": { "unit": "Bps" } }, + "options": { "tooltip": { "mode": "multi" } } + }, + { + "title": "Row Mutations / sec", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(cnpg_pg_stat_database_tup_inserted[5m]))", "legendFormat": "inserts" }, + { "expr": "sum(rate(cnpg_pg_stat_database_tup_updated[5m]))", "legendFormat": "updates" }, + { "expr": "sum(rate(cnpg_pg_stat_database_tup_deleted[5m]))", "legendFormat": "deletes" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Deadlocks & Conflicts", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(cnpg_pg_stat_database_deadlocks[5m])", "legendFormat": "deadlocks/s" }, + { "expr": "rate(cnpg_pg_stat_database_conflicts[5m])", "legendFormat": "conflicts/s" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "WAL & Archival", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }, + "collapsed": false + }, + { + "title": "WAL Generation Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "rate(cnpg_collector_wal_bytes[5m])", "legendFormat": "WAL bytes/s" }], + "fieldConfig": { "defaults": { "unit": "Bps" } } + }, + { + "title": "WAL Archival", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(cnpg_pg_stat_archiver_archived_count[5m])", "legendFormat": "archived/s" }, + { "expr": "rate(cnpg_pg_stat_archiver_failed_count[5m])", "legendFormat": "failed/s" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Seconds Since Last Archive", + "type": "stat", + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 48 }, + "datasource": { "uid": "prometheus" }, + "targets": [{ "expr": "cnpg_pg_stat_archiver_seconds_since_last_archival", "legendFormat": "", "instant": true }], + "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":300},{"color":"red","value":900}] } } } + }, + { + "title": "Checkpoints", + "type": "timeseries", + "gridPos": { "h": 8, "w": 16, "x": 8, "y": 48 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(cnpg_pg_stat_checkpointer_checkpoints_timed[5m])", "legendFormat": "timed" }, + { "expr": "rate(cnpg_pg_stat_checkpointer_checkpoints_req[5m])", "legendFormat": "requested" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } } ], "schemaVersion": 39, "tags": ["postgres", "cnpg"], + "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "title": "PostgreSQL / CNPG", "uid": "postgres-cnpg" @@ -263,6 +555,7 @@ data: { "annotations": { "list": [] }, "editable": true, + "graphTooltip": 1, "panels": [ { "title": "Certificates Expiring", @@ -292,7 +585,8 @@ data: "targets": [ { "expr": "certmanager_certificate_ready_status{condition=\"True\"}", - "legendFormat": "{{name}}" + "legendFormat": "{{name}}", + "instant": true } ], "fieldConfig": { @@ -315,6 +609,7 @@ data: ], "schemaVersion": 39, "tags": ["cert-manager", "tls"], + "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "title": "Cert-Manager", "uid": "cert-manager" diff --git a/base/monitoring/dashboards-ingress.yaml b/base/monitoring/dashboards-ingress.yaml index 20711ae..307fc64 100644 --- a/base/monitoring/dashboards-ingress.yaml +++ b/base/monitoring/dashboards-ingress.yaml @@ -19,111 +19,210 @@ data: "links": [], "panels": [ { - "title": "Requests / sec", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "title": "Traffic Overview", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "collapsed": false + }, + { + "title": "Active Connections", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(pingora_http_requests_total[5m]))", - "legendFormat": "total" - }, - { - "expr": "sum(rate(pingora_http_requests_total[5m])) by (status_code)", - "legendFormat": "{{status_code}}" - } + { "expr": "sum(sunbeam_active_connections)", "legendFormat": "", "instant": true } ], "fieldConfig": { - "defaults": { "unit": "reqps" } + "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":500},{"color":"red","value":1000}] } } } }, { - "title": "Error Rate (5xx)", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "title": "Request Rate", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(pingora_http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(pingora_http_requests_total[5m]))", - "legendFormat": "5xx ratio" - } + { "expr": "sum(rate(sunbeam_requests_total[5m]))", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Error Rate (5xx)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(sunbeam_requests_total{status=~\"5..\"}[5m])) / sum(rate(sunbeam_requests_total[5m]))", "legendFormat": "", "instant": true } ], "fieldConfig": { "defaults": { "unit": "percentunit", "max": 1, "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.01},{"color":"red","value":0.05}] } } } }, { - "title": "Request Latency (p50 / p95 / p99)", + "title": "Avg Latency (p95)", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(sunbeam_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":0.5},{"color":"red","value":2}] } } } + }, + { + "title": "Requests & Latency", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "collapsed": false + }, + { + "title": "Requests / sec by Status", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 }, "datasource": { "uid": "prometheus" }, "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "expr": "sum(rate(sunbeam_requests_total[5m])) by (status)", + "legendFormat": "{{status}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Requests / sec by Backend", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(sunbeam_requests_total[5m])) by (backend)", + "legendFormat": "{{backend}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Requests / sec by Method", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(sunbeam_requests_total[5m])) by (method)", + "legendFormat": "{{method}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Request Latency (p50 / p95 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(sunbeam_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50" }, { - "expr": "histogram_quantile(0.95, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(sunbeam_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95" }, { - "expr": "histogram_quantile(0.99, sum(rate(pingora_http_request_duration_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(sunbeam_request_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p99" } ], - "fieldConfig": { - "defaults": { "unit": "s" } - } + "fieldConfig": { "defaults": { "unit": "s" } } }, { "title": "Active Connections", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, "datasource": { "uid": "prometheus" }, "targets": [ { - "expr": "pingora_active_connections", - "legendFormat": "active" - } - ] - }, - { - "title": "Upstream Latency by Backend", - "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "histogram_quantile(0.95, sum(rate(pingora_upstream_duration_seconds_bucket[5m])) by (le, backend))", - "legendFormat": "{{backend}} p95" + "expr": "sunbeam_active_connections", + "legendFormat": "{{instance}}" } ], - "fieldConfig": { - "defaults": { "unit": "s" } - } + "fieldConfig": { "defaults": { "unit": "short" } } }, { - "title": "DDoS / Scanner Detections", + "title": "Security & Rate Limiting", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "collapsed": false + }, + { + "title": "DDoS Decisions", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 23 }, "datasource": { "uid": "prometheus" }, "targets": [ { - "expr": "sum(rate(pingora_ddos_detections_total[5m]))", - "legendFormat": "DDoS" - }, - { - "expr": "sum(rate(pingora_scanner_detections_total[5m]))", - "legendFormat": "Scanner" - }, - { - "expr": "sum(rate(pingora_rate_limit_rejected_total[5m]))", - "legendFormat": "Rate-limited" + "expr": "sum(rate(sunbeam_ddos_decisions_total[5m])) by (decision)", + "legendFormat": "{{decision}}" } ], - "fieldConfig": { - "defaults": { "unit": "reqps" } - } + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Scanner Decisions", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 23 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(sunbeam_scanner_decisions_total[5m])) by (decision)", + "legendFormat": "{{decision}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Rate Limit Decisions", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 23 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(sunbeam_rate_limit_decisions_total[5m])) by (decision)", + "legendFormat": "{{decision}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Ensemble Decision Paths", + "type": "row", + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "collapsed": false + }, + { + "title": "DDoS Ensemble Paths", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 32 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(sunbeam_ddos_ensemble_path_total[5m])) by (path)", + "legendFormat": "{{path}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Scanner Ensemble Paths", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 32 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(sunbeam_scanner_ensemble_path_total[5m])) by (path)", + "legendFormat": "{{path}}" + } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } } ], "schemaVersion": 39, diff --git a/base/monitoring/dashboards-storage.yaml b/base/monitoring/dashboards-storage.yaml index 90f9c53..bccdfca 100644 --- a/base/monitoring/dashboards-storage.yaml +++ b/base/monitoring/dashboards-storage.yaml @@ -14,6 +14,7 @@ data: { "annotations": { "list": [] }, "editable": true, + "graphTooltip": 1, "panels": [ { "title": "Cluster Overview", @@ -22,41 +23,50 @@ data: "collapsed": false }, { - "title": "Data Nodes", + "title": "Master Leader", "type": "stat", - "gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "seaweedfs_master_data_nodes or count(up{job=~\".*seaweedfs-volume.*\"})", - "legendFormat": "nodes" - } - ] - }, - { - "title": "Total Volume Count", - "type": "stat", - "gridPos": { "h": 4, "w": 8, "x": 8, "y": 1 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "seaweedfs_master_volumes_count or sum(seaweedfs_volume_count)", - "legendFormat": "volumes" - } - ] - }, - { - "title": "Total Disk Free", - "type": "stat", - "gridPos": { "h": 4, "w": 8, "x": 16, "y": 1 }, - "datasource": { "uid": "prometheus" }, - "targets": [ - { - "expr": "sum(seaweedfs_disk_free_bytes)", - "legendFormat": "free" - } + { "expr": "SeaweedFS_master_is_leader", "legendFormat": "", "instant": true } ], - "fieldConfig": { "defaults": { "unit": "bytes" } } + "fieldConfig": { + "defaults": { + "mappings": [ + { "type": "value", "options": { "1": { "text": "Leader", "color": "green" }, "0": { "text": "Follower", "color": "red" } } } + ] + } + } + }, + { + "title": "Writable Volumes", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "SeaweedFS_master_volume_layout_writable", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"red","value":null},{"color":"yellow","value":1},{"color":"green","value":3}] } } } + }, + { + "title": "Crowded Volumes", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "SeaweedFS_master_volume_layout_crowded", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":1},{"color":"red","value":5}] } } } + }, + { + "title": "Leader Changes", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "increase(SeaweedFS_master_leader_changes[1h])", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":1},{"color":"red","value":3}] } } } }, { "title": "Volume Server", @@ -65,98 +75,253 @@ data: "collapsed": false }, { - "title": "Read Rate", - "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 0, "y": 6 }, + "title": "Total Volumes", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 6 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(seaweedfs_volume_read_total[5m]))", - "legendFormat": "reads" - } - ], - "fieldConfig": { "defaults": { "unit": "ops" } } + { "expr": "sum(SeaweedFS_volumeServer_volumes)", "legendFormat": "", "instant": true } + ] }, { - "title": "Write Rate", - "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 8, "y": 6 }, + "title": "Max Volumes", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 6 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(seaweedfs_volume_write_total[5m]))", - "legendFormat": "writes" - } - ], - "fieldConfig": { "defaults": { "unit": "ops" } } + { "expr": "sum(SeaweedFS_volumeServer_max_volumes)", "legendFormat": "", "instant": true } + ] }, { - "title": "Disk Usage", - "type": "timeseries", - "gridPos": { "h": 8, "w": 8, "x": 16, "y": 6 }, + "title": "Disk Size", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 6 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "seaweedfs_disk_used_bytes", - "legendFormat": "{{instance}}" - } + { "expr": "sum(SeaweedFS_volumeServer_total_disk_size)", "legendFormat": "", "instant": true } ], "fieldConfig": { "defaults": { "unit": "bytes" } } }, + { + "title": "Read-Only Volumes", + "type": "stat", + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 6 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(SeaweedFS_volumeServer_read_only_volumes)", "legendFormat": "", "instant": true } + ], + "fieldConfig": { "defaults": { "thresholds": { "steps": [{"color":"green","value":null},{"color":"yellow","value":1},{"color":"red","value":5}] } } } + }, + { + "title": "Volume Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 10 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(SeaweedFS_volumeServer_request_total[5m]))", "legendFormat": "requests/s" } + ], + "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "Volume Latency (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 10 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(SeaweedFS_volumeServer_request_seconds_bucket[5m])) by (le))", "legendFormat": "p95" } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "Volume In-Flight Requests", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 10 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(SeaweedFS_volumeServer_in_flight_requests)", "legendFormat": "in-flight" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "title": "Master Heartbeats & Errors", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "rate(SeaweedFS_master_received_heartbeats[5m])", "legendFormat": "heartbeats/s" }, + { "expr": "rate(SeaweedFS_master_pick_for_write_error[5m])", "legendFormat": "write errors/s" }, + { "expr": "rate(SeaweedFS_master_broadcast_to_full[5m])", "legendFormat": "broadcast full/s" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Vacuuming Activity", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "SeaweedFS_volumeServer_resource{type=\"vacuuming\"}", "legendFormat": "{{instance}}" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + }, { "title": "Filer", "type": "row", - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, "collapsed": false }, { "title": "Filer Request Rate", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 27 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(seaweedfs_filer_request_total[5m])) by (type)", - "legendFormat": "{{type}}" - } + { "expr": "sum(rate(SeaweedFS_filer_request_total[5m]))", "legendFormat": "requests/s" } ], "fieldConfig": { "defaults": { "unit": "reqps" } } }, { - "title": "Filer Latency p95", + "title": "Filer Latency (p95)", "type": "timeseries", - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 27 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "histogram_quantile(0.95, sum(rate(seaweedfs_filer_request_duration_seconds_bucket[5m])) by (le))", - "legendFormat": "p95" - } + { "expr": "histogram_quantile(0.95, sum(rate(SeaweedFS_filer_request_seconds_bucket[5m])) by (le))", "legendFormat": "p95" } ], "fieldConfig": { "defaults": { "unit": "s" } } }, + { + "title": "Filer In-Flight Requests", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 27 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(SeaweedFS_filer_in_flight_requests)", "legendFormat": "in-flight" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "title": "Filer Handler Ops", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 35 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(SeaweedFS_filer_handler_total[5m])) by (type)", "legendFormat": "{{type}}" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, + { + "title": "Filer Store Ops", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 35 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(SeaweedFS_filerStore_request_total[5m])) by (type)", "legendFormat": "{{type}}" }, + { "expr": "histogram_quantile(0.95, sum(rate(SeaweedFS_filerStore_request_seconds_bucket[5m])) by (le))", "legendFormat": "store p95 latency" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } + }, { "title": "S3 API", "type": "row", - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 43 }, "collapsed": false }, { - "title": "S3 Requests", + "title": "S3 Request Rate", "type": "timeseries", - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 24 }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 44 }, "datasource": { "uid": "prometheus" }, "targets": [ - { - "expr": "sum(rate(seaweedfs_s3_request_total[5m])) by (bucket, method)", - "legendFormat": "{{bucket}} {{method}}" - } + { "expr": "sum(rate(SeaweedFS_s3_request_total[5m])) by (bucket)", "legendFormat": "{{bucket}}" } ], "fieldConfig": { "defaults": { "unit": "reqps" } } + }, + { + "title": "S3 Latency (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 44 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(SeaweedFS_s3_request_seconds_bucket[5m])) by (le))", "legendFormat": "p95" } + ], + "fieldConfig": { "defaults": { "unit": "s" } } + }, + { + "title": "S3 Time to First Byte (p95)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 44 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "histogram_quantile(0.95, sum(rate(SeaweedFS_s3_time_to_first_byte_millisecond_bucket[5m])) by (le))", "legendFormat": "p95" } + ], + "fieldConfig": { "defaults": { "unit": "ms" } } + }, + { + "title": "S3 Bucket Sizes", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 52 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "SeaweedFS_s3_bucket_size_bytes", "legendFormat": "{{bucket}}" } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "S3 Bucket Physical Sizes", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 52 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "SeaweedFS_s3_bucket_physical_size_bytes", "legendFormat": "{{bucket}}" } + ], + "fieldConfig": { "defaults": { "unit": "bytes" } } + }, + { + "title": "S3 Object Count per Bucket", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 52 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "SeaweedFS_s3_bucket_object_count", "legendFormat": "{{bucket}}" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "title": "S3 Traffic Received", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 60 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(SeaweedFS_s3_bucket_traffic_received_bytes_total[5m])) by (bucket)", "legendFormat": "{{bucket}}" } + ], + "fieldConfig": { "defaults": { "unit": "Bps" } } + }, + { + "title": "S3 Traffic Sent", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 60 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(SeaweedFS_s3_bucket_traffic_sent_bytes_total[5m])) by (bucket)", "legendFormat": "{{bucket}}" } + ], + "fieldConfig": { "defaults": { "unit": "Bps" } } + }, + { + "title": "S3 Object Uploads / Deletes", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 60 }, + "datasource": { "uid": "prometheus" }, + "targets": [ + { "expr": "sum(rate(SeaweedFS_s3_uploaded_objects[5m]))", "legendFormat": "uploads/s" }, + { "expr": "sum(rate(SeaweedFS_s3_deleted_objects[5m]))", "legendFormat": "deletes/s" } + ], + "fieldConfig": { "defaults": { "unit": "ops" } } } ], "schemaVersion": 39, "tags": ["seaweedfs", "storage", "s3"], + "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "title": "SeaweedFS", "uid": "seaweedfs"