From 3722972ddfa4d702cd789a9cdac87d3b763f7296 Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Tue, 10 Mar 2026 23:38:21 +0000 Subject: [PATCH] feat(cluster): add Prometheus metrics for cluster gossip and bandwidth New metrics: cluster peers gauge, bandwidth in/out gauges, gossip message counter, aggregate rate gauges (in/out/total bytes/sec), model update counter, and bandwidth limit enforcement decision counter. Signed-off-by: Sienna Meridian Satterwhite --- src/metrics.rs | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/src/metrics.rs b/src/metrics.rs index c6da4ab..5c12004 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -95,6 +95,105 @@ pub static ACTIVE_CONNECTIONS: LazyLock = LazyLock::new(|| { g }); +pub static CLUSTER_PEERS: LazyLock = LazyLock::new(|| { + let g = Gauge::new( + "sunbeam_cluster_peers", + "Number of active cluster peers", + ) + .unwrap(); + REGISTRY.register(Box::new(g.clone())).unwrap(); + g +}); + +pub static CLUSTER_BANDWIDTH_IN: LazyLock = LazyLock::new(|| { + let g = Gauge::new( + "sunbeam_cluster_bandwidth_in_bytes", + "Total cluster-wide inbound bytes", + ) + .unwrap(); + REGISTRY.register(Box::new(g.clone())).unwrap(); + g +}); + +pub static CLUSTER_BANDWIDTH_OUT: LazyLock = LazyLock::new(|| { + let g = Gauge::new( + "sunbeam_cluster_bandwidth_out_bytes", + "Total cluster-wide outbound bytes", + ) + .unwrap(); + REGISTRY.register(Box::new(g.clone())).unwrap(); + g +}); + +pub static CLUSTER_GOSSIP_MESSAGES: LazyLock = LazyLock::new(|| { + let c = IntCounterVec::new( + Opts::new( + "sunbeam_cluster_gossip_messages_total", + "Gossip messages sent and received", + ), + &["channel"], + ) + .unwrap(); + REGISTRY.register(Box::new(c.clone())).unwrap(); + c +}); + +pub static CLUSTER_AGGREGATE_IN_RATE: LazyLock = LazyLock::new(|| { + let g = Gauge::new( + "sunbeam_cluster_aggregate_in_bytes_per_sec", + "Cluster-wide aggregate inbound bandwidth (bytes/sec, sliding window)", + ) + .unwrap(); + REGISTRY.register(Box::new(g.clone())).unwrap(); + g +}); + +pub static CLUSTER_AGGREGATE_OUT_RATE: LazyLock = LazyLock::new(|| { + let g = Gauge::new( + "sunbeam_cluster_aggregate_out_bytes_per_sec", + "Cluster-wide aggregate outbound bandwidth (bytes/sec, sliding window)", + ) + .unwrap(); + REGISTRY.register(Box::new(g.clone())).unwrap(); + g +}); + +pub static CLUSTER_AGGREGATE_TOTAL_RATE: LazyLock = LazyLock::new(|| { + let g = Gauge::new( + "sunbeam_cluster_aggregate_total_bytes_per_sec", + "Cluster-wide aggregate total bandwidth (bytes/sec, sliding window)", + ) + .unwrap(); + REGISTRY.register(Box::new(g.clone())).unwrap(); + g +}); + +pub static BANDWIDTH_LIMIT_DECISIONS: LazyLock = LazyLock::new(|| { + let c = IntCounterVec::new( + Opts::new( + "sunbeam_bandwidth_limit_decisions_total", + "Cluster bandwidth limit enforcement decisions", + ), + &["decision"], + ) + .unwrap(); + REGISTRY.register(Box::new(c.clone())).unwrap(); + c +}); + +pub static CLUSTER_MODEL_UPDATES: LazyLock = LazyLock::new(|| { + let c = IntCounterVec::new( + Opts::new( + "sunbeam_cluster_model_updates_total", + "Model distribution events", + ), + &["model_type", "result"], + ) + .unwrap(); + REGISTRY.register(Box::new(c.clone())).unwrap(); + c +}); + /// Spawn a lightweight HTTP server on `port` serving `/metrics` and `/health`. /// Returns immediately; the server runs in the background on the tokio runtime. /// Port 0 = disabled.