feat(cluster): add Prometheus metrics for cluster gossip and bandwidth
New metrics: cluster peers gauge, bandwidth in/out gauges, gossip message counter, aggregate rate gauges (in/out/total bytes/sec), model update counter, and bandwidth limit enforcement decision counter. Signed-off-by: Sienna Meridian Satterwhite <sienna@sunbeam.pt>
This commit is contained in:
@@ -95,6 +95,105 @@ pub static ACTIVE_CONNECTIONS: LazyLock<Gauge> = LazyLock::new(|| {
|
|||||||
g
|
g
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub static CLUSTER_PEERS: LazyLock<Gauge> = LazyLock::new(|| {
|
||||||
|
let g = Gauge::new(
|
||||||
|
"sunbeam_cluster_peers",
|
||||||
|
"Number of active cluster peers",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(g.clone())).unwrap();
|
||||||
|
g
|
||||||
|
});
|
||||||
|
|
||||||
|
pub static CLUSTER_BANDWIDTH_IN: LazyLock<Gauge> = LazyLock::new(|| {
|
||||||
|
let g = Gauge::new(
|
||||||
|
"sunbeam_cluster_bandwidth_in_bytes",
|
||||||
|
"Total cluster-wide inbound bytes",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(g.clone())).unwrap();
|
||||||
|
g
|
||||||
|
});
|
||||||
|
|
||||||
|
pub static CLUSTER_BANDWIDTH_OUT: LazyLock<Gauge> = LazyLock::new(|| {
|
||||||
|
let g = Gauge::new(
|
||||||
|
"sunbeam_cluster_bandwidth_out_bytes",
|
||||||
|
"Total cluster-wide outbound bytes",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(g.clone())).unwrap();
|
||||||
|
g
|
||||||
|
});
|
||||||
|
|
||||||
|
pub static CLUSTER_GOSSIP_MESSAGES: LazyLock<IntCounterVec> = LazyLock::new(|| {
|
||||||
|
let c = IntCounterVec::new(
|
||||||
|
Opts::new(
|
||||||
|
"sunbeam_cluster_gossip_messages_total",
|
||||||
|
"Gossip messages sent and received",
|
||||||
|
),
|
||||||
|
&["channel"],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(c.clone())).unwrap();
|
||||||
|
c
|
||||||
|
});
|
||||||
|
|
||||||
|
pub static CLUSTER_AGGREGATE_IN_RATE: LazyLock<Gauge> = LazyLock::new(|| {
|
||||||
|
let g = Gauge::new(
|
||||||
|
"sunbeam_cluster_aggregate_in_bytes_per_sec",
|
||||||
|
"Cluster-wide aggregate inbound bandwidth (bytes/sec, sliding window)",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(g.clone())).unwrap();
|
||||||
|
g
|
||||||
|
});
|
||||||
|
|
||||||
|
pub static CLUSTER_AGGREGATE_OUT_RATE: LazyLock<Gauge> = LazyLock::new(|| {
|
||||||
|
let g = Gauge::new(
|
||||||
|
"sunbeam_cluster_aggregate_out_bytes_per_sec",
|
||||||
|
"Cluster-wide aggregate outbound bandwidth (bytes/sec, sliding window)",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(g.clone())).unwrap();
|
||||||
|
g
|
||||||
|
});
|
||||||
|
|
||||||
|
pub static CLUSTER_AGGREGATE_TOTAL_RATE: LazyLock<Gauge> = LazyLock::new(|| {
|
||||||
|
let g = Gauge::new(
|
||||||
|
"sunbeam_cluster_aggregate_total_bytes_per_sec",
|
||||||
|
"Cluster-wide aggregate total bandwidth (bytes/sec, sliding window)",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(g.clone())).unwrap();
|
||||||
|
g
|
||||||
|
});
|
||||||
|
|
||||||
|
pub static BANDWIDTH_LIMIT_DECISIONS: LazyLock<IntCounterVec> = LazyLock::new(|| {
|
||||||
|
let c = IntCounterVec::new(
|
||||||
|
Opts::new(
|
||||||
|
"sunbeam_bandwidth_limit_decisions_total",
|
||||||
|
"Cluster bandwidth limit enforcement decisions",
|
||||||
|
),
|
||||||
|
&["decision"],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(c.clone())).unwrap();
|
||||||
|
c
|
||||||
|
});
|
||||||
|
|
||||||
|
pub static CLUSTER_MODEL_UPDATES: LazyLock<IntCounterVec> = LazyLock::new(|| {
|
||||||
|
let c = IntCounterVec::new(
|
||||||
|
Opts::new(
|
||||||
|
"sunbeam_cluster_model_updates_total",
|
||||||
|
"Model distribution events",
|
||||||
|
),
|
||||||
|
&["model_type", "result"],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
REGISTRY.register(Box::new(c.clone())).unwrap();
|
||||||
|
c
|
||||||
|
});
|
||||||
|
|
||||||
/// Spawn a lightweight HTTP server on `port` serving `/metrics` and `/health`.
|
/// Spawn a lightweight HTTP server on `port` serving `/metrics` and `/health`.
|
||||||
/// Returns immediately; the server runs in the background on the tokio runtime.
|
/// Returns immediately; the server runs in the background on the tokio runtime.
|
||||||
/// Port 0 = disabled.
|
/// Port 0 = disabled.
|
||||||
|
|||||||
Reference in New Issue
Block a user