- Remove async-nats from Cargo.toml dependencies - Rename nats_publish_failed metric → redis_publish_failed - Update queue lib doc comment: Redis Streams + Redis Pub/Sub - Add Paused/Cancelled task statuses to agent_task model - Add issue_id and retry_count fields to agent_task - Switch tool executor Mutex from std::sync → tokio::sync (async context) - Add timeout/rate-limited/retryable/tool-not-found error variants
194 lines
6.9 KiB
Rust
194 lines
6.9 KiB
Rust
use std::collections::HashMap;
|
|
use std::sync::Arc;
|
|
|
|
use metrics::{
|
|
describe_counter, describe_gauge, describe_histogram, register_counter, register_gauge, register_histogram, Counter,
|
|
Gauge, Histogram, Unit,
|
|
};
|
|
use tokio::sync::RwLock;
|
|
use uuid::Uuid;
|
|
|
|
pub struct RoomMetrics {
|
|
pub rooms_online: Gauge,
|
|
pub users_online: Gauge,
|
|
pub ws_connections_active: Gauge,
|
|
pub ws_connections_total: Counter,
|
|
pub ws_disconnections_total: Counter,
|
|
pub messages_sent: Counter,
|
|
pub messages_persisted: Counter,
|
|
pub messages_persist_failed: Counter,
|
|
pub broadcasts_sent: Counter,
|
|
pub broadcasts_dropped: Counter,
|
|
pub duplicates_skipped: Counter,
|
|
pub redis_publish_failed: Counter,
|
|
pub message_latency_ms: Histogram,
|
|
pub ws_rate_limit_hits: Counter,
|
|
pub ws_auth_failures: Counter,
|
|
pub ws_heartbeat_sent_total: Counter,
|
|
pub ws_heartbeat_timeout_total: Counter,
|
|
pub ws_idle_timeout_total: Counter,
|
|
room_connections: RwLock<HashMap<Uuid, Gauge>>,
|
|
room_messages: RwLock<HashMap<Uuid, Counter>>,
|
|
}
|
|
|
|
impl Default for RoomMetrics {
|
|
fn default() -> Self {
|
|
describe_gauge!("room_online_rooms", "Number of rooms with active workers");
|
|
describe_gauge!(
|
|
"room_online_users",
|
|
"Total number of online WebSocket users"
|
|
);
|
|
describe_gauge!(
|
|
"room_ws_connections_active",
|
|
"Current number of active WebSocket connections"
|
|
);
|
|
describe_counter!(
|
|
"room_ws_connections_total",
|
|
Unit::Count,
|
|
"Total WebSocket connections established"
|
|
);
|
|
describe_counter!(
|
|
"room_ws_disconnections_total",
|
|
Unit::Count,
|
|
"Total WebSocket disconnections"
|
|
);
|
|
describe_counter!(
|
|
"room_messages_sent_total",
|
|
Unit::Count,
|
|
"Total messages sent to rooms"
|
|
);
|
|
describe_counter!(
|
|
"room_messages_persisted_total",
|
|
Unit::Count,
|
|
"Total messages persisted to database"
|
|
);
|
|
describe_counter!(
|
|
"room_messages_persist_failed_total",
|
|
Unit::Count,
|
|
"Total message persistence failures"
|
|
);
|
|
describe_counter!(
|
|
"room_broadcasts_sent_total",
|
|
Unit::Count,
|
|
"Total WebSocket broadcasts sent"
|
|
);
|
|
describe_counter!(
|
|
"room_duplicates_skipped_total",
|
|
Unit::Count,
|
|
"Total duplicate messages skipped (idempotency)"
|
|
);
|
|
describe_counter!(
|
|
"room_redis_publish_failed_total",
|
|
Unit::Count,
|
|
"Total Redis publish failures"
|
|
);
|
|
describe_histogram!(
|
|
"room_message_latency_ms",
|
|
Unit::Milliseconds,
|
|
"Message processing latency from publish to persist"
|
|
);
|
|
describe_counter!(
|
|
"room_ws_rate_limit_hits_total",
|
|
Unit::Count,
|
|
"Total WebSocket rate limit rejections"
|
|
);
|
|
describe_counter!(
|
|
"room_ws_auth_failures_total",
|
|
Unit::Count,
|
|
"Total WebSocket authentication/authorization failures"
|
|
);
|
|
describe_counter!(
|
|
"room_ws_heartbeat_sent_total",
|
|
Unit::Count,
|
|
"Total WebSocket heartbeat pings sent by server"
|
|
);
|
|
describe_counter!(
|
|
"room_ws_heartbeat_timeout_total",
|
|
Unit::Count,
|
|
"Total WebSocket connections closed due to heartbeat timeout"
|
|
);
|
|
describe_counter!(
|
|
"room_ws_idle_timeout_total",
|
|
Unit::Count,
|
|
"Total WebSocket connections closed due to idle timeout"
|
|
);
|
|
describe_counter!(
|
|
"room_broadcasts_dropped_total",
|
|
Unit::Count,
|
|
"Total broadcasts dropped due to channel full"
|
|
);
|
|
|
|
Self {
|
|
rooms_online: register_gauge!("room_online_rooms"),
|
|
users_online: register_gauge!("room_online_users"),
|
|
ws_connections_active: register_gauge!("room_ws_connections_active"),
|
|
ws_connections_total: register_counter!("room_ws_connections_total"),
|
|
ws_disconnections_total: register_counter!("room_ws_disconnections_total"),
|
|
messages_sent: register_counter!("room_messages_sent_total"),
|
|
messages_persisted: register_counter!("room_messages_persisted_total"),
|
|
messages_persist_failed: register_counter!("room_messages_persist_failed_total"),
|
|
broadcasts_sent: register_counter!("room_broadcasts_sent_total"),
|
|
broadcasts_dropped: register_counter!("room_broadcasts_dropped_total"),
|
|
duplicates_skipped: register_counter!("room_duplicates_skipped_total"),
|
|
redis_publish_failed: register_counter!("room_redis_publish_failed_total"),
|
|
message_latency_ms: register_histogram!("room_message_latency_ms"),
|
|
ws_rate_limit_hits: register_counter!("room_ws_rate_limit_hits_total"),
|
|
ws_auth_failures: register_counter!("room_ws_auth_failures_total"),
|
|
ws_heartbeat_sent_total: register_counter!("room_ws_heartbeat_sent_total"),
|
|
ws_heartbeat_timeout_total: register_counter!("room_ws_heartbeat_timeout_total"),
|
|
ws_idle_timeout_total: register_counter!("room_ws_idle_timeout_total"),
|
|
room_connections: RwLock::new(HashMap::new()),
|
|
room_messages: RwLock::new(HashMap::new()),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl RoomMetrics {
|
|
pub fn new() -> Self {
|
|
Self::default()
|
|
}
|
|
|
|
pub fn record_message_latency(&self, ms: f64) {
|
|
self.message_latency_ms.record(ms);
|
|
}
|
|
|
|
pub fn incr_duplicates_skipped(&self) {
|
|
self.duplicates_skipped.increment(1);
|
|
}
|
|
|
|
pub async fn incr_room_connections(&self, room_id: Uuid) {
|
|
let mut map = self.room_connections.write().await;
|
|
let counter = map.entry(room_id).or_insert_with(|| {
|
|
register_gauge!(format!("room_connections{{room_id=\"{}\"}}", room_id))
|
|
});
|
|
counter.increment(1.0);
|
|
}
|
|
|
|
pub async fn dec_room_connections(&self, room_id: Uuid) {
|
|
let map = self.room_connections.read().await;
|
|
if let Some(counter) = map.get(&room_id) {
|
|
counter.decrement(1.0);
|
|
}
|
|
}
|
|
|
|
pub async fn incr_room_messages(&self, room_id: Uuid) {
|
|
let mut map = self.room_messages.write().await;
|
|
let counter = map.entry(room_id).or_insert_with(|| {
|
|
register_counter!(format!("room_messages_total{{room_id=\"{}\"}}", room_id))
|
|
});
|
|
counter.increment(1);
|
|
}
|
|
|
|
pub async fn cleanup_stale_rooms(&self, active_room_ids: &[Uuid]) {
|
|
let mut conn_map = self.room_connections.write().await;
|
|
conn_map.retain(|room_id, _| active_room_ids.contains(room_id));
|
|
|
|
let mut msg_map = self.room_messages.write().await;
|
|
msg_map.retain(|room_id, _| active_room_ids.contains(room_id));
|
|
}
|
|
|
|
pub fn into_arc(self) -> Arc<RoomMetrics> {
|
|
Arc::new(self)
|
|
}
|
|
}
|