feat: add real-time metrics collection with ring buffer storage (MON-01)

Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk,
system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds.
Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d).
Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-11 11:11:02 +00:00
parent 47c783ceac
commit baeeb72f27
8 changed files with 1285 additions and 40 deletions

View File

@@ -2,6 +2,7 @@ use crate::api::ApiHandler;
use crate::config::{Config, ContainerRuntime};
use crate::container::{docker_packages, DockerPackageScanner};
use crate::identity::{self, NodeIdentity};
use crate::monitoring::MetricsStore;
use crate::node_message;
use crate::nostr_discovery;
use crate::peers;
@@ -13,7 +14,7 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use tokio::net::TcpListener;
use tracing::{debug, error, info};
use tracing::{debug, error, info, warn};
pub struct Server {
_config: Config,
@@ -74,7 +75,14 @@ impl Server {
info!("🔑 Node identity: {} (pubkey: {}...)", identity.node_id(), &identity.pubkey_hex()[..16.min(identity.pubkey_hex().len())]);
let identity = Arc::new(identity);
let api_handler = Arc::new(ApiHandler::new(config.clone(), state_manager.clone()).await?);
// Create metrics store and spawn background collector
let metrics_store = Arc::new(MetricsStore::new());
crate::monitoring::spawn_metrics_collector(metrics_store.clone());
let api_handler = Arc::new(
ApiHandler::new(config.clone(), state_manager.clone(), metrics_store).await?,
);
// Periodic Tor address refresh (runs regardless of dev_mode)
// Picks up hostname when Tor creates it after startup/rotation (30-60s delay)
@@ -131,6 +139,9 @@ impl Server {
});
}
// Container health monitoring — auto-restart unhealthy containers
crate::health_monitor::spawn_health_monitor(state_manager.clone());
Ok(Self {
_config: config,
_identity: identity,
@@ -140,37 +151,71 @@ impl Server {
}
pub async fn serve(&self, addr: SocketAddr) -> Result<()> {
self.serve_with_shutdown(addr, std::future::pending()).await
}
/// Serve with a graceful shutdown signal.
/// When the shutdown future completes, stop accepting new connections and drain in-flight requests.
pub async fn serve_with_shutdown(
&self,
addr: SocketAddr,
shutdown: impl std::future::Future<Output = ()>,
) -> Result<()> {
let listener = TcpListener::bind(addr).await?;
let active_connections = Arc::new(tokio::sync::Semaphore::new(1024));
tokio::pin!(shutdown);
loop {
let (stream, peer_addr) = match listener.accept().await {
Ok(conn) => conn,
Err(e) => {
error!("Failed to accept connection: {}", e);
continue;
tokio::select! {
result = listener.accept() => {
let (stream, peer_addr) = match result {
Ok(conn) => conn,
Err(e) => {
error!("Failed to accept connection: {}", e);
continue;
}
};
let handler = self.api_handler.clone();
let permit = active_connections.clone().acquire_owned().await;
tokio::spawn(async move {
let _permit = permit;
let service = service_fn(move |req| {
let handler = handler.clone();
async move {
handler.handle_request(req).await
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)))
}
});
if let Err(e) = Http::new()
.http1_keep_alive(false)
.serve_connection(stream, service)
.with_upgrades()
.await
{
error!("Error serving connection from {}: {}", peer_addr, e);
}
});
}
};
let handler = self.api_handler.clone();
tokio::spawn(async move {
let service = service_fn(move |req| {
let handler = handler.clone();
async move {
handler.handle_request(req).await
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)))
_ = &mut shutdown => {
info!("Shutdown signal received, draining connections...");
// Wait up to 5 seconds for in-flight requests to complete
let drain_start = std::time::Instant::now();
let drain_timeout = std::time::Duration::from_secs(5);
while active_connections.available_permits() < 1024 {
if drain_start.elapsed() > drain_timeout {
warn!("Drain timeout reached, forcing shutdown");
break;
}
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
});
if let Err(e) = Http::new()
.http1_keep_alive(false)
.serve_connection(stream, service)
.with_upgrades()
.await
{
error!("Error serving connection from {}: {}", peer_addr, e);
info!("Shutdown complete");
return Ok(());
}
});
}
}
}
}