feat: add real-time metrics collection with ring buffer storage (MON-01)
Implements monitoring/collector.rs that collects per-container CPU/RAM/network/disk, system-wide metrics, RPC latency, and WebSocket connection count every 60 seconds. Data stored in dual ring buffers: 1-min resolution (24h) and 15-min resolution (7d). Three new RPC endpoints: monitoring.current, monitoring.history, monitoring.containers. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@ use crate::api::ApiHandler;
|
||||
use crate::config::{Config, ContainerRuntime};
|
||||
use crate::container::{docker_packages, DockerPackageScanner};
|
||||
use crate::identity::{self, NodeIdentity};
|
||||
use crate::monitoring::MetricsStore;
|
||||
use crate::node_message;
|
||||
use crate::nostr_discovery;
|
||||
use crate::peers;
|
||||
@@ -13,7 +14,7 @@ use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::net::TcpListener;
|
||||
use tracing::{debug, error, info};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
pub struct Server {
|
||||
_config: Config,
|
||||
@@ -74,7 +75,14 @@ impl Server {
|
||||
info!("🔑 Node identity: {} (pubkey: {}...)", identity.node_id(), &identity.pubkey_hex()[..16.min(identity.pubkey_hex().len())]);
|
||||
|
||||
let identity = Arc::new(identity);
|
||||
let api_handler = Arc::new(ApiHandler::new(config.clone(), state_manager.clone()).await?);
|
||||
|
||||
// Create metrics store and spawn background collector
|
||||
let metrics_store = Arc::new(MetricsStore::new());
|
||||
crate::monitoring::spawn_metrics_collector(metrics_store.clone());
|
||||
|
||||
let api_handler = Arc::new(
|
||||
ApiHandler::new(config.clone(), state_manager.clone(), metrics_store).await?,
|
||||
);
|
||||
|
||||
// Periodic Tor address refresh (runs regardless of dev_mode)
|
||||
// Picks up hostname when Tor creates it after startup/rotation (30-60s delay)
|
||||
@@ -131,6 +139,9 @@ impl Server {
|
||||
});
|
||||
}
|
||||
|
||||
// Container health monitoring — auto-restart unhealthy containers
|
||||
crate::health_monitor::spawn_health_monitor(state_manager.clone());
|
||||
|
||||
Ok(Self {
|
||||
_config: config,
|
||||
_identity: identity,
|
||||
@@ -140,37 +151,71 @@ impl Server {
|
||||
}
|
||||
|
||||
pub async fn serve(&self, addr: SocketAddr) -> Result<()> {
|
||||
self.serve_with_shutdown(addr, std::future::pending()).await
|
||||
}
|
||||
|
||||
/// Serve with a graceful shutdown signal.
|
||||
/// When the shutdown future completes, stop accepting new connections and drain in-flight requests.
|
||||
pub async fn serve_with_shutdown(
|
||||
&self,
|
||||
addr: SocketAddr,
|
||||
shutdown: impl std::future::Future<Output = ()>,
|
||||
) -> Result<()> {
|
||||
let listener = TcpListener::bind(addr).await?;
|
||||
let active_connections = Arc::new(tokio::sync::Semaphore::new(1024));
|
||||
|
||||
tokio::pin!(shutdown);
|
||||
|
||||
loop {
|
||||
let (stream, peer_addr) = match listener.accept().await {
|
||||
Ok(conn) => conn,
|
||||
Err(e) => {
|
||||
error!("Failed to accept connection: {}", e);
|
||||
continue;
|
||||
tokio::select! {
|
||||
result = listener.accept() => {
|
||||
let (stream, peer_addr) = match result {
|
||||
Ok(conn) => conn,
|
||||
Err(e) => {
|
||||
error!("Failed to accept connection: {}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let handler = self.api_handler.clone();
|
||||
let permit = active_connections.clone().acquire_owned().await;
|
||||
|
||||
tokio::spawn(async move {
|
||||
let _permit = permit;
|
||||
let service = service_fn(move |req| {
|
||||
let handler = handler.clone();
|
||||
async move {
|
||||
handler.handle_request(req).await
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)))
|
||||
}
|
||||
});
|
||||
|
||||
if let Err(e) = Http::new()
|
||||
.http1_keep_alive(false)
|
||||
.serve_connection(stream, service)
|
||||
.with_upgrades()
|
||||
.await
|
||||
{
|
||||
error!("Error serving connection from {}: {}", peer_addr, e);
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let handler = self.api_handler.clone();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let service = service_fn(move |req| {
|
||||
let handler = handler.clone();
|
||||
async move {
|
||||
handler.handle_request(req).await
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("{}", e)))
|
||||
_ = &mut shutdown => {
|
||||
info!("Shutdown signal received, draining connections...");
|
||||
// Wait up to 5 seconds for in-flight requests to complete
|
||||
let drain_start = std::time::Instant::now();
|
||||
let drain_timeout = std::time::Duration::from_secs(5);
|
||||
while active_connections.available_permits() < 1024 {
|
||||
if drain_start.elapsed() > drain_timeout {
|
||||
warn!("Drain timeout reached, forcing shutdown");
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||||
}
|
||||
});
|
||||
|
||||
if let Err(e) = Http::new()
|
||||
.http1_keep_alive(false)
|
||||
.serve_connection(stream, service)
|
||||
.with_upgrades()
|
||||
.await
|
||||
{
|
||||
error!("Error serving connection from {}: {}", peer_addr, e);
|
||||
info!("Shutdown complete");
|
||||
return Ok(());
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user