feat: add systemd watchdog, OOM detection, disk growth alerting
MEM-01: OOM kill detection via dmesg checks every 5 minutes
MEM-03: Disk growth rate tracking (288 samples over 24h), warns at >1GB/day
MEM-04: Systemd watchdog (WatchdogSec=60, sd_notify::Watchdog every 30s)
Service Type=notify for proper startup notification
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -80,6 +80,9 @@ qrcode = "0.14"
|
||||
data-encoding = "2.6"
|
||||
zeroize = { version = "1.7", features = ["derive"] }
|
||||
|
||||
# Systemd watchdog notification
|
||||
sd-notify = "0.4"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio-test = "0.4"
|
||||
tempfile = "3.10"
|
||||
|
||||
@@ -95,8 +95,29 @@ async fn auto_cleanup() -> Result<u64> {
|
||||
Ok(freed)
|
||||
}
|
||||
|
||||
/// Check for OOM kills in kernel logs.
|
||||
/// Returns a list of process names that were OOM-killed since boot.
|
||||
async fn check_oom_kills() -> Vec<String> {
|
||||
let output = tokio::process::Command::new("sudo")
|
||||
.args(["dmesg", "--level=err,crit", "--notime"])
|
||||
.output()
|
||||
.await;
|
||||
|
||||
match output {
|
||||
Ok(out) if out.status.success() => {
|
||||
let stdout = String::from_utf8_lossy(&out.stdout);
|
||||
stdout
|
||||
.lines()
|
||||
.filter(|l| l.contains("oom-kill") || l.contains("Out of memory"))
|
||||
.map(|l| l.to_string())
|
||||
.collect()
|
||||
}
|
||||
_ => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawn a background task that monitors disk usage every 5 minutes.
|
||||
/// Triggers automatic cleanup at 90% and logs warnings at 85%.
|
||||
/// Also checks for OOM kills and tracks disk growth rate.
|
||||
pub fn spawn_disk_monitor(data_dir: std::path::PathBuf) {
|
||||
tokio::spawn(async move {
|
||||
// Initial delay to let system stabilize
|
||||
@@ -104,12 +125,57 @@ pub fn spawn_disk_monitor(data_dir: std::path::PathBuf) {
|
||||
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(300));
|
||||
let mut last_warning_level: Option<&str> = None;
|
||||
let mut last_disk_used: Option<u64> = None;
|
||||
let mut last_oom_count: usize = 0;
|
||||
let mut disk_samples: Vec<(std::time::Instant, u64)> = Vec::new();
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
// Check for OOM kills
|
||||
let oom_lines = check_oom_kills().await;
|
||||
if oom_lines.len() > last_oom_count {
|
||||
let new_kills = &oom_lines[last_oom_count..];
|
||||
for kill in new_kills {
|
||||
warn!("OOM kill detected: {}", kill);
|
||||
}
|
||||
// Write OOM alert for frontend
|
||||
let alert_path = data_dir.join("oom-alert.json");
|
||||
let _ = tokio::fs::write(
|
||||
&alert_path,
|
||||
serde_json::json!({
|
||||
"count": oom_lines.len(),
|
||||
"latest": oom_lines.last(),
|
||||
"timestamp": chrono::Utc::now().to_rfc3339(),
|
||||
})
|
||||
.to_string(),
|
||||
)
|
||||
.await;
|
||||
last_oom_count = oom_lines.len();
|
||||
}
|
||||
|
||||
match check_disk_usage().await {
|
||||
Ok((_used, _total, percent)) => {
|
||||
Ok((used, _total, percent)) => {
|
||||
// Track disk growth rate
|
||||
let now = std::time::Instant::now();
|
||||
disk_samples.push((now, used));
|
||||
// Keep only last 288 samples (24h at 5min intervals)
|
||||
if disk_samples.len() > 288 {
|
||||
disk_samples.remove(0);
|
||||
}
|
||||
// Calculate daily growth rate from oldest to newest sample
|
||||
if disk_samples.len() >= 12 {
|
||||
let (oldest_time, oldest_used) = disk_samples.first().unwrap();
|
||||
let elapsed_hours = now.duration_since(*oldest_time).as_secs() as f64 / 3600.0;
|
||||
if elapsed_hours > 0.5 {
|
||||
let growth_bytes = used.saturating_sub(*oldest_used);
|
||||
let daily_growth_gb = (growth_bytes as f64 / 1_073_741_824.0) * (24.0 / elapsed_hours);
|
||||
if daily_growth_gb > 1.0 {
|
||||
warn!("Disk growing at {:.1} GB/day — may fill up", daily_growth_gb);
|
||||
}
|
||||
}
|
||||
}
|
||||
let _ = last_disk_used.insert(used);
|
||||
if percent >= 90.0 {
|
||||
if last_warning_level != Some("critical") {
|
||||
warn!("Disk usage critical: {:.1}% — triggering automatic cleanup", percent);
|
||||
|
||||
@@ -122,6 +122,18 @@ async fn main() -> Result<()> {
|
||||
info!("RPC API: http://{}/rpc/v1", addr);
|
||||
info!("WebSocket: ws://{}/ws", addr);
|
||||
|
||||
// Notify systemd that we're ready (Type=notify)
|
||||
let _ = sd_notify::notify(true, &[sd_notify::NotifyState::Ready]);
|
||||
|
||||
// Spawn systemd watchdog ping (WatchdogSec=60, ping every 30s)
|
||||
tokio::spawn(async {
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Watchdog]);
|
||||
}
|
||||
});
|
||||
|
||||
// Graceful shutdown: wait for SIGTERM or SIGINT
|
||||
let shutdown = async {
|
||||
let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate())
|
||||
|
||||
Reference in New Issue
Block a user