feat: add CPU load alert, lower disk/RAM thresholds (SCALE-04)

- Add CpuLoad alert rule: fires when 5min load > 2x core count
- Lower disk usage alert from 90% to 80%
- Lower RAM usage alert from 90% to 80%
- Add num_cpus dependency for runtime core detection

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-14 03:29:29 +00:00
parent a38cd87fbb
commit ebad38cdaf
3 changed files with 33 additions and 4 deletions

View File

@@ -83,6 +83,9 @@ zeroize = { version = "1.7", features = ["derive"] }
# Systemd watchdog notification
sd-notify = "0.4"
# CPU core count detection
num_cpus = "1.16"
[dev-dependencies]
tokio-test = "0.4"
tempfile = "3.10"

View File

@@ -64,6 +64,7 @@ const MAX_ALERT_HISTORY: usize = 100;
pub enum AlertRuleKind {
DiskUsage,
RamUsage,
CpuLoad,
ContainerCrash,
BackendErrorSpike,
SslCertExpiry,
@@ -95,15 +96,21 @@ impl AlertRule {
vec![
AlertRule {
kind: AlertRuleKind::DiskUsage,
threshold: 90.0,
threshold: 80.0,
enabled: true,
description: "Disk usage exceeds threshold".to_string(),
},
AlertRule {
kind: AlertRuleKind::RamUsage,
threshold: 90.0,
threshold: 80.0,
enabled: true,
description: "RAM usage exceeds threshold".to_string(),
description: "Total memory usage exceeds threshold".to_string(),
},
AlertRule {
kind: AlertRuleKind::CpuLoad,
threshold: 2.0,
enabled: true,
description: "CPU load exceeds 2x core count for 5 minutes".to_string(),
},
AlertRule {
kind: AlertRuleKind::ContainerCrash,
@@ -335,6 +342,25 @@ impl MetricsStore {
}
}
}
AlertRuleKind::CpuLoad => {
// Alert if 5-min load average exceeds threshold * core count
let cores = num_cpus::get() as f64;
let max_load = rule.threshold * cores;
if snapshot.system.load_avg_5 > max_load {
new_alerts.push(FiredAlert {
id: format!("cpu-{}", ts),
kind: AlertRuleKind::CpuLoad,
message: format!(
"CPU load at {:.1} (threshold: {:.0} = {:.0}x {} cores)",
snapshot.system.load_avg_5, max_load, rule.threshold, cores as u32
),
value: snapshot.system.load_avg_5,
threshold: max_load,
timestamp: ts,
acknowledged: false,
});
}
}
AlertRuleKind::BackendErrorSpike => {
if snapshot.rpc_latency_ms > rule.threshold {
new_alerts.push(FiredAlert {