feat: add CPU load alert, lower disk/RAM thresholds (SCALE-04)
- Add CpuLoad alert rule: fires when 5min load > 2x core count - Lower disk usage alert from 90% to 80% - Lower RAM usage alert from 90% to 80% - Add num_cpus dependency for runtime core detection Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -64,6 +64,7 @@ const MAX_ALERT_HISTORY: usize = 100;
|
||||
pub enum AlertRuleKind {
|
||||
DiskUsage,
|
||||
RamUsage,
|
||||
CpuLoad,
|
||||
ContainerCrash,
|
||||
BackendErrorSpike,
|
||||
SslCertExpiry,
|
||||
@@ -95,15 +96,21 @@ impl AlertRule {
|
||||
vec![
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::DiskUsage,
|
||||
threshold: 90.0,
|
||||
threshold: 80.0,
|
||||
enabled: true,
|
||||
description: "Disk usage exceeds threshold".to_string(),
|
||||
},
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::RamUsage,
|
||||
threshold: 90.0,
|
||||
threshold: 80.0,
|
||||
enabled: true,
|
||||
description: "RAM usage exceeds threshold".to_string(),
|
||||
description: "Total memory usage exceeds threshold".to_string(),
|
||||
},
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::CpuLoad,
|
||||
threshold: 2.0,
|
||||
enabled: true,
|
||||
description: "CPU load exceeds 2x core count for 5 minutes".to_string(),
|
||||
},
|
||||
AlertRule {
|
||||
kind: AlertRuleKind::ContainerCrash,
|
||||
@@ -335,6 +342,25 @@ impl MetricsStore {
|
||||
}
|
||||
}
|
||||
}
|
||||
AlertRuleKind::CpuLoad => {
|
||||
// Alert if 5-min load average exceeds threshold * core count
|
||||
let cores = num_cpus::get() as f64;
|
||||
let max_load = rule.threshold * cores;
|
||||
if snapshot.system.load_avg_5 > max_load {
|
||||
new_alerts.push(FiredAlert {
|
||||
id: format!("cpu-{}", ts),
|
||||
kind: AlertRuleKind::CpuLoad,
|
||||
message: format!(
|
||||
"CPU load at {:.1} (threshold: {:.0} = {:.0}x {} cores)",
|
||||
snapshot.system.load_avg_5, max_load, rule.threshold, cores as u32
|
||||
),
|
||||
value: snapshot.system.load_avg_5,
|
||||
threshold: max_load,
|
||||
timestamp: ts,
|
||||
acknowledged: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
AlertRuleKind::BackendErrorSpike => {
|
||||
if snapshot.rpc_latency_ms > rule.threshold {
|
||||
new_alerts.push(FiredAlert {
|
||||
|
||||
Reference in New Issue
Block a user