feat: auto-start stopped containers on boot, add failure recovery tests
Added start_stopped_containers() to crash_recovery.rs that starts all exited/created containers on backend startup, fixing the issue where containers didn't come back after clean reboot (PID marker removed by systemd stop). Created test-failure-recovery.sh covering 5 failure scenarios: container crash, backend restart, Tor restart, full reboot, and Tor traffic block (UPTIME-02). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -222,6 +222,37 @@ fn is_process_running(pid: u32) -> bool {
|
||||
std::path::Path::new(&format!("/proc/{}", pid)).exists()
|
||||
}
|
||||
|
||||
/// Start all stopped containers that were previously installed.
|
||||
/// Runs on every startup to ensure containers come back after clean reboots.
|
||||
/// The crash recovery (PID-based) handles dirty shutdowns; this handles clean ones.
|
||||
pub async fn start_stopped_containers() -> RecoveryReport {
|
||||
let output = tokio::process::Command::new("sudo")
|
||||
.args(["podman", "ps", "-a", "--filter", "status=exited", "--filter", "status=created", "--format", "{{.Names}}"])
|
||||
.output()
|
||||
.await;
|
||||
|
||||
let names: Vec<String> = match output {
|
||||
Ok(o) if o.status.success() => {
|
||||
String::from_utf8_lossy(&o.stdout)
|
||||
.lines()
|
||||
.filter(|l| !l.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.collect()
|
||||
}
|
||||
_ => Vec::new(),
|
||||
};
|
||||
|
||||
if names.is_empty() {
|
||||
return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() };
|
||||
}
|
||||
|
||||
info!("Starting {} stopped containers after boot...", names.len());
|
||||
let records: Vec<RunningContainerRecord> = names.iter()
|
||||
.map(|n| RunningContainerRecord { name: n.clone(), image: String::new() })
|
||||
.collect();
|
||||
recover_containers(&records).await
|
||||
}
|
||||
|
||||
/// Spawn a background task that periodically saves the container snapshot.
|
||||
pub fn spawn_snapshot_task(data_dir: PathBuf) {
|
||||
tokio::spawn(async move {
|
||||
|
||||
@@ -76,6 +76,15 @@ async fn main() -> Result<()> {
|
||||
);
|
||||
}
|
||||
|
||||
// Start any stopped containers (handles clean reboot where PID was removed)
|
||||
let boot_report = crash_recovery::start_stopped_containers().await;
|
||||
if boot_report.total > 0 {
|
||||
info!(
|
||||
"🔄 Boot startup: {}/{} containers started (failed: {:?})",
|
||||
boot_report.recovered, boot_report.total, boot_report.failed
|
||||
);
|
||||
}
|
||||
|
||||
// Write PID marker so we can detect crashes on next startup
|
||||
crash_recovery::write_pid_marker(&config.data_dir).await?;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user