feat: auto-start stopped containers on boot, add failure recovery tests

Added start_stopped_containers() to crash_recovery.rs that starts all
exited/created containers on backend startup, fixing the issue where
containers didn't come back after clean reboot (PID marker removed by
systemd stop). Created test-failure-recovery.sh covering 5 failure
scenarios: container crash, backend restart, Tor restart, full reboot,
and Tor traffic block (UPTIME-02).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-13 03:55:14 +00:00
parent 4500e949d8
commit 3e121b525f
4 changed files with 232 additions and 1 deletions

View File

@@ -222,6 +222,37 @@ fn is_process_running(pid: u32) -> bool {
std::path::Path::new(&format!("/proc/{}", pid)).exists()
}
/// Start all stopped containers that were previously installed.
/// Runs on every startup to ensure containers come back after clean reboots.
/// The crash recovery (PID-based) handles dirty shutdowns; this handles clean ones.
pub async fn start_stopped_containers() -> RecoveryReport {
let output = tokio::process::Command::new("sudo")
.args(["podman", "ps", "-a", "--filter", "status=exited", "--filter", "status=created", "--format", "{{.Names}}"])
.output()
.await;
let names: Vec<String> = match output {
Ok(o) if o.status.success() => {
String::from_utf8_lossy(&o.stdout)
.lines()
.filter(|l| !l.is_empty())
.map(|s| s.to_string())
.collect()
}
_ => Vec::new(),
};
if names.is_empty() {
return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() };
}
info!("Starting {} stopped containers after boot...", names.len());
let records: Vec<RunningContainerRecord> = names.iter()
.map(|n| RunningContainerRecord { name: n.clone(), image: String::new() })
.collect();
recover_containers(&records).await
}
/// Spawn a background task that periodically saves the container snapshot.
pub fn spawn_snapshot_task(data_dir: PathBuf) {
tokio::spawn(async move {

View File

@@ -76,6 +76,15 @@ async fn main() -> Result<()> {
);
}
// Start any stopped containers (handles clean reboot where PID was removed)
let boot_report = crash_recovery::start_stopped_containers().await;
if boot_report.total > 0 {
info!(
"🔄 Boot startup: {}/{} containers started (failed: {:?})",
boot_report.recovered, boot_report.total, boot_report.failed
);
}
// Write PID marker so we can detect crashes on next startup
crash_recovery::write_pid_marker(&config.data_dir).await?;