perf: move crash recovery to background for instant health endpoint

Crash recovery (check_for_crash + recover_containers +
start_stopped_containers) now runs in a background tokio task.
The health endpoint is available immediately on startup instead of
blocking for 260+ seconds while containers restart sequentially.

This directly fixes the .198 boot recovery timeout issue where the
backend took 260s to become healthy after restart.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-14 03:44:33 +00:00
parent 75d63d26b4
commit 6c05b27ec2
2 changed files with 33 additions and 21 deletions

View File

@@ -66,28 +66,40 @@ async fn main() -> Result<()> {
let config = Config::load().await?;
info!("📁 Data directory: {}", config.data_dir.display());
// Crash recovery: check if previous instance shut down cleanly
if let Some(containers) = crash_recovery::check_for_crash(&config.data_dir).await? {
info!("🔧 Recovering {} containers from previous crash...", containers.len());
let report = crash_recovery::recover_containers(&containers).await;
info!(
"🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
report.recovered, report.total, report.failed
);
}
// Start any stopped containers (handles clean reboot where PID was removed)
let boot_report = crash_recovery::start_stopped_containers().await;
if boot_report.total > 0 {
info!(
"🔄 Boot startup: {}/{} containers started (failed: {:?})",
boot_report.recovered, boot_report.total, boot_report.failed
);
}
// Write PID marker so we can detect crashes on next startup
// Write PID marker early so we can detect crashes on next startup
crash_recovery::write_pid_marker(&config.data_dir).await?;
// Crash recovery runs in background so health endpoint is available immediately
{
let data_dir = config.data_dir.clone();
tokio::spawn(async move {
// Check if previous instance shut down cleanly
match crash_recovery::check_for_crash(&data_dir).await {
Ok(Some(containers)) => {
info!("🔧 Recovering {} containers from previous crash...", containers.len());
let report = crash_recovery::recover_containers(&containers).await;
info!(
"🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
report.recovered, report.total, report.failed
);
}
Ok(None) => {}
Err(e) => {
tracing::warn!("Crash recovery check failed: {}", e);
}
}
// Start any stopped containers (handles clean reboot)
let boot_report = crash_recovery::start_stopped_containers().await;
if boot_report.total > 0 {
info!(
"🔄 Boot startup: {}/{} containers started (failed: {:?})",
boot_report.recovered, boot_report.total, boot_report.failed
);
}
});
}
// In dev mode, ensure a default user exists so login works without manual setup
if config.dev_mode {
let auth = AuthManager::new(config.data_dir.clone());