perf: move crash recovery to background for instant health endpoint
Crash recovery (check_for_crash + recover_containers + start_stopped_containers) now runs in a background tokio task. The health endpoint is available immediately on startup instead of blocking for 260+ seconds while containers restart sequentially. This directly fixes the .198 boot recovery timeout issue where the backend took 260s to become healthy after restart. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -66,28 +66,40 @@ async fn main() -> Result<()> {
|
||||
let config = Config::load().await?;
|
||||
info!("📁 Data directory: {}", config.data_dir.display());
|
||||
|
||||
// Crash recovery: check if previous instance shut down cleanly
|
||||
if let Some(containers) = crash_recovery::check_for_crash(&config.data_dir).await? {
|
||||
info!("🔧 Recovering {} containers from previous crash...", containers.len());
|
||||
let report = crash_recovery::recover_containers(&containers).await;
|
||||
info!(
|
||||
"🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
|
||||
report.recovered, report.total, report.failed
|
||||
);
|
||||
}
|
||||
|
||||
// Start any stopped containers (handles clean reboot where PID was removed)
|
||||
let boot_report = crash_recovery::start_stopped_containers().await;
|
||||
if boot_report.total > 0 {
|
||||
info!(
|
||||
"🔄 Boot startup: {}/{} containers started (failed: {:?})",
|
||||
boot_report.recovered, boot_report.total, boot_report.failed
|
||||
);
|
||||
}
|
||||
|
||||
// Write PID marker so we can detect crashes on next startup
|
||||
// Write PID marker early so we can detect crashes on next startup
|
||||
crash_recovery::write_pid_marker(&config.data_dir).await?;
|
||||
|
||||
// Crash recovery runs in background so health endpoint is available immediately
|
||||
{
|
||||
let data_dir = config.data_dir.clone();
|
||||
tokio::spawn(async move {
|
||||
// Check if previous instance shut down cleanly
|
||||
match crash_recovery::check_for_crash(&data_dir).await {
|
||||
Ok(Some(containers)) => {
|
||||
info!("🔧 Recovering {} containers from previous crash...", containers.len());
|
||||
let report = crash_recovery::recover_containers(&containers).await;
|
||||
info!(
|
||||
"🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
|
||||
report.recovered, report.total, report.failed
|
||||
);
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(e) => {
|
||||
tracing::warn!("Crash recovery check failed: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Start any stopped containers (handles clean reboot)
|
||||
let boot_report = crash_recovery::start_stopped_containers().await;
|
||||
if boot_report.total > 0 {
|
||||
info!(
|
||||
"🔄 Boot startup: {}/{} containers started (failed: {:?})",
|
||||
boot_report.recovered, boot_report.total, boot_report.failed
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// In dev mode, ensure a default user exists so login works without manual setup
|
||||
if config.dev_mode {
|
||||
let auth = AuthManager::new(config.data_dir.clone());
|
||||
|
||||
Reference in New Issue
Block a user