perf: move crash recovery to background for instant health endpoint

Crash recovery (check_for_crash + recover_containers + start_stopped_containers) now runs in a background tokio task. The health endpoint is available immediately on startup instead of blocking for 260+ seconds while containers restart sequentially. This directly fixes the .198 boot recovery timeout issue where the backend took 260s to become healthy after restart. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-14 03:44:33 +00:00
parent 75d63d26b4
commit 6c05b27ec2
2 changed files with 33 additions and 21 deletions
--- a/core/archipelago/src/main.rs
+++ b/core/archipelago/src/main.rs
@@ -66,28 +66,40 @@ async fn main() -> Result<()> {
    let config = Config::load().await?;
    info!("📁 Data directory: {}", config.data_dir.display());

-    // Crash recovery: check if previous instance shut down cleanly
-    if let Some(containers) = crash_recovery::check_for_crash(&config.data_dir).await? {
-        info!("🔧 Recovering {} containers from previous crash...", containers.len());
-        let report = crash_recovery::recover_containers(&containers).await;
-        info!(
-            "🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
-            report.recovered, report.total, report.failed
-        );
-    }
-
-    // Start any stopped containers (handles clean reboot where PID was removed)
-    let boot_report = crash_recovery::start_stopped_containers().await;
-    if boot_report.total > 0 {
-        info!(
-            "🔄 Boot startup: {}/{} containers started (failed: {:?})",
-            boot_report.recovered, boot_report.total, boot_report.failed
-        );
-    }
-
-    // Write PID marker so we can detect crashes on next startup
+    // Write PID marker early so we can detect crashes on next startup
    crash_recovery::write_pid_marker(&config.data_dir).await?;

+    // Crash recovery runs in background so health endpoint is available immediately
+    {
+        let data_dir = config.data_dir.clone();
+        tokio::spawn(async move {
+            // Check if previous instance shut down cleanly
+            match crash_recovery::check_for_crash(&data_dir).await {
+                Ok(Some(containers)) => {
+                    info!("🔧 Recovering {} containers from previous crash...", containers.len());
+                    let report = crash_recovery::recover_containers(&containers).await;
+                    info!(
+                        "🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
+                        report.recovered, report.total, report.failed
+                    );
+                }
+                Ok(None) => {}
+                Err(e) => {
+                    tracing::warn!("Crash recovery check failed: {}", e);
+                }
+            }
+
+            // Start any stopped containers (handles clean reboot)
+            let boot_report = crash_recovery::start_stopped_containers().await;
+            if boot_report.total > 0 {
+                info!(
+                    "🔄 Boot startup: {}/{} containers started (failed: {:?})",
+                    boot_report.recovered, boot_report.total, boot_report.failed
+                );
+            }
+        });
+    }
+
    // In dev mode, ensure a default user exists so login works without manual setup
    if config.dev_mode {
        let auth = AuthManager::new(config.data_dir.clone());