From a9c2685d8ba5fa91a6afa1445246b31e43740a64 Mon Sep 17 00:00:00 2001 From: archipelago Date: Sat, 2 May 2026 06:37:37 -0400 Subject: [PATCH] fix(quadlet): http:// double-prefix + companion migration race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs surfaced by the first real-node validation of Phase 3.2-3.4 on .228 (2026-05-02), both caught before flipping the default. Bug 1 — translate_health_check double-prefixed http://. Manifests in the wild carry the scheme inside the endpoint string ("http://localhost:8175"), and we were prepending another http:// unconditionally. Result on .228: every backend HealthCmd read `curl -fsS -m 5 http://http://localhost...`, every probe failed, fedimint hit a 14-restart loop. Now we accept either form and skip appending hc.path when the endpoint already carries one. Regression test asserts no double-prefix and that an in-endpoint path is honoured. Bug 2 — Phase 3.3 migration ran for UI companions (bitcoin-ui / electrs-ui / lnd-ui) that have shipped via Quadlet since v1.7.41. Migration tore down the running companion + raced companion.rs render, producing "Phase 3.3: re-install archy-bitcoin-ui via Quadlet" reconcile errors and leaving archy-bitcoin-ui down. Companions now short-circuit out of migrate_to_quadlet_if_needed before any IO. Also: when try_exists returns Err for an unrelated reason (permissions, EIO), we now skip migration instead of treating "I can't tell" as "go ahead and migrate" — migrating on top of a possibly-existing unit is destructive. What this does not fix yet: * the orchestrator's reconciler iterating every manifest in /opt/archipelago/apps/, not just installed apps. Pre-existing behavior (also affects the legacy path) — separate scope. * fedimint /data UID mismatch surfaced when Quadlet started fedimint fresh. Likely orthogonal — defer. * no rollback when install_via_quadlet fails after a remove_container. Tracked as Phase 3.3.1 — defer. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/container/prod_orchestrator.rs | 30 +++++++- core/archipelago/src/container/quadlet.rs | 69 +++++++++++++++++-- 2 files changed, 92 insertions(+), 7 deletions(-) diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 1abc5dee..e0b54a2c 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -532,13 +532,37 @@ impl ProdContainerOrchestrator { lm: &LoadedManifest, name: &str, ) -> Result> { + // Skip companion apps — bitcoin-ui / electrs-ui / lnd-ui have shipped + // via Quadlet since v1.7.41 (companion.rs renders the unit). Running + // migration for them races companion rendering: when migration ran + // for archy-bitcoin-ui on .228 2026-05-02 it tore down the running + // companion + tried to start a duplicate unit and failed. Companions + // are managed by their own path, never the legacy podman-create one, + // so there's nothing here to migrate. + let app_id = lm.manifest.app.id.as_str(); + if UI_APP_IDS.contains(&app_id) { + return Ok(None); + } + let unit_dir = quadlet::unit_dir() .await .context("locate user quadlet unit dir for migration check")?; let unit_path = unit_dir.join(format!("{name}.container")); - if tokio::fs::try_exists(&unit_path).await.unwrap_or(false) { - // Already on the Quadlet path — nothing to migrate. - return Ok(None); + // Treat any IO error from try_exists as "the unit might be there" — + // migrating on top of an existing unit is destructive (we'd podman rm + // the running container then fight the existing service over startup + // ordering), so be cautious when we can't read the answer. + match tokio::fs::try_exists(&unit_path).await { + Ok(true) => return Ok(None), + Ok(false) => {} // proceed + Err(e) => { + tracing::warn!( + path = %unit_path.display(), + error = %e, + "Phase 3.3: skipping migration — couldn't determine if Quadlet unit already exists" + ); + return Ok(None); + } } // No unit on disk. If a pre-Phase-3 container exists for this diff --git a/core/archipelago/src/container/quadlet.rs b/core/archipelago/src/container/quadlet.rs index 5c3a06b8..ecd5b07a 100644 --- a/core/archipelago/src/container/quadlet.rs +++ b/core/archipelago/src/container/quadlet.rs @@ -365,8 +365,14 @@ impl QuadletUnit { /// /// Supported shapes: /// - type: tcp, endpoint: "host:port" → `nc -z host port` -/// - type: http, endpoint: "host:port", path → `curl -fsS http://host:port` +/// - type: http, endpoint: "host:port" or "http(s)://host:port", path → curl /// - type: cmd, endpoint: "" → `` verbatim +/// +/// For type=http we accept the endpoint with or without scheme; manifests +/// in the wild use both forms (`localhost:8175` and +/// `http://localhost:8175/`). Earlier we blindly prepended `http://` even +/// when one was already there, producing `http://http://...` HealthCmds +/// that pasted on .228 2026-05-02 and failed every probe. fn translate_health_check( hc: &archipelago_container::HealthCheck, ) -> Option { @@ -379,11 +385,34 @@ fn translate_health_check( format!("nc -z {host} {port}") } "http" => { - let endpoint = hc.endpoint.as_deref()?; - let path = hc.path.as_deref().unwrap_or("/"); + let endpoint = hc.endpoint.as_deref()?.trim(); + // Accept either bare host:port or a full URL. If endpoint + // already includes a scheme we use it as-is; otherwise we + // prepend http://. This keeps existing http://foo manifests + // working and stops the http://http:// double-prefix bug. + let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") { + endpoint.to_string() + } else { + format!("http://{endpoint}") + }; + // If the endpoint already carried a path component, honour it + // and ignore hc.path (manifests that bake the path into the + // endpoint don't expect to merge a separate path field). + // Otherwise append hc.path (default "/"). + let already_has_path = url + .splitn(4, '/') + .nth(3) + .map(|p| !p.is_empty()) + .unwrap_or(false); + let final_url = if already_has_path { + url + } else { + let path = hc.path.as_deref().unwrap_or("/"); + format!("{url}{path}") + }; // -fsS: fail on non-2xx, silent except on error, show errors. // -m 5: per-request timeout matches the default manifest timeout. - format!("curl -fsS -m 5 http://{endpoint}{path}") + format!("curl -fsS -m 5 {final_url}") } "cmd" => hc.endpoint.as_deref()?.to_string(), _ => return None, @@ -921,6 +950,38 @@ app: assert!(translate_health_check(&badtcp).is_none()); } + #[test] + fn translate_health_check_http_does_not_double_prefix_scheme() { + // Regression: on .228 2026-05-02 we shipped HealthCmds reading + // `curl -fsS -m 5 http://http://localhost:8175/` because manifests + // in the wild carry the scheme inside the endpoint string. Every + // probe failed and the unit looped. Now we accept either form. + use archipelago_container::HealthCheck; + let with_scheme = HealthCheck { + check_type: "http".into(), + endpoint: Some("http://localhost:8175".into()), + path: Some("/".into()), + interval: "30s".into(), + timeout: "5s".into(), + retries: 3, + }; + let h = translate_health_check(&with_scheme).expect("with-scheme must translate"); + assert_eq!(h.cmd, "curl -fsS -m 5 http://localhost:8175/"); + assert!(!h.cmd.contains("http://http://"), "got: {}", h.cmd); + + let with_https = HealthCheck { + check_type: "http".into(), + endpoint: Some("https://example.local/health".into()), + path: None, + interval: "30s".into(), + timeout: "5s".into(), + retries: 3, + }; + let h = translate_health_check(&with_https).expect("https must translate"); + // Endpoint already has /health → don't append the default "/". + assert_eq!(h.cmd, "curl -fsS -m 5 https://example.local/health"); + } + #[test] fn from_manifest_picks_up_health_check() { let yaml = r#"