fix(quadlet): http:// double-prefix + companion migration race
Two bugs surfaced by the first real-node validation of Phase 3.2-3.4
on .228 (2026-05-02), both caught before flipping the default.
Bug 1 — translate_health_check double-prefixed http://. Manifests in
the wild carry the scheme inside the endpoint string ("http://localhost:8175"),
and we were prepending another http:// unconditionally. Result on .228:
every backend HealthCmd read `curl -fsS -m 5 http://http://localhost...`,
every probe failed, fedimint hit a 14-restart loop. Now we accept either
form and skip appending hc.path when the endpoint already carries one.
Regression test asserts no double-prefix and that an in-endpoint path
is honoured.
Bug 2 — Phase 3.3 migration ran for UI companions (bitcoin-ui /
electrs-ui / lnd-ui) that have shipped via Quadlet since v1.7.41.
Migration tore down the running companion + raced companion.rs render,
producing "Phase 3.3: re-install archy-bitcoin-ui via Quadlet" reconcile
errors and leaving archy-bitcoin-ui down. Companions now short-circuit
out of migrate_to_quadlet_if_needed before any IO. Also: when try_exists
returns Err for an unrelated reason (permissions, EIO), we now skip
migration instead of treating "I can't tell" as "go ahead and migrate" —
migrating on top of a possibly-existing unit is destructive.
What this does not fix yet:
* the orchestrator's reconciler iterating every manifest in
/opt/archipelago/apps/, not just installed apps. Pre-existing
behavior (also affects the legacy path) — separate scope.
* fedimint /data UID mismatch surfaced when Quadlet started fedimint
fresh. Likely orthogonal — defer.
* no rollback when install_via_quadlet fails after a remove_container.
Tracked as Phase 3.3.1 — defer.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -532,13 +532,37 @@ impl ProdContainerOrchestrator {
|
||||
lm: &LoadedManifest,
|
||||
name: &str,
|
||||
) -> Result<Option<ReconcileAction>> {
|
||||
// Skip companion apps — bitcoin-ui / electrs-ui / lnd-ui have shipped
|
||||
// via Quadlet since v1.7.41 (companion.rs renders the unit). Running
|
||||
// migration for them races companion rendering: when migration ran
|
||||
// for archy-bitcoin-ui on .228 2026-05-02 it tore down the running
|
||||
// companion + tried to start a duplicate unit and failed. Companions
|
||||
// are managed by their own path, never the legacy podman-create one,
|
||||
// so there's nothing here to migrate.
|
||||
let app_id = lm.manifest.app.id.as_str();
|
||||
if UI_APP_IDS.contains(&app_id) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let unit_dir = quadlet::unit_dir()
|
||||
.await
|
||||
.context("locate user quadlet unit dir for migration check")?;
|
||||
let unit_path = unit_dir.join(format!("{name}.container"));
|
||||
if tokio::fs::try_exists(&unit_path).await.unwrap_or(false) {
|
||||
// Already on the Quadlet path — nothing to migrate.
|
||||
return Ok(None);
|
||||
// Treat any IO error from try_exists as "the unit might be there" —
|
||||
// migrating on top of an existing unit is destructive (we'd podman rm
|
||||
// the running container then fight the existing service over startup
|
||||
// ordering), so be cautious when we can't read the answer.
|
||||
match tokio::fs::try_exists(&unit_path).await {
|
||||
Ok(true) => return Ok(None),
|
||||
Ok(false) => {} // proceed
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
path = %unit_path.display(),
|
||||
error = %e,
|
||||
"Phase 3.3: skipping migration — couldn't determine if Quadlet unit already exists"
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
|
||||
// No unit on disk. If a pre-Phase-3 container exists for this
|
||||
|
||||
@@ -365,8 +365,14 @@ impl QuadletUnit {
|
||||
///
|
||||
/// Supported shapes:
|
||||
/// - type: tcp, endpoint: "host:port" → `nc -z host port`
|
||||
/// - type: http, endpoint: "host:port", path → `curl -fsS http://host:port<path>`
|
||||
/// - type: http, endpoint: "host:port" or "http(s)://host:port", path → curl
|
||||
/// - type: cmd, endpoint: "<shell command>" → `<shell command>` verbatim
|
||||
///
|
||||
/// For type=http we accept the endpoint with or without scheme; manifests
|
||||
/// in the wild use both forms (`localhost:8175` and
|
||||
/// `http://localhost:8175/`). Earlier we blindly prepended `http://` even
|
||||
/// when one was already there, producing `http://http://...` HealthCmds
|
||||
/// that pasted on .228 2026-05-02 and failed every probe.
|
||||
fn translate_health_check(
|
||||
hc: &archipelago_container::HealthCheck,
|
||||
) -> Option<HealthSpec> {
|
||||
@@ -379,11 +385,34 @@ fn translate_health_check(
|
||||
format!("nc -z {host} {port}")
|
||||
}
|
||||
"http" => {
|
||||
let endpoint = hc.endpoint.as_deref()?;
|
||||
let path = hc.path.as_deref().unwrap_or("/");
|
||||
let endpoint = hc.endpoint.as_deref()?.trim();
|
||||
// Accept either bare host:port or a full URL. If endpoint
|
||||
// already includes a scheme we use it as-is; otherwise we
|
||||
// prepend http://. This keeps existing http://foo manifests
|
||||
// working and stops the http://http:// double-prefix bug.
|
||||
let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
|
||||
endpoint.to_string()
|
||||
} else {
|
||||
format!("http://{endpoint}")
|
||||
};
|
||||
// If the endpoint already carried a path component, honour it
|
||||
// and ignore hc.path (manifests that bake the path into the
|
||||
// endpoint don't expect to merge a separate path field).
|
||||
// Otherwise append hc.path (default "/").
|
||||
let already_has_path = url
|
||||
.splitn(4, '/')
|
||||
.nth(3)
|
||||
.map(|p| !p.is_empty())
|
||||
.unwrap_or(false);
|
||||
let final_url = if already_has_path {
|
||||
url
|
||||
} else {
|
||||
let path = hc.path.as_deref().unwrap_or("/");
|
||||
format!("{url}{path}")
|
||||
};
|
||||
// -fsS: fail on non-2xx, silent except on error, show errors.
|
||||
// -m 5: per-request timeout matches the default manifest timeout.
|
||||
format!("curl -fsS -m 5 http://{endpoint}{path}")
|
||||
format!("curl -fsS -m 5 {final_url}")
|
||||
}
|
||||
"cmd" => hc.endpoint.as_deref()?.to_string(),
|
||||
_ => return None,
|
||||
@@ -921,6 +950,38 @@ app:
|
||||
assert!(translate_health_check(&badtcp).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn translate_health_check_http_does_not_double_prefix_scheme() {
|
||||
// Regression: on .228 2026-05-02 we shipped HealthCmds reading
|
||||
// `curl -fsS -m 5 http://http://localhost:8175/` because manifests
|
||||
// in the wild carry the scheme inside the endpoint string. Every
|
||||
// probe failed and the unit looped. Now we accept either form.
|
||||
use archipelago_container::HealthCheck;
|
||||
let with_scheme = HealthCheck {
|
||||
check_type: "http".into(),
|
||||
endpoint: Some("http://localhost:8175".into()),
|
||||
path: Some("/".into()),
|
||||
interval: "30s".into(),
|
||||
timeout: "5s".into(),
|
||||
retries: 3,
|
||||
};
|
||||
let h = translate_health_check(&with_scheme).expect("with-scheme must translate");
|
||||
assert_eq!(h.cmd, "curl -fsS -m 5 http://localhost:8175/");
|
||||
assert!(!h.cmd.contains("http://http://"), "got: {}", h.cmd);
|
||||
|
||||
let with_https = HealthCheck {
|
||||
check_type: "http".into(),
|
||||
endpoint: Some("https://example.local/health".into()),
|
||||
path: None,
|
||||
interval: "30s".into(),
|
||||
timeout: "5s".into(),
|
||||
retries: 3,
|
||||
};
|
||||
let h = translate_health_check(&with_https).expect("https must translate");
|
||||
// Endpoint already has /health → don't append the default "/".
|
||||
assert_eq!(h.cmd, "curl -fsS -m 5 https://example.local/health");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_manifest_picks_up_health_check() {
|
||||
let yaml = r#"
|
||||
|
||||
Reference in New Issue
Block a user