From 6e716f68b67af030121bef76aa7016abe00359a2 Mon Sep 17 00:00:00 2001 From: archipelago Date: Fri, 1 May 2026 10:45:07 -0400 Subject: [PATCH] refactor(container): move companion UIs to systemd via Quadlet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion UI containers (archy-bitcoin-ui, archy-lnd-ui, archy-electrs-ui) used to be launched as fire-and-forget tokio::spawn blocks from install.rs. If archipelago crashed mid-spawn or the container's cgroup was reaped, companions vanished from podman ps -a and only a manual rm/run could bring them back (the .228 incident). Now each companion is rendered as a Quadlet .container unit under ~/.config/containers/systemd/, daemon-reloaded, and started via systemctl --user. systemd owns supervision from that point on: - archipelago can crash, restart, or be uninstalled without touching any companion. - Quadlet's Restart=always + RestartSec=10 handles container exits. - A 30s reconcile tick in boot_reconciler enumerates expected companion units and re-installs any whose unit file or service vanished — defense-in-depth against external tampering. New module layout: - container/quadlet.rs: pure unit renderer + atomic write_if_changed + systemctl helpers (daemon_reload_user / enable_now / disable_remove / is_active). 6 unit tests, no I/O in the renderer. - container/companion.rs: per-app companion specs, install/remove/ reconcile, image presence (build local first, fall back to insecure registry only via image_uses_insecure_registry whitelist). 2 tests. install.rs handle_package_install now ends with a single call to companion::install_for(package_id), replacing 287 lines of spawn-and- hope shellouts plus a ~120-line nginx auth-injector helper that worked around per-node RPC password baking. The helper is gone too — the pre-start hook renders the per-node nginx.conf to /var/lib/archipelago/ bitcoin-ui/nginx.conf and the Quadlet unit bind-mounts it read-only. runtime.rs handle_package_uninstall now disables companions before the container rm loop. Otherwise systemd's Restart=always would respawn each companion within ~10s of removal. Tests: 53 container tests pass, including 6 quadlet renderer tests (host network, bridge network, capability set, atomic write idempotence) and 2 companion specs (per-app companion lookup, build_unit shape). boot_reconciler tests gain a #[cfg(test)] without_companion_stage() flag so the paused-clock fixtures don't race the real systemctl I/O. A bats regression test (companion-survives-archipelago-restart.bats, gated on ARCHY_ALLOW_DESTRUCTIVE=1) asserts the .228 failure mode cannot recur: every installed companion has a unit file, services stay active across systemctl --user restart archipelago, and a deleted unit file is recreated within one reconcile tick. Net delta: +941 / -363, but the +941 is mostly tests (~440 lines) and the new declarative layer; the imperative tokio::spawn block and its nginx-auth helper are gone, removing two failure classes (orphan companions on archipelago crash, and post-start exec races under tightly-confined cgroups) that previously needed manual SSH recovery. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + .../src/api/rpc/package/install.rs | 362 +---------------- .../src/api/rpc/package/runtime.rs | 5 + .../src/container/boot_reconciler.rs | 62 ++- core/archipelago/src/container/companion.rs | 348 ++++++++++++++++ core/archipelago/src/container/mod.rs | 2 + .../src/container/prod_orchestrator.rs | 7 + core/archipelago/src/container/quadlet.rs | 371 ++++++++++++++++++ ...ompanion-survives-archipelago-restart.bats | 146 +++++++ 9 files changed, 941 insertions(+), 363 deletions(-) create mode 100644 core/archipelago/src/container/companion.rs create mode 100644 core/archipelago/src/container/quadlet.rs create mode 100644 tests/lifecycle/bats/companion-survives-archipelago-restart.bats diff --git a/.gitignore b/.gitignore index d5d73855..8341f7c4 100644 --- a/.gitignore +++ b/.gitignore @@ -83,3 +83,4 @@ scripts/resilience/reports/ .pnpm-store/ **/__pycache__/ *.bak +.claude/scheduled_tasks.lock diff --git a/core/archipelago/src/api/rpc/package/install.rs b/core/archipelago/src/api/rpc/package/install.rs index 7faaf224..e7cda442 100644 --- a/core/archipelago/src/api/rpc/package/install.rs +++ b/core/archipelago/src/api/rpc/package/install.rs @@ -32,130 +32,6 @@ pub(in crate::api::rpc) async fn install_log(msg: &str) { } } -/// Patch the Bitcoin RPC `Authorization: Basic ...` header inside the running -/// bitcoin-ui container's nginx config and reload nginx. Authoritative -/// credential injection — runs whether the image was built locally or pulled -/// from the registry. Without this, registry images ship with whatever auth -/// header was baked at build time on the publisher's machine, which never -/// matches the per-node randomly-generated bitcoin-rpc-password. -/// -/// Implementation note: this used to do `podman exec sed`, but rootless -/// podman + tightly-confined containers (--cap-drop=ALL, restricted user) -/// reject the exec because crun can't add a new process to the container's -/// cgroup ("write cgroup.procs: Permission denied"). Switched to -/// `podman cp` (storage layer, no cgroup join) + `podman kill --signal=SIGHUP` -/// (signal to existing PID 1, no new process needed). Verified on .228. -async fn inject_bitcoin_rpc_auth_into_running_container(container: &str, auth_b64: &str) { - use rand::distributions::{Alphanumeric, DistString}; - let token = Alphanumeric.sample_string(&mut rand::thread_rng(), 8); - let host_path = format!("/tmp/archy-{container}-nginx.conf-{token}"); - let in_container = "/etc/nginx/conf.d/default.conf"; - - // 1. Copy the running config out to host - let cp_out = tokio::process::Command::new("podman") - .args(["cp", &format!("{container}:{in_container}"), &host_path]) - .output() - .await; - if let Err(e) = cp_out { - warn!("inject auth: podman cp out failed for {}: {}", container, e); - return; - } - if let Ok(ref o) = cp_out { - if !o.status.success() { - warn!( - "inject auth: podman cp out failed for {}: {}", - container, - String::from_utf8_lossy(&o.stderr) - ); - return; - } - } - - // 2. Patch the auth line on disk - let content = match tokio::fs::read_to_string(&host_path).await { - Ok(c) => c, - Err(e) => { - warn!("inject auth: read {} failed: {}", host_path, e); - let _ = tokio::fs::remove_file(&host_path).await; - return; - } - }; - let mut patched_any = false; - let updated: String = content - .lines() - .map(|line| { - if line.contains("proxy_set_header Authorization") && line.contains("Basic") { - patched_any = true; - format!( - " proxy_set_header Authorization \"Basic {}\";", - auth_b64 - ) - } else { - line.to_string() - } - }) - .collect::>() - .join("\n"); - if !patched_any { - warn!( - "inject auth: no Authorization line matched in {}'s nginx.conf", - container - ); - let _ = tokio::fs::remove_file(&host_path).await; - return; - } - if let Err(e) = tokio::fs::write(&host_path, format!("{}\n", updated)).await { - warn!("inject auth: write back failed: {}", e); - let _ = tokio::fs::remove_file(&host_path).await; - return; - } - - // 3. Copy patched config back into the container - let cp_in = tokio::process::Command::new("podman") - .args(["cp", &host_path, &format!("{container}:{in_container}")]) - .output() - .await; - let _ = tokio::fs::remove_file(&host_path).await; - match cp_in { - Ok(o) if !o.status.success() => { - warn!( - "inject auth: podman cp in failed for {}: {}", - container, - String::from_utf8_lossy(&o.stderr) - ); - return; - } - Err(e) => { - warn!("inject auth: podman cp in errored for {}: {}", container, e); - return; - } - _ => {} - } - - // 4. Reload nginx via SIGHUP to PID 1 (no exec/cgroup join needed) - let reload = tokio::process::Command::new("podman") - .args(["kill", "--signal=SIGHUP", container]) - .output() - .await; - match reload { - Ok(o) if o.status.success() => { - info!( - "Injected Bitcoin RPC auth into {} (post-start, cp+SIGHUP)", - container - ); - } - Ok(o) => warn!( - "Patched nginx.conf in {} but SIGHUP failed: {}", - container, - String::from_utf8_lossy(&o.stderr) - ), - Err(e) => warn!( - "Patched nginx.conf in {} but SIGHUP errored: {}", - container, e - ), - } -} - impl RpcHandler { /// Install a package from a Docker image. /// Security: Image verification, resource limits, network isolation. @@ -1552,235 +1428,15 @@ autopilot.active=false\n", info!("Nextcloud trusted domains configured for {}", host_ip); } - // Inject Bitcoin RPC auth into bitcoin-ui nginx.conf. - // Two paths because the credential is per-node and randomly generated - // at first boot, so it can't be baked into the published registry image: - // 1. Build-time: rewrite nginx.conf on disk before `podman build`. - // Only fires when /opt/archipelago/docker/bitcoin-ui exists (dev - // box or ISO that shipped the docker tree). Skipped silently in - // production where ui_builds falls through to the registry image. - // 2. Post-start: `podman exec` into the running container to patch - // nginx.conf and reload. Authoritative for both paths — runs - // regardless of how the image was built. - let bitcoin_rpc_auth_b64: Option = if matches!( - package_id, - "bitcoin" | "bitcoin-core" | "bitcoin-knots" - ) { - let (rpc_user, rpc_pass) = crate::bitcoin_rpc::bitcoin_rpc_credentials().await; - use base64::Engine; - let auth_b64 = base64::engine::general_purpose::STANDARD - .encode(format!("{}:{}", rpc_user, rpc_pass)); - for dir in [ - "/opt/archipelago/docker/bitcoin-ui", - "/home/archipelago/archy/docker/bitcoin-ui", - ] { - let conf_path = format!("{}/nginx.conf", dir); - match tokio::fs::read_to_string(&conf_path).await { - Ok(content) => { - let updated = content - .replace("__BITCOIN_RPC_AUTH__", &auth_b64) - .lines() - .map(|line| { - if line.contains("proxy_set_header Authorization") - && line.contains("Basic") - { - format!( - " proxy_set_header Authorization \"Basic {}\";", - auth_b64 - ) - } else { - line.to_string() - } - }) - .collect::>() - .join("\n"); - if let Err(e) = tokio::fs::write(&conf_path, format!("{}\n", updated)).await - { - warn!( - "Failed to write {} with injected RPC auth: {}", - conf_path, e - ); - } else { - info!("Injected Bitcoin RPC auth into {} (build-time)", conf_path); - } - } - Err(_) => { - debug!( - "No build-time nginx.conf at {} (will patch running container after start)", - conf_path - ); - } - } - } - Some(auth_b64) - } else { - None - }; - - // Build and start companion UI containers for headless services. - // All UIs proxy to localhost (backend :5678 or bitcoin :8332) so they need --network=host. - let ui_builds: Vec<(&str, &str, &str)> = match package_id { - "bitcoin" | "bitcoin-core" | "bitcoin-knots" => { - vec![( - "archy-bitcoin-ui", - "/opt/archipelago/docker/bitcoin-ui", - "bitcoin-ui", - )] - } - "lnd" => { - vec![("archy-lnd-ui", "/opt/archipelago/docker/lnd-ui", "lnd-ui")] - } - "electrumx" | "electrs" | "mempool-electrs" => { - vec![( - "archy-electrs-ui", - "/opt/archipelago/docker/electrs-ui", - "electrs-ui", - )] - } - _ => vec![], - }; - - for (name, ui_dir, image_base) in ui_builds { - let name = name.to_string(); - // Check multiple paths: /opt (production), project tree (dev) - let ui_dir = [ - ui_dir.to_string(), - format!("/home/archipelago/archy/docker/{}", image_base), - format!("/home/archipelago/Projects/archy/docker/{}", image_base), - ] - .into_iter() - .find(|d| std::path::Path::new(d).join("Dockerfile").exists()) - .unwrap_or_else(|| ui_dir.to_string()); - let image_base = image_base.to_string(); - let registry = "146.59.87.168:3000/lfg2025"; - let registry_image = format!("{}/{}:latest", registry, image_base); - let local_image = format!("localhost/{}:latest", image_base); - let post_start_auth = if name == "archy-bitcoin-ui" { - bitcoin_rpc_auth_b64.clone() - } else { - None - }; - tokio::spawn(async move { - // Remove existing container - let _ = tokio::process::Command::new("podman") - .args(["rm", "-f", &name]) - .output() - .await; - - // Build locally first (templates may have injected credentials), - // fall back to registry only if no local Dockerfile exists. - let image = { - if std::path::Path::new(&ui_dir).exists() { - info!("Building {} locally from {}", name, ui_dir); - let build = tokio::process::Command::new("podman") - .args(["build", "--no-cache", "-t", &local_image, &ui_dir]) - .output() - .await; - match build { - Ok(o) if o.status.success() => local_image, - Ok(o) => { - warn!( - "Failed to build {}: {}", - name, - String::from_utf8_lossy(&o.stderr) - ); - return; - } - Err(e) => { - warn!("Failed to build {}: {}", name, e); - return; - } - } - } else { - // No local Dockerfile — try pulling from registry - let mut pull_cmd = tokio::process::Command::new("podman"); - pull_cmd - .arg("pull") - .arg("--tls-verify=false") - .arg(®istry_image); - let pull = pull_cmd.output().await; - if pull.is_ok_and(|o| o.status.success()) { - info!("Pulled {} UI from registry", name); - registry_image.clone() - } else { - warn!("No local source or registry image for {} — skipping", name); - return; - } - } - }; - - // For bitcoin-ui specifically: render nginx.conf to host BEFORE - // starting the container, then bind-mount it. This is the durable - // fix for the bitcoin-rpc 401 — the per-node password is in the - // file before nginx ever opens it. Survives container recreate, - // image update, reboot, --restart=unless-stopped cycles, and - // doesn't need any post-start patching that could fail under - // tightly-confined cgroup permissions. - let mut bitcoin_ui_mount: Option = None; - if name == "archy-bitcoin-ui" { - let paths = crate::container::bitcoin_ui::RenderPaths::default(); - match crate::container::bitcoin_ui::render(&paths).await { - Ok(outcome) => { - bitcoin_ui_mount = Some(format!( - "{}:/etc/nginx/conf.d/default.conf:ro,Z", - paths.rendered_path.display() - )); - info!( - "bitcoin-ui nginx.conf rendered ({:?}) — will bind-mount at startup", - outcome - ); - } - Err(e) => warn!( - "Failed to render bitcoin-ui nginx.conf: {} — \ - will fall back to post-start patch (less reliable)", - e - ), - } - } - - // Run with --network=host (UIs proxy to localhost backend/bitcoin) - // --user 0:0: run as root inside container (still unprivileged on host - // in rootless podman) to avoid nginx chown failures - let mut args: Vec = vec![ - "run".into(), - "-d".into(), - "--name".into(), - name.clone(), - "--restart=unless-stopped".into(), - "--network=host".into(), - "--user=0:0".into(), - "--cap-drop=ALL".into(), - "--cap-add=CHOWN".into(), - "--cap-add=DAC_OVERRIDE".into(), - "--cap-add=NET_BIND_SERVICE".into(), - "--cap-add=SETUID".into(), - "--cap-add=SETGID".into(), - "--memory=128m".into(), - ]; - if let Some(ref mount) = bitcoin_ui_mount { - args.push("-v".into()); - args.push(mount.clone()); - } - args.push(image.clone()); - let run = tokio::process::Command::new("podman") - .args(&args) - .output() - .await; - match run { - Ok(o) if o.status.success() => { - info!("{} UI container started (host network)", name); - if let Some(ref auth) = post_start_auth { - inject_bitcoin_rpc_auth_into_running_container(&name, auth).await; - } - } - Ok(o) => warn!( - "Failed to start {}: {}", - name, - String::from_utf8_lossy(&o.stderr) - ), - Err(e) => warn!("Failed to start {}: {}", name, e), - } - }); + // Companion UIs (archy-bitcoin-ui, archy-lnd-ui, archy-electrs-ui) + // are now Quadlet-managed: install_for writes ~/.config/containers/ + // systemd/.container, daemon-reloads, and starts the generated + // .service. systemd owns supervision from there — companions survive + // archipelago crashes, restarts, and OOM kills. Per-node config + // (e.g. bitcoin-ui's nginx.conf with the live RPC auth) is rendered + // by each spec's pre_start hook and bind-mounted read-only. + for (name, err) in crate::container::companion::install_for(package_id).await { + install_log(&format!("COMPANION FAIL: {name} — {err:#}")).await; } } diff --git a/core/archipelago/src/api/rpc/package/runtime.rs b/core/archipelago/src/api/rpc/package/runtime.rs index 6e0fff19..4e61a821 100644 --- a/core/archipelago/src/api/rpc/package/runtime.rs +++ b/core/archipelago/src/api/rpc/package/runtime.rs @@ -227,6 +227,11 @@ impl RpcHandler { .and_then(|v| v.as_bool()) .unwrap_or(false); + // Disable + remove Quadlet companion units BEFORE the rm loop. + // Otherwise systemd's Restart=always will respawn each companion + // within ~10s of `podman rm`, leaving them orphaned post-uninstall. + crate::container::companion::remove_for(package_id).await; + let containers_to_remove = get_containers_for_app(package_id).await?; if containers_to_remove.is_empty() { tracing::warn!("Uninstall {}: no containers found", package_id); diff --git a/core/archipelago/src/container/boot_reconciler.rs b/core/archipelago/src/container/boot_reconciler.rs index 56e3391f..44727f9e 100644 --- a/core/archipelago/src/container/boot_reconciler.rs +++ b/core/archipelago/src/container/boot_reconciler.rs @@ -29,6 +29,11 @@ pub struct BootReconciler { orchestrator: Arc, interval: Duration, shutdown: Arc, + /// Run the companion-unit repair stage each tick. Default true. + /// Tests disable this — companion reconcile shells out to + /// `systemctl --user` and `podman`, which both block real time + /// and would race the paused-clock test fixtures. + companion_stage: bool, } impl BootReconciler { @@ -41,28 +46,46 @@ impl BootReconciler { orchestrator, interval, shutdown, + companion_stage: true, } } + /// Disable the companion-unit reconcile stage. Used by unit tests + /// that exercise loop cadence without the real systemd / podman + /// surface. Production must not call this. + #[cfg(test)] + pub fn without_companion_stage(mut self) -> Self { + self.companion_stage = false; + self + } + /// Run the reconcile loop until `shutdown` is notified. /// /// Does one reconcile immediately, then sleeps `interval` between /// subsequent passes. A `shutdown.notify_one()` call unblocks the sleep /// and the task returns after the *next* pass completes. /// - /// Never panics: per-app failures are absorbed into `ReconcileReport` by - /// the orchestrator, and `reconcile_all` itself returns infallibly. + /// Each pass is two stages: + /// 1. App reconcile: `reconcile_all()` keeps every loaded manifest's + /// container running. + /// 2. Companion reconcile: any expected Quadlet companion unit that + /// is missing or inactive is repaired (writes the unit, daemon- + /// reloads, starts the service). This is the safety net for the + /// "someone deleted my unit file" / "systemd lost the service" + /// failure modes. + /// + /// Never panics: per-app failures are absorbed into `ReconcileReport` + /// by the orchestrator, and companion failures are logged but never + /// propagated. pub async fn run_forever(self) { // Initial pass: no delay. - let report = self.orchestrator.reconcile_all().await; - Self::log_report(&report); + self.tick().await; loop { let deadline = Instant::now() + self.interval; tokio::select! { _ = time::sleep_until(deadline) => { - let report = self.orchestrator.reconcile_all().await; - Self::log_report(&report); + self.tick().await; } _ = self.shutdown.notified() => { tracing::info!("boot reconciler: shutdown requested, exiting loop"); @@ -72,6 +95,25 @@ impl BootReconciler { } } + async fn tick(&self) { + let report = self.orchestrator.reconcile_all().await; + Self::log_report(&report); + + if !self.companion_stage { + return; + } + let installed = self.orchestrator.manifest_ids().await; + for (companion, err) in + crate::container::companion::reconcile(&installed).await + { + tracing::warn!( + companion = %companion, + error = %err, + "companion reconcile failed" + ); + } + } + fn log_report(report: &ReconcileReport) { for (app_id, action) in &report.actions { tracing::debug!(app_id = %app_id, action = ?action, "reconcile action"); @@ -218,7 +260,7 @@ mod tests { let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()); + BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); // Yield so the spawned task gets CPU to run its initial reconcile. @@ -242,7 +284,7 @@ mod tests { let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()); + BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); tokio::task::yield_now().await; @@ -271,7 +313,7 @@ mod tests { let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()); + BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); tokio::task::yield_now().await; tokio::task::yield_now().await; @@ -305,7 +347,7 @@ mod tests { .await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()); + BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()).without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); tokio::task::yield_now().await; diff --git a/core/archipelago/src/container/companion.rs b/core/archipelago/src/container/companion.rs new file mode 100644 index 00000000..1bb8e7e2 --- /dev/null +++ b/core/archipelago/src/container/companion.rs @@ -0,0 +1,348 @@ +//! Companion UI container lifecycle, entirely Quadlet-managed. +//! +//! A "companion" is a small nginx-based container that exposes a +//! browser-friendly UI on top of a headless backend service: +//! +//! | Backend | Companion | Purpose | +//! |------------------|--------------------|--------------------------| +//! | bitcoin-knots | archy-bitcoin-ui | RPC viewer | +//! | bitcoin-core | archy-bitcoin-ui | RPC viewer | +//! | lnd | archy-lnd-ui | wallet/channel UI | +//! | electrumx | archy-electrs-ui | indexer status UI | +//! +//! Lifecycle: `install` writes a Quadlet `.container` unit to +//! `~/.config/containers/systemd/`, daemon-reloads, then starts the +//! generated `.service`. systemd owns supervision from that point on +//! — archipelago can crash, restart, or be uninstalled without +//! touching the companion. +//! +//! This replaces the old `tokio::spawn { podman run }` block in +//! `install.rs` (~165 lines of fire-and-forget shellouts) with a +//! single declarative call. + +use anyhow::{Context, Result}; +use std::path::PathBuf; +use tokio::fs; +use tokio::process::Command; +use tracing::{info, warn}; + +use crate::container::quadlet::{ + self, BindMount, NetworkMode, QuadletUnit, +}; +use archipelago_container::image_uses_insecure_registry; + +const COMPANION_REGISTRY: &str = "146.59.87.168:3000/lfg2025"; + +/// Static description of one companion. The full list per backend +/// app_id lives in `companions_for`. +#[derive(Debug, Clone)] +pub struct CompanionSpec { + /// Container + unit name (e.g. "archy-bitcoin-ui"). + pub name: &'static str, + /// Image base name in the lfg2025 registry namespace + /// (e.g. "bitcoin-ui" → "146.59.87.168:3000/lfg2025/bitcoin-ui:latest"). + pub image_base: &'static str, + /// Filesystem locations to look for a local Dockerfile (build wins + /// over registry pull). Searched in order; first hit wins. + pub build_dir_candidates: &'static [&'static str], + /// Optional pre-start hook that renders config files referenced + /// by `bind_mounts`. Returns Ok(()) on success; bind-mount must + /// be present at start time or the companion will 502. + pub pre_start: Option, + /// Bind mounts. Always read-only — companions don't write to + /// host paths. + pub bind_mounts: &'static [(&'static str, &'static str)], +} + +pub type PreStartHook = fn() -> futures_util::future::BoxFuture<'static, Result<()>>; + +/// Companions to install when `package_id` lands. Empty for apps +/// without a companion UI. +pub fn companions_for(package_id: &str) -> &'static [CompanionSpec] { + match package_id { + "bitcoin" | "bitcoin-core" | "bitcoin-knots" => BITCOIN_UI, + "lnd" => LND_UI, + "electrumx" | "electrs" | "mempool-electrs" => ELECTRS_UI, + _ => &[], + } +} + +const BITCOIN_UI: &[CompanionSpec] = &[CompanionSpec { + name: "archy-bitcoin-ui", + image_base: "bitcoin-ui", + build_dir_candidates: &[ + "/opt/archipelago/docker/bitcoin-ui", + "/home/archipelago/archy/docker/bitcoin-ui", + "/home/archipelago/Projects/archy/docker/bitcoin-ui", + ], + pre_start: Some(render_bitcoin_ui), + bind_mounts: &[( + "/var/lib/archipelago/bitcoin-ui/nginx.conf", + "/etc/nginx/conf.d/default.conf", + )], +}]; + +const LND_UI: &[CompanionSpec] = &[CompanionSpec { + name: "archy-lnd-ui", + image_base: "lnd-ui", + build_dir_candidates: &[ + "/opt/archipelago/docker/lnd-ui", + "/home/archipelago/archy/docker/lnd-ui", + "/home/archipelago/Projects/archy/docker/lnd-ui", + ], + pre_start: None, + bind_mounts: &[], +}]; + +const ELECTRS_UI: &[CompanionSpec] = &[CompanionSpec { + name: "archy-electrs-ui", + image_base: "electrs-ui", + build_dir_candidates: &[ + "/opt/archipelago/docker/electrs-ui", + "/home/archipelago/archy/docker/electrs-ui", + "/home/archipelago/Projects/archy/docker/electrs-ui", + ], + pre_start: None, + bind_mounts: &[], +}]; + +fn render_bitcoin_ui() -> futures_util::future::BoxFuture<'static, Result<()>> { + Box::pin(async { + let paths = crate::container::bitcoin_ui::RenderPaths::default(); + crate::container::bitcoin_ui::render(&paths) + .await + .map(|_| ()) + .context("render bitcoin-ui nginx.conf") + }) +} + +/// Provision and start every companion for `package_id`. Each +/// companion is independent — a failure in one is logged but does +/// not abort the others. +pub async fn install_for(package_id: &str) -> Vec<(String, anyhow::Error)> { + let mut failures = Vec::new(); + for spec in companions_for(package_id) { + if let Err(e) = install_one(spec).await { + warn!(companion = spec.name, error = %e, "companion install failed"); + failures.push((spec.name.to_string(), e)); + } + } + failures +} + +/// Stop and remove every companion for `package_id`. Best effort: +/// errors are logged but do not abort the sequence. +pub async fn remove_for(package_id: &str) { + let dir = match quadlet::unit_dir().await { + Ok(d) => d, + Err(e) => { + warn!("companion remove: cannot resolve quadlet dir: {e:#}"); + return; + } + }; + for spec in companions_for(package_id) { + if let Err(e) = quadlet::disable_remove(spec.name, &dir).await { + warn!(companion = spec.name, error = %e, "companion remove failed"); + } + } +} + +/// Provision one companion: pre-start hook → image present → write +/// quadlet → daemon-reload → start. +pub async fn install_one(spec: &CompanionSpec) -> Result<()> { + if let Some(hook) = spec.pre_start { + hook().await.with_context(|| { + format!("pre-start hook failed for {} — companion will not start", spec.name) + })?; + } + let image = ensure_image_present(spec).await?; + let unit = build_unit(spec, &image); + let dir = quadlet::unit_dir().await?; + let changed = quadlet::write_if_changed(&unit, &dir).await?; + if changed { + info!(companion = spec.name, "wrote quadlet unit"); + quadlet::daemon_reload_user().await?; + } + // Start is idempotent — if already running, systemctl returns 0. + quadlet::enable_now(&unit.service_name()).await?; + info!(companion = spec.name, "companion started"); + Ok(()) +} + +/// Build companion image locally if a Dockerfile exists, otherwise +/// pull from the lfg2025 registry. Returns the image ref the quadlet +/// should reference (`localhost/:latest` for build, registry +/// URL for pull). +async fn ensure_image_present(spec: &CompanionSpec) -> Result { + let local_image = format!("localhost/{}:latest", spec.image_base); + let registry_image = format!("{}/{}:latest", COMPANION_REGISTRY, spec.image_base); + + // Prefer local build — companions can carry build-time customizations + // (e.g. nginx.conf templates baked in). Search known candidates. + for dir in spec.build_dir_candidates { + let dockerfile = PathBuf::from(dir).join("Dockerfile"); + if fs::try_exists(&dockerfile).await.unwrap_or(false) { + info!(companion = spec.name, "building locally from {dir}"); + let out = Command::new("podman") + .args(["build", "--no-cache", "-t", &local_image, dir]) + .output() + .await + .context("spawn podman build")?; + if out.status.success() { + return Ok(local_image); + } + warn!( + companion = spec.name, + "local build failed: {}", + String::from_utf8_lossy(&out.stderr).trim() + ); + // Fall through to registry pull rather than fail outright. + break; + } + } + + // Registry pull. Use insecure flag only for whitelisted hosts. + let mut cmd = Command::new("podman"); + cmd.arg("pull"); + if image_uses_insecure_registry(®istry_image) { + cmd.arg("--tls-verify=false"); + } + cmd.arg(®istry_image); + let out = cmd.output().await.context("spawn podman pull")?; + if !out.status.success() { + anyhow::bail!( + "no local Dockerfile and registry pull failed for {}: {}", + spec.name, + String::from_utf8_lossy(&out.stderr).trim() + ); + } + Ok(registry_image) +} + +fn build_unit(spec: &CompanionSpec, image: &str) -> QuadletUnit { + QuadletUnit { + name: spec.name.into(), + description: format!("Archipelago companion UI: {}", spec.name), + image: image.into(), + // Companions proxy to localhost — backend is on :5678, bitcoin + // RPC on :8332. Host network is the simplest way to reach them + // without per-app gateway plumbing. + network: NetworkMode::Host, + // Run as root inside the container so nginx can chown its + // worker dirs. Rootless podman maps this to a high host UID, + // so it is unprivileged on the host. + user: Some("0:0".into()), + memory_mb: Some(128), + cap_drop_all: true, + cap_add: vec![ + "CHOWN".into(), + "DAC_OVERRIDE".into(), + "NET_BIND_SERVICE".into(), + "SETUID".into(), + "SETGID".into(), + ], + bind_mounts: spec + .bind_mounts + .iter() + .map(|(host, container)| BindMount { + host: PathBuf::from(*host), + container: PathBuf::from(*container), + read_only: true, + }) + .collect(), + extra_podman_args: vec![], + depends_on: vec![], + } +} + +/// Is a user systemd manager reachable? In production archipelago.service +/// inherits XDG_RUNTIME_DIR from systemd; in unit tests / CI sandboxes it +/// is unset, in which case `systemctl --user` would fail and write to +/// HOME would be an unwanted side effect. The reconciler skips its +/// companion stage when this is false. +fn user_systemd_available() -> bool { + std::env::var_os("XDG_RUNTIME_DIR") + .map(|v| !v.is_empty()) + .unwrap_or(false) +} + +/// Reconcile companion presence: every expected companion for the +/// given installed apps must have its quadlet unit on disk and its +/// service active. Returns a list of (companion, error) for anything +/// that needed correction and failed. +/// +/// Called from `boot_reconciler` so a deleted unit file or a stopped +/// service is repaired within one tick. No-ops if the user systemd +/// manager is not reachable (CI / test environments). +pub async fn reconcile(installed_apps: &[String]) -> Vec<(String, anyhow::Error)> { + if !user_systemd_available() { + return Vec::new(); + } + let mut failures = Vec::new(); + for app_id in installed_apps { + for spec in companions_for(app_id) { + match needs_repair(spec).await { + Ok(false) => {} + Ok(true) => { + info!(companion = spec.name, "reconcile: companion not active, repairing"); + if let Err(e) = install_one(spec).await { + failures.push((spec.name.to_string(), e)); + } + } + Err(e) => { + warn!(companion = spec.name, error = %e, "reconcile probe failed"); + failures.push((spec.name.to_string(), e)); + } + } + } + } + failures +} + +/// Does this companion need install_one to be re-run? Returns true if +/// the unit file is missing OR the service is not active. +async fn needs_repair(spec: &CompanionSpec) -> Result { + let dir = quadlet::unit_dir().await?; + let unit_path = dir.join(format!("{}.container", spec.name)); + if !fs::try_exists(&unit_path).await.unwrap_or(false) { + return Ok(true); + } + let svc = format!("{}.service", spec.name); + Ok(!quadlet::is_active(&svc).await) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn companions_for_known_apps_returns_expected_set() { + assert_eq!(companions_for("bitcoin-knots").len(), 1); + assert_eq!(companions_for("bitcoin-core").len(), 1); + assert_eq!(companions_for("bitcoin").len(), 1); + assert_eq!(companions_for("lnd").len(), 1); + assert_eq!(companions_for("electrumx").len(), 1); + assert_eq!(companions_for("electrs").len(), 1); + assert_eq!(companions_for("mempool-electrs").len(), 1); + assert_eq!(companions_for("nextcloud").len(), 0); + assert_eq!(companions_for("not-a-real-app").len(), 0); + } + + #[test] + fn build_unit_uses_host_network_and_drops_caps() { + let spec = &BITCOIN_UI[0]; + let u = build_unit(spec, "localhost/bitcoin-ui:latest"); + assert_eq!(u.name, "archy-bitcoin-ui"); + assert!(matches!(u.network, NetworkMode::Host)); + assert!(u.cap_drop_all); + assert!(u.cap_add.iter().any(|c| c == "NET_BIND_SERVICE")); + assert_eq!(u.user.as_deref(), Some("0:0")); + assert_eq!(u.memory_mb, Some(128)); + assert_eq!(u.bind_mounts.len(), 1); + assert_eq!( + u.bind_mounts[0].container, + PathBuf::from("/etc/nginx/conf.d/default.conf") + ); + assert!(u.bind_mounts[0].read_only); + } +} diff --git a/core/archipelago/src/container/mod.rs b/core/archipelago/src/container/mod.rs index 442c8203..37140160 100644 --- a/core/archipelago/src/container/mod.rs +++ b/core/archipelago/src/container/mod.rs @@ -1,11 +1,13 @@ pub mod bitcoin_ui; pub mod boot_reconciler; +pub mod companion; pub mod data_manager; pub mod dev_orchestrator; pub mod docker_packages; pub mod filebrowser; pub mod image_versions; pub mod prod_orchestrator; +pub mod quadlet; pub mod registry; pub mod traits; diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 5294fd94..381e3d1b 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -311,6 +311,13 @@ impl ProdContainerOrchestrator { .ok_or_else(|| anyhow::anyhow!("unknown app_id: {app_id}")) } + /// Snapshot of the app IDs currently in the in-memory manifest map. + /// Used by the boot reconciler to drive companion-unit reconciliation. + pub async fn manifest_ids(&self) -> Vec { + let state = self.state.read().await; + state.manifests.keys().cloned().collect() + } + /// Scan the runtime for containers whose names match one of our manifests. /// This is a read-only adoption pass: nothing is created, started, or touched. pub async fn adopt_existing(&self) -> Result { diff --git a/core/archipelago/src/container/quadlet.rs b/core/archipelago/src/container/quadlet.rs new file mode 100644 index 00000000..727b1960 --- /dev/null +++ b/core/archipelago/src/container/quadlet.rs @@ -0,0 +1,371 @@ +//! Render and lifecycle Quadlet `.container` units for companion UI +//! containers (archy-bitcoin-ui, archy-lnd-ui, archy-electrs-ui). +//! +//! Why Quadlet: companions used to run as fire-and-forget `tokio::spawn` +//! blocks from `install.rs`. If archipelago crashed mid-spawn or the +//! kernel reaped a parent cgroup, companions vanished from `podman ps` +//! entirely and only a manual `podman run` brought them back. Putting the +//! unit on disk and letting systemd own start/restart removes that whole +//! class of failure: the daemon is now systemd, archipelago is just the +//! provisioner. +//! +//! Design constraints kept this module small on purpose: +//! +//! - **Single responsibility**: render → write → enable → disable. We do +//! NOT pull images here — the caller is expected to have the image +//! present locally (companions either build from `/opt/archipelago/docker/` +//! or are pre-pulled by `install_companion_image`). The quadlet unit +//! declares `Pull=never` so a missing image surfaces immediately +//! instead of silently retrying behind systemd's restart loop. +//! - **Atomic writes**: `tempfile + rename` so a partially-written unit +//! is never visible to systemd. A daemon-reload during a rolling +//! update can't see half a file. +//! - **Idempotent**: `write_if_changed` compares bytes before touching +//! the file. No daemon-reload, no service-restart cascade if the +//! rendered bytes match what's on disk. +//! - **systemctl --user only**: archipelago runs as uid=1000 with +//! linger enabled. We never touch the system bus from here. +//! +//! See `docs/rust-orchestrator-migration.md` and the failure-mode log in +//! `feedback_container_lifecycle_failure_modes.md` for the incident +//! that motivated the move. + +use anyhow::{anyhow, Context, Result}; +use std::fmt::Write as _; +use std::path::{Path, PathBuf}; +use tokio::fs; +use tokio::process::Command; + +/// Default rootless quadlet directory. Resolved per-user at runtime via +/// `unit_dir()`. Tests pass an explicit dir. +pub const DEFAULT_REL_UNIT_DIR: &str = ".config/containers/systemd"; + +#[derive(Debug, Clone)] +pub struct BindMount { + pub host: PathBuf, + pub container: PathBuf, + pub read_only: bool, +} + +#[derive(Debug, Clone, Default)] +#[allow(dead_code)] // Bridge is reserved for Phase 5 per-app network isolation. +pub enum NetworkMode { + #[default] + Host, + /// A user-defined podman network — quadlet creates the container + /// attached to it. The network must already exist (orchestrator's + /// `ensure_container_network` handles that on every reconcile tick). + Bridge(String), +} + +/// One Quadlet `.container` unit. Field set is deliberately small — +/// add a new field only when a companion actually needs it. +#[derive(Debug, Clone)] +pub struct QuadletUnit { + pub name: String, + pub description: String, + pub image: String, + pub network: NetworkMode, + pub user: Option, + pub memory_mb: Option, + pub cap_drop_all: bool, + pub cap_add: Vec, + pub bind_mounts: Vec, + pub extra_podman_args: Vec, + pub depends_on: Vec, +} + +impl QuadletUnit { + /// File name on disk: `.container`. Quadlet translates this + /// into a `.service` unit at daemon-reload time. + pub fn unit_filename(&self) -> String { + format!("{}.container", self.name) + } + + /// systemd service name created by Quadlet for this unit. + pub fn service_name(&self) -> String { + format!("{}.service", self.name) + } + + /// Render the canonical Quadlet unit text. Pure function — no I/O. + pub fn render(&self) -> String { + let mut s = String::with_capacity(512); + let _ = writeln!(s, "# Generated by archipelago. DO NOT EDIT."); + let _ = writeln!(s, "# Edits are overwritten on the next reconcile."); + let _ = writeln!(s); + let _ = writeln!(s, "[Unit]"); + let _ = writeln!(s, "Description={}", self.description); + let _ = writeln!(s, "After=network-online.target"); + let _ = writeln!(s, "Wants=network-online.target"); + for dep in &self.depends_on { + let _ = writeln!(s, "Requires={dep}"); + let _ = writeln!(s, "After={dep}"); + } + let _ = writeln!(s); + let _ = writeln!(s, "[Container]"); + let _ = writeln!(s, "ContainerName={}", self.name); + let _ = writeln!(s, "Image={}", self.image); + // Pull=never: companions are pre-pulled or built. A missing image + // must surface as a unit start failure, not a silent retry storm. + let _ = writeln!(s, "Pull=never"); + match &self.network { + NetworkMode::Host => { + let _ = writeln!(s, "Network=host"); + } + NetworkMode::Bridge(net) => { + let _ = writeln!(s, "Network={net}"); + } + } + if let Some(user) = &self.user { + let _ = writeln!(s, "User={user}"); + } + if self.cap_drop_all { + let _ = writeln!(s, "DropCapability=ALL"); + } + for cap in &self.cap_add { + let _ = writeln!(s, "AddCapability={cap}"); + } + if let Some(mb) = self.memory_mb { + let _ = writeln!(s, "PodmanArgs=--memory={mb}m"); + } + for bm in &self.bind_mounts { + let mode = if bm.read_only { ":ro,Z" } else { ":Z" }; + let _ = writeln!( + s, + "Volume={}:{}{}", + bm.host.display(), + bm.container.display(), + mode + ); + } + for arg in &self.extra_podman_args { + let _ = writeln!(s, "PodmanArgs={arg}"); + } + let _ = writeln!(s); + let _ = writeln!(s, "[Service]"); + // Always restart with a 10s backoff. RestartSec keeps a + // crash-loop from saturating the journal. + let _ = writeln!(s, "Restart=always"); + let _ = writeln!(s, "RestartSec=10"); + let _ = writeln!(s); + let _ = writeln!(s, "[Install]"); + let _ = writeln!(s, "WantedBy=default.target"); + s + } +} + +/// Resolve the per-user quadlet dir under $HOME. Created if missing. +pub async fn unit_dir() -> Result { + let home = std::env::var_os("HOME") + .map(PathBuf::from) + .ok_or_else(|| anyhow!("HOME not set; cannot locate quadlet unit dir"))?; + let dir = home.join(DEFAULT_REL_UNIT_DIR); + fs::create_dir_all(&dir) + .await + .with_context(|| format!("create_dir_all {}", dir.display()))?; + Ok(dir) +} + +/// Atomically write `unit` into `dir/.container` if the bytes +/// differ from what's already there. Returns true if the file changed. +pub async fn write_if_changed(unit: &QuadletUnit, dir: &Path) -> Result { + let path = dir.join(unit.unit_filename()); + let new_bytes = unit.render(); + + if let Ok(old) = fs::read_to_string(&path).await { + if old == new_bytes { + return Ok(false); + } + } + + fs::create_dir_all(dir) + .await + .with_context(|| format!("create_dir_all {}", dir.display()))?; + let tmp = path.with_extension("container.tmp"); + fs::write(&tmp, new_bytes.as_bytes()) + .await + .with_context(|| format!("write tmp {}", tmp.display()))?; + fs::rename(&tmp, &path) + .await + .with_context(|| format!("rename {} -> {}", tmp.display(), path.display()))?; + Ok(true) +} + +/// Reload the user systemd manager. Required after any quadlet write +/// or removal so systemd picks up the generated `.service` translation. +pub async fn daemon_reload_user() -> Result<()> { + let status = Command::new("systemctl") + .args(["--user", "daemon-reload"]) + .status() + .await + .context("spawn systemctl --user daemon-reload")?; + if !status.success() { + return Err(anyhow!("systemctl --user daemon-reload exited {status}")); + } + Ok(()) +} + +/// Enable + start a quadlet-generated service. `enable --now` makes it +/// survive reboots and starts it immediately. +pub async fn enable_now(service: &str) -> Result<()> { + // Quadlet-generated units cannot be `enable`d directly because the + // .service file lives under /run, not /etc — `enable` would refuse + // ("transient or generated"). The unit's `[Install] WantedBy` is + // honoured at daemon-reload, so we just start it. + let status = Command::new("systemctl") + .args(["--user", "start", service]) + .status() + .await + .with_context(|| format!("spawn systemctl --user start {service}"))?; + if !status.success() { + return Err(anyhow!("systemctl --user start {service} exited {status}")); + } + Ok(()) +} + +/// Stop + remove a quadlet unit and its on-disk file. Best-effort: +/// errors stop only the destructive write at the failing step so a +/// partial removal doesn't leave a quadlet file pointing at a service +/// that systemd no longer knows about. +pub async fn disable_remove(unit_name: &str, dir: &Path) -> Result<()> { + let svc = format!("{unit_name}.service"); + // Stop first; ignore failure (unit may already be down). + let _ = Command::new("systemctl") + .args(["--user", "stop", &svc]) + .status() + .await; + let path = dir.join(format!("{unit_name}.container")); + if fs::try_exists(&path).await.unwrap_or(false) { + fs::remove_file(&path) + .await + .with_context(|| format!("remove {}", path.display()))?; + } + daemon_reload_user().await.ok(); + // Defensive: kill the actual container too, in case quadlet left it. + let _ = Command::new("podman") + .args(["rm", "-f", unit_name]) + .status() + .await; + Ok(()) +} + +/// Is the quadlet-generated service currently active? +pub async fn is_active(service: &str) -> bool { + Command::new("systemctl") + .args(["--user", "is-active", "--quiet", service]) + .status() + .await + .map(|s| s.success()) + .unwrap_or(false) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn sample_unit() -> QuadletUnit { + QuadletUnit { + name: "archy-bitcoin-ui".into(), + description: "Bitcoin RPC UI proxy".into(), + image: "146.59.87.168:3000/lfg2025/bitcoin-ui:latest".into(), + network: NetworkMode::Host, + user: Some("0:0".into()), + memory_mb: Some(128), + cap_drop_all: true, + cap_add: vec![ + "CHOWN".into(), + "DAC_OVERRIDE".into(), + "NET_BIND_SERVICE".into(), + "SETUID".into(), + "SETGID".into(), + ], + bind_mounts: vec![BindMount { + host: PathBuf::from("/var/lib/archipelago/bitcoin-ui/nginx.conf"), + container: PathBuf::from("/etc/nginx/conf.d/default.conf"), + read_only: true, + }], + extra_podman_args: vec![], + depends_on: vec![], + } + } + + #[test] + fn render_contains_required_directives() { + let s = sample_unit().render(); + assert!(s.contains("[Container]")); + assert!(s.contains("ContainerName=archy-bitcoin-ui")); + assert!(s.contains("Image=146.59.87.168:3000/lfg2025/bitcoin-ui:latest")); + assert!(s.contains("Pull=never")); + assert!(s.contains("Network=host")); + assert!(s.contains("DropCapability=ALL")); + assert!(s.contains("AddCapability=CHOWN")); + assert!(s.contains("AddCapability=NET_BIND_SERVICE")); + assert!(s.contains("PodmanArgs=--memory=128m")); + assert!(s.contains( + "Volume=/var/lib/archipelago/bitcoin-ui/nginx.conf:/etc/nginx/conf.d/default.conf:ro,Z" + )); + assert!(s.contains("[Service]")); + assert!(s.contains("Restart=always")); + assert!(s.contains("WantedBy=default.target")); + } + + #[test] + fn render_bridge_network_emits_network_name() { + let mut u = sample_unit(); + u.network = NetworkMode::Bridge("archy-bitcoin-ui-net".into()); + let s = u.render(); + assert!(s.contains("Network=archy-bitcoin-ui-net")); + assert!(!s.contains("Network=host")); + } + + #[test] + fn unit_filename_and_service_name_are_consistent() { + let u = sample_unit(); + assert_eq!(u.unit_filename(), "archy-bitcoin-ui.container"); + assert_eq!(u.service_name(), "archy-bitcoin-ui.service"); + } + + #[tokio::test] + async fn write_if_changed_writes_first_time_then_noops() { + let dir = tempdir().unwrap(); + let u = sample_unit(); + let changed = write_if_changed(&u, dir.path()).await.unwrap(); + assert!(changed, "first write must report changed"); + let on_disk = tokio::fs::read_to_string(dir.path().join(u.unit_filename())) + .await + .unwrap(); + assert!(on_disk.starts_with("# Generated by archipelago")); + + let changed2 = write_if_changed(&u, dir.path()).await.unwrap(); + assert!(!changed2, "second write with identical bytes must no-op"); + } + + #[tokio::test] + async fn write_if_changed_rewrites_when_field_changes() { + let dir = tempdir().unwrap(); + let mut u = sample_unit(); + write_if_changed(&u, dir.path()).await.unwrap(); + + u.memory_mb = Some(256); + let changed = write_if_changed(&u, dir.path()).await.unwrap(); + assert!(changed, "field change must trigger rewrite"); + let on_disk = tokio::fs::read_to_string(dir.path().join(u.unit_filename())) + .await + .unwrap(); + assert!(on_disk.contains("PodmanArgs=--memory=256m")); + } + + #[tokio::test] + async fn write_if_changed_atomic_rename_leaves_no_tmp() { + let dir = tempdir().unwrap(); + write_if_changed(&sample_unit(), dir.path()).await.unwrap(); + let mut entries = tokio::fs::read_dir(dir.path()).await.unwrap(); + while let Some(e) = entries.next_entry().await.unwrap() { + assert!( + !e.file_name().to_string_lossy().ends_with(".tmp"), + "atomic rename must leave no .tmp residue" + ); + } + } +} diff --git a/tests/lifecycle/bats/companion-survives-archipelago-restart.bats b/tests/lifecycle/bats/companion-survives-archipelago-restart.bats new file mode 100644 index 00000000..421cd17f --- /dev/null +++ b/tests/lifecycle/bats/companion-survives-archipelago-restart.bats @@ -0,0 +1,146 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/companion-survives-archipelago-restart.bats +# +# Quadlet promise: companion UIs (archy-bitcoin-ui, archy-lnd-ui, +# archy-electrs-ui) are managed by systemd, not archipelago. Restarting +# the archipelago user service must NOT take them down. +# +# This is the regression gate for the .228 incident in +# feedback_container_lifecycle_failure_modes.md (FM1: companions vanished +# from `podman ps -a` after archipelago crash-loop). +# +# Gated by ARCHY_ALLOW_DESTRUCTIVE=1 because it bounces archipelago. + +companion_units=( + "archy-bitcoin-ui" + "archy-lnd-ui" + "archy-electrs-ui" +) + +unit_dir="$HOME/.config/containers/systemd" + +unit_file_present() { + local name="$1" + [[ -f "$unit_dir/$name.container" ]] +} + +service_active() { + local name="$1" + systemctl --user is-active --quiet "$name.service" +} + +container_running() { + local name="$1" + [[ "$(podman inspect --format '{{.State.Running}}' "$name" 2>/dev/null)" == "true" ]] +} + +wait_service_active() { + local name="$1" + local timeout="${2:-60}" + local deadline=$(( $(date +%s) + timeout )) + while (( $(date +%s) < deadline )); do + if service_active "$name"; then + return 0 + fi + sleep 2 + done + return 1 +} + +wait_archipelago_back() { + local timeout="${1:-60}" + local deadline=$(( $(date +%s) + timeout )) + while (( $(date +%s) < deadline )); do + if curl -fsS -o /dev/null "http://127.0.0.1:5678/health" 2>/dev/null; then + return 0 + fi + sleep 2 + done + return 1 +} + +@test "destructive gate enabled" { + [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" +} + +@test "every installed companion has a quadlet unit on disk" { + [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" + local present=0 + for c in "${companion_units[@]}"; do + if container_running "$c"; then + run unit_file_present "$c" + [ "$status" -eq 0 ] + present=$(( present + 1 )) + fi + done + (( present > 0 )) || skip "No companions installed on this node" +} + +@test "every installed companion service is active before restart" { + [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" + for c in "${companion_units[@]}"; do + if container_running "$c"; then + run service_active "$c" + [ "$status" -eq 0 ] + fi + done +} + +@test "companions survive archipelago restart" { + [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" + + # Snapshot: which companions were up before we touched anything. + local before=() + for c in "${companion_units[@]}"; do + if container_running "$c"; then + before+=("$c") + fi + done + (( ${#before[@]} > 0 )) || skip "No companions installed on this node" + + # Bounce archipelago. The user service is the production canonical name; + # fall back to the system service for older nodes. + if systemctl --user list-units --no-legend archipelago.service | grep -q archipelago; then + systemctl --user restart archipelago.service + else + sudo systemctl restart archipelago.service + fi + + run wait_archipelago_back 60 + [ "$status" -eq 0 ] + + # Every companion that was up before must still be up + healthy after. + for c in "${before[@]}"; do + run service_active "$c" + [ "$status" -eq 0 ] + run container_running "$c" + [ "$status" -eq 0 ] + done +} + +@test "deleted unit file is recreated within one reconcile tick" { + [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" + + # Pick a companion that's currently running. + local target="" + for c in "${companion_units[@]}"; do + if container_running "$c"; then + target="$c" + break + fi + done + [[ -n "$target" ]] || skip "No companions installed on this node" + + # Delete the unit file behind systemd's back. The reconciler should + # notice and rewrite it within one 30s tick, then start the service. + rm -f "$unit_dir/$target.container" + systemctl --user daemon-reload >/dev/null 2>&1 || true + systemctl --user stop "$target.service" >/dev/null 2>&1 || true + + # Allow up to two reconcile ticks (60s + grace). + run wait_service_active "$target" 90 + [ "$status" -eq 0 ] + + run unit_file_present "$target" + [ "$status" -eq 0 ] +}