From 3767c2670c292a47a65223370206756fc22b5a3b Mon Sep 17 00:00:00 2001 From: archipelago Date: Wed, 22 Apr 2026 17:46:36 -0400 Subject: [PATCH] feat(container): add build source to manifest schema ContainerConfig.image is now Option, mutually exclusive with a new optional ContainerConfig.build: Option. Exactly one of image or build must be present, enforced in AppManifest::validate. Adds ResolvedSource enum (Pull | Build) and ContainerConfig::resolve + ::image_ref helpers so the orchestrator can treat pull and build uniformly. All 26 existing pull-only manifests continue to parse unchanged (covered by existing_pull_only_manifests_still_parse test). Call sites updated: podman_client, runtime::DockerRuntime, dev_orchestrator. Dev orchestrator errors out cleanly on Build sources until Step 2 lands build_image support on the runtime trait. Step 1 of docs/rust-orchestrator-migration.md. 10 new unit tests, all pass. Also includes: docs/rust-orchestrator-migration.md (design spec) and docs/STATUS.md resume section for the next session. --- .../src/container/dev_orchestrator.rs | 34 +- core/container/src/dependency_resolver.rs | 3 +- core/container/src/lib.rs | 5 +- core/container/src/manifest.rs | 299 +++++++++- core/container/src/podman_client.rs | 9 +- docs/STATUS.md | 191 +++++++ docs/rust-orchestrator-migration.md | 522 ++++++++++++++++++ 7 files changed, 1046 insertions(+), 17 deletions(-) create mode 100644 docs/STATUS.md create mode 100644 docs/rust-orchestrator-migration.md diff --git a/core/archipelago/src/container/dev_orchestrator.rs b/core/archipelago/src/container/dev_orchestrator.rs index d6cf0d36..ee8804dc 100644 --- a/core/archipelago/src/container/dev_orchestrator.rs +++ b/core/archipelago/src/container/dev_orchestrator.rs @@ -1,7 +1,7 @@ use anyhow::{Context, Result}; use archipelago_container::{ AppManifest, BitcoinSimulationMode, BitcoinSimulator, - ContainerRuntime as ContainerRuntimeTrait, ContainerStatus, PortManager, + ContainerRuntime as ContainerRuntimeTrait, ContainerStatus, PortManager, ResolvedSource, }; use std::sync::Arc; @@ -103,14 +103,30 @@ impl DevContainerOrchestrator { volume.source = dev_path.to_string_lossy().to_string(); } - // Pull image - self.runtime - .pull_image( - &manifest.app.container.image, - manifest.app.container.image_signature.as_deref(), - ) - .await - .context("Failed to pull image")?; + // Resolve pull-or-build. Dev orchestrator currently only supports pull; + // Build support lands in Step 2 of the rust-orchestrator migration. + match manifest + .app + .container + .resolve() + .ok_or_else(|| anyhow::anyhow!("manifest container config invalid (neither image nor build)"))? + { + ResolvedSource::Pull { + image, + image_signature, + .. + } => { + self.runtime + .pull_image(&image, image_signature.as_deref()) + .await + .context("Failed to pull image")?; + } + ResolvedSource::Build(_) => { + anyhow::bail!( + "dev orchestrator does not yet support local image builds (see rust-orchestrator-migration.md Step 2)" + ); + } + } // Create container with port offset let port_offset = if self.config.dev_mode { diff --git a/core/container/src/dependency_resolver.rs b/core/container/src/dependency_resolver.rs index c75206e0..434b1366 100644 --- a/core/container/src/dependency_resolver.rs +++ b/core/container/src/dependency_resolver.rs @@ -213,9 +213,10 @@ mod tests { version: "1.0.0".to_string(), description: None, container: ContainerConfig { - image: format!("test/{}:latest", id), + image: Some(format!("test/{}:latest", id)), image_signature: None, pull_policy: "if-not-present".to_string(), + build: None, }, dependencies: deps, resources: Default::default(), diff --git a/core/container/src/lib.rs b/core/container/src/lib.rs index 795a1d4a..565e4601 100644 --- a/core/container/src/lib.rs +++ b/core/container/src/lib.rs @@ -9,7 +9,10 @@ pub mod runtime; pub use bitcoin_simulator::{BitcoinSimulationMode, BitcoinSimulator}; pub use dependency_resolver::DependencyResolver; pub use health_monitor::HealthMonitor; -pub use manifest::{AppManifest, Dependency, HealthCheck, ResourceLimits, SecurityPolicy}; +pub use manifest::{ + AppManifest, BuildConfig, Dependency, HealthCheck, ResolvedSource, ResourceLimits, + SecurityPolicy, +}; pub use podman_client::{ContainerState, ContainerStatus, PodmanClient}; pub use port_manager::{PortError, PortManager}; pub use runtime::{AutoRuntime, ContainerRuntime, DockerRuntime, PodmanRuntime}; diff --git a/core/container/src/manifest.rs b/core/container/src/manifest.rs index b1035c7e..3c1b39ca 100644 --- a/core/container/src/manifest.rs +++ b/core/container/src/manifest.rs @@ -57,17 +57,60 @@ pub struct AppDefinition { #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct ContainerConfig { - pub image: String, + /// Pull source. Mutually exclusive with `build`. Exactly one of the two must be present. + #[serde(default)] + pub image: Option, #[serde(default)] pub image_signature: Option, #[serde(default = "default_pull_policy")] pub pull_policy: String, + /// Local build source. Mutually exclusive with `image`. + #[serde(default)] + pub build: Option, } fn default_pull_policy() -> String { "if-not-present".to_string() } +/// Build a container image locally from a Dockerfile rather than pulling from a registry. +/// +/// When present on `ContainerConfig`, the orchestrator runs `podman build -t -f ` +/// before starting the container. The resulting local image is referenced by `tag`. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct BuildConfig { + /// Build context directory (absolute path or relative to the manifest location). + pub context: String, + /// Dockerfile path relative to `context`. Defaults to `Dockerfile`. + #[serde(default = "default_dockerfile")] + pub dockerfile: String, + /// Tag applied to the built image. Used as the container's image reference. + pub tag: String, + /// Optional `--build-arg KEY=VALUE` pairs passed to the build. + #[serde(default)] + pub build_args: HashMap, +} + +fn default_dockerfile() -> String { + "Dockerfile".to_string() +} + +/// Resolved pull-or-build decision after manifest validation. +/// +/// `ContainerConfig::resolve()` produces this. The orchestrator matches on it +/// to decide whether to pull a registry image or invoke a local build. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ResolvedSource { + /// Pull `image` from a registry using `pull_policy` semantics. + Pull { + image: String, + pull_policy: String, + image_signature: Option, + }, + /// Build locally. The resulting tag is the image reference for `podman create`. + Build(BuildConfig), +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(untagged)] pub enum Dependency { @@ -182,10 +225,33 @@ impl AppManifest { return Err(ManifestError::Invalid("app.id cannot be empty".to_string())); } - if self.app.container.image.is_empty() { - return Err(ManifestError::Invalid( - "container.image cannot be empty".to_string(), - )); + // Exactly one of container.image or container.build must be set. We can't + // default either side, because an empty-string image or an empty build block + // would be silently wrong downstream. + match (&self.app.container.image, &self.app.container.build) { + (Some(img), None) if !img.is_empty() => {} + (None, Some(b)) => { + if b.context.is_empty() { + return Err(ManifestError::Invalid( + "container.build.context cannot be empty".to_string(), + )); + } + if b.tag.is_empty() { + return Err(ManifestError::Invalid( + "container.build.tag cannot be empty".to_string(), + )); + } + } + (Some(_), Some(_)) => { + return Err(ManifestError::Invalid( + "container.image and container.build are mutually exclusive".to_string(), + )); + } + _ => { + return Err(ManifestError::Invalid( + "container must specify either image or build".to_string(), + )); + } } // Validate version format (semantic versioning) @@ -199,6 +265,37 @@ impl AppManifest { } } +impl ContainerConfig { + /// Collapse the (image, build) pair into a single resolved source. + /// + /// Returns `None` if the config is in an invalid state (e.g. neither field set + /// or both set). Callers should have already run `AppManifest::validate()` to + /// surface a user-facing error; this method is for internal orchestrator use + /// after validation has passed. + pub fn resolve(&self) -> Option { + match (&self.image, &self.build) { + (Some(img), None) if !img.is_empty() => Some(ResolvedSource::Pull { + image: img.clone(), + pull_policy: self.pull_policy.clone(), + image_signature: self.image_signature.clone(), + }), + (None, Some(b)) => Some(ResolvedSource::Build(b.clone())), + _ => None, + } + } + + /// The image reference used to create/inspect a container for this config. + /// + /// For Pull sources this is the registry image. For Build sources this is + /// the locally-built tag. Returns `None` only for an invalid config. + pub fn image_ref(&self) -> Option { + self.resolve().map(|r| match r { + ResolvedSource::Pull { image, .. } => image, + ResolvedSource::Build(b) => b.tag, + }) + } +} + #[cfg(test)] mod tests { use super::*; @@ -234,4 +331,196 @@ app: let result = AppManifest::parse(yaml); assert!(result.is_err()); } + + #[test] + fn pull_source_resolves_to_pull() { + let yaml = r#" +app: + id: test-app + name: Test + version: 1.0.0 + container: + image: docker.io/library/nginx:1.27 + pull_policy: always +"#; + let m = AppManifest::parse(yaml).unwrap(); + let src = m.app.container.resolve().unwrap(); + match src { + ResolvedSource::Pull { + image, pull_policy, .. + } => { + assert_eq!(image, "docker.io/library/nginx:1.27"); + assert_eq!(pull_policy, "always"); + } + _ => panic!("expected Pull"), + } + assert_eq!( + m.app.container.image_ref().as_deref(), + Some("docker.io/library/nginx:1.27") + ); + } + + #[test] + fn build_source_resolves_to_build() { + let yaml = r#" +app: + id: bitcoin-ui + name: Bitcoin UI + version: 1.0.0 + container: + build: + context: /opt/archipelago/docker/bitcoin-ui + dockerfile: Dockerfile + tag: archy-bitcoin-ui:local + build_args: + NGINX_VERSION: "1.27" +"#; + let m = AppManifest::parse(yaml).unwrap(); + let src = m.app.container.resolve().unwrap(); + match src { + ResolvedSource::Build(b) => { + assert_eq!(b.context, "/opt/archipelago/docker/bitcoin-ui"); + assert_eq!(b.dockerfile, "Dockerfile"); + assert_eq!(b.tag, "archy-bitcoin-ui:local"); + assert_eq!(b.build_args.get("NGINX_VERSION").unwrap(), "1.27"); + } + _ => panic!("expected Build"), + } + assert_eq!( + m.app.container.image_ref().as_deref(), + Some("archy-bitcoin-ui:local") + ); + } + + #[test] + fn dockerfile_defaults_to_dockerfile() { + let yaml = r#" +app: + id: x + name: X + version: 1.0.0 + container: + build: + context: /tmp + tag: x:local +"#; + let m = AppManifest::parse(yaml).unwrap(); + match m.app.container.resolve().unwrap() { + ResolvedSource::Build(b) => assert_eq!(b.dockerfile, "Dockerfile"), + _ => unreachable!(), + } + } + + #[test] + fn image_and_build_both_set_is_rejected() { + let yaml = r#" +app: + id: x + name: X + version: 1.0.0 + container: + image: foo:latest + build: + context: /tmp + tag: x:local +"#; + let err = AppManifest::parse(yaml).unwrap_err(); + let msg = format!("{err}"); + assert!( + msg.contains("mutually exclusive"), + "unexpected error: {msg}" + ); + } + + #[test] + fn neither_image_nor_build_is_rejected() { + let yaml = r#" +app: + id: x + name: X + version: 1.0.0 + container: {} +"#; + let err = AppManifest::parse(yaml).unwrap_err(); + let msg = format!("{err}"); + assert!( + msg.contains("either image or build"), + "unexpected error: {msg}" + ); + } + + #[test] + fn empty_image_string_is_rejected() { + let yaml = r#" +app: + id: x + name: X + version: 1.0.0 + container: + image: "" +"#; + let err = AppManifest::parse(yaml).unwrap_err(); + let msg = format!("{err}"); + assert!( + msg.contains("either image or build"), + "unexpected error: {msg}" + ); + } + + #[test] + fn empty_build_context_is_rejected() { + let yaml = r#" +app: + id: x + name: X + version: 1.0.0 + container: + build: + context: "" + tag: x:local +"#; + let err = AppManifest::parse(yaml).unwrap_err(); + let msg = format!("{err}"); + assert!(msg.contains("context"), "unexpected error: {msg}"); + } + + #[test] + fn empty_build_tag_is_rejected() { + let yaml = r#" +app: + id: x + name: X + version: 1.0.0 + container: + build: + context: /tmp + tag: "" +"#; + let err = AppManifest::parse(yaml).unwrap_err(); + let msg = format!("{err}"); + assert!(msg.contains("tag"), "unexpected error: {msg}"); + } + + #[test] + fn existing_pull_only_manifests_still_parse() { + // Backwards-compat smoke: the shape every file in apps/*/manifest.yml uses today. + let yaml = r#" +app: + id: legacy + name: Legacy App + version: 0.1.0 + description: existing shape + container: + image: registry.example.com/legacy:1.2.3 + image_signature: sha256:abc + ports: + - { host: 8080, container: 80 } +"#; + let m = AppManifest::parse(yaml).unwrap(); + assert_eq!(m.app.container.pull_policy, "if-not-present"); + matches!( + m.app.container.resolve().unwrap(), + ResolvedSource::Pull { .. } + ); + } } diff --git a/core/container/src/podman_client.rs b/core/container/src/podman_client.rs index 0b7d1ea6..d81759d0 100644 --- a/core/container/src/podman_client.rs +++ b/core/container/src/podman_client.rs @@ -306,9 +306,16 @@ impl PodmanClient { let cap_add: Vec = manifest.app.security.capabilities.clone(); let cap_drop = vec!["ALL".to_string()]; + let image_ref = manifest.app.container.image_ref().ok_or_else(|| { + anyhow::anyhow!( + "container config for {} has neither a valid image nor build source", + manifest.app.id + ) + })?; + let body = serde_json::json!({ "name": name, - "image": manifest.app.container.image, + "image": image_ref, "portmappings": port_mappings, "mounts": mounts, "env": env_map, diff --git a/docs/STATUS.md b/docs/STATUS.md new file mode 100644 index 00000000..583f69a8 --- /dev/null +++ b/docs/STATUS.md @@ -0,0 +1,191 @@ +# RESUME HERE — Rust orchestrator migration + +Updated: 2026-04-22 (late session, pivoted from laptop to ThinkPad) + +**To resume this work, SSH into the ThinkPad and run `opencode` from `~/Projects/archy/`.** + +## Where we are + +Working through the 11-step plan in [`rust-orchestrator-migration.md`](./rust-orchestrator-migration.md). + +- [x] **Step 1** — `ContainerConfig` schema extended with `build:` (mutually exclusive with `image:`), new `ResolvedSource` enum, `resolve()` method, 10 new tests +- [x] **Step 2** — `ContainerRuntime` trait gained `image_exists` + `build_image` on all three impls (PodmanRuntime, DockerRuntime, AutoRuntime), 4 new argv-construction tests +- [ ] **Step 3** — `ProdContainerOrchestrator` (next up) +- [ ] Steps 4-11 — see design doc + +## Acceptance evidence + +`cargo test -p archipelago-container --lib` passes 25/25 on the ThinkPad (cargo 1.95.0). + +## Uncommitted state + +The 6 modified files in `git status` ARE the Step 1+2 work: + +``` +core/archipelago/src/container/dev_orchestrator.rs +core/container/src/dependency_resolver.rs +core/container/src/lib.rs +core/container/src/manifest.rs +core/container/src/podman_client.rs +core/container/src/runtime.rs +``` + +Plus `docs/rust-orchestrator-migration.md` (the design spec, untracked). +Plus `tests/` (bats harness, uncommitted leftover from prior session). + +## Answered design questions (no need to re-ask) + +1. UI container naming → `archy-` for UIs only; existing bitcoin-knots/lnd/electrumx keep bare names +2. BITCOIN_RPC_AUTH injection → runtime bind-mount of nginx.conf (no build-args, no envsubst) +3. Reconciler interval → 30 seconds +4. Concurrency → per-app `Mutex<()>` in a `DashMap` +5. Bash scripts → delete immediately (first-boot-containers.sh, reconcile-containers.sh, container-specs.sh, + their systemd units) + +## Context: which host is what + +| Host | IP | Role | Dashboard pw | Sudo pw | +|---|---|---|---|---| +| `archy` (this one) | 192.168.1.116 | **Dev ThinkPad** (Lenovo X250, Debian 13, archi-thinkpad), also runs v1.7.42-alpha | archipelago | ThisIsWeb54321@ | +| `archy228` | 192.168.1.228 | Kiosk HP ProDesk, runs v1.7.41-alpha, missing bitcoin-ui + lnd-ui | password123 | archipelago | + +Both are development alpha nodes — **full destructive latitude**, no need to ask before stop/start/rebuild. + +## Next action + +Step 3: create `core/archipelago/src/container/prod_orchestrator.rs` (new file, ~400 LOC). See the design doc section for "Step 3" for the full public surface + acceptance criteria. Write it, add unit tests against a `MockRuntime`, verify `cargo test -p archipelago` builds. + +--- + +# Archipelago — Current State, Plan, and Releases + +Updated: 2026-04-22 + +This is the "pick this up tomorrow" page. One-stop summary of where we are, what the plan is, and what's shipped. Detailed plan lives in [`bulletproof-containers.md`](./bulletproof-containers.md). + +--- + +## Current state + +### Fleet status + +All four Gitea mirrors are synced to v1.7.40-alpha: + +| Mirror | Host | Status | +|---|---|---| +| tx1138 | https://git.tx1138.com | ✅ v1.7.40-alpha live | +| gitea-local | http://localhost:3000 | ✅ v1.7.40-alpha live | +| .160 | http://23.182.128.160:3000 | ✅ v1.7.40-alpha live (Gitea recovered via `podman system renumber` — see below) | +| .168 | http://146.59.87.168:3000 | ✅ v1.7.40-alpha live | + +Fleet test nodes: + +| Node | Version | State | +|---|---|---| +| .103 (dev) | 1.7.40 | running, being developed against | +| .116 (this box) | 1.7.40 | healed manually via `systemd-run chmod 755 /opt/archipelago/web-ui` after v1.7.38/39 bug | +| .198 | 1.7.39 → 1.7.40-alpha | healed manually | +| .228 (primary test) | 1.7.40-alpha | healed manually; bitcoin-core + lnd + electrumx running; UI companions currently missing; bitcoin.conf rpcauth patched live | +| .249 (ISO test) | unreachable today | | +| .253 | 1.7.39 → 1.7.40-alpha | healed manually | + +### Known open issues (drives the plan below) + +1. **UI companion containers disappear** on .228 after daemon restarts — no auto-recreate (fixed by v1.7.45 Quadlet migration) +2. **bitcoin.conf rpcauth drifts** from canonical secret → ElectrumX "Daemon connection problem" (fixed by v1.7.43 reconcile::derived) +3. **`host.containers.internal`** resolves to LAN gateway inside containers on some versions (fixed by v1.7.42 containers.conf) +4. **Podman state DB loss** requires manual recovery (fixed by v1.7.44 startup self-heal) +5. **LND "Connect Wallet" info** vanishing after crashes — symptom of the same drift class as #2 +6. **ElectrumX not syncing** on .228 — downstream of #2; will resolve when bitcoin.conf is reconciled + +### Recent field incident (2026-04-22) + +- Shipped v1.7.38 + v1.7.39, both broke nginx fleet-wide because the frontend tarball's root dir was `drwx------` (700). Every node that OTA'd got 500 errors on every page. +- Root-cause fix shipped in v1.7.40 (`create-release-manifest.sh` chmod + pre-ship assertion that `tar tvzf | head -1` shows `drwxr-xr-x`). +- .160 Gitea was down all day (502) because its rootless podman's `libpod/bolt_state.db` had vanished. Recovered via clearing `/run/user/$UID/{containers,libpod,podman}` + `podman system renumber`. +- Full failure-mode audit is in [`bulletproof-containers.md`](./bulletproof-containers.md). + +--- + +## Plan + +We're shipping a level-triggered **reconciler + Quadlet** architecture over six incremental releases. Each release closes one failure mode. See [`bulletproof-containers.md`](./bulletproof-containers.md) for the full design, code layout, test harness, chaos matrix, sources. + +### Release roadmap + +| Release | Closes | What lands | Status | +|---|---|---|---| +| **v1.7.41** | FM5 (bad OTA nginx 500) | Post-OTA auto-rollback. New binary probes `https://127.0.0.1/` on boot; if non-200 within 90s, restores `web-ui.bak` + calls `rollback_update()` + restarts | **in flight — deploying to .228 for test** | +| **v1.7.42** | FM4 (`host.containers.internal` wrong) | `/etc/containers/containers.conf` w/ `host_containers_internal_ip = 10.89.0.1`; every container gets `--add-host=host.archipelago:10.89.0.1` | pending | +| **v1.7.43** | FM2 (config drift) | `reconcile::derived::render_bitcoin_conf` — pure fn over canonical secret, rewrites on drift. Same for `lnd.conf` | pending | +| **v1.7.44** | FM6 (podman state loss) | Startup probe detects broken podman state, auto-recovers via `/run/user/$UID/*` clear + `system renumber` | pending | +| **v1.7.45** | FM1 + FM3 (companion orphans) | `archy-bitcoin-ui` → Quadlet `.container` unit in `/etc/containers/systemd/`. systemd (not archipelago) owns it | pending | +| **v1.7.46** | — | `archy-lnd-ui` → Quadlet | pending | +| **v1.7.47** | — | `archy-electrs-ui` → Quadlet | pending | +| **v1.7.48+** | all (full daemon refactor) | `core/archipelago/src/reconcile/` module replaces imperative `install.rs` container management. Main app containers become Quadlet too | pending | + +Test harness (bats + Goss + Chaos Toolkit + vmtest) lands scaffold in v1.7.41, first lifecycle tests blocking v1.7.45, full matrix blocking beta tag. + +--- + +## Release history + +### [v1.7.41-alpha](/releases/v1.7.41-alpha/) — IN FLIGHT — 2026-04-22 +**Post-OTA auto-rollback.** After an update lands, the node probes its own web UI through nginx — if the frontend isn't answering cleanly within 90 seconds, the node automatically rolls back to the previous version and restarts. A bad release can no longer leave the fleet stranded on an unreachable node. + +Changes: +- `core/archipelago/src/update.rs`: `PendingVerification` struct, write marker before service restart, `verify_pending_update()` on new binary boot — probes `https://127.0.0.1/`, on fail restores `web-ui.bak` + calls `rollback_update()` + `systemctl restart archipelago` +- `core/archipelago/src/main.rs`: startup task invokes verifier concurrently with server + +### [v1.7.40-alpha](https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.40-alpha/) — 2026-04-22 +**Proper fix for the 500 error.** Fixed the v1.7.38/39 tarball-perms bug at its source — staging dir is now explicitly `chmod 755` before tar; `--mode=u=rwX,go=rX` normalizes archive perms; pre-ship assertion aborts release if `tar tvzf | head -1` isn't `drwxr-xr-x`. + +Changes: +- `scripts/create-release-manifest.sh`: pre-tar chmod + tar --mode flag + post-tar verify +- Everything from .38 + .39 still in place (onboarding auto-heal, silent logins, app purge, AIUI in tarball) + +### [v1.7.39-alpha](https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.39-alpha/) — 2026-04-22 +**Hotfix attempt** for v1.7.38's nginx 500 (didn't fully work — still shipped broken tarball perms). Added startup self-heal chmod in `main.rs` and post-extract chmod in `update.rs` OTA applier. + +### [v1.7.38-alpha](https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.38-alpha/) — 2026-04-22 +**Onboarding auto-heal + silent logins + App Store trim.** + +Changes: +- `auth.rs`: `is_onboarding_complete()` auto-heals from `setup_complete` + `password_hash` (prevents clear-cache → onboarding wizard bug) +- `useOnboarding`: tri-state — backend-unreachable no longer defaults to `/onboarding/intro` +- Login sounds gated by `isFirstInstallPhase()` — silent after onboarding, typing sounds unaffected +- Removed FIPS app, Nostr Relay, Nostr VPN, Routstr, Penpot from catalog + Rust + docker + icons +- Deleted 15 image versions from tx1138, .168, gitea-local registries +- AIUI baked into release tarball via `demo/aiui/` +- `prebuild` hook syncs `app-catalog/catalog.json` → `public/catalog.json` + +(Shipped with tarball-perms bug; fleet had to be healed before v1.7.40.) + +### [v1.7.37-alpha](https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/v1.7.37-alpha/) — 2026-04-22 +**Bitcoin Core install fixes + dynamic node UI + full-archive default.** + +- Bitcoin Core passes explicit `-rpcbind/-rpcallowip/etc.` CLI args so vanilla image exposes RPC +- Split `bitcoin-core` from `bitcoin-knots` in backend `AppMetadata` +- bitcoin-ui auto-detects Core vs. Knots from subversion, swaps branding at runtime +- Storage (Full Archive · X GB / Pruned) indicator on dashboard +- Node Settings modal shows real values (network, storage, txindex, ZMQ, RPC port) +- Pull fallback to `docker.io` when no mirror carries the image +- Removed `prune=550` hardcode — full archive default + +--- + +## Key docs + +- [`bulletproof-containers.md`](./bulletproof-containers.md) — full reconcile architecture, code layout, test matrix, chaos scenarios, sources +- [`BETA-RELEASE-CHECKLIST.md`](./BETA-RELEASE-CHECKLIST.md) — existing beta checklist +- [`BETA-ISSUES-20260328.md`](./BETA-ISSUES-20260328.md) — prior beta-blocker tracking +- [`hotfix-process.md`](./hotfix-process.md) — release workflow +- [`architecture.md`](./architecture.md) — system architecture overview + +--- + +## How to resume + +1. Check fleet mirrors are all live: `curl -sS https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json | jq .version` +2. Read [`bulletproof-containers.md`](./bulletproof-containers.md) for the current plan +3. Check task list (`/list` or via Claude Code) for the in-flight release +4. Latest in-flight work: v1.7.41 deploying to .228 for test; will ship to all 4 mirrors once verified diff --git a/docs/rust-orchestrator-migration.md b/docs/rust-orchestrator-migration.md new file mode 100644 index 00000000..1508c172 --- /dev/null +++ b/docs/rust-orchestrator-migration.md @@ -0,0 +1,522 @@ +# Rust Orchestrator Migration — Design Doc + +Status: **DRAFT — pending user approval** +Author: OpenCode session, 2026-04-22 +Supersedes planning in `docs/bulletproof-containers.md` v1.7.43 slot + +## Problem statement + +Today, the archipelago backend has **no production container orchestrator**. Production containers (bitcoin-knots, lnd, electrumx, btcpay, filebrowser, and the three custom UIs archy-bitcoin-ui / archy-electrs-ui / archy-lnd-ui) are installed by **bash scripts** at first boot (`scripts/first-boot-containers.sh`) and optionally reconciled by another bash script (`scripts/reconcile-containers.sh`) that is **not enabled by default**. The existing `DevContainerOrchestrator` (`core/archipelago/src/container/dev_orchestrator.rs`) is hardcoded to append `-dev` suffixes and gated behind `config.dev_mode`, so it has never managed a production container. + +This design migrates production container management into Rust, under a single orchestrator that owns install, start, stop, restart, upgrade, uninstall, health, and self-healing for every container. The three custom UI containers are the first-class test fixture: they exercise the "build image from local Dockerfile" path (which today doesn't exist in the manifest schema) and their lifecycle was the original failure class the user asked to fix. + +## Non-goals + +- Backwards compatibility with `first-boot-containers.sh`: we **delete** it and its systemd unit after verifying Rust parity. +- Backwards compatibility with the existing `package-install` RPC’s podman shell-outs: those get rewritten to call the orchestrator. +- Registry signature verification: `image_signature` stays optional. Sigstore/cosign integration is out of scope. +- Network isolation improvements: existing SecurityPolicy fields stay as-is. +- Dev mode removal: `DevContainerOrchestrator` keeps existing behavior for local development; prod code path is separate. + +## Scope of this migration + +In scope: +1. Extend `ContainerConfig` schema with a `source:` variant supporting `{type: build, context, dockerfile, tag}` alongside `{type: pull, image, pull_policy}`. +2. Extend `ContainerRuntime` trait + `PodmanRuntime` impl with `build_image(...)` and `image_exists(...)`. +3. Introduce `ProdContainerOrchestrator` (new type) with identical public surface to `DevContainerOrchestrator` but **no `-dev` suffix**, **no port offset**, **no data-path rewriting**, **no bitcoin_simulator gate**. It is wired into `RpcHandler::orchestrator` in prod (currently `None`). +4. Add `AdoptionScan` at orchestrator startup: enumerate `podman ps -a`, match by container name against declared manifests, adopt into orchestrator state without recreating. +5. Add `BootReconciler` task spawned from `main.rs` (replacing the commented-out `run_boot_reconciliation` hook). Walks the manifest set on startup and periodically, ensures each is present-and-running, builds/pulls/creates anything missing, logs failures non-silently. +6. Ship three manifests in the repo: `apps/bitcoin-ui/manifest.yml`, `apps/electrs-ui/manifest.yml`, `apps/lnd-ui/manifest.yml`. They use the new `source: build` variant pointing at `/opt/archipelago/docker//`. +7. Delete `scripts/first-boot-containers.sh`, `scripts/reconcile-containers.sh`, `scripts/container-specs.sh`, `image-recipe/configs/archipelago-first-boot-containers.service`, `image-recipe/configs/archipelago-reconcile.service`. Remove enablement from ISO builder. + +Out of scope this migration (tracked separately): +- Migrating btcpay / mempool / fedimint multi-container stacks to manifests (they currently live in `core/archipelago/src/api/rpc/package/stacks.rs`). They keep working via `package-install` RPC. Phase 2. +- Rewriting the 26 existing `apps/*/manifest.yml` files to use the new `source:` schema. They stay on `image:` for now; the schema is **additive and backwards-compatible**. +- Re-enabling signature verification; stays todo. + +## Data model changes + +### 1. `ContainerConfig` gets a `source` enum + +File: `core/container/src/manifest.rs:58` + +**Before:** +```rust +pub struct ContainerConfig { + pub image: String, + pub image_signature: Option, + pub pull_policy: String, +} +``` + +**After:** +```rust +pub struct ContainerConfig { + // Legacy shorthand (backwards compatible with all 26 existing manifests): + // if `source` is absent, `image` + `pull_policy` are interpreted as + // `source: { type: pull, image, pull_policy }`. + #[serde(default)] + pub image: String, + #[serde(default)] + pub image_signature: Option, + #[serde(default = "default_pull_policy")] + pub pull_policy: String, + + // New: explicit source. If present, overrides the legacy shorthand. + #[serde(default)] + pub source: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "lowercase")] +pub enum ContainerSource { + /// Pull an image from a registry. + Pull { + image: String, + #[serde(default)] + image_signature: Option, + #[serde(default = "default_pull_policy")] + pull_policy: String, + }, + /// Build an image from a local Dockerfile. + Build { + /// Filesystem path to build context, absolute or relative to manifest dir. + context: String, + /// Dockerfile path relative to context. Defaults to "Dockerfile". + #[serde(default = "default_dockerfile")] + dockerfile: String, + /// Tag to assign to the built image, e.g. "localhost/bitcoin-ui:local". + tag: String, + /// `--build-arg` key=value pairs. + #[serde(default)] + build_args: HashMap, + /// If true, rebuild on every reconcile. If false, only build when tag is missing. + #[serde(default)] + always_rebuild: bool, + }, +} +``` + +Validation in `AppManifest::validate`: +- If `source` is absent AND `image` is empty → error (unchanged rule just rephrased). +- If `source` is present, legacy `image` field is ignored with a warning. +- `Build::context` must resolve to an existing directory that contains `dockerfile`. + +Tests to add: +- Parse a legacy manifest → works, produces `ContainerSource::Pull` at resolution time. +- Parse a `source: { type: build, ... }` manifest → works. +- Parse a manifest with both legacy `image:` and `source:` → warning logged, `source:` wins. +- Parse a manifest with neither → rejected. + +### 2. `ContainerRuntime` trait gets `build_image` + `image_exists` + +File: `core/container/src/runtime.rs:10` + +```rust +#[async_trait] +pub trait ContainerRuntime: Send + Sync { + // existing methods unchanged... + async fn pull_image(&self, image: &str, signature: Option<&str>) -> Result<()>; + async fn create_container(...) -> Result<()>; + // ... + + // NEW: + /// Build an image from a local Dockerfile. Returns Ok(()) if the image now + /// exists under the given tag (whether newly built or already present and + /// `force=false`). Returns Err if the build failed. + async fn build_image( + &self, + context: &Path, + dockerfile: &str, + tag: &str, + build_args: &HashMap, + force: bool, + ) -> Result<()>; + + /// Check if an image exists in the local image store. + async fn image_exists(&self, tag: &str) -> Result; +} +``` + +`PodmanRuntime::build_image` shells out: +``` +podman build --tag \ + --file / \ + --build-arg KEY=VALUE ... \ + +``` + +Force-rebuild semantics: if `force=false`, skip when `image_exists(tag) == true`. If `force=true`, always build (podman's own layer cache handles the fast path). + +Tests: +- `build_image` happy path on a minimal Dockerfile (using a throwaway context in tmpdir). +- `build_image` failure path (nonsense Dockerfile) → Err. +- `image_exists` returns false for nonexistent tag. +- `image_exists` returns true after `build_image`. + +### 3. Manifest resolution: `ContainerSource::resolve(manifest_dir) -> ResolvedSource` + +New method that turns the raw manifest into something the orchestrator can act on: + +```rust +pub enum ResolvedSource { + Pull { image: String, signature: Option, pull_policy: PullPolicy }, + Build { context: PathBuf, dockerfile: String, tag: String, build_args: HashMap, always_rebuild: bool }, +} + +impl ContainerConfig { + pub fn resolve(&self, manifest_dir: &Path) -> Result { + match &self.source { + Some(ContainerSource::Pull { image, image_signature, pull_policy }) => Ok(ResolvedSource::Pull { ... }), + Some(ContainerSource::Build { context, dockerfile, tag, build_args, always_rebuild }) => { + let abs_context = if Path::new(context).is_absolute() { + PathBuf::from(context) + } else { + manifest_dir.join(context) + }; + Ok(ResolvedSource::Build { context: abs_context, ... }) + } + None => { + // Legacy shorthand + if self.image.is_empty() { + return Err(...); + } + Ok(ResolvedSource::Pull { image: self.image.clone(), ... }) + } + } + } +} +``` + +## Runtime architecture + +### `ProdContainerOrchestrator` + +New file: `core/archipelago/src/container/prod_orchestrator.rs` + +```rust +pub struct ProdContainerOrchestrator { + runtime: Arc, + manifests_dir: PathBuf, // e.g. /opt/archipelago/apps + data_dir: PathBuf, // e.g. /var/lib/archipelago + state: Arc>, + config: Config, +} + +struct OrchestratorState { + /// app_id → known manifest (loaded from disk at startup, refreshed on reconcile) + manifests: HashMap, + /// app_id → current known state (from adoption scan or our own ops) + containers: HashMap, + /// app_id → last install/health/build timestamp + last_reconciled: HashMap, +} +``` + +Public surface mirrors `DevContainerOrchestrator` but **container name = `archy-` for UI apps, `` for backends, matching existing .116 naming**: + +```rust +impl ProdContainerOrchestrator { + pub async fn new(config: Config) -> Result { ... } + pub async fn load_manifests(&self) -> Result<()> { /* walks manifests_dir */ } + pub async fn adopt_existing(&self) -> Result { /* scans podman ps -a */ } + pub async fn reconcile_all(&self) -> Result { /* ensures every manifest has a running container */ } + pub async fn install(&self, app_id: &str) -> Result<()> { /* build-or-pull + create + start */ } + pub async fn start(&self, app_id: &str) -> Result<()> { ... } + pub async fn stop(&self, app_id: &str) -> Result<()> { ... } + pub async fn restart(&self, app_id: &str) -> Result<()> { ... } + pub async fn remove(&self, app_id: &str, preserve_data: bool) -> Result<()> { ... } + pub async fn upgrade(&self, app_id: &str) -> Result<()> { /* re-read manifest, rebuild/pull, recreate */ } + pub async fn status(&self, app_id: &str) -> Result { ... } + pub async fn list(&self) -> Result> { ... } + pub async fn logs(&self, app_id: &str, lines: u32) -> Result> { ... } + pub async fn health(&self, app_id: &str) -> Result { ... } +} +``` + +**Container naming rule** (matches `.116` existing fixture so adoption works): +- If the manifest has `extensions["container_name"]` → use that verbatim. +- Else if the app_id starts with `bitcoin-ui` / `electrs-ui` / `lnd-ui` → `archy-`. +- Else → ``. + +This is codified and tested; no ad-hoc naming in the codebase. + +### `AdoptionScan` + +On orchestrator startup, before any reconcile: + +```rust +async fn adopt_existing(&self) -> Result { + let all = self.runtime.list_containers().await?; // podman ps -a + let mut report = AdoptionReport::default(); + for c in all { + // For each manifest we have loaded, check if the expected container name matches + for (app_id, manifest) in self.state.read().await.manifests.iter() { + let expected_name = compute_container_name(manifest); + if c.name == expected_name { + // This container is ours. Record its state. + self.state.write().await.containers.insert(app_id.clone(), c.state.clone()); + report.adopted.push(app_id.clone()); + } + } + } + Ok(report) +} +``` + +No recreate. No touching data volumes. Just "we now know this container belongs to app X and its current state is Y". + +### `BootReconciler` + +New file: `core/archipelago/src/container/boot_reconciler.rs` + +```rust +pub struct BootReconciler { + orchestrator: Arc, + interval: Duration, // e.g. 5 minutes + shutdown: CancellationToken, +} + +impl BootReconciler { + pub async fn run_forever(self) { + // Initial reconcile immediately (after adoption). + let _ = self.orchestrator.reconcile_all().await; + loop { + tokio::select! { + _ = tokio::time::sleep(self.interval) => { + let _ = self.orchestrator.reconcile_all().await; + } + _ = self.shutdown.cancelled() => break, + } + } + } +} +``` + +`reconcile_all`: +```rust +async fn reconcile_all(&self) -> Result { + let manifests: Vec<_> = self.state.read().await.manifests.values().cloned().collect(); + let mut report = ReconcileReport::default(); + for manifest in manifests { + let app_id = &manifest.app.id; + match self.ensure_running(&manifest).await { + Ok(action) => report.record(app_id, action), + Err(e) => { + tracing::error!(app_id, error = %e, "Reconcile failed for app"); + report.failures.push((app_id.clone(), e.to_string())); + } + } + } + if !report.failures.is_empty() { + // Surface via WebSocket so the UI can show a banner. + self.notify_failures(&report).await; + } + Ok(report) +} + +async fn ensure_running(&self, manifest: &AppManifest) -> Result { + let name = compute_container_name(manifest); + match self.runtime.get_container_status(&name).await { + Ok(status) if matches!(status.state, ContainerState::Running) => Ok(ReconcileAction::NoOp), + Ok(status) if matches!(status.state, ContainerState::Exited | ContainerState::Stopped) => { + self.runtime.start_container(&name).await?; + Ok(ReconcileAction::Started) + } + Ok(_) => Ok(ReconcileAction::NoOp), // Created / Paused — leave alone + Err(_) => { + // Container doesn't exist. Install it. + self.install_fresh(manifest).await?; + Ok(ReconcileAction::Installed) + } + } +} + +async fn install_fresh(&self, manifest: &AppManifest) -> Result<()> { + let manifest_dir = ...; // directory of manifest.yml + let resolved = manifest.app.container.resolve(manifest_dir)?; + match resolved { + ResolvedSource::Pull { image, signature, .. } => { + self.runtime.pull_image(&image, signature.as_deref()).await?; + } + ResolvedSource::Build { context, dockerfile, tag, build_args, always_rebuild } => { + if always_rebuild || !self.runtime.image_exists(&tag).await? { + self.runtime.build_image(&context, &dockerfile, &tag, &build_args, always_rebuild).await?; + } + } + } + self.runtime.create_container(manifest, &compute_container_name(manifest), 0).await?; + self.runtime.start_container(&compute_container_name(manifest)).await?; + Ok(()) +} +``` + +### Wire-up in `main.rs` + +File: `core/archipelago/src/main.rs` + +Replace the commented-out `run_boot_reconciliation` block (`main.rs:107-111`) with: + +```rust +// Load manifests + adopt existing + start reconciler loop. +let orchestrator = Arc::new(ProdContainerOrchestrator::new(config.clone()).await?); +orchestrator.load_manifests().await?; +let adoption = orchestrator.adopt_existing().await?; +tracing::info!(adopted = adoption.adopted.len(), "Container adoption complete"); +let reconciler = BootReconciler::new(orchestrator.clone(), Duration::from_secs(300), shutdown_token.clone()); +tokio::spawn(reconciler.run_forever()); +``` + +`RpcHandler` gets the orchestrator regardless of `dev_mode`: +```rust +// core/archipelago/src/api/rpc/mod.rs:83 +let orchestrator: Option> = if config.dev_mode { + Some(Arc::new(DevContainerOrchestrator::new(config.clone()).await?)) +} else { + Some(Arc::new(prod_orch.clone())) +}; +``` + +Where `ContainerOrchestrator` becomes a trait implemented by both `DevContainerOrchestrator` and `ProdContainerOrchestrator`. + +### First-boot replacement + +There is no separate first-boot code. The reconciler handles it: when the archipelago service starts on a fresh node, `adopt_existing` finds nothing, `reconcile_all` sees no running container for any manifest, and installs each one in dependency order (bitcoin-core first, then everything else). On subsequent boots, adoption finds existing containers and reconcile mostly no-ops. + +**Removes completely**: +- `/var/lib/archipelago/.first-boot-containers-done` marker (no longer needed) +- `/var/lib/archipelago/.unbundled` handling in first-boot script (becomes a config flag in archipelago.conf if we still need it) +- `scripts/first-boot-containers.sh` (1392 lines) +- `scripts/reconcile-containers.sh` +- `scripts/container-specs.sh` +- `image-recipe/configs/archipelago-first-boot-containers.service` +- `image-recipe/configs/archipelago-reconcile.service` +- Related enable/disable in ISO builder + +## The three UI manifests + +Example: `apps/bitcoin-ui/manifest.yml` + +```yaml +app: + id: bitcoin-ui + name: Bitcoin Knots UI + version: 1.0.0 + description: Custom Archipelago UI for Bitcoin Knots + container: + source: + type: build + context: /opt/archipelago/docker/bitcoin-ui + dockerfile: Dockerfile + tag: localhost/bitcoin-ui:local + build_args: + BITCOIN_RPC_AUTH: ${BITCOIN_RPC_AUTH} # injected from host-ip.env or secrets + always_rebuild: false + dependencies: + - app_id: bitcoin-core + resources: + memory_limit: 128Mi + security: + network_policy: host + readonly_root: false + ports: [] # host networking + volumes: [] + environment: [] + health_check: + type: http + endpoint: http://127.0.0.1:8334 + path: / + interval: 30s + extensions: + container_name: archy-bitcoin-ui +``` + +The `extensions.container_name` is how we match the existing running container on .116 for adoption. Same pattern for `electrs-ui` (container_name: `archy-electrs-ui`, port probe 50002) and `lnd-ui` (container_name: `archy-lnd-ui`, port probe 8081). + +**BITCOIN_RPC_AUTH injection**: today `first-boot-containers.sh` `sed`s this value into `nginx.conf` (destructively). In the new world, it's a `--build-arg` — the Dockerfile gets `ARG BITCOIN_RPC_AUTH` and templates `nginx.conf` from a template file. Fixes the "sed destroys the source" bug from the mapping. + +## Migration path (.116 and .228 specifically) + +### .116 (all 3 UIs currently running, adopted from bash install) +1. Ship the new archipelago binary with the prod orchestrator. +2. On archipelago restart, `adopt_existing` scans `podman ps -a`, sees `archy-bitcoin-ui`, `archy-electrs-ui`, `archy-lnd-ui` already running. +3. Matches them against the new manifests by `extensions.container_name`. +4. Records state. Reconciler sees them Running → NoOp. +5. Manual test: `podman stop archy-bitcoin-ui` → within 5 minutes, reconciler starts it again. `podman rm -f archy-bitcoin-ui` → reconciler rebuilds from `/opt/archipelago/docker/bitcoin-ui/Dockerfile` and re-creates. + +### .228 (no bitcoin-ui, no lnd-ui, has electrs-ui from bash first-boot) +1. Ship same binary. +2. Adoption finds only `archy-electrs-ui`. +3. Reconciler sees `bitcoin-ui` and `lnd-ui` missing → triggers `install_fresh` for each. +4. For `bitcoin-ui`: `image_exists("localhost/bitcoin-ui:local")` → false. `build_image(/opt/archipelago/docker/bitcoin-ui, Dockerfile, localhost/bitcoin-ui:local, {BITCOIN_RPC_AUTH: ...}, force=false)`. Then create + start. +5. Same for `lnd-ui`. +6. Manual test: HTTP probe ports 8334 and 8081 return 200 within ~5 minutes of service restart. + +## Test plan + +Unit tests (Rust, in-process): +- `manifest::tests::legacy_image_parses_as_pull_source` +- `manifest::tests::explicit_pull_source_parses` +- `manifest::tests::explicit_build_source_parses` +- `manifest::tests::source_build_requires_tag` +- `runtime::tests::build_image_happy_path` (uses a minimal Dockerfile in `tempfile::TempDir`) +- `runtime::tests::build_image_failure` +- `runtime::tests::image_exists_roundtrip` +- `prod_orchestrator::tests::install_fresh_pull` +- `prod_orchestrator::tests::install_fresh_build` +- `prod_orchestrator::tests::adopt_existing_matches_by_name` +- `prod_orchestrator::tests::reconcile_starts_exited_container` (with a mock runtime) +- `prod_orchestrator::tests::reconcile_installs_missing_container` +- `prod_orchestrator::tests::compute_container_name_ui_apps_prefixed` +- `prod_orchestrator::tests::compute_container_name_backend_apps_bare` + +Integration tests (require real podman, run on archy node): +- Fresh-install path: wipe containers + images, start archipelago, verify all 3 UIs up within 60s. +- Adoption path: containers pre-running, start archipelago, verify no recreate (compare container IDs before/after). +- Reconcile-start path: `podman stop archy-bitcoin-ui`, wait, verify restart. +- Reconcile-recreate path: `podman rm -f archy-bitcoin-ui`, wait, verify rebuild+recreate. +- Rebuild-on-Dockerfile-change path: edit Dockerfile, call `upgrade` RPC, verify image rebuilt and container recreated. + +Chaos matrix (bash + Playwright, the original goal): +- For each UI (bitcoin-ui, electrs-ui, lnd-ui) × each event (stop, start, restart, remove+reconcile, SIGKILL, archipelago-service-restart, host-reboot) × each node (.116, .228): assert HTTP 200 + page-title marker returns within 60s of event. + +## Risks + mitigations + +| Risk | Mitigation | +|------|------------| +| Adoption mismatches and re-creates a container we already had, losing its data | Adoption matches by exact name; `install_fresh` only runs when `get_container_status` returns Err (container doesn't exist), not when it returns Stopped/Exited. Unit tested. | +| Build loop: reconciler rebuilds on every tick | `always_rebuild: false` + `image_exists` check. Only rebuilds when image tag is missing OR `upgrade` RPC is called. | +| Reconciler runs while user is mid-install via the UI | Orchestrator state has per-app mutex; reconcile waits. Install path takes the same mutex. | +| Auto-rollback (v1.7.41) fires during testing | `reconcile_all` is spawned AFTER server is healthy and responding; if it fails, archipelago the service still passes verification. Individual container failures are logged, not fatal. | +| Dependency ordering: bitcoin-ui needs BITCOIN_RPC_AUTH which is generated at first boot | Reconciler handles dependency order by reading `manifest.app.dependencies` and installing in topological order. If the dep doesn't exist yet, skip and retry next tick. | +| Moving `/opt/archipelago/docker/` content breaks the build context | That path is stable per the ISO builder at `image-recipe/build-auto-installer-iso.sh:1671-1685`. Manifests reference it absolutely. | +| Dropping bash scripts breaks existing ISOs in the field | Target release cycle is disposable alpha nodes. For existing alpha nodes (.116, .228) we hot-swap the binary and let the reconciler take over, then the next reboot doesn't need the systemd units; we mask them manually. | +| User wants to downgrade to v1.7.42 | Auto-rollback mechanism already handles that; binary swap is reversible. The removed bash scripts are still in git history. | + +## Implementation order + +1. **Schema first**: extend `ContainerConfig` + `ContainerSource` + `resolve()` + validation + unit tests. ~100 LOC Rust + ~80 LOC tests. +2. **Runtime**: `build_image` + `image_exists` in trait, `PodmanRuntime`, `DockerRuntime` (can stub), `AutoRuntime`. ~150 LOC + tests with throwaway tempdir Dockerfile. +3. **ProdContainerOrchestrator**: new type with `install/start/stop/restart/remove/status/list/logs/health/adopt_existing/reconcile_all/ensure_running/install_fresh`. ~400 LOC + unit tests with mocked runtime. +4. **ContainerOrchestrator trait**: abstract over Dev and Prod so `RpcHandler` is polymorphic. ~50 LOC refactor. +5. **BootReconciler**: task spawner with loop + cancellation. ~80 LOC + unit tests. +6. **main.rs wire-up**: adopt + spawn reconciler. ~20 LOC. +7. **3 UI manifests + Dockerfile BITCOIN_RPC_AUTH refactor** (use ARG + template file, not sed). ~60 lines of YAML + ~20 lines of Dockerfile. +8. **Remove bash scripts + services**: `git rm` + ISO-builder edits + changelog. +9. **Live test on .228**: hot-swap binary, expect 3 UIs to come up within 60s of service restart. +10. **Live test on .116**: hot-swap binary, expect zero container recreation + adoption-confirmed log lines. +11. **Chaos matrix** on both nodes. + +Each step is a separate commit. Steps 1–6 are independent-enough that they can each have their own test gate. + +## Estimated total + +~1000 LOC Rust added, ~1500 lines bash deleted, ~50 LOC Rust deleted. 8–12 hours of focused work across multiple sessions. No release pressure per user decision. + +## Open questions for user + +1. **Container naming**: I propose `archy-` for UIs, `` for backends (matches current .116 fixture). Alternative: unify on `archy-` for everything and migrate existing backends by renaming at adoption. Which? +2. **BITCOIN_RPC_AUTH injection**: the build-arg approach rebuilds the UI image when the auth value changes. Fine during normal operation (rare). Alternative: mount the nginx.conf at runtime as a volume, never bake auth into the image. Which? +3. **Reconciler interval**: 5 minutes. Too slow for a dropped container (user sees a broken UI for up to 5 min). Alternative: 30 seconds + more expensive `podman ps` calls. Which? +4. **Concurrent reconcile + user install**: per-app mutex is the simple answer. Alternative: a single orchestrator-wide mutex (simpler, slower). Which? +5. **Delete bash scripts in this migration, or keep them around as fallback?** I recommend delete (single source of truth), but deleting `first-boot-containers.sh` is a one-way door in terms of field recovery.