fix: bulletproof first-boot container creation and install reliability

Remove the Bitcoin RPC 60-second gate that blocked 13+ dependent containers
(mempool, electrumx, btcpay, lnd, fedimint) from being created on first boot.
Containers now always get created and auto-restart via health monitor once
Bitcoin becomes responsive — the designed recovery path.

Additional hardening:
- Validate archy-net creation with retry (silent failure broke DNS)
- Verify critical images are loaded, re-load from tarballs if missing
- Create SearXNG settings.yml before container start (was missing)
- Run reconciler automatically after first-boot failures
- Add load-images as explicit systemd dependency with 900s timeout
- Propagate config write errors in install.rs (bitcoin.conf, lnd.conf)
- FileBrowser password change: retry loop (6 attempts) + 0o600 perms
- Post-start verification: detect containers that exit immediately
- Add 2s dependency waits between container starts

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-31 18:31:00 +01:00
parent a896ecd431
commit 08f7f58a9d
5 changed files with 239 additions and 68 deletions

View File

@@ -412,11 +412,13 @@ load_spec_searxng() {
SPEC_IMAGE="${SEARXNG_IMAGE}"
SPEC_PORTS="8888:8080"
SPEC_MEMORY="$(mem_limit searxng)"
SPEC_VOLUMES="/var/lib/archipelago/searxng:/etc/searxng"
SPEC_HEALTH_CMD="curl -sf http://localhost:8080/ || exit 1"
SPEC_READONLY="true"
SPEC_TMPFS="/tmp:rw,noexec,nosuid,size=256m /run:rw,noexec,nosuid,size=64m /etc/searxng:rw,noexec,nosuid,size=16m"
SPEC_TMPFS="/tmp:rw,noexec,nosuid,size=256m /run:rw,noexec,nosuid,size=64m"
SPEC_TIER="3"
SPEC_CAPS=""
SPEC_DATA_DIR="/var/lib/archipelago/searxng"
}
load_spec_onlyoffice() {

View File

@@ -233,8 +233,19 @@ chmod 700 /run/user/1000
runuser -u archipelago -- env XDG_RUNTIME_DIR=/run/user/1000 \
systemctl --user start podman.socket 2>/dev/null || true
# Ensure network exists (matches deploy)
# Ensure archy-net exists — critical for inter-container DNS (mempool→bitcoin, etc.)
$DOCKER network create archy-net 2>/dev/null || true
if ! $DOCKER network exists archy-net 2>/dev/null; then
log "WARNING: archy-net creation failed, retrying in 5s..."
sleep 5
$DOCKER network create archy-net 2>>"$LOG"
if ! $DOCKER network exists archy-net 2>/dev/null; then
log "FATAL: Cannot create archy-net — inter-container DNS will not work."
log " All containers requiring archy-net will fail. Exiting."
exit 1
fi
fi
log "archy-net network ready"
# Rootless podman UID mapping: fix data dir ownership so container processes
# can write. Rootless podman maps container UIDs via subuid (container UID N
@@ -299,6 +310,43 @@ mem_limit() {
esac
}
# ── Verify critical images are loaded ──────────────────────────────────
# archipelago-load-images.service should have loaded these from tarballs.
# If any are missing (corrupt tarball, disk full, etc.), try re-loading.
log "Verifying container images..."
MISSING_IMAGES=""
for img_var in BITCOIN_KNOTS_IMAGE MARIADB_IMAGE ELECTRUMX_IMAGE \
MEMPOOL_BACKEND_IMAGE MEMPOOL_WEB_IMAGE BTCPAY_POSTGRES_IMAGE \
NBXPLORER_IMAGE BTCPAY_IMAGE LND_IMAGE FEDIMINT_IMAGE \
FEDIMINT_GATEWAY_IMAGE HOMEASSISTANT_IMAGE GRAFANA_IMAGE \
UPTIME_KUMA_IMAGE JELLYFIN_IMAGE VAULTWARDEN_IMAGE \
NEXTCLOUD_IMAGE SEARXNG_IMAGE FILEBROWSER_IMAGE; do
img="${!img_var}"
if [ -z "$img" ]; then
continue # Variable not defined in image-versions.sh
fi
if ! $DOCKER images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep -qF "$img"; then
MISSING_IMAGES="$MISSING_IMAGES $img_var"
fi
done
if [ -n "$MISSING_IMAGES" ]; then
log "WARNING: Missing images:$MISSING_IMAGES"
log "Attempting to re-load from /opt/archipelago/container-images/..."
RELOAD_COUNT=0
for tarfile in /opt/archipelago/container-images/*.tar; do
if [ -f "$tarfile" ]; then
if $DOCKER load -i "$tarfile" 2>>"$LOG"; then
RELOAD_COUNT=$((RELOAD_COUNT + 1))
else
log " Failed to load: $tarfile"
fi
fi
done
log "Re-loaded $RELOAD_COUNT image tarballs"
else
log "All critical images verified"
fi
# ── Tier 1: Databases & Core Infrastructure ──────────────────────────────
log "=== Tier 1: Databases & Core Infrastructure ==="
@@ -337,13 +385,16 @@ else
$DOCKER network connect archy-net bitcoin-knots 2>/dev/null || true
log "Bitcoin Knots already running"
fi
# Wait for Bitcoin Knots RPC to be responsive
# Check Bitcoin Knots RPC (informational — containers created regardless)
# Dependent containers use --restart=unless-stopped and the health monitor
# will auto-restart them once Bitcoin becomes responsive.
if wait_for_container "Bitcoin Knots RPC" "$DOCKER exec bitcoin-knots bitcoin-cli -rpcuser='$BITCOIN_RPC_USER' -rpcpassword='$BITCOIN_RPC_PASS' getblockchaininfo" 60; then
BITCOIN_READY=true
log "Bitcoin Knots is ready — dependent containers will proceed"
log "Bitcoin Knots is ready"
else
BITCOIN_READY=false
log "WARNING: Bitcoin Knots NOT ready — skipping dependent containers (electrumx, lnd, mempool, btcpay, fedimint)"
log "Bitcoin Knots not yet responsive (normal during IBD) — creating dependent containers anyway"
log " They will auto-restart via health monitor once Bitcoin is ready"
fi
track_container "bitcoin-knots"
@@ -355,7 +406,8 @@ if ! $DOCKER exec bitcoin-knots bitcoin-cli "-rpcuser=$BITCOIN_RPC_USER" "-rpcpa
fi
# 2. Mempool stack (matches deploy) — depends on Bitcoin
if [ "$BITCOIN_READY" = "true" ]; then
# Note: containers created regardless of BITCOIN_READY — they will restart
# automatically once Bitcoin becomes responsive (--restart=unless-stopped).
if ! $DOCKER ps -a --format '{{.Names}}' 2>/dev/null | grep -qE 'archy-mempool-db|mysql-mempool'; then
log "Creating mysql-mempool..."
mkdir -p /var/lib/archipelago/mysql-mempool
@@ -624,9 +676,7 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q fedimint-gateway; th
fi
track_container "fedimint-gateway"
else
log "SKIPPED: mempool stack, electrumx, btcpay stack, lnd, fedimint (Bitcoin not ready)"
fi # end BITCOIN_READY
# (Bitcoin-dependent containers created above regardless of BITCOIN_READY)
# ── Tier 3: Applications (independent — always attempt) ───────────────────
log "=== Tier 3: Applications ==="
@@ -742,12 +792,33 @@ fi
track_container "nextcloud"
if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q searxng; then
log "Creating SearXNG..."
# SearXNG requires settings.yml or it exits immediately
SEARXNG_CONF="/var/lib/archipelago/searxng"
if [ ! -f "$SEARXNG_CONF/settings.yml" ]; then
mkdir -p "$SEARXNG_CONF"
SEARX_SECRET=$(openssl rand -hex 32)
cat > "$SEARXNG_CONF/settings.yml" <<SEARXCFG
use_default_settings: true
general:
instance_name: Archipelago Search
server:
secret_key: "$SEARX_SECRET"
bind_address: "0.0.0.0"
port: 8080
limiter: false
ui:
default_theme: simple
SEARXCFG
chown -R 100000:100000 "$SEARXNG_CONF" 2>/dev/null
log " Created SearXNG settings.yml"
fi
$DOCKER run -d --name searxng --restart unless-stopped \
--health-cmd="curl -sf http://localhost:8080/ || exit 1" --health-interval=120s --health-timeout=5s --health-retries=3 \
--memory=$(mem_limit searxng) \
--cap-drop ALL --security-opt no-new-privileges:true \
--read-only --tmpfs /tmp:rw,noexec,nosuid,size=256m --tmpfs /run:rw,noexec,nosuid,size=64m \
-p 8888:8080 \
-v /var/lib/archipelago/searxng:/etc/searxng \
"${SEARXNG_IMAGE}" 2>>"$LOG" || true
fi
track_container "searxng"
@@ -979,8 +1050,29 @@ elif [ -x "/opt/archipelago/scripts/container-doctor.sh" ]; then
bash "/opt/archipelago/scripts/container-doctor.sh" --local 2>&1 | tee -a "$LOG"
fi
# 12. Final summary
# 11b. If any containers failed, run the reconciler to attempt recovery
FAILED=$((TOTAL - SUCCESS))
if [ "$FAILED" -gt 0 ]; then
log "Attempting to recover $FAILED failed container(s) via reconciler..."
RECONCILE_SCRIPT=""
if [ -x "$SCRIPT_DIR/reconcile-containers.sh" ]; then
RECONCILE_SCRIPT="$SCRIPT_DIR/reconcile-containers.sh"
elif [ -x "/opt/archipelago/scripts/reconcile-containers.sh" ]; then
RECONCILE_SCRIPT="/opt/archipelago/scripts/reconcile-containers.sh"
fi
if [ -n "$RECONCILE_SCRIPT" ]; then
runuser -u archipelago -- bash "$RECONCILE_SCRIPT" 2>&1 | tee -a "$LOG"
# Recount after reconciliation
SUCCESS=0
for name in $($DOCKER ps --format '{{.Names}}' 2>/dev/null); do
SUCCESS=$((SUCCESS + 1))
done
FAILED=$((TOTAL - SUCCESS))
log "After reconciliation: $SUCCESS running, $FAILED still failed"
fi
fi
# 12. Final summary
log "============================================="
log " FIRST-BOOT CONTAINER SUMMARY"
log "============================================="
@@ -988,7 +1080,7 @@ log " Total tracked: $TOTAL"
log " Running: $SUCCESS"
log " Failed: $FAILED"
if [ "$BITCOIN_READY" != "true" ]; then
log " Bitcoin: NOT READY (dependent containers skipped)"
log " Bitcoin: NOT READY (dependent containers will auto-restart when ready)"
fi
if [ -n "$FAILED_LIST" ]; then
log " Failed list: $FAILED_LIST"