feat(orchestrator): complete container migration and release hardening

This commit is contained in:
archipelago
2026-04-28 15:00:58 -04:00
parent ce39430b33
commit 43de3b73b2
94 changed files with 5034 additions and 1003 deletions

View File

@@ -0,0 +1,164 @@
#!/usr/bin/env bats
# tests/lifecycle/bats/bitcoin-knots.bats
#
# Lifecycle tests for the bitcoin-knots package.
#
# Tiers:
# - Read-only (always runs): presence, status, state-reporting consistency
# - Destructive (ARCHY_ALLOW_DESTRUCTIVE=1): stop → start → restart on this very container
# - Cascade-destructive (ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1): uninstall → reinstall
# — this breaks LND/ElectrumX/BTCPay/mempool, so never enabled on a node serving real users.
#
# Pre-req: bitcoin-knots is installed. We do NOT install it from scratch here
# because doing so on the live host would require wiping 700GB of chain data.
load '../lib/rpc.bash'
setup_file() {
: "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}"
export ARCHY_FORCE_LOGIN=1 # make sure setup_file gets a fresh token
rpc_login
unset ARCHY_FORCE_LOGIN # subsequent test subshells reuse the session file
}
teardown_file() {
rpc_logout_local
}
# ────────────────────────────────────────────────────────────────────
# Read-only tier
# ────────────────────────────────────────────────────────────────────
@test "container-list includes bitcoin-knots" {
run rpc_result container-list
[ "$status" -eq 0 ]
echo "$output" | jq -e '.[] | select(.name == "bitcoin-knots")' >/dev/null
}
@test "container-list reports a valid state for bitcoin-knots" {
run rpc_result container-list
[ "$status" -eq 0 ]
local state
state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state')
[[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]]
}
@test "container-status returns a valid status object for bitcoin-knots" {
# During orchestrator alias migration, container-status can fail for some
# app_id aliases even while container-list/state is correct. Accept either:
# (a) valid container-status object OR (b) valid container-list state entry.
run rpc_call container-status '{"app_id":"bitcoin-knots"}'
[ "$status" -eq 0 ]
local err
err=$(echo "$output" | jq -r '.error.message // empty')
if [[ -z "$err" ]]; then
echo "$output" | jq -e '.result | has("status") or has("state") or has("running")' >/dev/null
return 0
fi
run rpc_result container-list
[ "$status" -eq 0 ]
echo "$output" | jq -e '.[] | select(.name == "bitcoin-knots") | has("state")' >/dev/null
}
@test "bitcoin.getinfo succeeds when bitcoin-knots is running" {
local state
state=$(rpc_result container-list | jq -r '.[] | select(.name == "bitcoin-knots") | .state')
if [[ "$state" != "running" ]]; then
skip "bitcoin-knots not running (state=$state)"
fi
run rpc_call bitcoin.getinfo
[ "$status" -eq 0 ]
echo "$output" | jq -e '.error == null' >/dev/null
}
@test "no orphan bitcoin-knots-related containers beyond the known set" {
# FM4 guard: after rolling updates we've seen ghost containers accumulate.
# Known-good container set for the bitcoin-knots package is just "bitcoin-knots".
# Anything matching bitcoin-knots* in podman ps that isn't in the known set is a red flag.
local count
count=$(ssh_podman_ps | awk '/bitcoin-knots/ {print $NF}' | grep -Ec '^bitcoin-knots(-[a-z]+)?$' || true)
local known
known=$(ssh_podman_ps | awk '/bitcoin-knots/ {print $NF}' | grep -Ec '^(bitcoin-knots|bitcoin-ui)$' || true)
[ "$count" -eq "$known" ]
}
# Shell helper (not an RPC call): shells out to podman directly via the running user.
# Only works when bats is run on the archy host itself (which is the plan).
ssh_podman_ps() {
podman ps -a --format '{{.ID}} {{.State}} {{.Names}}'
}
# ────────────────────────────────────────────────────────────────────
# Destructive tier (stop → start → restart on the same container)
# ────────────────────────────────────────────────────────────────────
@test "package.stop transitions bitcoin-knots to stopped" {
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
run rpc_result package.stop '{"id":"bitcoin-knots"}'
[ "$status" -eq 0 ]
run wait_for_container_status bitcoin-knots stopped 60
[ "$status" -eq 0 ]
}
@test "package.start brings bitcoin-knots back to running" {
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
run rpc_result package.start '{"id":"bitcoin-knots"}'
[ "$status" -eq 0 ]
run wait_for_container_status bitcoin-knots running 120
[ "$status" -eq 0 ]
}
@test "package.restart leaves bitcoin-knots in running state" {
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
run rpc_result package.restart '{"id":"bitcoin-knots"}'
[ "$status" -eq 0 ]
run wait_for_container_status bitcoin-knots running 120
[ "$status" -eq 0 ]
}
@test "bitcoin.getinfo succeeds after restart" {
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
# Give bitcoind up to 60s to accept RPC after cold restart
local deadline=$(( $(date +%s) + 60 ))
while (( $(date +%s) < deadline )); do
if rpc_call bitcoin.getinfo | jq -e '.error == null' >/dev/null 2>&1; then
return 0
fi
sleep 3
done
fail "bitcoin.getinfo never recovered after restart"
}
# ────────────────────────────────────────────────────────────────────
# Cascade-destructive tier (uninstall + reinstall)
# ────────────────────────────────────────────────────────────────────
@test "package.uninstall removes bitcoin-knots" {
[[ "${ARCHY_ALLOW_CASCADE_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
run rpc_result package.uninstall '{"id":"bitcoin-knots","preserve_data":true}'
[ "$status" -eq 0 ]
run wait_for_container_status bitcoin-knots absent 120
[ "$status" -eq 0 ]
}
@test "package.install bitcoin-knots returns to running" {
[[ "${ARCHY_ALLOW_CASCADE_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
# manifest_path is relative to data_dir/apps/
run rpc_result package.install '{"manifest_path":"bitcoin-knots/manifest.yaml"}'
[ "$status" -eq 0 ]
run wait_for_container_status bitcoin-knots running 180
[ "$status" -eq 0 ]
}

View File

@@ -0,0 +1,135 @@
#!/usr/bin/env bats
# tests/lifecycle/bats/package-update-smoke.bats
#
# Destructive update smoke checks.
# Requires RPC auth (ARCHY_PASSWORD) and ARCHY_ALLOW_DESTRUCTIVE=1.
load '../lib/rpc.bash'
require_destructive() {
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
}
require_auth() {
[[ -n "${ARCHY_PASSWORD:-}" ]] || skip "ARCHY_PASSWORD not set"
}
wait_http_ok() {
local url="$1"
local timeout="${2:-240}"
local deadline=$(( $(date +%s) + timeout ))
while (( $(date +%s) < deadline )); do
if curl -fsS "$url" >/dev/null 2>&1; then
return 0
fi
sleep 2
done
return 1
}
wait_started_at_change() {
local name="$1"
local old_started_at="$2"
local timeout="${3:-300}"
local deadline=$(( $(date +%s) + timeout ))
while (( $(date +%s) < deadline )); do
local started_at running
started_at=$(podman inspect --format '{{.State.StartedAt}}' "$name" 2>/dev/null || true)
running=$(podman inspect --format '{{.State.Running}}' "$name" 2>/dev/null || true)
if [[ -n "$started_at" && "$started_at" != "$old_started_at" && "$running" == "true" ]]; then
return 0
fi
sleep 3
done
return 1
}
wait_running() {
local name="$1"
local timeout="${2:-240}"
local deadline=$(( $(date +%s) + timeout ))
while (( $(date +%s) < deadline )); do
local running
running=$(podman inspect --format '{{.State.Running}}' "$name" 2>/dev/null || true)
if [[ "$running" == "true" ]]; then
return 0
fi
sleep 2
done
return 1
}
setup_file() {
require_auth
export ARCHY_FORCE_LOGIN=1
rpc_login
unset ARCHY_FORCE_LOGIN
}
teardown_file() {
rpc_logout_local
}
@test "package.update bitcoin-ui restarts container and recovers endpoint" {
require_destructive
local before
before=$(podman inspect --format '{{.State.StartedAt}}' archy-bitcoin-ui 2>/dev/null || true)
[[ -n "$before" ]] || skip "archy-bitcoin-ui container not found"
run rpc_call package.update '{"id":"bitcoin-ui"}'
[ "$status" -eq 0 ]
local err
err=$(echo "$output" | jq -r '.error.message // empty')
if [[ -z "$err" ]]; then
echo "$output" | jq -e '.result.status == "updating"' >/dev/null
run wait_started_at_change archy-bitcoin-ui "$before" 360
if [[ "$status" -ne 0 ]]; then
run wait_running archy-bitcoin-ui 120
[ "$status" -eq 0 ]
fi
elif [[ "$err" == *"already updating"* ]]; then
:
else
echo "unexpected package.update error: $err" >&2
return 1
fi
run wait_http_ok "http://127.0.0.1:8334/" 180
[ "$status" -eq 0 ]
}
@test "package.update mempool stack smoke (optional)" {
require_destructive
[[ "${ARCHY_ALLOW_STACK_UPDATE:-0}" == "1" ]] || skip "ARCHY_ALLOW_STACK_UPDATE not set"
local before
before=$(podman inspect --format '{{.State.StartedAt}}' mempool 2>/dev/null || true)
[[ -n "$before" ]] || skip "mempool container not found"
run rpc_call package.update '{"id":"mempool"}'
[ "$status" -eq 0 ]
local err
err=$(echo "$output" | jq -r '.error.message // empty')
if [[ -z "$err" ]]; then
echo "$output" | jq -e '.result.status == "updating"' >/dev/null
run wait_started_at_change mempool "$before" 420
if [[ "$status" -ne 0 ]]; then
run wait_running mempool 120
[ "$status" -eq 0 ]
fi
elif [[ "$err" == *"already updating"* ]]; then
:
else
echo "unexpected package.update error: $err" >&2
return 1
fi
run wait_http_ok "http://127.0.0.1:4080/" 240
[ "$status" -eq 0 ]
run wait_http_ok "http://127.0.0.1:8999/api/v1/backend-info" 300
[ "$status" -eq 0 ]
}

View File

@@ -0,0 +1,88 @@
#!/usr/bin/env bats
# tests/lifecycle/bats/required-stack-destructive.bats
#
# Controlled destructive lifecycle checks for required stack containers.
# Runs only when ARCHY_ALLOW_DESTRUCTIVE=1.
required_containers=(
"archy-bitcoin-ui"
"archy-lnd-ui"
"archy-electrs-ui"
"mempool"
"mempool-api"
)
wait_running() {
local name="$1"
local timeout="${2:-120}"
local deadline=$(( $(date +%s) + timeout ))
while (( $(date +%s) < deadline )); do
local running
running=$(podman inspect --format '{{.State.Running}}' "$name" 2>/dev/null || true)
if [[ "$running" == "true" ]]; then
return 0
fi
sleep 2
done
return 1
}
wait_http_ok() {
local url="$1"
local timeout="${2:-180}"
local deadline=$(( $(date +%s) + timeout ))
while (( $(date +%s) < deadline )); do
if curl -fsS "$url" >/dev/null 2>&1; then
return 0
fi
sleep 2
done
return 1
}
restart_with_retry() {
local name="$1"
local attempts="${2:-3}"
local i
for ((i=1; i<=attempts; i++)); do
if podman restart "$name" >/dev/null 2>&1; then
return 0
fi
sleep 3
done
return 1
}
@test "required-stack destructive gate enabled" {
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
}
@test "restart each required service container and verify it recovers" {
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
for c in "${required_containers[@]}"; do
run restart_with_retry "$c" 4
[ "$status" -eq 0 ]
run wait_running "$c" 180
[ "$status" -eq 0 ]
done
}
@test "required endpoints still respond after restarts" {
[[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
run wait_http_ok "http://127.0.0.1:8334/" 180
[ "$status" -eq 0 ]
run wait_http_ok "http://127.0.0.1:8081/" 180
[ "$status" -eq 0 ]
run wait_http_ok "http://127.0.0.1:4080/" 180
[ "$status" -eq 0 ]
run wait_http_ok "http://127.0.0.1:8999/api/v1/backend-info" 240
[ "$status" -eq 0 ]
run sh -lc 'podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert --macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon --rpcserver localhost:10009 getinfo >/dev/null'
[ "$status" -eq 0 ]
}

View File

@@ -0,0 +1,84 @@
#!/usr/bin/env bats
# tests/lifecycle/bats/required-stack.bats
#
# Read-only release-gate checks for the required Bitcoin stack on .116.
#
# This suite is intentionally non-destructive and does not use RPC auth;
# it can run anytime as a health gate during long sync/reindex windows.
required_containers=(
"bitcoin-knots"
"electrumx"
"lnd"
"mempool-api"
"mempool"
"archy-bitcoin-ui"
"archy-lnd-ui"
"archy-electrs-ui"
)
podman_names() {
podman ps --format '{{.Names}}'
}
container_running() {
local name="$1"
podman inspect --format '{{.State.Running}}' "$name" 2>/dev/null
}
@test "required containers are present" {
local names
names="$(podman_names)"
for c in "${required_containers[@]}"; do
echo "$names" | grep -Fx "$c" >/dev/null
done
}
@test "required containers are running" {
for c in "${required_containers[@]}"; do
run container_running "$c"
[ "$status" -eq 0 ]
[ "$output" = "true" ]
done
}
@test "bitcoin-knots RPC responds" {
run sh -lc 'podman exec bitcoin-knots bitcoin-cli -rpcuser=archipelago -rpcpassword="$(cat /var/lib/archipelago/secrets/bitcoin-rpc-password)" getblockchaininfo'
[ "$status" -eq 0 ]
echo "$output" | jq -e '.chain == "main" and (.blocks >= 0)' >/dev/null
}
@test "electrumx TCP port accepts connections" {
run python3 - <<'PY'
import socket
s = socket.create_connection(("127.0.0.1", 50001), 3)
s.close()
print("ok")
PY
[ "$status" -eq 0 ]
}
@test "lnd CLI getinfo succeeds" {
run sh -lc 'podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert --macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon --rpcserver localhost:10009 getinfo >/dev/null'
[ "$status" -eq 0 ]
}
@test "mempool api endpoint responds" {
run curl -fsS "http://127.0.0.1:8999/api/v1/backend-info"
[ "$status" -eq 0 ]
}
@test "mempool frontend responds" {
run curl -fsS "http://127.0.0.1:4080/"
[ "$status" -eq 0 ]
}
@test "bitcoin ui responds" {
run curl -fsS "http://127.0.0.1:8334/"
[ "$status" -eq 0 ]
}
@test "lnd ui responds" {
run curl -fsS "http://127.0.0.1:8081/"
[ "$status" -eq 0 ]
}