From 5eec0c143c034b5fce105888957a6509fdb5f9e9 Mon Sep 17 00:00:00 2001 From: archipelago Date: Sat, 2 May 2026 05:34:47 -0400 Subject: [PATCH] test(lifecycle): post-condition gate for use_quadlet_backends path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A six-test bats suite that validates what install_via_quadlet (Phase 3.2) is supposed to leave behind: * `.container` unit on disk in $XDG_CONFIG_HOME/containers/systemd/ with [Container] / [Service] / [Install] sections, Image= present, and Restart=on-failure (the backend invariant — companions use Always) * Phase 3.4 cross-check: any unit with HealthCmd= must also emit Notify=healthy, otherwise systemctl start won't gate on health * `systemctl --user is-active` returns 0 for the .service * podman shows the container running * the container's cgroup is under user.slice/, NOT under archipelago.service — the kernel-level proof that FM3 cgroup cascade SIGKILL is structurally fixed for this container Auto-skips on every test when no backend Quadlet units exist (today's default state, use_quadlet_backends=false) — so the suite is a no-op on current fleet boxes and turns into a hard regression gate the moment anyone flips the flag and reinstalls. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/lifecycle/TESTING.md | 2 +- .../bats/use-quadlet-backends-install.bats | 149 ++++++++++++++++++ 2 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 tests/lifecycle/bats/use-quadlet-backends-install.bats diff --git a/tests/lifecycle/TESTING.md b/tests/lifecycle/TESTING.md index a527b7d5..86d273ee 100644 --- a/tests/lifecycle/TESTING.md +++ b/tests/lifecycle/TESTING.md @@ -57,7 +57,7 @@ v1.7.52 tags. | L0 unit | 628 | n/a | ● green | | L1 RPC | 70 | bitcoin-knots, lnd, electrumx, btcpay, mempool, fedimint, required-stack, package-update-smoke | ● for the 6 core apps | | L2 UI | 9 | ui-coverage | ● for dashboard + 7 proxy paths + bitcoin-ui:8334 | -| L3 lifecycle survival | 8 | companion-survives-archipelago-restart, backend-survives-archipelago-restart, required-stack-destructive | ◐ companions ● ; backends ◐ regression-gate (will fail until Phase 3 Quadlet ships) | +| L3 lifecycle survival | 14 | companion-survives-archipelago-restart, backend-survives-archipelago-restart, required-stack-destructive, use-quadlet-backends-install | ◐ companions ● ; backends ◐ regression-gate (will fail until Phase 3 Quadlet ships); quadlet post-condition gate ✅ skip-clean today, hard gate when flag flipped | | L4 browser journey | 0 | none | ○ not started | | L5 chaos | 0 | none | ○ not started | | L6 performance | 0 | none | ○ not started | diff --git a/tests/lifecycle/bats/use-quadlet-backends-install.bats b/tests/lifecycle/bats/use-quadlet-backends-install.bats new file mode 100644 index 00000000..02b3abab --- /dev/null +++ b/tests/lifecycle/bats/use-quadlet-backends-install.bats @@ -0,0 +1,149 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/use-quadlet-backends-install.bats +# +# Validates the post-condition of Phase 3.2's `use_quadlet_backends` +# install path. When the orchestrator routed at least one backend +# install through `install_via_quadlet`, this suite asserts that the +# resulting state has the four properties the Phase 3 design promises: +# +# 1. A `.container` unit file exists in ~/.config/containers/systemd/ +# and is well-formed (required sections + directives). +# 2. The corresponding `.service` is active under `systemctl --user`. +# 3. The container is in `podman ps` (running). +# 4. The container's cgroup is under `user.slice/...`, NOT under +# `archipelago.service` — proving FM3 (cgroup cascade SIGKILL on +# archipelago restart) is structurally fixed for that container. +# +# Auto-skips if no Quadlet-managed backend exists yet — so it runs as a +# no-op on nodes where `use_quadlet_backends` is still false (today's +# default), and turns into a hard regression gate as soon as anyone +# flips the flag and reinstalls. +# +# Run on a node with rootless podman + systemd-user (every alpha-fleet +# box). No env vars required for the read-only checks. The cleanup +# section at the bottom is gated by ARCHY_ALLOW_DESTRUCTIVE=1. + +quadlet_dir() { + echo "${XDG_CONFIG_HOME:-$HOME/.config}/containers/systemd" +} + +# List Quadlet `.container` units that correspond to backend containers +# (i.e., NOT companions like archy-*-ui, which already shipped via Quadlet +# in v1.7.41 and have their own coverage in companion-survives-archipelago- +# restart.bats). Echoes one container name per line; empty if none found. +backend_quadlet_units() { + local d + d="$(quadlet_dir)" + [[ -d "$d" ]] || return 0 + # Strip the .container extension; filter out archy-*-ui companions. + for f in "$d"/*.container; do + [[ -e "$f" ]] || continue + local name + name="$(basename "$f" .container)" + [[ "$name" =~ ^archy-.*-ui$ ]] && continue + echo "$name" + done +} + +# Read the cgroup path of a running container's main process. For +# rootless podman the conmon-run target lands the container's pid1 in +# the cgroup that owns its supervising .service. +container_cgroup_path() { + local name="$1" + local pid + pid="$(podman inspect --format '{{.State.Pid}}' "$name" 2>/dev/null)" + [[ -n "$pid" && "$pid" != "0" ]] || return 1 + # cgroup v2 line: "0::/path/to/cgroup" + awk -F: '$1=="0"{print $3}' "/proc/$pid/cgroup" 2>/dev/null +} + +# Per-test gate. Each @test calls this so the suite is a clean no-op on +# nodes where use_quadlet_backends is still false (today's default) — +# bats doesn't propagate setup-level skip semantics across @test blocks. +require_quadlet_backends() { + local count + count="$(backend_quadlet_units | wc -l)" + (( count > 0 )) || skip "no backend .container units in $(quadlet_dir) — use_quadlet_backends not enabled or no backends installed" +} + +@test "Quadlet unit dir exists or is plausibly creatable" { + local d + d="$(quadlet_dir)" + # Either it already exists, or its parent does (so quadlet can mkdir it). + [[ -d "$d" ]] || [[ -d "$(dirname "$d")" ]] \ + || skip "no XDG_CONFIG_HOME and no \$HOME/.config — not a desktop-style host" +} + +@test "each backend Quadlet unit has the required sections + directives" { + require_quadlet_backends + local d + d="$(quadlet_dir)" + while read -r name; do + [[ -z "$name" ]] && continue + local body + body="$(<"$d/$name.container")" + # [Container] section + Image= + [[ "$body" == *"[Container]"* ]] || fail "$name: missing [Container] section" + [[ "$body" == *"Image="* ]] || fail "$name: missing Image= directive" + # [Service] section with the Phase 3.2 backend invariant: Restart=on-failure. + # Companions use Restart=always; backends use on-failure so an operator-issued + # `systemctl stop` actually stays stopped. + [[ "$body" == *"[Service]"* ]] || fail "$name: missing [Service] section" + [[ "$body" == *"Restart=on-failure"* ]] \ + || fail "$name: backend unit must use Restart=on-failure (got companion-style Restart=always)" + # [Install] section so `systemctl --user enable` is well-defined. + [[ "$body" == *"[Install]"* ]] || fail "$name: missing [Install] section" + [[ "$body" == *"WantedBy="* ]] || fail "$name: missing WantedBy= in [Install]" + done < <(backend_quadlet_units) +} + +@test "Phase 3.4: any unit emitting HealthCmd= also emits Notify=healthy" { + require_quadlet_backends + local d + d="$(quadlet_dir)" + while read -r name; do + [[ -z "$name" ]] && continue + local body + body="$(<"$d/$name.container")" + if [[ "$body" == *"HealthCmd="* ]]; then + [[ "$body" == *"Notify=healthy"* ]] \ + || fail "$name: HealthCmd= present but Notify=healthy missing — systemctl start won't gate on health" + fi + done < <(backend_quadlet_units) +} + +@test "every backend Quadlet unit's .service is active in systemctl --user" { + require_quadlet_backends + while read -r name; do + [[ -z "$name" ]] && continue + run systemctl --user is-active "$name.service" + [[ "$status" -eq 0 ]] || fail "$name.service is '$output' — expected 'active'" + done < <(backend_quadlet_units) +} + +@test "every backend Quadlet unit has a running podman container" { + require_quadlet_backends + while read -r name; do + [[ -z "$name" ]] && continue + run sh -c "podman inspect --format '{{.State.Running}}' '$name'" + [[ "$status" -eq 0 ]] || fail "$name not present in podman" + [[ "$output" == "true" ]] || fail "$name container exists but not running (state=$output)" + done < <(backend_quadlet_units) +} + +@test "FM3 fix: backend cgroup is under user.slice, not archipelago.service" { + require_quadlet_backends + # The whole point of Phase 3 — verify the kernel-level invariant. + while read -r name; do + [[ -z "$name" ]] && continue + local cg + cg="$(container_cgroup_path "$name")" || skip "$name has no readable PID; container may have crashed mid-test" + [[ -n "$cg" ]] || fail "$name: empty cgroup path" + # Acceptable: anything under user.slice (rootless podman lands here when + # quadlet-managed). Forbidden: anything under archipelago.service's tree. + [[ "$cg" == *"user.slice"* ]] \ + || fail "$name: cgroup '$cg' is not under user.slice — FM3 cascade still possible" + [[ "$cg" != *"archipelago.service"* ]] \ + || fail "$name: cgroup '$cg' is under archipelago.service — Phase 3 promise broken" + done < <(backend_quadlet_units) +}