fix: overhaul container lifecycle — recovery, health, uninstall, UI state
Some checks failed
Build Archipelago ISO (dev) / build-iso (push) Failing after 13m44s
Container Orchestration Tests / unit-tests (push) Failing after 7m30s
Container Orchestration Tests / smoke-tests (push) Has been skipped

Container recovery:
- Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s
- Dependency-aware restarts: won't restart services before their deps
- Reset dependent counters when a dependency recovers
- Handle "created" state containers (were invisible to health monitor)
- Added IndeedHub, mempool-api, mysql to tier system
- Crash recovery: podman start timeout 30s→120s with retry
- Podman client: socket timeout 5s→30s, added restart policy

UI state representation:
- Exit code 0 shows "stopped" (gray), not "crashed" (red)
- Exit code 137 shows "killed (OOM)"
- Non-zero exit shows "crashed" (red)
- Added exit_code field to PackageDataEntry

Install/uninstall fixes:
- Install returns error when container doesn't start (was silent success)
- Post-install hooks awaited instead of fire-and-forget tokio::spawn
- Uninstall: graceful rm before force, volume prune, network cleanup
- Uninstall returns error on partial failure (was 200 OK)

Config consistency:
- DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded)
- Bitcoin: added ZMQ ports 28332/28333 for LND block notifications
- IndeedHub port 7777→8190 (was conflicting with strfry)
- Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0

Performance:
- Metrics collector interval 60s→300s (was duplicating health monitor)
- Podman client: proper error propagation instead of unwrap_or_default

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-31 07:03:57 +01:00
parent cdff10a8bc
commit 64b57dca7d
65 changed files with 3950 additions and 298 deletions

View File

@@ -389,6 +389,15 @@ pub(super) fn get_data_dirs_for_app(package_id: &str) -> Vec<String> {
}
}
/// Read a secret from /var/lib/archipelago/secrets/{name}.
/// Falls back to the provided default if the file doesn't exist.
fn read_secret(name: &str, default: &str) -> String {
let path = format!("/var/lib/archipelago/secrets/{}", name);
std::fs::read_to_string(&path)
.map(|s| s.trim().to_string())
.unwrap_or_else(|_| default.to_string())
}
/// Get app-specific configuration
/// Returns: (ports, volumes, env_vars, custom_command, custom_args)
pub(super) async fn get_app_config(
@@ -413,7 +422,12 @@ pub(super) async fn get_app_config(
None,
),
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => (
vec!["8332:8332".to_string(), "8333:8333".to_string()],
vec![
"8332:8332".to_string(),
"8333:8333".to_string(),
"28332:28332".to_string(),
"28333:28333".to_string(),
],
vec!["/var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin".to_string()],
vec![],
None,
@@ -453,7 +467,8 @@ pub(super) async fn get_app_config(
format!("BTCPAY_BTCRPCURL=http://{}:8332", host_ip),
format!("BTCPAY_BTCRPCUSER={}", rpc_user),
format!("BTCPAY_BTCRPCPASSWORD={}", rpc_pass),
"BTCPAY_POSTGRES=User ID=btcpay;Password=btcpaypass;Host=archy-btcpay-db;Port=5432;Database=btcpay;Include Error Detail=true".to_string(),
format!("BTCPAY_POSTGRES=User ID=btcpay;Password={};Host=archy-btcpay-db;Port=5432;Database=btcpay;Include Error Detail=true",
read_secret("btcpay-db-password", "btcpaypass")),
],
None,
None,
@@ -481,7 +496,7 @@ pub(super) async fn get_app_config(
"DATABASE_HOST=archy-mempool-db".to_string(),
"DATABASE_DATABASE=mempool".to_string(),
"DATABASE_USERNAME=mempool".to_string(),
"DATABASE_PASSWORD=mempoolpass".to_string(),
format!("DATABASE_PASSWORD={}", read_secret("mempool-db-password", "mempoolpass")),
],
None,
None,
@@ -511,8 +526,8 @@ pub(super) async fn get_app_config(
vec![
"MYSQL_DATABASE=mempool".to_string(),
"MYSQL_USER=mempool".to_string(),
"MYSQL_PASSWORD=mempoolpass".to_string(),
"MYSQL_ROOT_PASSWORD=rootpass".to_string(),
format!("MYSQL_PASSWORD={}", read_secret("mempool-db-password", "mempoolpass")),
format!("MYSQL_ROOT_PASSWORD={}", read_secret("mempool-db-root-password", "rootpass")),
],
None,
None,
@@ -607,7 +622,7 @@ pub(super) async fn get_app_config(
vec![
"DB_HOSTNAME=immich_postgres".to_string(),
"DB_USERNAME=postgres".to_string(),
"DB_PASSWORD=immichpass".to_string(),
format!("DB_PASSWORD={}", read_secret("immich-db-password", "immichpass")),
"DB_DATABASE_NAME=immich".to_string(),
"REDIS_HOSTNAME=immich_redis".to_string(),
"UPLOAD_LOCATION=/usr/src/app/upload".to_string(),

View File

@@ -256,8 +256,9 @@ impl RpcHandler {
.trim()
.to_string();
// Post-start health verification: wait up to 30s for container to be running
for i in 0..6u32 {
// Post-start health verification: wait up to 60s for container to be running
let mut container_running = false;
for i in 0..12u32 {
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
let status = tokio::process::Command::new("podman")
.args(["inspect", container_name, "--format", "{{.State.Status}}"])
@@ -266,6 +267,7 @@ impl RpcHandler {
if let Ok(o) = status {
let state = String::from_utf8_lossy(&o.stdout).trim().to_string();
if state == "running" {
container_running = true;
break;
}
if state == "exited" {
@@ -288,12 +290,19 @@ impl RpcHandler {
));
}
}
if i == 5 {
debug!("Container {} health check timeout (30s)continuing anyway", container_name);
if i == 11 {
warn!("Container {} not running after 60s — install may have failed", container_name);
}
}
// Post-install hooks
if !container_running {
return Err(anyhow::anyhow!(
"Container {} did not reach running state within 60s. Check logs with: podman logs {}",
container_name, container_name
));
}
// Post-install hooks — await completion before returning success
self.run_post_install_hooks(package_id).await;
Ok(serde_json::json!({
@@ -536,98 +545,106 @@ printtoconsole=1\n",
}
/// Run post-install hooks (Nextcloud trusted domains, Bitcoin UI container).
/// Critical hooks (credential setup, config) are awaited; UI container builds are background.
async fn run_post_install_hooks(&self, package_id: &str) {
if package_id == "filebrowser" {
tokio::spawn(async move {
// Wait for filebrowser to start and initialize its database
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
// Wait for filebrowser to start and initialize its database
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
// Generate a random password (32 bytes, hex-encoded)
let mut buf = [0u8; 32];
rand::RngCore::fill_bytes(&mut rand::rngs::OsRng, &mut buf);
let password = hex::encode(buf);
// Generate a random password (32 bytes, hex-encoded)
let mut buf = [0u8; 32];
rand::RngCore::fill_bytes(&mut rand::rngs::OsRng, &mut buf);
let password = hex::encode(buf);
// Get a JWT token with default credentials
let login_res = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
.unwrap_or_default()
.post("http://127.0.0.1:8083/api/login")
.json(&serde_json::json!({"username": "admin", "password": "admin"}))
.send()
.await;
// Get a JWT token with default credentials
let client = match reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
{
Ok(c) => c,
Err(e) => {
tracing::warn!("Failed to create HTTP client for FileBrowser hook: {}", e);
return;
}
};
let token = match login_res {
Ok(resp) if resp.status().is_success() => {
resp.text().await.unwrap_or_default().trim_matches('"').to_string()
}
_ => {
tracing::warn!("FileBrowser not ready for password change — keeping default");
return;
}
};
let login_res = client
.post("http://127.0.0.1:8083/api/login")
.json(&serde_json::json!({"username": "admin", "password": "admin"}))
.send()
.await;
// Change admin password via filebrowser API
let change_res = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
.unwrap_or_default()
.put("http://127.0.0.1:8083/api/users/1")
.header("X-Auth", &token)
.json(&serde_json::json!({"password": password}))
.send()
.await;
match change_res {
Ok(resp) if resp.status().is_success() => {
let secret_dir = "/var/lib/archipelago/secrets/filebrowser";
let _ = tokio::fs::create_dir_all(secret_dir).await;
let _ = tokio::fs::write(
format!("{}/password", secret_dir),
&password,
).await;
info!("FileBrowser admin password secured (default credentials replaced)");
}
Ok(resp) => {
tracing::warn!("FileBrowser password change failed: {}", resp.status());
}
Err(e) => {
tracing::warn!("FileBrowser password change error: {}", e);
let token = match login_res {
Ok(resp) if resp.status().is_success() => {
match resp.text().await {
Ok(t) => t.trim_matches('"').to_string(),
Err(e) => {
tracing::warn!("FileBrowser login response parse failed: {}", e);
return;
}
}
}
});
_ => {
tracing::warn!("FileBrowser not ready for password change — keeping default");
return;
}
};
// Change admin password via filebrowser API
let change_res = client
.put("http://127.0.0.1:8083/api/users/1")
.header("X-Auth", &token)
.json(&serde_json::json!({"password": password}))
.send()
.await;
match change_res {
Ok(resp) if resp.status().is_success() => {
let secret_dir = "/var/lib/archipelago/secrets/filebrowser";
let _ = tokio::fs::create_dir_all(secret_dir).await;
let _ = tokio::fs::write(
format!("{}/password", secret_dir),
&password,
).await;
info!("FileBrowser admin password secured (default credentials replaced)");
}
Ok(resp) => {
tracing::warn!("FileBrowser password change failed: {}", resp.status());
}
Err(e) => {
tracing::warn!("FileBrowser password change error: {}", e);
}
}
}
if package_id == "nextcloud" {
let host_ip = self.config.host_ip.clone();
tokio::spawn(async move {
// Wait for Nextcloud to finish first-run initialization
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
for domain_idx in 1..=2u8 {
let value = if domain_idx == 1 {
host_ip.as_str()
} else {
"localhost"
};
let _ = tokio::process::Command::new("podman")
.args([
"exec",
"-u",
"33",
"nextcloud",
"php",
"occ",
"config:system:set",
"trusted_domains",
&domain_idx.to_string(),
"--value",
value,
])
.output()
.await;
}
info!("Nextcloud trusted domains configured for {}", host_ip);
});
let host_ip = &self.config.host_ip;
// Wait for Nextcloud to finish first-run initialization
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
for domain_idx in 1..=2u8 {
let value = if domain_idx == 1 {
host_ip.as_str()
} else {
"localhost"
};
let _ = tokio::process::Command::new("podman")
.args([
"exec",
"-u",
"33",
"nextcloud",
"php",
"occ",
"config:system:set",
"trusted_domains",
&domain_idx.to_string(),
"--value",
value,
])
.output()
.await;
}
info!("Nextcloud trusted domains configured for {}", host_ip);
}
// Build and start companion UI containers for headless services

View File

@@ -58,6 +58,7 @@ fn create_installing_entry(package_id: &str) -> PackageDataEntry {
PackageDataEntry {
state: PackageState::Installing,
health: None,
exit_code: None,
static_files: StaticFiles {
license: String::new(),
instructions: String::new(),

View File

@@ -221,18 +221,30 @@ impl RpcHandler {
}
}
// Remove container (without -f to respect graceful shutdown above)
tracing::info!("Uninstall {}: removing container {}", package_id, name);
let rm_out = tokio::process::Command::new("podman")
.args(["rm", "-f", name])
.args(["rm", name])
.output()
.await;
match rm_out {
Ok(o) if o.status.success() => removed += 1,
Ok(o) => {
// If normal rm fails (e.g., still running), force as fallback
let stderr = String::from_utf8_lossy(&o.stderr);
let msg = format!("Failed to remove {}: {}", name, stderr.trim());
tracing::error!("Uninstall {}: {}", package_id, msg);
errors.push(msg);
tracing::warn!("Uninstall {}: rm {} failed ({}), trying force", package_id, name, stderr.trim());
let force_rm = tokio::process::Command::new("podman")
.args(["rm", "-f", name])
.output()
.await;
match force_rm {
Ok(o2) if o2.status.success() => removed += 1,
_ => {
let msg = format!("Failed to remove {}: {}", name, stderr.trim());
tracing::error!("Uninstall {}: {}", package_id, msg);
errors.push(msg);
}
}
}
Err(e) => {
let msg = format!("Failed to remove {}: {}", name, e);
@@ -242,6 +254,26 @@ impl RpcHandler {
}
}
// Clean up dangling volumes associated with removed containers
let _ = tokio::process::Command::new("podman")
.args(["volume", "prune", "-f"])
.output()
.await;
// Clean up app-specific networks (only if no other containers use them)
let app_networks: Vec<&str> = match package_id {
"immich" | "immich_server" => vec!["immich-net"],
"penpot" | "penpot-frontend" => vec!["penpot-net"],
"indeedhub" | "indeedhub-api" => vec!["indeedhub-net"],
_ => vec![],
};
for net in &app_networks {
let _ = tokio::process::Command::new("podman")
.args(["network", "rm", net])
.output()
.await;
}
// Release port allocation
{
let mut allocator = self.port_allocator.lock().await;
@@ -257,10 +289,19 @@ impl RpcHandler {
.args(["rm", "-rf", dir])
.output()
.await;
if let Ok(o) = rm_out {
if !o.status.success() {
tracing::warn!("Uninstall {}: rm {} failed", package_id, dir);
match rm_out {
Ok(o) if !o.status.success() => {
let stderr = String::from_utf8_lossy(&o.stderr);
let msg = format!("Failed to remove data {}: {}", dir, stderr.trim());
tracing::error!("Uninstall {}: {}", package_id, msg);
errors.push(msg);
}
Err(e) => {
let msg = format!("Failed to remove data {}: {}", dir, e);
tracing::error!("Uninstall {}: {}", package_id, msg);
errors.push(msg);
}
_ => {}
}
}
}
@@ -271,20 +312,24 @@ impl RpcHandler {
package_id,
errors
);
} else {
tracing::info!(
"Uninstall {} complete: stopped={}, removed={}",
return Err(anyhow::anyhow!(
"Uninstall {} partially failed: {}",
package_id,
stopped,
removed
);
errors.join("; ")
));
}
tracing::info!(
"Uninstall {} complete: stopped={}, removed={}",
package_id,
stopped,
removed
);
Ok(serde_json::json!({
"status": if errors.is_empty() { "uninstalled" } else { "partial" },
"status": "uninstalled",
"stopped": stopped,
"removed": removed,
"errors": errors,
}))
}