fix: overhaul container lifecycle — recovery, health, uninstall, UI state

Container recovery:
- Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s
- Dependency-aware restarts: won't restart services before their deps
- Reset dependent counters when a dependency recovers
- Handle "created" state containers (were invisible to health monitor)
- Added IndeedHub, mempool-api, mysql to tier system
- Crash recovery: podman start timeout 30s→120s with retry
- Podman client: socket timeout 5s→30s, added restart policy

UI state representation:
- Exit code 0 shows "stopped" (gray), not "crashed" (red)
- Exit code 137 shows "killed (OOM)"
- Non-zero exit shows "crashed" (red)
- Added exit_code field to PackageDataEntry

Install/uninstall fixes:
- Install returns error when container doesn't start (was silent success)
- Post-install hooks awaited instead of fire-and-forget tokio::spawn
- Uninstall: graceful rm before force, volume prune, network cleanup
- Uninstall returns error on partial failure (was 200 OK)

Config consistency:
- DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded)
- Bitcoin: added ZMQ ports 28332/28333 for LND block notifications
- IndeedHub port 7777→8190 (was conflicting with strfry)
- Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0

Performance:
- Metrics collector interval 60s→300s (was duplicating health monitor)
- Podman client: proper error propagation instead of unwrap_or_default

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-31 07:03:57 +01:00
parent 795e74bc50
commit 1e283daf13
65 changed files with 3950 additions and 298 deletions

View File

@@ -389,6 +389,15 @@ pub(super) fn get_data_dirs_for_app(package_id: &str) -> Vec<String> {
}
}
/// Read a secret from /var/lib/archipelago/secrets/{name}.
/// Falls back to the provided default if the file doesn't exist.
fn read_secret(name: &str, default: &str) -> String {
let path = format!("/var/lib/archipelago/secrets/{}", name);
std::fs::read_to_string(&path)
.map(|s| s.trim().to_string())
.unwrap_or_else(|_| default.to_string())
}
/// Get app-specific configuration
/// Returns: (ports, volumes, env_vars, custom_command, custom_args)
pub(super) async fn get_app_config(
@@ -413,7 +422,12 @@ pub(super) async fn get_app_config(
None,
),
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => (
vec!["8332:8332".to_string(), "8333:8333".to_string()],
vec![
"8332:8332".to_string(),
"8333:8333".to_string(),
"28332:28332".to_string(),
"28333:28333".to_string(),
],
vec!["/var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin".to_string()],
vec![],
None,
@@ -453,7 +467,8 @@ pub(super) async fn get_app_config(
format!("BTCPAY_BTCRPCURL=http://{}:8332", host_ip),
format!("BTCPAY_BTCRPCUSER={}", rpc_user),
format!("BTCPAY_BTCRPCPASSWORD={}", rpc_pass),
"BTCPAY_POSTGRES=User ID=btcpay;Password=btcpaypass;Host=archy-btcpay-db;Port=5432;Database=btcpay;Include Error Detail=true".to_string(),
format!("BTCPAY_POSTGRES=User ID=btcpay;Password={};Host=archy-btcpay-db;Port=5432;Database=btcpay;Include Error Detail=true",
read_secret("btcpay-db-password", "btcpaypass")),
],
None,
None,
@@ -481,7 +496,7 @@ pub(super) async fn get_app_config(
"DATABASE_HOST=archy-mempool-db".to_string(),
"DATABASE_DATABASE=mempool".to_string(),
"DATABASE_USERNAME=mempool".to_string(),
"DATABASE_PASSWORD=mempoolpass".to_string(),
format!("DATABASE_PASSWORD={}", read_secret("mempool-db-password", "mempoolpass")),
],
None,
None,
@@ -511,8 +526,8 @@ pub(super) async fn get_app_config(
vec![
"MYSQL_DATABASE=mempool".to_string(),
"MYSQL_USER=mempool".to_string(),
"MYSQL_PASSWORD=mempoolpass".to_string(),
"MYSQL_ROOT_PASSWORD=rootpass".to_string(),
format!("MYSQL_PASSWORD={}", read_secret("mempool-db-password", "mempoolpass")),
format!("MYSQL_ROOT_PASSWORD={}", read_secret("mempool-db-root-password", "rootpass")),
],
None,
None,
@@ -607,7 +622,7 @@ pub(super) async fn get_app_config(
vec![
"DB_HOSTNAME=immich_postgres".to_string(),
"DB_USERNAME=postgres".to_string(),
"DB_PASSWORD=immichpass".to_string(),
format!("DB_PASSWORD={}", read_secret("immich-db-password", "immichpass")),
"DB_DATABASE_NAME=immich".to_string(),
"REDIS_HOSTNAME=immich_redis".to_string(),
"UPLOAD_LOCATION=/usr/src/app/upload".to_string(),

View File

@@ -256,8 +256,9 @@ impl RpcHandler {
.trim()
.to_string();
// Post-start health verification: wait up to 30s for container to be running
for i in 0..6u32 {
// Post-start health verification: wait up to 60s for container to be running
let mut container_running = false;
for i in 0..12u32 {
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
let status = tokio::process::Command::new("podman")
.args(["inspect", container_name, "--format", "{{.State.Status}}"])
@@ -266,6 +267,7 @@ impl RpcHandler {
if let Ok(o) = status {
let state = String::from_utf8_lossy(&o.stdout).trim().to_string();
if state == "running" {
container_running = true;
break;
}
if state == "exited" {
@@ -288,12 +290,19 @@ impl RpcHandler {
));
}
}
if i == 5 {
debug!("Container {} health check timeout (30s)continuing anyway", container_name);
if i == 11 {
warn!("Container {} not running after 60s — install may have failed", container_name);
}
}
// Post-install hooks
if !container_running {
return Err(anyhow::anyhow!(
"Container {} did not reach running state within 60s. Check logs with: podman logs {}",
container_name, container_name
));
}
// Post-install hooks — await completion before returning success
self.run_post_install_hooks(package_id).await;
Ok(serde_json::json!({
@@ -536,98 +545,106 @@ printtoconsole=1\n",
}
/// Run post-install hooks (Nextcloud trusted domains, Bitcoin UI container).
/// Critical hooks (credential setup, config) are awaited; UI container builds are background.
async fn run_post_install_hooks(&self, package_id: &str) {
if package_id == "filebrowser" {
tokio::spawn(async move {
// Wait for filebrowser to start and initialize its database
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
// Wait for filebrowser to start and initialize its database
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
// Generate a random password (32 bytes, hex-encoded)
let mut buf = [0u8; 32];
rand::RngCore::fill_bytes(&mut rand::rngs::OsRng, &mut buf);
let password = hex::encode(buf);
// Generate a random password (32 bytes, hex-encoded)
let mut buf = [0u8; 32];
rand::RngCore::fill_bytes(&mut rand::rngs::OsRng, &mut buf);
let password = hex::encode(buf);
// Get a JWT token with default credentials
let login_res = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
.unwrap_or_default()
.post("http://127.0.0.1:8083/api/login")
.json(&serde_json::json!({"username": "admin", "password": "admin"}))
.send()
.await;
// Get a JWT token with default credentials
let client = match reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
{
Ok(c) => c,
Err(e) => {
tracing::warn!("Failed to create HTTP client for FileBrowser hook: {}", e);
return;
}
};
let token = match login_res {
Ok(resp) if resp.status().is_success() => {
resp.text().await.unwrap_or_default().trim_matches('"').to_string()
}
_ => {
tracing::warn!("FileBrowser not ready for password change — keeping default");
return;
}
};
let login_res = client
.post("http://127.0.0.1:8083/api/login")
.json(&serde_json::json!({"username": "admin", "password": "admin"}))
.send()
.await;
// Change admin password via filebrowser API
let change_res = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
.unwrap_or_default()
.put("http://127.0.0.1:8083/api/users/1")
.header("X-Auth", &token)
.json(&serde_json::json!({"password": password}))
.send()
.await;
match change_res {
Ok(resp) if resp.status().is_success() => {
let secret_dir = "/var/lib/archipelago/secrets/filebrowser";
let _ = tokio::fs::create_dir_all(secret_dir).await;
let _ = tokio::fs::write(
format!("{}/password", secret_dir),
&password,
).await;
info!("FileBrowser admin password secured (default credentials replaced)");
}
Ok(resp) => {
tracing::warn!("FileBrowser password change failed: {}", resp.status());
}
Err(e) => {
tracing::warn!("FileBrowser password change error: {}", e);
let token = match login_res {
Ok(resp) if resp.status().is_success() => {
match resp.text().await {
Ok(t) => t.trim_matches('"').to_string(),
Err(e) => {
tracing::warn!("FileBrowser login response parse failed: {}", e);
return;
}
}
}
});
_ => {
tracing::warn!("FileBrowser not ready for password change — keeping default");
return;
}
};
// Change admin password via filebrowser API
let change_res = client
.put("http://127.0.0.1:8083/api/users/1")
.header("X-Auth", &token)
.json(&serde_json::json!({"password": password}))
.send()
.await;
match change_res {
Ok(resp) if resp.status().is_success() => {
let secret_dir = "/var/lib/archipelago/secrets/filebrowser";
let _ = tokio::fs::create_dir_all(secret_dir).await;
let _ = tokio::fs::write(
format!("{}/password", secret_dir),
&password,
).await;
info!("FileBrowser admin password secured (default credentials replaced)");
}
Ok(resp) => {
tracing::warn!("FileBrowser password change failed: {}", resp.status());
}
Err(e) => {
tracing::warn!("FileBrowser password change error: {}", e);
}
}
}
if package_id == "nextcloud" {
let host_ip = self.config.host_ip.clone();
tokio::spawn(async move {
// Wait for Nextcloud to finish first-run initialization
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
for domain_idx in 1..=2u8 {
let value = if domain_idx == 1 {
host_ip.as_str()
} else {
"localhost"
};
let _ = tokio::process::Command::new("podman")
.args([
"exec",
"-u",
"33",
"nextcloud",
"php",
"occ",
"config:system:set",
"trusted_domains",
&domain_idx.to_string(),
"--value",
value,
])
.output()
.await;
}
info!("Nextcloud trusted domains configured for {}", host_ip);
});
let host_ip = &self.config.host_ip;
// Wait for Nextcloud to finish first-run initialization
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
for domain_idx in 1..=2u8 {
let value = if domain_idx == 1 {
host_ip.as_str()
} else {
"localhost"
};
let _ = tokio::process::Command::new("podman")
.args([
"exec",
"-u",
"33",
"nextcloud",
"php",
"occ",
"config:system:set",
"trusted_domains",
&domain_idx.to_string(),
"--value",
value,
])
.output()
.await;
}
info!("Nextcloud trusted domains configured for {}", host_ip);
}
// Build and start companion UI containers for headless services

View File

@@ -58,6 +58,7 @@ fn create_installing_entry(package_id: &str) -> PackageDataEntry {
PackageDataEntry {
state: PackageState::Installing,
health: None,
exit_code: None,
static_files: StaticFiles {
license: String::new(),
instructions: String::new(),

View File

@@ -221,18 +221,30 @@ impl RpcHandler {
}
}
// Remove container (without -f to respect graceful shutdown above)
tracing::info!("Uninstall {}: removing container {}", package_id, name);
let rm_out = tokio::process::Command::new("podman")
.args(["rm", "-f", name])
.args(["rm", name])
.output()
.await;
match rm_out {
Ok(o) if o.status.success() => removed += 1,
Ok(o) => {
// If normal rm fails (e.g., still running), force as fallback
let stderr = String::from_utf8_lossy(&o.stderr);
let msg = format!("Failed to remove {}: {}", name, stderr.trim());
tracing::error!("Uninstall {}: {}", package_id, msg);
errors.push(msg);
tracing::warn!("Uninstall {}: rm {} failed ({}), trying force", package_id, name, stderr.trim());
let force_rm = tokio::process::Command::new("podman")
.args(["rm", "-f", name])
.output()
.await;
match force_rm {
Ok(o2) if o2.status.success() => removed += 1,
_ => {
let msg = format!("Failed to remove {}: {}", name, stderr.trim());
tracing::error!("Uninstall {}: {}", package_id, msg);
errors.push(msg);
}
}
}
Err(e) => {
let msg = format!("Failed to remove {}: {}", name, e);
@@ -242,6 +254,26 @@ impl RpcHandler {
}
}
// Clean up dangling volumes associated with removed containers
let _ = tokio::process::Command::new("podman")
.args(["volume", "prune", "-f"])
.output()
.await;
// Clean up app-specific networks (only if no other containers use them)
let app_networks: Vec<&str> = match package_id {
"immich" | "immich_server" => vec!["immich-net"],
"penpot" | "penpot-frontend" => vec!["penpot-net"],
"indeedhub" | "indeedhub-api" => vec!["indeedhub-net"],
_ => vec![],
};
for net in &app_networks {
let _ = tokio::process::Command::new("podman")
.args(["network", "rm", net])
.output()
.await;
}
// Release port allocation
{
let mut allocator = self.port_allocator.lock().await;
@@ -257,10 +289,19 @@ impl RpcHandler {
.args(["rm", "-rf", dir])
.output()
.await;
if let Ok(o) = rm_out {
if !o.status.success() {
tracing::warn!("Uninstall {}: rm {} failed", package_id, dir);
match rm_out {
Ok(o) if !o.status.success() => {
let stderr = String::from_utf8_lossy(&o.stderr);
let msg = format!("Failed to remove data {}: {}", dir, stderr.trim());
tracing::error!("Uninstall {}: {}", package_id, msg);
errors.push(msg);
}
Err(e) => {
let msg = format!("Failed to remove data {}: {}", dir, e);
tracing::error!("Uninstall {}: {}", package_id, msg);
errors.push(msg);
}
_ => {}
}
}
}
@@ -271,20 +312,24 @@ impl RpcHandler {
package_id,
errors
);
} else {
tracing::info!(
"Uninstall {} complete: stopped={}, removed={}",
return Err(anyhow::anyhow!(
"Uninstall {} partially failed: {}",
package_id,
stopped,
removed
);
errors.join("; ")
));
}
tracing::info!(
"Uninstall {} complete: stopped={}, removed={}",
package_id,
stopped,
removed
);
Ok(serde_json::json!({
"status": if errors.is_empty() { "uninstalled" } else { "partial" },
"status": "uninstalled",
"stopped": stopped,
"removed": removed,
"errors": errors,
}))
}

View File

@@ -146,6 +146,7 @@ impl DockerPackageScanner {
let package = PackageDataEntry {
state: package_state.clone(),
health: container.health.clone(),
exit_code: if package_state == PackageState::Exited { container.exit_code } else { None },
static_files: StaticFiles {
license: "MIT".to_string(),
instructions: metadata.description.clone(),

View File

@@ -262,33 +262,47 @@ pub async fn recover_containers(containers: &[RunningContainerRecord]) -> Recove
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
}
let result = tokio::time::timeout(
std::time::Duration::from_secs(30),
tokio::process::Command::new("podman")
.args(["start", &record.name])
.output(),
)
.await;
// Try up to 2 attempts with increasing timeout (120s first, 180s retry)
let mut started = false;
for attempt in 0..2u32 {
let timeout_secs = if attempt == 0 { 120 } else { 180 };
if attempt > 0 {
info!("Retrying container {} (attempt {})", record.name, attempt + 1);
tokio::time::sleep(std::time::Duration::from_secs(10)).await;
}
let result = tokio::time::timeout(
std::time::Duration::from_secs(timeout_secs),
tokio::process::Command::new("podman")
.args(["start", &record.name])
.output(),
)
.await;
match result {
Ok(Ok(output)) if output.status.success() => {
info!("Successfully restarted container: {}", record.name);
report.recovered += 1;
}
Ok(Ok(output)) => {
let stderr = String::from_utf8_lossy(&output.stderr);
warn!("Failed to restart container {}: {}", record.name, stderr.trim());
report.failed.push(record.name.clone());
}
Ok(Err(e)) => {
warn!("Failed to execute podman start for {}: {}", record.name, e);
report.failed.push(record.name.clone());
}
Err(_) => {
warn!("Timeout starting container {} (30s)", record.name);
report.failed.push(record.name.clone());
match result {
Ok(Ok(output)) if output.status.success() => {
info!("Successfully restarted container: {}", record.name);
report.recovered += 1;
started = true;
break;
}
Ok(Ok(output)) => {
let stderr = String::from_utf8_lossy(&output.stderr);
warn!("Failed to restart container {} (attempt {}): {}",
record.name, attempt + 1, stderr.trim());
}
Ok(Err(e)) => {
warn!("Failed to execute podman start for {} (attempt {}): {}",
record.name, attempt + 1, e);
}
Err(_) => {
warn!("Timeout starting container {} ({}s, attempt {})",
record.name, timeout_secs, attempt + 1);
}
}
}
if !started {
report.failed.push(record.name.clone());
}
}
report
@@ -313,7 +327,7 @@ fn is_process_running(pid: u32) -> bool {
/// Skips containers that the user intentionally stopped via the UI.
pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport {
let output = match tokio::time::timeout(
std::time::Duration::from_secs(30),
std::time::Duration::from_secs(60),
tokio::process::Command::new("podman")
.args(["ps", "-a", "--filter", "status=exited", "--filter", "status=created", "--format", "{{.Names}}"])
.output(),
@@ -322,7 +336,7 @@ pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport {
{
Ok(result) => result,
Err(_) => {
warn!("Timeout listing stopped containers (30s)");
warn!("Timeout listing stopped containers (60s)");
return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() };
}
};
@@ -374,12 +388,21 @@ pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport {
fn container_boot_tier(name: &str) -> u8 {
let id = name.strip_prefix("archy-").unwrap_or(name);
match id {
"btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres"
| "immich_redis" | "penpot-valkey" => 0,
// Tier 0: Databases and data stores
"btcpay-db" | "mempool-db" | "mysql-mempool" | "penpot-postgres"
| "immich_postgres" | "immich_redis" | "penpot-valkey"
| "endurain-db" | "nextcloud-db"
| "indeedhub-postgres" | "indeedhub-redis" | "indeedhub-minio" => 0,
// Tier 1: Core infrastructure
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => 1,
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer" => 2,
// Tier 2: Dependent services
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer"
| "mempool-api" | "indeedhub-api" => 2,
// Tier 4: Frontend/UI
"mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui"
| "penpot-frontend" | "penpot-exporter" => 4,
| "penpot-frontend" | "penpot-exporter"
| "indeedhub" => 4,
// Tier 3: Everything else
_ => 3,
}
}

View File

@@ -124,6 +124,9 @@ pub struct PackageDataEntry {
/// Container health: "healthy", "unhealthy", "starting", or null
#[serde(skip_serializing_if = "Option::is_none")]
pub health: Option<String>,
/// Container exit code (only set when state is Exited): 0 = clean, non-zero = crash
#[serde(rename = "exit-code", skip_serializing_if = "Option::is_none")]
pub exit_code: Option<i32>,
#[serde(rename = "static-files")]
pub static_files: StaticFiles,
pub manifest: Manifest,

View File

@@ -1,6 +1,7 @@
// Container Health Monitor
// Checks container health every 60s, auto-restarts unhealthy containers (max 3 times)
// with exponential backoff (10s, 30s, 90s), dependency-aware startup ordering,
// Checks container health every 120s, auto-restarts unhealthy containers (max 10 times)
// with exponential backoff (10s..120s), dependency-aware restart ordering (deps first),
// handles "created" state containers, resets dependent counters when deps recover,
// and sends WebSocket notifications to the UI on failure.
use crate::data_model::{Notification, NotificationLevel};
@@ -13,10 +14,10 @@ use std::sync::Arc;
use std::time::Instant;
use tracing::{debug, info, warn};
const MAX_RESTART_ATTEMPTS: u32 = 3;
const CHECK_INTERVAL_SECS: u64 = 60;
/// Backoff delays per attempt: 10s, 30s, 90s
const BACKOFF_DELAYS_SECS: [u64; 3] = [10, 30, 90];
const MAX_RESTART_ATTEMPTS: u32 = 10;
const CHECK_INTERVAL_SECS: u64 = 120;
/// Backoff delays per attempt — escalating from 10s to 120s
const BACKOFF_DELAYS_SECS: [u64; 10] = [10, 15, 20, 30, 30, 45, 60, 60, 90, 120];
/// Reset restart counter after 1 hour of stability
const STABILITY_RESET_SECS: u64 = 3600;
@@ -39,25 +40,83 @@ enum StartupTier {
fn container_tier(name: &str) -> StartupTier {
let id = name.strip_prefix("archy-").unwrap_or(name);
match id {
// Tier 0: Databases
"btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres"
| "immich_redis" | "penpot-valkey" | "endurain-db" | "nextcloud-db" => StartupTier::Database,
// Tier 0: Databases and data stores
"btcpay-db" | "mempool-db" | "mysql-mempool" | "penpot-postgres"
| "immich_postgres" | "immich_redis" | "penpot-valkey"
| "endurain-db" | "nextcloud-db"
| "indeedhub-postgres" | "indeedhub-redis" | "indeedhub-minio" => StartupTier::Database,
// Tier 1: Core infrastructure
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => StartupTier::CoreInfra,
// Tier 2: Dependent services
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer" => StartupTier::DependentService,
// Tier 2: Dependent services (need databases or bitcoin)
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer"
| "mempool-api" | "indeedhub-api" => StartupTier::DependentService,
// Tier 4: Frontend/UI
"mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui"
| "penpot-frontend" | "penpot-exporter" => StartupTier::Frontend,
| "penpot-frontend" | "penpot-exporter"
| "indeedhub" => StartupTier::Frontend,
// Tier 3: Everything else
// Tier 3: Application layer (everything else)
_ => StartupTier::Application,
}
}
/// Map containers to their required dependencies.
/// When a dependent fails, check and restart its dependencies first.
fn container_dependencies(name: &str) -> &'static [&'static str] {
let id = name.strip_prefix("archy-").unwrap_or(name);
match id {
// Bitcoin-dependent chain
"lnd" => &["bitcoin-knots"],
"electrumx" | "mempool-electrs" | "electrs" => &["bitcoin-knots"],
"nbxplorer" => &["bitcoin-knots"],
"btcpay-server" => &["btcpay-db", "nbxplorer"],
"mempool-api" => &["mempool-db", "electrumx"],
"mempool-web" => &["mempool-api"],
"fedimint" => &["bitcoin-knots"],
"fedimint-gateway" => &["lnd"],
// IndeedHub stack
"indeedhub-api" => &["indeedhub-postgres", "indeedhub-redis"],
"indeedhub" => &["indeedhub-api"],
"indeedhub-relay" => &["indeedhub-postgres"],
"indeedhub-ffmpeg" => &["indeedhub-api"],
// Multi-container stacks
"immich_server" => &["immich_postgres", "immich_redis"],
"penpot-backend" => &["penpot-postgres", "penpot-valkey"],
"penpot-frontend" => &["penpot-backend"],
// UI containers
"bitcoin-ui" => &["bitcoin-knots"],
"lnd-ui" => &["lnd"],
"electrs-ui" => &["electrumx"],
_ => &[],
}
}
/// Check if all of a container's dependencies are currently running.
fn deps_are_running(name: &str, containers: &[ContainerHealth]) -> bool {
let deps = container_dependencies(name);
if deps.is_empty() {
return true;
}
for dep in deps {
// Check both plain name and archy- prefixed name
let dep_running = containers.iter().any(|c| {
let c_id = c.name.strip_prefix("archy-").unwrap_or(&c.name);
(c_id == *dep || c.name == *dep) && c.state == "running"
});
if !dep_running {
return false;
}
}
true
}
/// Track restart attempts per container with exponential backoff and stability reset.
struct RestartTracker {
attempts: HashMap<String, u32>,
@@ -372,7 +431,7 @@ async fn check_containers() -> Vec<ContainerHealth> {
async fn restart_container(name: &str) -> bool {
info!("Auto-restarting unhealthy container: {}", name);
let result = tokio::time::timeout(
std::time::Duration::from_secs(30),
std::time::Duration::from_secs(120),
tokio::process::Command::new("podman")
.args(["start", name])
.output(),
@@ -394,7 +453,7 @@ async fn restart_container(name: &str) -> bool {
false
}
Err(_) => {
warn!("Timeout starting container {} (30s)", name);
warn!("Timeout starting container {} (120s)", name);
false
}
}
@@ -466,13 +525,33 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
if container.healthy {
if tracker.attempt_count(&container.name) > 0 {
info!("Container {} is healthy again after restart", container.name);
// Reset attempt counters for containers that depend on this one,
// since their previous failures may have been caused by this
// dependency being down
let recovered_id = container.name.strip_prefix("archy-")
.unwrap_or(&container.name).to_string();
for other in &containers {
let deps = container_dependencies(&other.name);
if deps.iter().any(|d| *d == recovered_id || *d == container.name) {
if tracker.attempt_count(&other.name) > 0 {
info!("Resetting restart counter for {} (dependency {} recovered)",
other.name, container.name);
tracker.clear(&other.name);
restart_history.clear(&other.name);
history_dirty = true;
}
}
}
tracker.clear(&container.name);
restart_history.clear(&container.name);
history_dirty = true;
}
continue;
}
if container.state == "exited" || container.state == "stopped" {
// Handle exited, stopped, AND created state containers
if container.state == "exited" || container.state == "stopped"
|| container.state == "created"
{
// Skip user-stopped containers
if user_stopped.contains(&container.name) {
debug!("Skipping user-stopped container: {}", container.name);
@@ -509,6 +588,13 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
continue;
}
// Skip if dependencies aren't running — they need to start first
if !deps_are_running(&container.name, &containers) {
let deps = container_dependencies(&container.name);
debug!("Container {} waiting for dependencies {:?}", container.name, deps);
continue;
}
// When transitioning to a higher tier, wait briefly for previous tier to stabilize
if let Some(prev) = prev_tier {
if tier > prev {
@@ -695,13 +781,13 @@ mod tests {
#[test]
fn test_max_restart_attempts_constant() {
assert!(MAX_RESTART_ATTEMPTS >= 1);
assert!(MAX_RESTART_ATTEMPTS <= 10);
assert_eq!(MAX_RESTART_ATTEMPTS, 3);
assert!(MAX_RESTART_ATTEMPTS <= 20);
assert_eq!(MAX_RESTART_ATTEMPTS, 10);
}
#[test]
fn test_check_interval_constant() {
assert_eq!(CHECK_INTERVAL_SECS, 60);
assert_eq!(CHECK_INTERVAL_SECS, 120);
}
#[test]
@@ -740,6 +826,44 @@ mod tests {
assert_eq!(container_tier("archy-btcpay-db"), StartupTier::Database);
assert_eq!(container_tier("immich_postgres"), StartupTier::Database);
assert_eq!(container_tier("penpot-valkey"), StartupTier::Database);
assert_eq!(container_tier("indeedhub-postgres"), StartupTier::Database);
assert_eq!(container_tier("indeedhub-redis"), StartupTier::Database);
assert_eq!(container_tier("indeedhub-minio"), StartupTier::Database);
}
#[test]
fn test_container_tier_indeedhub_api() {
assert_eq!(container_tier("indeedhub-api"), StartupTier::DependentService);
}
#[test]
fn test_container_tier_mempool_api() {
assert_eq!(container_tier("mempool-api"), StartupTier::DependentService);
}
#[test]
fn test_container_dependencies() {
assert!(container_dependencies("lnd").contains(&"bitcoin-knots"));
assert!(container_dependencies("indeedhub-api").contains(&"indeedhub-postgres"));
assert!(container_dependencies("indeedhub-api").contains(&"indeedhub-redis"));
assert!(container_dependencies("mempool-api").contains(&"mempool-db"));
assert!(container_dependencies("mempool-api").contains(&"electrumx"));
assert!(container_dependencies("nextcloud").is_empty());
}
#[test]
fn test_deps_are_running() {
let containers = vec![
ContainerHealth { name: "indeedhub-postgres".into(), app_id: "indeedhub-postgres".into(), state: "running".into(), healthy: true },
ContainerHealth { name: "indeedhub-redis".into(), app_id: "indeedhub-redis".into(), state: "running".into(), healthy: true },
ContainerHealth { name: "indeedhub-api".into(), app_id: "indeedhub-api".into(), state: "exited".into(), healthy: false },
];
assert!(deps_are_running("indeedhub-api", &containers));
// Missing postgres
let partial = vec![
ContainerHealth { name: "indeedhub-redis".into(), app_id: "indeedhub-redis".into(), state: "running".into(), healthy: true },
];
assert!(!deps_are_running("indeedhub-api", &partial));
}
#[test]

View File

@@ -14,18 +14,21 @@ use std::path::PathBuf;
use std::sync::Arc;
use tracing::{debug, warn};
/// Spawn the background metrics collector (runs every 60 seconds).
/// Spawn the background metrics collector (runs every 300 seconds / 5 minutes).
/// Evaluates alert rules on each snapshot and dispatches notifications.
/// Note: health_monitor.rs handles container state polling at 120s intervals.
/// This collector handles system-level metrics (CPU, disk, network) and only
/// calls podman stats every 5 minutes to avoid duplicate subprocess overhead.
pub fn spawn_metrics_collector(
store: Arc<MetricsStore>,
state: Option<Arc<crate::state::StateManager>>,
data_dir: Option<PathBuf>,
) {
tokio::spawn(async move {
// Wait 30s for system to stabilize after boot
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
// Wait 60s for system to stabilize after boot
tokio::time::sleep(std::time::Duration::from_secs(60)).await;
let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
let mut interval = tokio::time::interval(std::time::Duration::from_secs(300));
loop {
interval.tick().await;

View File

@@ -34,6 +34,7 @@ pub struct ContainerStatus {
pub name: String,
pub state: ContainerState,
pub health: Option<String>,
pub exit_code: Option<i32>,
pub started_at: Option<String>,
pub image: String,
pub created: String,
@@ -150,13 +151,13 @@ impl PodmanClient {
) -> Result<serde_json::Value> {
let socket_path = self.socket_path.clone();
// Connect to the unix socket
// Connect to the unix socket (30s timeout — podman can be slow under load on boot)
let stream = tokio::time::timeout(
std::time::Duration::from_secs(5),
std::time::Duration::from_secs(30),
UnixStream::connect(&socket_path),
)
.await
.map_err(|_| anyhow::anyhow!("Podman socket connection timed out"))?
.map_err(|_| anyhow::anyhow!("Podman socket connection timed out (30s)"))?
.context(format!("Cannot connect to Podman socket at {}", socket_path.display()))?;
// Build the hyper client with the unix stream
@@ -179,8 +180,11 @@ impl PodmanClient {
let req = match method {
"POST" => {
let body_str = body.map(|b| serde_json::to_string(&b).unwrap_or_default())
.unwrap_or_default();
let body_str = match body {
Some(b) => serde_json::to_string(&b)
.context("Failed to serialize request body to JSON")?,
None => String::new(),
};
Request::builder()
.method("POST")
.uri(uri)
@@ -326,6 +330,8 @@ impl PodmanClient {
"cap_drop": cap_drop,
"read_only_filesystem": manifest.app.security.readonly_root,
"no_new_privileges": true,
"restart_policy": "unless-stopped",
"restart_tries": 5,
"netns": {
"nsmode": match manifest.app.security.network_policy.as_str() {
"host" => "host",
@@ -342,8 +348,9 @@ impl PodmanClient {
).await?;
let id = result["Id"].as_str()
.unwrap_or("")
.to_string();
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.context("Podman API returned no container ID — creation may have failed")?;
Ok(id)
}
@@ -396,11 +403,14 @@ impl PodmanClient {
let ports = parse_port_bindings(&data["HostConfig"]["PortBindings"]);
let lan_address = Self::lan_address_for(&container_name);
let exit_code = data["State"]["ExitCode"].as_i64().map(|c| c as i32);
Ok(ContainerStatus {
id: data["Id"].as_str().unwrap_or("").to_string(),
name: container_name,
state: ContainerState::from(state_str),
health,
exit_code,
started_at,
image: data["ImageName"].as_str()
.or_else(|| data["Config"]["Image"].as_str())
@@ -477,11 +487,16 @@ impl PodmanClient {
.map(|s| s.to_string());
let lan_address = Self::lan_address_for(&name);
let exit_code = c["ExitCode"].as_i64()
.or_else(|| c["State"]["ExitCode"].as_i64())
.map(|c| c as i32);
result.push(ContainerStatus {
id: c["Id"].as_str().unwrap_or("").to_string(),
name,
state: ContainerState::from(c["State"].as_str().unwrap_or("unknown")),
health,
exit_code,
started_at,
image: c["Image"].as_str().unwrap_or("").to_string(),
created: c["Created"].as_str().unwrap_or("").to_string(),

View File

@@ -285,6 +285,7 @@ impl ContainerRuntime for DockerRuntime {
name: parts[1].to_string(),
state: crate::podman_client::ContainerState::from(parts[2]),
health: None,
exit_code: None,
started_at: None,
image: parts[3].to_string(),
created: parts[4].to_string(),
@@ -359,6 +360,7 @@ impl ContainerRuntime for DockerRuntime {
container["State"].as_str().unwrap_or("unknown")
),
health: None,
exit_code: container["ExitCode"].as_i64().map(|c| c as i32),
started_at: None,
image: container["Image"].as_str().unwrap_or("").to_string(),
created: container["CreatedAt"].as_str().unwrap_or("").to_string(),