fix: overhaul container lifecycle — recovery, health, uninstall, UI state
Container recovery: - Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s - Dependency-aware restarts: won't restart services before their deps - Reset dependent counters when a dependency recovers - Handle "created" state containers (were invisible to health monitor) - Added IndeedHub, mempool-api, mysql to tier system - Crash recovery: podman start timeout 30s→120s with retry - Podman client: socket timeout 5s→30s, added restart policy UI state representation: - Exit code 0 shows "stopped" (gray), not "crashed" (red) - Exit code 137 shows "killed (OOM)" - Non-zero exit shows "crashed" (red) - Added exit_code field to PackageDataEntry Install/uninstall fixes: - Install returns error when container doesn't start (was silent success) - Post-install hooks awaited instead of fire-and-forget tokio::spawn - Uninstall: graceful rm before force, volume prune, network cleanup - Uninstall returns error on partial failure (was 200 OK) Config consistency: - DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded) - Bitcoin: added ZMQ ports 28332/28333 for LND block notifications - IndeedHub port 7777→8190 (was conflicting with strfry) - Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0 Performance: - Metrics collector interval 60s→300s (was duplicating health monitor) - Podman client: proper error propagation instead of unwrap_or_default Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -389,6 +389,15 @@ pub(super) fn get_data_dirs_for_app(package_id: &str) -> Vec<String> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a secret from /var/lib/archipelago/secrets/{name}.
|
||||
/// Falls back to the provided default if the file doesn't exist.
|
||||
fn read_secret(name: &str, default: &str) -> String {
|
||||
let path = format!("/var/lib/archipelago/secrets/{}", name);
|
||||
std::fs::read_to_string(&path)
|
||||
.map(|s| s.trim().to_string())
|
||||
.unwrap_or_else(|_| default.to_string())
|
||||
}
|
||||
|
||||
/// Get app-specific configuration
|
||||
/// Returns: (ports, volumes, env_vars, custom_command, custom_args)
|
||||
pub(super) async fn get_app_config(
|
||||
@@ -413,7 +422,12 @@ pub(super) async fn get_app_config(
|
||||
None,
|
||||
),
|
||||
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => (
|
||||
vec!["8332:8332".to_string(), "8333:8333".to_string()],
|
||||
vec![
|
||||
"8332:8332".to_string(),
|
||||
"8333:8333".to_string(),
|
||||
"28332:28332".to_string(),
|
||||
"28333:28333".to_string(),
|
||||
],
|
||||
vec!["/var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin".to_string()],
|
||||
vec![],
|
||||
None,
|
||||
@@ -453,7 +467,8 @@ pub(super) async fn get_app_config(
|
||||
format!("BTCPAY_BTCRPCURL=http://{}:8332", host_ip),
|
||||
format!("BTCPAY_BTCRPCUSER={}", rpc_user),
|
||||
format!("BTCPAY_BTCRPCPASSWORD={}", rpc_pass),
|
||||
"BTCPAY_POSTGRES=User ID=btcpay;Password=btcpaypass;Host=archy-btcpay-db;Port=5432;Database=btcpay;Include Error Detail=true".to_string(),
|
||||
format!("BTCPAY_POSTGRES=User ID=btcpay;Password={};Host=archy-btcpay-db;Port=5432;Database=btcpay;Include Error Detail=true",
|
||||
read_secret("btcpay-db-password", "btcpaypass")),
|
||||
],
|
||||
None,
|
||||
None,
|
||||
@@ -481,7 +496,7 @@ pub(super) async fn get_app_config(
|
||||
"DATABASE_HOST=archy-mempool-db".to_string(),
|
||||
"DATABASE_DATABASE=mempool".to_string(),
|
||||
"DATABASE_USERNAME=mempool".to_string(),
|
||||
"DATABASE_PASSWORD=mempoolpass".to_string(),
|
||||
format!("DATABASE_PASSWORD={}", read_secret("mempool-db-password", "mempoolpass")),
|
||||
],
|
||||
None,
|
||||
None,
|
||||
@@ -511,8 +526,8 @@ pub(super) async fn get_app_config(
|
||||
vec![
|
||||
"MYSQL_DATABASE=mempool".to_string(),
|
||||
"MYSQL_USER=mempool".to_string(),
|
||||
"MYSQL_PASSWORD=mempoolpass".to_string(),
|
||||
"MYSQL_ROOT_PASSWORD=rootpass".to_string(),
|
||||
format!("MYSQL_PASSWORD={}", read_secret("mempool-db-password", "mempoolpass")),
|
||||
format!("MYSQL_ROOT_PASSWORD={}", read_secret("mempool-db-root-password", "rootpass")),
|
||||
],
|
||||
None,
|
||||
None,
|
||||
@@ -607,7 +622,7 @@ pub(super) async fn get_app_config(
|
||||
vec![
|
||||
"DB_HOSTNAME=immich_postgres".to_string(),
|
||||
"DB_USERNAME=postgres".to_string(),
|
||||
"DB_PASSWORD=immichpass".to_string(),
|
||||
format!("DB_PASSWORD={}", read_secret("immich-db-password", "immichpass")),
|
||||
"DB_DATABASE_NAME=immich".to_string(),
|
||||
"REDIS_HOSTNAME=immich_redis".to_string(),
|
||||
"UPLOAD_LOCATION=/usr/src/app/upload".to_string(),
|
||||
|
||||
@@ -256,8 +256,9 @@ impl RpcHandler {
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
// Post-start health verification: wait up to 30s for container to be running
|
||||
for i in 0..6u32 {
|
||||
// Post-start health verification: wait up to 60s for container to be running
|
||||
let mut container_running = false;
|
||||
for i in 0..12u32 {
|
||||
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
|
||||
let status = tokio::process::Command::new("podman")
|
||||
.args(["inspect", container_name, "--format", "{{.State.Status}}"])
|
||||
@@ -266,6 +267,7 @@ impl RpcHandler {
|
||||
if let Ok(o) = status {
|
||||
let state = String::from_utf8_lossy(&o.stdout).trim().to_string();
|
||||
if state == "running" {
|
||||
container_running = true;
|
||||
break;
|
||||
}
|
||||
if state == "exited" {
|
||||
@@ -288,12 +290,19 @@ impl RpcHandler {
|
||||
));
|
||||
}
|
||||
}
|
||||
if i == 5 {
|
||||
debug!("Container {} health check timeout (30s) — continuing anyway", container_name);
|
||||
if i == 11 {
|
||||
warn!("Container {} not running after 60s — install may have failed", container_name);
|
||||
}
|
||||
}
|
||||
|
||||
// Post-install hooks
|
||||
if !container_running {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Container {} did not reach running state within 60s. Check logs with: podman logs {}",
|
||||
container_name, container_name
|
||||
));
|
||||
}
|
||||
|
||||
// Post-install hooks — await completion before returning success
|
||||
self.run_post_install_hooks(package_id).await;
|
||||
|
||||
Ok(serde_json::json!({
|
||||
@@ -536,98 +545,106 @@ printtoconsole=1\n",
|
||||
}
|
||||
|
||||
/// Run post-install hooks (Nextcloud trusted domains, Bitcoin UI container).
|
||||
/// Critical hooks (credential setup, config) are awaited; UI container builds are background.
|
||||
async fn run_post_install_hooks(&self, package_id: &str) {
|
||||
if package_id == "filebrowser" {
|
||||
tokio::spawn(async move {
|
||||
// Wait for filebrowser to start and initialize its database
|
||||
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
|
||||
// Wait for filebrowser to start and initialize its database
|
||||
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
|
||||
|
||||
// Generate a random password (32 bytes, hex-encoded)
|
||||
let mut buf = [0u8; 32];
|
||||
rand::RngCore::fill_bytes(&mut rand::rngs::OsRng, &mut buf);
|
||||
let password = hex::encode(buf);
|
||||
// Generate a random password (32 bytes, hex-encoded)
|
||||
let mut buf = [0u8; 32];
|
||||
rand::RngCore::fill_bytes(&mut rand::rngs::OsRng, &mut buf);
|
||||
let password = hex::encode(buf);
|
||||
|
||||
// Get a JWT token with default credentials
|
||||
let login_res = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(10))
|
||||
.build()
|
||||
.unwrap_or_default()
|
||||
.post("http://127.0.0.1:8083/api/login")
|
||||
.json(&serde_json::json!({"username": "admin", "password": "admin"}))
|
||||
.send()
|
||||
.await;
|
||||
// Get a JWT token with default credentials
|
||||
let client = match reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(10))
|
||||
.build()
|
||||
{
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to create HTTP client for FileBrowser hook: {}", e);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let token = match login_res {
|
||||
Ok(resp) if resp.status().is_success() => {
|
||||
resp.text().await.unwrap_or_default().trim_matches('"').to_string()
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!("FileBrowser not ready for password change — keeping default");
|
||||
return;
|
||||
}
|
||||
};
|
||||
let login_res = client
|
||||
.post("http://127.0.0.1:8083/api/login")
|
||||
.json(&serde_json::json!({"username": "admin", "password": "admin"}))
|
||||
.send()
|
||||
.await;
|
||||
|
||||
// Change admin password via filebrowser API
|
||||
let change_res = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(10))
|
||||
.build()
|
||||
.unwrap_or_default()
|
||||
.put("http://127.0.0.1:8083/api/users/1")
|
||||
.header("X-Auth", &token)
|
||||
.json(&serde_json::json!({"password": password}))
|
||||
.send()
|
||||
.await;
|
||||
|
||||
match change_res {
|
||||
Ok(resp) if resp.status().is_success() => {
|
||||
let secret_dir = "/var/lib/archipelago/secrets/filebrowser";
|
||||
let _ = tokio::fs::create_dir_all(secret_dir).await;
|
||||
let _ = tokio::fs::write(
|
||||
format!("{}/password", secret_dir),
|
||||
&password,
|
||||
).await;
|
||||
info!("FileBrowser admin password secured (default credentials replaced)");
|
||||
}
|
||||
Ok(resp) => {
|
||||
tracing::warn!("FileBrowser password change failed: {}", resp.status());
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("FileBrowser password change error: {}", e);
|
||||
let token = match login_res {
|
||||
Ok(resp) if resp.status().is_success() => {
|
||||
match resp.text().await {
|
||||
Ok(t) => t.trim_matches('"').to_string(),
|
||||
Err(e) => {
|
||||
tracing::warn!("FileBrowser login response parse failed: {}", e);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
_ => {
|
||||
tracing::warn!("FileBrowser not ready for password change — keeping default");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// Change admin password via filebrowser API
|
||||
let change_res = client
|
||||
.put("http://127.0.0.1:8083/api/users/1")
|
||||
.header("X-Auth", &token)
|
||||
.json(&serde_json::json!({"password": password}))
|
||||
.send()
|
||||
.await;
|
||||
|
||||
match change_res {
|
||||
Ok(resp) if resp.status().is_success() => {
|
||||
let secret_dir = "/var/lib/archipelago/secrets/filebrowser";
|
||||
let _ = tokio::fs::create_dir_all(secret_dir).await;
|
||||
let _ = tokio::fs::write(
|
||||
format!("{}/password", secret_dir),
|
||||
&password,
|
||||
).await;
|
||||
info!("FileBrowser admin password secured (default credentials replaced)");
|
||||
}
|
||||
Ok(resp) => {
|
||||
tracing::warn!("FileBrowser password change failed: {}", resp.status());
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("FileBrowser password change error: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if package_id == "nextcloud" {
|
||||
let host_ip = self.config.host_ip.clone();
|
||||
tokio::spawn(async move {
|
||||
// Wait for Nextcloud to finish first-run initialization
|
||||
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
|
||||
for domain_idx in 1..=2u8 {
|
||||
let value = if domain_idx == 1 {
|
||||
host_ip.as_str()
|
||||
} else {
|
||||
"localhost"
|
||||
};
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args([
|
||||
"exec",
|
||||
"-u",
|
||||
"33",
|
||||
"nextcloud",
|
||||
"php",
|
||||
"occ",
|
||||
"config:system:set",
|
||||
"trusted_domains",
|
||||
&domain_idx.to_string(),
|
||||
"--value",
|
||||
value,
|
||||
])
|
||||
.output()
|
||||
.await;
|
||||
}
|
||||
info!("Nextcloud trusted domains configured for {}", host_ip);
|
||||
});
|
||||
let host_ip = &self.config.host_ip;
|
||||
// Wait for Nextcloud to finish first-run initialization
|
||||
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
|
||||
for domain_idx in 1..=2u8 {
|
||||
let value = if domain_idx == 1 {
|
||||
host_ip.as_str()
|
||||
} else {
|
||||
"localhost"
|
||||
};
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args([
|
||||
"exec",
|
||||
"-u",
|
||||
"33",
|
||||
"nextcloud",
|
||||
"php",
|
||||
"occ",
|
||||
"config:system:set",
|
||||
"trusted_domains",
|
||||
&domain_idx.to_string(),
|
||||
"--value",
|
||||
value,
|
||||
])
|
||||
.output()
|
||||
.await;
|
||||
}
|
||||
info!("Nextcloud trusted domains configured for {}", host_ip);
|
||||
}
|
||||
|
||||
// Build and start companion UI containers for headless services
|
||||
|
||||
@@ -58,6 +58,7 @@ fn create_installing_entry(package_id: &str) -> PackageDataEntry {
|
||||
PackageDataEntry {
|
||||
state: PackageState::Installing,
|
||||
health: None,
|
||||
exit_code: None,
|
||||
static_files: StaticFiles {
|
||||
license: String::new(),
|
||||
instructions: String::new(),
|
||||
|
||||
@@ -221,18 +221,30 @@ impl RpcHandler {
|
||||
}
|
||||
}
|
||||
|
||||
// Remove container (without -f to respect graceful shutdown above)
|
||||
tracing::info!("Uninstall {}: removing container {}", package_id, name);
|
||||
let rm_out = tokio::process::Command::new("podman")
|
||||
.args(["rm", "-f", name])
|
||||
.args(["rm", name])
|
||||
.output()
|
||||
.await;
|
||||
match rm_out {
|
||||
Ok(o) if o.status.success() => removed += 1,
|
||||
Ok(o) => {
|
||||
// If normal rm fails (e.g., still running), force as fallback
|
||||
let stderr = String::from_utf8_lossy(&o.stderr);
|
||||
let msg = format!("Failed to remove {}: {}", name, stderr.trim());
|
||||
tracing::error!("Uninstall {}: {}", package_id, msg);
|
||||
errors.push(msg);
|
||||
tracing::warn!("Uninstall {}: rm {} failed ({}), trying force", package_id, name, stderr.trim());
|
||||
let force_rm = tokio::process::Command::new("podman")
|
||||
.args(["rm", "-f", name])
|
||||
.output()
|
||||
.await;
|
||||
match force_rm {
|
||||
Ok(o2) if o2.status.success() => removed += 1,
|
||||
_ => {
|
||||
let msg = format!("Failed to remove {}: {}", name, stderr.trim());
|
||||
tracing::error!("Uninstall {}: {}", package_id, msg);
|
||||
errors.push(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = format!("Failed to remove {}: {}", name, e);
|
||||
@@ -242,6 +254,26 @@ impl RpcHandler {
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up dangling volumes associated with removed containers
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args(["volume", "prune", "-f"])
|
||||
.output()
|
||||
.await;
|
||||
|
||||
// Clean up app-specific networks (only if no other containers use them)
|
||||
let app_networks: Vec<&str> = match package_id {
|
||||
"immich" | "immich_server" => vec!["immich-net"],
|
||||
"penpot" | "penpot-frontend" => vec!["penpot-net"],
|
||||
"indeedhub" | "indeedhub-api" => vec!["indeedhub-net"],
|
||||
_ => vec![],
|
||||
};
|
||||
for net in &app_networks {
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args(["network", "rm", net])
|
||||
.output()
|
||||
.await;
|
||||
}
|
||||
|
||||
// Release port allocation
|
||||
{
|
||||
let mut allocator = self.port_allocator.lock().await;
|
||||
@@ -257,10 +289,19 @@ impl RpcHandler {
|
||||
.args(["rm", "-rf", dir])
|
||||
.output()
|
||||
.await;
|
||||
if let Ok(o) = rm_out {
|
||||
if !o.status.success() {
|
||||
tracing::warn!("Uninstall {}: rm {} failed", package_id, dir);
|
||||
match rm_out {
|
||||
Ok(o) if !o.status.success() => {
|
||||
let stderr = String::from_utf8_lossy(&o.stderr);
|
||||
let msg = format!("Failed to remove data {}: {}", dir, stderr.trim());
|
||||
tracing::error!("Uninstall {}: {}", package_id, msg);
|
||||
errors.push(msg);
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = format!("Failed to remove data {}: {}", dir, e);
|
||||
tracing::error!("Uninstall {}: {}", package_id, msg);
|
||||
errors.push(msg);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -271,20 +312,24 @@ impl RpcHandler {
|
||||
package_id,
|
||||
errors
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
"Uninstall {} complete: stopped={}, removed={}",
|
||||
return Err(anyhow::anyhow!(
|
||||
"Uninstall {} partially failed: {}",
|
||||
package_id,
|
||||
stopped,
|
||||
removed
|
||||
);
|
||||
errors.join("; ")
|
||||
));
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Uninstall {} complete: stopped={}, removed={}",
|
||||
package_id,
|
||||
stopped,
|
||||
removed
|
||||
);
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"status": if errors.is_empty() { "uninstalled" } else { "partial" },
|
||||
"status": "uninstalled",
|
||||
"stopped": stopped,
|
||||
"removed": removed,
|
||||
"errors": errors,
|
||||
}))
|
||||
}
|
||||
|
||||
|
||||
@@ -146,6 +146,7 @@ impl DockerPackageScanner {
|
||||
let package = PackageDataEntry {
|
||||
state: package_state.clone(),
|
||||
health: container.health.clone(),
|
||||
exit_code: if package_state == PackageState::Exited { container.exit_code } else { None },
|
||||
static_files: StaticFiles {
|
||||
license: "MIT".to_string(),
|
||||
instructions: metadata.description.clone(),
|
||||
|
||||
@@ -262,33 +262,47 @@ pub async fn recover_containers(containers: &[RunningContainerRecord]) -> Recove
|
||||
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
|
||||
}
|
||||
|
||||
let result = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(30),
|
||||
tokio::process::Command::new("podman")
|
||||
.args(["start", &record.name])
|
||||
.output(),
|
||||
)
|
||||
.await;
|
||||
// Try up to 2 attempts with increasing timeout (120s first, 180s retry)
|
||||
let mut started = false;
|
||||
for attempt in 0..2u32 {
|
||||
let timeout_secs = if attempt == 0 { 120 } else { 180 };
|
||||
if attempt > 0 {
|
||||
info!("Retrying container {} (attempt {})", record.name, attempt + 1);
|
||||
tokio::time::sleep(std::time::Duration::from_secs(10)).await;
|
||||
}
|
||||
let result = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(timeout_secs),
|
||||
tokio::process::Command::new("podman")
|
||||
.args(["start", &record.name])
|
||||
.output(),
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(Ok(output)) if output.status.success() => {
|
||||
info!("Successfully restarted container: {}", record.name);
|
||||
report.recovered += 1;
|
||||
}
|
||||
Ok(Ok(output)) => {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
warn!("Failed to restart container {}: {}", record.name, stderr.trim());
|
||||
report.failed.push(record.name.clone());
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!("Failed to execute podman start for {}: {}", record.name, e);
|
||||
report.failed.push(record.name.clone());
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Timeout starting container {} (30s)", record.name);
|
||||
report.failed.push(record.name.clone());
|
||||
match result {
|
||||
Ok(Ok(output)) if output.status.success() => {
|
||||
info!("Successfully restarted container: {}", record.name);
|
||||
report.recovered += 1;
|
||||
started = true;
|
||||
break;
|
||||
}
|
||||
Ok(Ok(output)) => {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
warn!("Failed to restart container {} (attempt {}): {}",
|
||||
record.name, attempt + 1, stderr.trim());
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
warn!("Failed to execute podman start for {} (attempt {}): {}",
|
||||
record.name, attempt + 1, e);
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Timeout starting container {} ({}s, attempt {})",
|
||||
record.name, timeout_secs, attempt + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if !started {
|
||||
report.failed.push(record.name.clone());
|
||||
}
|
||||
}
|
||||
|
||||
report
|
||||
@@ -313,7 +327,7 @@ fn is_process_running(pid: u32) -> bool {
|
||||
/// Skips containers that the user intentionally stopped via the UI.
|
||||
pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport {
|
||||
let output = match tokio::time::timeout(
|
||||
std::time::Duration::from_secs(30),
|
||||
std::time::Duration::from_secs(60),
|
||||
tokio::process::Command::new("podman")
|
||||
.args(["ps", "-a", "--filter", "status=exited", "--filter", "status=created", "--format", "{{.Names}}"])
|
||||
.output(),
|
||||
@@ -322,7 +336,7 @@ pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport {
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(_) => {
|
||||
warn!("Timeout listing stopped containers (30s)");
|
||||
warn!("Timeout listing stopped containers (60s)");
|
||||
return RecoveryReport { total: 0, recovered: 0, failed: Vec::new() };
|
||||
}
|
||||
};
|
||||
@@ -374,12 +388,21 @@ pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport {
|
||||
fn container_boot_tier(name: &str) -> u8 {
|
||||
let id = name.strip_prefix("archy-").unwrap_or(name);
|
||||
match id {
|
||||
"btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres"
|
||||
| "immich_redis" | "penpot-valkey" => 0,
|
||||
// Tier 0: Databases and data stores
|
||||
"btcpay-db" | "mempool-db" | "mysql-mempool" | "penpot-postgres"
|
||||
| "immich_postgres" | "immich_redis" | "penpot-valkey"
|
||||
| "endurain-db" | "nextcloud-db"
|
||||
| "indeedhub-postgres" | "indeedhub-redis" | "indeedhub-minio" => 0,
|
||||
// Tier 1: Core infrastructure
|
||||
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => 1,
|
||||
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer" => 2,
|
||||
// Tier 2: Dependent services
|
||||
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer"
|
||||
| "mempool-api" | "indeedhub-api" => 2,
|
||||
// Tier 4: Frontend/UI
|
||||
"mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui"
|
||||
| "penpot-frontend" | "penpot-exporter" => 4,
|
||||
| "penpot-frontend" | "penpot-exporter"
|
||||
| "indeedhub" => 4,
|
||||
// Tier 3: Everything else
|
||||
_ => 3,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,6 +124,9 @@ pub struct PackageDataEntry {
|
||||
/// Container health: "healthy", "unhealthy", "starting", or null
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub health: Option<String>,
|
||||
/// Container exit code (only set when state is Exited): 0 = clean, non-zero = crash
|
||||
#[serde(rename = "exit-code", skip_serializing_if = "Option::is_none")]
|
||||
pub exit_code: Option<i32>,
|
||||
#[serde(rename = "static-files")]
|
||||
pub static_files: StaticFiles,
|
||||
pub manifest: Manifest,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// Container Health Monitor
|
||||
// Checks container health every 60s, auto-restarts unhealthy containers (max 3 times)
|
||||
// with exponential backoff (10s, 30s, 90s), dependency-aware startup ordering,
|
||||
// Checks container health every 120s, auto-restarts unhealthy containers (max 10 times)
|
||||
// with exponential backoff (10s..120s), dependency-aware restart ordering (deps first),
|
||||
// handles "created" state containers, resets dependent counters when deps recover,
|
||||
// and sends WebSocket notifications to the UI on failure.
|
||||
|
||||
use crate::data_model::{Notification, NotificationLevel};
|
||||
@@ -13,10 +14,10 @@ use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
const MAX_RESTART_ATTEMPTS: u32 = 3;
|
||||
const CHECK_INTERVAL_SECS: u64 = 60;
|
||||
/// Backoff delays per attempt: 10s, 30s, 90s
|
||||
const BACKOFF_DELAYS_SECS: [u64; 3] = [10, 30, 90];
|
||||
const MAX_RESTART_ATTEMPTS: u32 = 10;
|
||||
const CHECK_INTERVAL_SECS: u64 = 120;
|
||||
/// Backoff delays per attempt — escalating from 10s to 120s
|
||||
const BACKOFF_DELAYS_SECS: [u64; 10] = [10, 15, 20, 30, 30, 45, 60, 60, 90, 120];
|
||||
/// Reset restart counter after 1 hour of stability
|
||||
const STABILITY_RESET_SECS: u64 = 3600;
|
||||
|
||||
@@ -39,25 +40,83 @@ enum StartupTier {
|
||||
fn container_tier(name: &str) -> StartupTier {
|
||||
let id = name.strip_prefix("archy-").unwrap_or(name);
|
||||
match id {
|
||||
// Tier 0: Databases
|
||||
"btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres"
|
||||
| "immich_redis" | "penpot-valkey" | "endurain-db" | "nextcloud-db" => StartupTier::Database,
|
||||
// Tier 0: Databases and data stores
|
||||
"btcpay-db" | "mempool-db" | "mysql-mempool" | "penpot-postgres"
|
||||
| "immich_postgres" | "immich_redis" | "penpot-valkey"
|
||||
| "endurain-db" | "nextcloud-db"
|
||||
| "indeedhub-postgres" | "indeedhub-redis" | "indeedhub-minio" => StartupTier::Database,
|
||||
|
||||
// Tier 1: Core infrastructure
|
||||
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => StartupTier::CoreInfra,
|
||||
|
||||
// Tier 2: Dependent services
|
||||
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer" => StartupTier::DependentService,
|
||||
// Tier 2: Dependent services (need databases or bitcoin)
|
||||
"lnd" | "electrumx" | "mempool-electrs" | "electrs" | "nbxplorer"
|
||||
| "mempool-api" | "indeedhub-api" => StartupTier::DependentService,
|
||||
|
||||
// Tier 4: Frontend/UI
|
||||
"mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui"
|
||||
| "penpot-frontend" | "penpot-exporter" => StartupTier::Frontend,
|
||||
| "penpot-frontend" | "penpot-exporter"
|
||||
| "indeedhub" => StartupTier::Frontend,
|
||||
|
||||
// Tier 3: Everything else
|
||||
// Tier 3: Application layer (everything else)
|
||||
_ => StartupTier::Application,
|
||||
}
|
||||
}
|
||||
|
||||
/// Map containers to their required dependencies.
|
||||
/// When a dependent fails, check and restart its dependencies first.
|
||||
fn container_dependencies(name: &str) -> &'static [&'static str] {
|
||||
let id = name.strip_prefix("archy-").unwrap_or(name);
|
||||
match id {
|
||||
// Bitcoin-dependent chain
|
||||
"lnd" => &["bitcoin-knots"],
|
||||
"electrumx" | "mempool-electrs" | "electrs" => &["bitcoin-knots"],
|
||||
"nbxplorer" => &["bitcoin-knots"],
|
||||
"btcpay-server" => &["btcpay-db", "nbxplorer"],
|
||||
"mempool-api" => &["mempool-db", "electrumx"],
|
||||
"mempool-web" => &["mempool-api"],
|
||||
"fedimint" => &["bitcoin-knots"],
|
||||
"fedimint-gateway" => &["lnd"],
|
||||
|
||||
// IndeedHub stack
|
||||
"indeedhub-api" => &["indeedhub-postgres", "indeedhub-redis"],
|
||||
"indeedhub" => &["indeedhub-api"],
|
||||
"indeedhub-relay" => &["indeedhub-postgres"],
|
||||
"indeedhub-ffmpeg" => &["indeedhub-api"],
|
||||
|
||||
// Multi-container stacks
|
||||
"immich_server" => &["immich_postgres", "immich_redis"],
|
||||
"penpot-backend" => &["penpot-postgres", "penpot-valkey"],
|
||||
"penpot-frontend" => &["penpot-backend"],
|
||||
|
||||
// UI containers
|
||||
"bitcoin-ui" => &["bitcoin-knots"],
|
||||
"lnd-ui" => &["lnd"],
|
||||
"electrs-ui" => &["electrumx"],
|
||||
|
||||
_ => &[],
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if all of a container's dependencies are currently running.
|
||||
fn deps_are_running(name: &str, containers: &[ContainerHealth]) -> bool {
|
||||
let deps = container_dependencies(name);
|
||||
if deps.is_empty() {
|
||||
return true;
|
||||
}
|
||||
for dep in deps {
|
||||
// Check both plain name and archy- prefixed name
|
||||
let dep_running = containers.iter().any(|c| {
|
||||
let c_id = c.name.strip_prefix("archy-").unwrap_or(&c.name);
|
||||
(c_id == *dep || c.name == *dep) && c.state == "running"
|
||||
});
|
||||
if !dep_running {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Track restart attempts per container with exponential backoff and stability reset.
|
||||
struct RestartTracker {
|
||||
attempts: HashMap<String, u32>,
|
||||
@@ -372,7 +431,7 @@ async fn check_containers() -> Vec<ContainerHealth> {
|
||||
async fn restart_container(name: &str) -> bool {
|
||||
info!("Auto-restarting unhealthy container: {}", name);
|
||||
let result = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(30),
|
||||
std::time::Duration::from_secs(120),
|
||||
tokio::process::Command::new("podman")
|
||||
.args(["start", name])
|
||||
.output(),
|
||||
@@ -394,7 +453,7 @@ async fn restart_container(name: &str) -> bool {
|
||||
false
|
||||
}
|
||||
Err(_) => {
|
||||
warn!("Timeout starting container {} (30s)", name);
|
||||
warn!("Timeout starting container {} (120s)", name);
|
||||
false
|
||||
}
|
||||
}
|
||||
@@ -466,13 +525,33 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
||||
if container.healthy {
|
||||
if tracker.attempt_count(&container.name) > 0 {
|
||||
info!("Container {} is healthy again after restart", container.name);
|
||||
// Reset attempt counters for containers that depend on this one,
|
||||
// since their previous failures may have been caused by this
|
||||
// dependency being down
|
||||
let recovered_id = container.name.strip_prefix("archy-")
|
||||
.unwrap_or(&container.name).to_string();
|
||||
for other in &containers {
|
||||
let deps = container_dependencies(&other.name);
|
||||
if deps.iter().any(|d| *d == recovered_id || *d == container.name) {
|
||||
if tracker.attempt_count(&other.name) > 0 {
|
||||
info!("Resetting restart counter for {} (dependency {} recovered)",
|
||||
other.name, container.name);
|
||||
tracker.clear(&other.name);
|
||||
restart_history.clear(&other.name);
|
||||
history_dirty = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
tracker.clear(&container.name);
|
||||
restart_history.clear(&container.name);
|
||||
history_dirty = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if container.state == "exited" || container.state == "stopped" {
|
||||
// Handle exited, stopped, AND created state containers
|
||||
if container.state == "exited" || container.state == "stopped"
|
||||
|| container.state == "created"
|
||||
{
|
||||
// Skip user-stopped containers
|
||||
if user_stopped.contains(&container.name) {
|
||||
debug!("Skipping user-stopped container: {}", container.name);
|
||||
@@ -509,6 +588,13 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip if dependencies aren't running — they need to start first
|
||||
if !deps_are_running(&container.name, &containers) {
|
||||
let deps = container_dependencies(&container.name);
|
||||
debug!("Container {} waiting for dependencies {:?}", container.name, deps);
|
||||
continue;
|
||||
}
|
||||
|
||||
// When transitioning to a higher tier, wait briefly for previous tier to stabilize
|
||||
if let Some(prev) = prev_tier {
|
||||
if tier > prev {
|
||||
@@ -695,13 +781,13 @@ mod tests {
|
||||
#[test]
|
||||
fn test_max_restart_attempts_constant() {
|
||||
assert!(MAX_RESTART_ATTEMPTS >= 1);
|
||||
assert!(MAX_RESTART_ATTEMPTS <= 10);
|
||||
assert_eq!(MAX_RESTART_ATTEMPTS, 3);
|
||||
assert!(MAX_RESTART_ATTEMPTS <= 20);
|
||||
assert_eq!(MAX_RESTART_ATTEMPTS, 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_check_interval_constant() {
|
||||
assert_eq!(CHECK_INTERVAL_SECS, 60);
|
||||
assert_eq!(CHECK_INTERVAL_SECS, 120);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -740,6 +826,44 @@ mod tests {
|
||||
assert_eq!(container_tier("archy-btcpay-db"), StartupTier::Database);
|
||||
assert_eq!(container_tier("immich_postgres"), StartupTier::Database);
|
||||
assert_eq!(container_tier("penpot-valkey"), StartupTier::Database);
|
||||
assert_eq!(container_tier("indeedhub-postgres"), StartupTier::Database);
|
||||
assert_eq!(container_tier("indeedhub-redis"), StartupTier::Database);
|
||||
assert_eq!(container_tier("indeedhub-minio"), StartupTier::Database);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_container_tier_indeedhub_api() {
|
||||
assert_eq!(container_tier("indeedhub-api"), StartupTier::DependentService);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_container_tier_mempool_api() {
|
||||
assert_eq!(container_tier("mempool-api"), StartupTier::DependentService);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_container_dependencies() {
|
||||
assert!(container_dependencies("lnd").contains(&"bitcoin-knots"));
|
||||
assert!(container_dependencies("indeedhub-api").contains(&"indeedhub-postgres"));
|
||||
assert!(container_dependencies("indeedhub-api").contains(&"indeedhub-redis"));
|
||||
assert!(container_dependencies("mempool-api").contains(&"mempool-db"));
|
||||
assert!(container_dependencies("mempool-api").contains(&"electrumx"));
|
||||
assert!(container_dependencies("nextcloud").is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deps_are_running() {
|
||||
let containers = vec![
|
||||
ContainerHealth { name: "indeedhub-postgres".into(), app_id: "indeedhub-postgres".into(), state: "running".into(), healthy: true },
|
||||
ContainerHealth { name: "indeedhub-redis".into(), app_id: "indeedhub-redis".into(), state: "running".into(), healthy: true },
|
||||
ContainerHealth { name: "indeedhub-api".into(), app_id: "indeedhub-api".into(), state: "exited".into(), healthy: false },
|
||||
];
|
||||
assert!(deps_are_running("indeedhub-api", &containers));
|
||||
// Missing postgres
|
||||
let partial = vec![
|
||||
ContainerHealth { name: "indeedhub-redis".into(), app_id: "indeedhub-redis".into(), state: "running".into(), healthy: true },
|
||||
];
|
||||
assert!(!deps_are_running("indeedhub-api", &partial));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -14,18 +14,21 @@ use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
/// Spawn the background metrics collector (runs every 60 seconds).
|
||||
/// Spawn the background metrics collector (runs every 300 seconds / 5 minutes).
|
||||
/// Evaluates alert rules on each snapshot and dispatches notifications.
|
||||
/// Note: health_monitor.rs handles container state polling at 120s intervals.
|
||||
/// This collector handles system-level metrics (CPU, disk, network) and only
|
||||
/// calls podman stats every 5 minutes to avoid duplicate subprocess overhead.
|
||||
pub fn spawn_metrics_collector(
|
||||
store: Arc<MetricsStore>,
|
||||
state: Option<Arc<crate::state::StateManager>>,
|
||||
data_dir: Option<PathBuf>,
|
||||
) {
|
||||
tokio::spawn(async move {
|
||||
// Wait 30s for system to stabilize after boot
|
||||
tokio::time::sleep(std::time::Duration::from_secs(30)).await;
|
||||
// Wait 60s for system to stabilize after boot
|
||||
tokio::time::sleep(std::time::Duration::from_secs(60)).await;
|
||||
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(60));
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(300));
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
|
||||
@@ -34,6 +34,7 @@ pub struct ContainerStatus {
|
||||
pub name: String,
|
||||
pub state: ContainerState,
|
||||
pub health: Option<String>,
|
||||
pub exit_code: Option<i32>,
|
||||
pub started_at: Option<String>,
|
||||
pub image: String,
|
||||
pub created: String,
|
||||
@@ -150,13 +151,13 @@ impl PodmanClient {
|
||||
) -> Result<serde_json::Value> {
|
||||
let socket_path = self.socket_path.clone();
|
||||
|
||||
// Connect to the unix socket
|
||||
// Connect to the unix socket (30s timeout — podman can be slow under load on boot)
|
||||
let stream = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(5),
|
||||
std::time::Duration::from_secs(30),
|
||||
UnixStream::connect(&socket_path),
|
||||
)
|
||||
.await
|
||||
.map_err(|_| anyhow::anyhow!("Podman socket connection timed out"))?
|
||||
.map_err(|_| anyhow::anyhow!("Podman socket connection timed out (30s)"))?
|
||||
.context(format!("Cannot connect to Podman socket at {}", socket_path.display()))?;
|
||||
|
||||
// Build the hyper client with the unix stream
|
||||
@@ -179,8 +180,11 @@ impl PodmanClient {
|
||||
|
||||
let req = match method {
|
||||
"POST" => {
|
||||
let body_str = body.map(|b| serde_json::to_string(&b).unwrap_or_default())
|
||||
.unwrap_or_default();
|
||||
let body_str = match body {
|
||||
Some(b) => serde_json::to_string(&b)
|
||||
.context("Failed to serialize request body to JSON")?,
|
||||
None => String::new(),
|
||||
};
|
||||
Request::builder()
|
||||
.method("POST")
|
||||
.uri(uri)
|
||||
@@ -326,6 +330,8 @@ impl PodmanClient {
|
||||
"cap_drop": cap_drop,
|
||||
"read_only_filesystem": manifest.app.security.readonly_root,
|
||||
"no_new_privileges": true,
|
||||
"restart_policy": "unless-stopped",
|
||||
"restart_tries": 5,
|
||||
"netns": {
|
||||
"nsmode": match manifest.app.security.network_policy.as_str() {
|
||||
"host" => "host",
|
||||
@@ -342,8 +348,9 @@ impl PodmanClient {
|
||||
).await?;
|
||||
|
||||
let id = result["Id"].as_str()
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.context("Podman API returned no container ID — creation may have failed")?;
|
||||
|
||||
Ok(id)
|
||||
}
|
||||
@@ -396,11 +403,14 @@ impl PodmanClient {
|
||||
let ports = parse_port_bindings(&data["HostConfig"]["PortBindings"]);
|
||||
let lan_address = Self::lan_address_for(&container_name);
|
||||
|
||||
let exit_code = data["State"]["ExitCode"].as_i64().map(|c| c as i32);
|
||||
|
||||
Ok(ContainerStatus {
|
||||
id: data["Id"].as_str().unwrap_or("").to_string(),
|
||||
name: container_name,
|
||||
state: ContainerState::from(state_str),
|
||||
health,
|
||||
exit_code,
|
||||
started_at,
|
||||
image: data["ImageName"].as_str()
|
||||
.or_else(|| data["Config"]["Image"].as_str())
|
||||
@@ -477,11 +487,16 @@ impl PodmanClient {
|
||||
.map(|s| s.to_string());
|
||||
let lan_address = Self::lan_address_for(&name);
|
||||
|
||||
let exit_code = c["ExitCode"].as_i64()
|
||||
.or_else(|| c["State"]["ExitCode"].as_i64())
|
||||
.map(|c| c as i32);
|
||||
|
||||
result.push(ContainerStatus {
|
||||
id: c["Id"].as_str().unwrap_or("").to_string(),
|
||||
name,
|
||||
state: ContainerState::from(c["State"].as_str().unwrap_or("unknown")),
|
||||
health,
|
||||
exit_code,
|
||||
started_at,
|
||||
image: c["Image"].as_str().unwrap_or("").to_string(),
|
||||
created: c["Created"].as_str().unwrap_or("").to_string(),
|
||||
|
||||
@@ -285,6 +285,7 @@ impl ContainerRuntime for DockerRuntime {
|
||||
name: parts[1].to_string(),
|
||||
state: crate::podman_client::ContainerState::from(parts[2]),
|
||||
health: None,
|
||||
exit_code: None,
|
||||
started_at: None,
|
||||
image: parts[3].to_string(),
|
||||
created: parts[4].to_string(),
|
||||
@@ -359,6 +360,7 @@ impl ContainerRuntime for DockerRuntime {
|
||||
container["State"].as_str().unwrap_or("unknown")
|
||||
),
|
||||
health: None,
|
||||
exit_code: container["ExitCode"].as_i64().map(|c| c as i32),
|
||||
started_at: None,
|
||||
image: container["Image"].as_str().unwrap_or("").to_string(),
|
||||
created: container["CreatedAt"].as_str().unwrap_or("").to_string(),
|
||||
|
||||
Reference in New Issue
Block a user