fix: overhaul container lifecycle — recovery, health, uninstall, UI state
Container recovery: - Health monitor: MAX_RESTART_ATTEMPTS 3→10, interval 60s→120s - Dependency-aware restarts: won't restart services before their deps - Reset dependent counters when a dependency recovers - Handle "created" state containers (were invisible to health monitor) - Added IndeedHub, mempool-api, mysql to tier system - Crash recovery: podman start timeout 30s→120s with retry - Podman client: socket timeout 5s→30s, added restart policy UI state representation: - Exit code 0 shows "stopped" (gray), not "crashed" (red) - Exit code 137 shows "killed (OOM)" - Non-zero exit shows "crashed" (red) - Added exit_code field to PackageDataEntry Install/uninstall fixes: - Install returns error when container doesn't start (was silent success) - Post-install hooks awaited instead of fire-and-forget tokio::spawn - Uninstall: graceful rm before force, volume prune, network cleanup - Uninstall returns error on partial failure (was 200 OK) Config consistency: - DB passwords read from /var/lib/archipelago/secrets/ (was hardcoded) - Bitcoin: added ZMQ ports 28332/28333 for LND block notifications - IndeedHub port 7777→8190 (was conflicting with strfry) - Marketplace versions: LND 0.17.4→0.18.4, Mempool 2.5.0→3.0.0 Performance: - Metrics collector interval 60s→300s (was duplicating health monitor) - Podman client: proper error propagation instead of unwrap_or_default Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,7 @@ pub struct ContainerStatus {
|
||||
pub name: String,
|
||||
pub state: ContainerState,
|
||||
pub health: Option<String>,
|
||||
pub exit_code: Option<i32>,
|
||||
pub started_at: Option<String>,
|
||||
pub image: String,
|
||||
pub created: String,
|
||||
@@ -150,13 +151,13 @@ impl PodmanClient {
|
||||
) -> Result<serde_json::Value> {
|
||||
let socket_path = self.socket_path.clone();
|
||||
|
||||
// Connect to the unix socket
|
||||
// Connect to the unix socket (30s timeout — podman can be slow under load on boot)
|
||||
let stream = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(5),
|
||||
std::time::Duration::from_secs(30),
|
||||
UnixStream::connect(&socket_path),
|
||||
)
|
||||
.await
|
||||
.map_err(|_| anyhow::anyhow!("Podman socket connection timed out"))?
|
||||
.map_err(|_| anyhow::anyhow!("Podman socket connection timed out (30s)"))?
|
||||
.context(format!("Cannot connect to Podman socket at {}", socket_path.display()))?;
|
||||
|
||||
// Build the hyper client with the unix stream
|
||||
@@ -179,8 +180,11 @@ impl PodmanClient {
|
||||
|
||||
let req = match method {
|
||||
"POST" => {
|
||||
let body_str = body.map(|b| serde_json::to_string(&b).unwrap_or_default())
|
||||
.unwrap_or_default();
|
||||
let body_str = match body {
|
||||
Some(b) => serde_json::to_string(&b)
|
||||
.context("Failed to serialize request body to JSON")?,
|
||||
None => String::new(),
|
||||
};
|
||||
Request::builder()
|
||||
.method("POST")
|
||||
.uri(uri)
|
||||
@@ -326,6 +330,8 @@ impl PodmanClient {
|
||||
"cap_drop": cap_drop,
|
||||
"read_only_filesystem": manifest.app.security.readonly_root,
|
||||
"no_new_privileges": true,
|
||||
"restart_policy": "unless-stopped",
|
||||
"restart_tries": 5,
|
||||
"netns": {
|
||||
"nsmode": match manifest.app.security.network_policy.as_str() {
|
||||
"host" => "host",
|
||||
@@ -342,8 +348,9 @@ impl PodmanClient {
|
||||
).await?;
|
||||
|
||||
let id = result["Id"].as_str()
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.context("Podman API returned no container ID — creation may have failed")?;
|
||||
|
||||
Ok(id)
|
||||
}
|
||||
@@ -396,11 +403,14 @@ impl PodmanClient {
|
||||
let ports = parse_port_bindings(&data["HostConfig"]["PortBindings"]);
|
||||
let lan_address = Self::lan_address_for(&container_name);
|
||||
|
||||
let exit_code = data["State"]["ExitCode"].as_i64().map(|c| c as i32);
|
||||
|
||||
Ok(ContainerStatus {
|
||||
id: data["Id"].as_str().unwrap_or("").to_string(),
|
||||
name: container_name,
|
||||
state: ContainerState::from(state_str),
|
||||
health,
|
||||
exit_code,
|
||||
started_at,
|
||||
image: data["ImageName"].as_str()
|
||||
.or_else(|| data["Config"]["Image"].as_str())
|
||||
@@ -477,11 +487,16 @@ impl PodmanClient {
|
||||
.map(|s| s.to_string());
|
||||
let lan_address = Self::lan_address_for(&name);
|
||||
|
||||
let exit_code = c["ExitCode"].as_i64()
|
||||
.or_else(|| c["State"]["ExitCode"].as_i64())
|
||||
.map(|c| c as i32);
|
||||
|
||||
result.push(ContainerStatus {
|
||||
id: c["Id"].as_str().unwrap_or("").to_string(),
|
||||
name,
|
||||
state: ContainerState::from(c["State"].as_str().unwrap_or("unknown")),
|
||||
health,
|
||||
exit_code,
|
||||
started_at,
|
||||
image: c["Image"].as_str().unwrap_or("").to_string(),
|
||||
created: c["Created"].as_str().unwrap_or("").to_string(),
|
||||
|
||||
@@ -285,6 +285,7 @@ impl ContainerRuntime for DockerRuntime {
|
||||
name: parts[1].to_string(),
|
||||
state: crate::podman_client::ContainerState::from(parts[2]),
|
||||
health: None,
|
||||
exit_code: None,
|
||||
started_at: None,
|
||||
image: parts[3].to_string(),
|
||||
created: parts[4].to_string(),
|
||||
@@ -359,6 +360,7 @@ impl ContainerRuntime for DockerRuntime {
|
||||
container["State"].as_str().unwrap_or("unknown")
|
||||
),
|
||||
health: None,
|
||||
exit_code: container["ExitCode"].as_i64().map(|c| c as i32),
|
||||
started_at: None,
|
||||
image: container["Image"].as_str().unwrap_or("").to_string(),
|
||||
created: container["CreatedAt"].as_str().unwrap_or("").to_string(),
|
||||
|
||||
Reference in New Issue
Block a user