fix: harden container reconcile and launch behavior

This commit is contained in:
Dorian
2026-05-13 22:59:55 -04:00
parent 835c525218
commit 2ff47f88a7
9 changed files with 259 additions and 11 deletions

View File

@@ -62,6 +62,19 @@ fn is_required_baseline_app(app_id: &str) -> bool {
)
}
fn is_restart_sensitive_app(app_id: &str) -> bool {
matches!(
app_id,
"bitcoin-knots"
| "bitcoin-core"
| "bitcoin"
| "lnd"
| "btcpay-server"
| "fedimint"
| "fedimint-gateway"
)
}
fn requires_archival_bitcoin(app_id: &str) -> bool {
matches!(
app_id,
@@ -713,6 +726,17 @@ impl ProdContainerOrchestrator {
return Ok(ReconcileAction::Started);
}
if self.container_env_drifted(&name, &resolved_manifest).await {
if mode == ReconcileMode::ExistingOnly
&& is_restart_sensitive_app(&app_id)
{
tracing::info!(
app_id = %app_id,
container = %name,
"container drift detected during boot reconcile; leaving running restart-sensitive app untouched"
);
self.run_post_start_hooks(&app_id).await?;
return Ok(ReconcileAction::NoOp);
}
tracing::info!(app_id = %app_id, container = %name, "container env drift detected — recreating");
let _ = self.runtime.stop_container(&name).await;
let _ = self.runtime.remove_container(&name).await;
@@ -2252,6 +2276,7 @@ mod tests {
runtime,
PathBuf::from("/nonexistent-for-tests"),
);
orch.set_data_dir(PathBuf::from("/nonexistent-for-tests"));
// Redirect the bitcoin-ui pre-start hook to a test-scoped
// tmpdir, seeded with a fake password file. Shared across
// every test in this module (OnceLock), so the hook can run
@@ -2259,6 +2284,7 @@ mod tests {
// this redirection, any test that installs the bitcoin-ui
// fixture would try to write under /var/lib/archipelago.
orch.set_bitcoin_ui_paths(test_bitcoin_ui_paths());
orch.set_filebrowser_paths(test_filebrowser_paths());
orch
}
@@ -2339,6 +2365,17 @@ app:
}
}
fn test_filebrowser_paths() -> filebrowser::EnsurePaths {
use std::sync::OnceLock;
static DIR: OnceLock<tempfile::TempDir> = OnceLock::new();
let dir = DIR.get_or_init(|| tempfile::TempDir::new().expect("test tmpdir"));
filebrowser::EnsurePaths {
srv_root: dir.path().join("filebrowser"),
data_dir: dir.path().join("filebrowser-data"),
config_path: dir.path().join("filebrowser-data/.filebrowser.json"),
}
}
#[tokio::test]
async fn install_fresh_pull() {
let rt = Arc::new(MockRuntime::default());

View File

@@ -8,7 +8,7 @@ use crate::data_model::{Notification, NotificationLevel, PackageState};
use crate::state::StateManager;
use crate::webhooks::{self, WebhookEvent};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Instant;
@@ -420,6 +420,9 @@ async fn check_containers() -> Vec<ContainerHealth> {
let stdout = String::from_utf8_lossy(&output.stdout);
let containers: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap_or_default();
let live_container_ids = live_container_ids(&containers);
cleanup_stale_podman_healthcheck_units(&live_container_ids).await;
// Monitor ALL long-running containers for health — backend services (databases,
// nbxplorer, mempool-api) and UI containers need auto-restart too.
// Only skip ephemeral containers (build infrastructure, init one-shots).
@@ -462,6 +465,154 @@ async fn check_containers() -> Vec<ContainerHealth> {
.collect()
}
fn live_container_ids(containers: &[serde_json::Value]) -> HashSet<String> {
containers
.iter()
.filter_map(|c| {
c.get("Id")
.or_else(|| c.get("ID"))
.and_then(|v| v.as_str())
.map(|s| s.to_string())
})
.collect()
}
async fn cleanup_stale_podman_healthcheck_units(live_container_ids: &HashSet<String>) {
if live_container_ids.is_empty() {
return;
}
let mut units = stale_healthcheck_units_from_systemd(live_container_ids).await;
if units.is_empty() {
return;
}
units.sort();
units.dedup();
let mut cleaned = 0;
for unit in units {
let Some(container_id) = parse_podman_healthcheck_unit(&unit) else {
continue;
};
let service = format!("{}.service", unit.trim_end_matches(".timer"));
if stop_user_unit(&unit).await {
cleaned += 1;
}
let _ = stop_user_unit(&service).await;
let _ = reset_failed_user_unit(&service).await;
debug!(
"Stopped stale Podman healthcheck unit {} for removed container {}",
unit, container_id
);
}
if cleaned > 0 {
info!("Cleaned {} stale Podman healthcheck timer(s)", cleaned);
}
}
async fn stale_healthcheck_units_from_systemd(live_container_ids: &HashSet<String>) -> Vec<String> {
let mut units = Vec::new();
for args in [
["--user", "list-timers", "--all", "--no-legend", "--no-pager"].as_slice(),
["--user", "list-units", "--all", "--no-legend", "--no-pager"].as_slice(),
] {
let output = match tokio::time::timeout(
std::time::Duration::from_secs(20),
tokio::process::Command::new("systemctl")
.args(args.iter().copied())
.output(),
)
.await
{
Ok(Ok(output)) if output.status.success() => output,
Ok(Ok(output)) => {
let stderr = String::from_utf8_lossy(&output.stderr);
debug!("systemctl {} failed: {}", args.join(" "), stderr.trim());
continue;
}
Ok(Err(e)) => {
debug!("Failed to run systemctl {}: {}", args.join(" "), e);
continue;
}
Err(_) => {
debug!("systemctl {} timed out", args.join(" "));
continue;
}
};
let stdout = String::from_utf8_lossy(&output.stdout);
units.extend(stale_healthcheck_units(&stdout, live_container_ids));
}
units
}
fn stale_healthcheck_units(output: &str, live_container_ids: &HashSet<String>) -> Vec<String> {
output
.lines()
.flat_map(|line| line.split_whitespace())
.filter_map(|token| {
let unit = token.trim_start_matches('●');
let id = parse_podman_healthcheck_unit(unit)?;
(!live_container_ids.contains(id)).then(|| unit.to_string())
})
.collect()
}
fn parse_podman_healthcheck_unit(unit: &str) -> Option<&str> {
let unit = unit
.strip_suffix(".timer")
.or_else(|| unit.strip_suffix(".service"))?;
let (container_id, _suffix) = unit.split_once('-')?;
if container_id.len() == 64 && container_id.bytes().all(|b| b.is_ascii_hexdigit()) {
Some(container_id)
} else {
None
}
}
async fn stop_user_unit(unit: &str) -> bool {
run_systemctl_user(["stop", unit]).await
}
async fn reset_failed_user_unit(unit: &str) -> bool {
run_systemctl_user(["reset-failed", unit]).await
}
async fn run_systemctl_user<const N: usize>(args: [&str; N]) -> bool {
let output = match tokio::time::timeout(
std::time::Duration::from_secs(10),
tokio::process::Command::new("systemctl")
.arg("--user")
.args(args.iter().copied())
.output(),
)
.await
{
Ok(Ok(output)) => output,
Ok(Err(e)) => {
debug!("Failed to run systemctl --user {}: {}", args.join(" "), e);
return false;
}
Err(_) => {
debug!("systemctl --user {} timed out", args.join(" "));
return false;
}
};
if output.status.success() {
true
} else {
let stderr = String::from_utf8_lossy(&output.stderr);
debug!(
"systemctl --user {} failed: {}",
args.join(" "),
stderr.trim()
);
false
}
}
fn parse_podman_health(c: &serde_json::Value, state: &str) -> Option<String> {
c.get("Status")
.and_then(|v| v.as_str())
@@ -1173,4 +1324,47 @@ mod tests {
Some("unhealthy")
);
}
#[test]
fn parses_podman_healthcheck_systemd_units() {
let id = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c";
assert_eq!(
parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.timer", id)),
Some(id)
);
assert_eq!(
parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.service", id)),
Some(id)
);
assert_eq!(parse_podman_healthcheck_unit("grafana.service"), None);
assert_eq!(
parse_podman_healthcheck_unit("nothexzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz-x.timer"),
None
);
}
#[test]
fn stale_healthcheck_units_filters_only_removed_container_ids() {
let live = "6467e25fd87d791a63fe9dbf6e2fabc7bf26533aa2c402b1089effeacf7ebbba";
let stale = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c";
let mut live_ids = HashSet::new();
live_ids.insert(live.to_string());
let output = format!(
" {live}-6fdc497fd3ba3b62.timer loaded active waiting\n\
{stale}-15c66ddfefa8a763.service loaded failed failed\n\
grafana.service loaded active running\n\
{stale}-1898d85de0bb707f.timer loaded active waiting\n"
);
let mut units = stale_healthcheck_units(&output, &live_ids);
units.sort();
assert_eq!(
units,
vec![
format!("{stale}-15c66ddfefa8a763.service"),
format!("{stale}-1898d85de0bb707f.timer"),
]
);
}
}