fix: harden container reconcile and launch behavior
This commit is contained in:
@@ -62,6 +62,19 @@ fn is_required_baseline_app(app_id: &str) -> bool {
|
||||
)
|
||||
}
|
||||
|
||||
fn is_restart_sensitive_app(app_id: &str) -> bool {
|
||||
matches!(
|
||||
app_id,
|
||||
"bitcoin-knots"
|
||||
| "bitcoin-core"
|
||||
| "bitcoin"
|
||||
| "lnd"
|
||||
| "btcpay-server"
|
||||
| "fedimint"
|
||||
| "fedimint-gateway"
|
||||
)
|
||||
}
|
||||
|
||||
fn requires_archival_bitcoin(app_id: &str) -> bool {
|
||||
matches!(
|
||||
app_id,
|
||||
@@ -713,6 +726,17 @@ impl ProdContainerOrchestrator {
|
||||
return Ok(ReconcileAction::Started);
|
||||
}
|
||||
if self.container_env_drifted(&name, &resolved_manifest).await {
|
||||
if mode == ReconcileMode::ExistingOnly
|
||||
&& is_restart_sensitive_app(&app_id)
|
||||
{
|
||||
tracing::info!(
|
||||
app_id = %app_id,
|
||||
container = %name,
|
||||
"container drift detected during boot reconcile; leaving running restart-sensitive app untouched"
|
||||
);
|
||||
self.run_post_start_hooks(&app_id).await?;
|
||||
return Ok(ReconcileAction::NoOp);
|
||||
}
|
||||
tracing::info!(app_id = %app_id, container = %name, "container env drift detected — recreating");
|
||||
let _ = self.runtime.stop_container(&name).await;
|
||||
let _ = self.runtime.remove_container(&name).await;
|
||||
@@ -2252,6 +2276,7 @@ mod tests {
|
||||
runtime,
|
||||
PathBuf::from("/nonexistent-for-tests"),
|
||||
);
|
||||
orch.set_data_dir(PathBuf::from("/nonexistent-for-tests"));
|
||||
// Redirect the bitcoin-ui pre-start hook to a test-scoped
|
||||
// tmpdir, seeded with a fake password file. Shared across
|
||||
// every test in this module (OnceLock), so the hook can run
|
||||
@@ -2259,6 +2284,7 @@ mod tests {
|
||||
// this redirection, any test that installs the bitcoin-ui
|
||||
// fixture would try to write under /var/lib/archipelago.
|
||||
orch.set_bitcoin_ui_paths(test_bitcoin_ui_paths());
|
||||
orch.set_filebrowser_paths(test_filebrowser_paths());
|
||||
orch
|
||||
}
|
||||
|
||||
@@ -2339,6 +2365,17 @@ app:
|
||||
}
|
||||
}
|
||||
|
||||
fn test_filebrowser_paths() -> filebrowser::EnsurePaths {
|
||||
use std::sync::OnceLock;
|
||||
static DIR: OnceLock<tempfile::TempDir> = OnceLock::new();
|
||||
let dir = DIR.get_or_init(|| tempfile::TempDir::new().expect("test tmpdir"));
|
||||
filebrowser::EnsurePaths {
|
||||
srv_root: dir.path().join("filebrowser"),
|
||||
data_dir: dir.path().join("filebrowser-data"),
|
||||
config_path: dir.path().join("filebrowser-data/.filebrowser.json"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn install_fresh_pull() {
|
||||
let rt = Arc::new(MockRuntime::default());
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::data_model::{Notification, NotificationLevel, PackageState};
|
||||
use crate::state::StateManager;
|
||||
use crate::webhooks::{self, WebhookEvent};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
@@ -420,6 +420,9 @@ async fn check_containers() -> Vec<ContainerHealth> {
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let containers: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap_or_default();
|
||||
|
||||
let live_container_ids = live_container_ids(&containers);
|
||||
cleanup_stale_podman_healthcheck_units(&live_container_ids).await;
|
||||
|
||||
// Monitor ALL long-running containers for health — backend services (databases,
|
||||
// nbxplorer, mempool-api) and UI containers need auto-restart too.
|
||||
// Only skip ephemeral containers (build infrastructure, init one-shots).
|
||||
@@ -462,6 +465,154 @@ async fn check_containers() -> Vec<ContainerHealth> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn live_container_ids(containers: &[serde_json::Value]) -> HashSet<String> {
|
||||
containers
|
||||
.iter()
|
||||
.filter_map(|c| {
|
||||
c.get("Id")
|
||||
.or_else(|| c.get("ID"))
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn cleanup_stale_podman_healthcheck_units(live_container_ids: &HashSet<String>) {
|
||||
if live_container_ids.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut units = stale_healthcheck_units_from_systemd(live_container_ids).await;
|
||||
if units.is_empty() {
|
||||
return;
|
||||
}
|
||||
units.sort();
|
||||
units.dedup();
|
||||
|
||||
let mut cleaned = 0;
|
||||
for unit in units {
|
||||
let Some(container_id) = parse_podman_healthcheck_unit(&unit) else {
|
||||
continue;
|
||||
};
|
||||
let service = format!("{}.service", unit.trim_end_matches(".timer"));
|
||||
if stop_user_unit(&unit).await {
|
||||
cleaned += 1;
|
||||
}
|
||||
let _ = stop_user_unit(&service).await;
|
||||
let _ = reset_failed_user_unit(&service).await;
|
||||
debug!(
|
||||
"Stopped stale Podman healthcheck unit {} for removed container {}",
|
||||
unit, container_id
|
||||
);
|
||||
}
|
||||
|
||||
if cleaned > 0 {
|
||||
info!("Cleaned {} stale Podman healthcheck timer(s)", cleaned);
|
||||
}
|
||||
}
|
||||
|
||||
async fn stale_healthcheck_units_from_systemd(live_container_ids: &HashSet<String>) -> Vec<String> {
|
||||
let mut units = Vec::new();
|
||||
for args in [
|
||||
["--user", "list-timers", "--all", "--no-legend", "--no-pager"].as_slice(),
|
||||
["--user", "list-units", "--all", "--no-legend", "--no-pager"].as_slice(),
|
||||
] {
|
||||
let output = match tokio::time::timeout(
|
||||
std::time::Duration::from_secs(20),
|
||||
tokio::process::Command::new("systemctl")
|
||||
.args(args.iter().copied())
|
||||
.output(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Ok(output)) if output.status.success() => output,
|
||||
Ok(Ok(output)) => {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
debug!("systemctl {} failed: {}", args.join(" "), stderr.trim());
|
||||
continue;
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
debug!("Failed to run systemctl {}: {}", args.join(" "), e);
|
||||
continue;
|
||||
}
|
||||
Err(_) => {
|
||||
debug!("systemctl {} timed out", args.join(" "));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
units.extend(stale_healthcheck_units(&stdout, live_container_ids));
|
||||
}
|
||||
units
|
||||
}
|
||||
|
||||
fn stale_healthcheck_units(output: &str, live_container_ids: &HashSet<String>) -> Vec<String> {
|
||||
output
|
||||
.lines()
|
||||
.flat_map(|line| line.split_whitespace())
|
||||
.filter_map(|token| {
|
||||
let unit = token.trim_start_matches('●');
|
||||
let id = parse_podman_healthcheck_unit(unit)?;
|
||||
(!live_container_ids.contains(id)).then(|| unit.to_string())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn parse_podman_healthcheck_unit(unit: &str) -> Option<&str> {
|
||||
let unit = unit
|
||||
.strip_suffix(".timer")
|
||||
.or_else(|| unit.strip_suffix(".service"))?;
|
||||
let (container_id, _suffix) = unit.split_once('-')?;
|
||||
if container_id.len() == 64 && container_id.bytes().all(|b| b.is_ascii_hexdigit()) {
|
||||
Some(container_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
async fn stop_user_unit(unit: &str) -> bool {
|
||||
run_systemctl_user(["stop", unit]).await
|
||||
}
|
||||
|
||||
async fn reset_failed_user_unit(unit: &str) -> bool {
|
||||
run_systemctl_user(["reset-failed", unit]).await
|
||||
}
|
||||
|
||||
async fn run_systemctl_user<const N: usize>(args: [&str; N]) -> bool {
|
||||
let output = match tokio::time::timeout(
|
||||
std::time::Duration::from_secs(10),
|
||||
tokio::process::Command::new("systemctl")
|
||||
.arg("--user")
|
||||
.args(args.iter().copied())
|
||||
.output(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Ok(output)) => output,
|
||||
Ok(Err(e)) => {
|
||||
debug!("Failed to run systemctl --user {}: {}", args.join(" "), e);
|
||||
return false;
|
||||
}
|
||||
Err(_) => {
|
||||
debug!("systemctl --user {} timed out", args.join(" "));
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
if output.status.success() {
|
||||
true
|
||||
} else {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
debug!(
|
||||
"systemctl --user {} failed: {}",
|
||||
args.join(" "),
|
||||
stderr.trim()
|
||||
);
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_podman_health(c: &serde_json::Value, state: &str) -> Option<String> {
|
||||
c.get("Status")
|
||||
.and_then(|v| v.as_str())
|
||||
@@ -1173,4 +1324,47 @@ mod tests {
|
||||
Some("unhealthy")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_podman_healthcheck_systemd_units() {
|
||||
let id = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c";
|
||||
assert_eq!(
|
||||
parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.timer", id)),
|
||||
Some(id)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.service", id)),
|
||||
Some(id)
|
||||
);
|
||||
assert_eq!(parse_podman_healthcheck_unit("grafana.service"), None);
|
||||
assert_eq!(
|
||||
parse_podman_healthcheck_unit("nothexzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz-x.timer"),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stale_healthcheck_units_filters_only_removed_container_ids() {
|
||||
let live = "6467e25fd87d791a63fe9dbf6e2fabc7bf26533aa2c402b1089effeacf7ebbba";
|
||||
let stale = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c";
|
||||
let mut live_ids = HashSet::new();
|
||||
live_ids.insert(live.to_string());
|
||||
|
||||
let output = format!(
|
||||
" {live}-6fdc497fd3ba3b62.timer loaded active waiting\n\
|
||||
● {stale}-15c66ddfefa8a763.service loaded failed failed\n\
|
||||
grafana.service loaded active running\n\
|
||||
{stale}-1898d85de0bb707f.timer loaded active waiting\n"
|
||||
);
|
||||
|
||||
let mut units = stale_healthcheck_units(&output, &live_ids);
|
||||
units.sort();
|
||||
assert_eq!(
|
||||
units,
|
||||
vec![
|
||||
format!("{stale}-15c66ddfefa8a763.service"),
|
||||
format!("{stale}-1898d85de0bb707f.timer"),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user