feat: container orchestration, branding overhaul, onboarding logging
Container orchestration: - Health monitor with crash recovery and auto-restart - Doctor service (periodic health checks via systemd timer) - Reconcile service (desired-state convergence) - Stack-aware install/uninstall with dependency tracking Branding: - Custom GRUB background (designer artwork, 1024x768) - ISOLINUX boot menu: centered, orange accents, clean labels - Terminal banners: adaptive width, basic ANSI colors, fits 80-col - Removed auto-generated splash scripts (designer provides assets) - GRUB theme: lowercase branding Frontend: - 401 handler clears localStorage immediately (prevents cascade) Backend: - Onboarding/auth logging ([onboarding] tag in journalctl) - Cookie Secure flag logging for debugging HTTP/HTTPS issues ISO fixes: - Install log saved before unmount (was silently failing) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
15
core/Cargo.lock
generated
15
core/Cargo.lock
generated
@@ -84,7 +84,6 @@ version = "1.2.0-alpha"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"archipelago-container",
|
||||
"archipelago-parmanode",
|
||||
"archipelago-performance",
|
||||
"archipelago-security",
|
||||
"argon2",
|
||||
@@ -160,20 +159,6 @@ dependencies = [
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "archipelago-parmanode"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"archipelago-container",
|
||||
"log",
|
||||
"serde",
|
||||
"serde_yaml",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "archipelago-performance"
|
||||
version = "0.1.0"
|
||||
|
||||
@@ -16,8 +16,10 @@ impl RpcHandler {
|
||||
if !is_setup {
|
||||
// Dev mode: allow default password so UI can log in without running setup
|
||||
if self.config.dev_mode && password == DEV_DEFAULT_PASSWORD {
|
||||
tracing::info!("[onboarding] login via dev default password");
|
||||
return Ok(serde_json::Value::Null);
|
||||
}
|
||||
tracing::warn!("[onboarding] login attempt before setup complete");
|
||||
return Err(anyhow::anyhow!(
|
||||
"User not set up. Please complete setup first."
|
||||
));
|
||||
@@ -25,13 +27,16 @@ impl RpcHandler {
|
||||
|
||||
let valid = self.auth_manager.verify_password(password).await?;
|
||||
if !valid {
|
||||
tracing::warn!("[onboarding] login failed — wrong password");
|
||||
return Err(anyhow::anyhow!("Password Incorrect"));
|
||||
}
|
||||
|
||||
tracing::info!("[onboarding] login successful");
|
||||
Ok(serde_json::Value::Null)
|
||||
}
|
||||
|
||||
pub(super) async fn handle_auth_logout(&self) -> Result<serde_json::Value> {
|
||||
tracing::info!("[onboarding] logout");
|
||||
Ok(serde_json::Value::Null)
|
||||
}
|
||||
|
||||
@@ -78,6 +83,7 @@ impl RpcHandler {
|
||||
// Prevent re-setup if already set up
|
||||
let is_setup = self.auth_manager.is_setup().await?;
|
||||
if is_setup {
|
||||
tracing::warn!("[onboarding] setup rejected — already set up");
|
||||
return Err(anyhow::anyhow!("Already set up. Use auth.changePassword to change."));
|
||||
}
|
||||
|
||||
@@ -88,20 +94,24 @@ impl RpcHandler {
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing password"))?;
|
||||
|
||||
if password.len() < 8 {
|
||||
tracing::warn!("[onboarding] setup rejected — password too short");
|
||||
return Err(anyhow::anyhow!("Password must be at least 8 characters"));
|
||||
}
|
||||
|
||||
self.auth_manager.setup_user(password).await?;
|
||||
tracing::info!("[onboarding] user setup complete");
|
||||
Ok(serde_json::json!(true))
|
||||
}
|
||||
|
||||
pub(super) async fn handle_auth_onboarding_complete(&self) -> Result<serde_json::Value> {
|
||||
self.auth_manager.complete_onboarding().await?;
|
||||
tracing::info!("[onboarding] onboarding marked complete");
|
||||
Ok(serde_json::json!(true))
|
||||
}
|
||||
|
||||
pub(super) async fn handle_auth_is_onboarding_complete(&self) -> Result<serde_json::Value> {
|
||||
let complete = self.auth_manager.is_onboarding_complete().await?;
|
||||
tracing::debug!("[onboarding] isOnboardingComplete={}", complete);
|
||||
Ok(serde_json::json!(complete))
|
||||
}
|
||||
|
||||
@@ -117,10 +127,12 @@ impl RpcHandler {
|
||||
|
||||
let valid = self.auth_manager.verify_password(password).await?;
|
||||
if !valid {
|
||||
tracing::warn!("[onboarding] reset rejected — wrong password");
|
||||
return Err(anyhow::anyhow!("Password Incorrect"));
|
||||
}
|
||||
|
||||
self.auth_manager.reset_onboarding().await?;
|
||||
tracing::info!("[onboarding] onboarding reset");
|
||||
Ok(serde_json::json!(true))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -153,9 +153,11 @@ impl RpcHandler {
|
||||
}
|
||||
if let Some(proto) = headers.get("x-forwarded-proto") {
|
||||
if proto.as_bytes() == b"https" {
|
||||
tracing::debug!("[onboarding] cookie: Secure (X-Forwarded-Proto: https)");
|
||||
return "; Secure";
|
||||
}
|
||||
}
|
||||
tracing::debug!("[onboarding] cookie: no Secure flag (HTTP or no X-Forwarded-Proto)");
|
||||
""
|
||||
}
|
||||
|
||||
|
||||
@@ -228,6 +228,11 @@ impl RpcHandler {
|
||||
|
||||
if !run_output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&run_output.stderr);
|
||||
// Rollback: remove partially created container
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args(["rm", "-f", container_name])
|
||||
.output()
|
||||
.await;
|
||||
return Err(anyhow::anyhow!("Failed to start container: {}", stderr));
|
||||
}
|
||||
|
||||
@@ -235,6 +240,43 @@ impl RpcHandler {
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
// Post-start health verification: wait up to 30s for container to be running
|
||||
for i in 0..6u32 {
|
||||
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
|
||||
let status = tokio::process::Command::new("podman")
|
||||
.args(["inspect", container_name, "--format", "{{.State.Status}}"])
|
||||
.output()
|
||||
.await;
|
||||
if let Ok(o) = status {
|
||||
let state = String::from_utf8_lossy(&o.stdout).trim().to_string();
|
||||
if state == "running" {
|
||||
break;
|
||||
}
|
||||
if state == "exited" {
|
||||
// Container crashed immediately — get logs for diagnosis
|
||||
let logs = tokio::process::Command::new("podman")
|
||||
.args(["logs", "--tail", "20", container_name])
|
||||
.output()
|
||||
.await;
|
||||
let log_output = logs
|
||||
.map(|o| String::from_utf8_lossy(&o.stderr).to_string())
|
||||
.unwrap_or_default();
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args(["rm", "-f", container_name])
|
||||
.output()
|
||||
.await;
|
||||
return Err(anyhow::anyhow!(
|
||||
"Container {} exited immediately after start. Logs: {}",
|
||||
container_name,
|
||||
log_output.chars().take(500).collect::<String>()
|
||||
));
|
||||
}
|
||||
}
|
||||
if i == 5 {
|
||||
debug!("Container {} health check timeout (30s) — continuing anyway", container_name);
|
||||
}
|
||||
}
|
||||
|
||||
// Post-install hooks
|
||||
self.run_post_install_hooks(package_id).await;
|
||||
|
||||
@@ -301,11 +343,43 @@ impl RpcHandler {
|
||||
Ok(has_local_fallback)
|
||||
}
|
||||
|
||||
/// Stream `podman pull` while updating install progress state.
|
||||
/// Pull image with retry and exponential backoff (3 attempts: 5s, 15s, 45s).
|
||||
async fn pull_image_with_progress(
|
||||
&self,
|
||||
package_id: &str,
|
||||
docker_image: &str,
|
||||
) -> Result<()> {
|
||||
const MAX_ATTEMPTS: u32 = 3;
|
||||
const BACKOFF_SECS: [u64; 3] = [5, 15, 45];
|
||||
|
||||
for attempt in 1..=MAX_ATTEMPTS {
|
||||
match self.do_pull_image(package_id, docker_image).await {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(e) if attempt < MAX_ATTEMPTS => {
|
||||
let delay = BACKOFF_SECS[(attempt - 1) as usize];
|
||||
tracing::warn!(
|
||||
"Image pull failed for {} (attempt {}/{}): {}. Retrying in {}s...",
|
||||
docker_image, attempt, MAX_ATTEMPTS, e, delay
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs(delay)).await;
|
||||
}
|
||||
Err(e) => {
|
||||
self.clear_install_progress(package_id).await;
|
||||
return Err(e.context(format!(
|
||||
"Failed to pull {} after {} attempts",
|
||||
docker_image, MAX_ATTEMPTS
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
/// Single image pull attempt with progress streaming.
|
||||
async fn do_pull_image(
|
||||
&self,
|
||||
package_id: &str,
|
||||
docker_image: &str,
|
||||
) -> Result<()> {
|
||||
debug!("Pulling image: {}", docker_image);
|
||||
self.set_install_progress(package_id, 0, 0).await;
|
||||
@@ -336,8 +410,20 @@ impl RpcHandler {
|
||||
.await
|
||||
.context("Failed to wait for image pull")?;
|
||||
if !status.success() {
|
||||
self.clear_install_progress(package_id).await;
|
||||
return Err(anyhow::anyhow!("Failed to pull image"));
|
||||
return Err(anyhow::anyhow!("podman pull exited with non-zero status"));
|
||||
}
|
||||
|
||||
// Verify image exists locally after pull
|
||||
let verify = tokio::process::Command::new("podman")
|
||||
.args(["images", "-q", docker_image])
|
||||
.output()
|
||||
.await
|
||||
.context("Failed to verify pulled image")?;
|
||||
if String::from_utf8_lossy(&verify.stdout).trim().is_empty() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Image {} not found locally after pull",
|
||||
docker_image
|
||||
));
|
||||
}
|
||||
|
||||
self.set_install_progress(package_id, 100, 100).await;
|
||||
|
||||
@@ -4,6 +4,22 @@ use super::validation::validate_app_id;
|
||||
use crate::api::rpc::RpcHandler;
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
/// Per-container graceful shutdown timeout in seconds.
|
||||
/// Bitcoin Core needs 600s to flush UTXO set, LND 330s for channel state,
|
||||
/// indexers 300s for index flush, databases 120s for WAL/transaction commit.
|
||||
fn stop_timeout_secs(container_name: &str) -> &'static str {
|
||||
let id = container_name.strip_prefix("archy-").unwrap_or(container_name);
|
||||
match id {
|
||||
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => "600",
|
||||
"lnd" => "330",
|
||||
"electrumx" | "electrs" | "mempool-electrs" => "300",
|
||||
"btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres"
|
||||
| "nextcloud-db" | "endurain-db" => "120",
|
||||
"btcpay-server" | "nbxplorer" | "fedimint" | "fedimint-gateway" => "60",
|
||||
_ => "30",
|
||||
}
|
||||
}
|
||||
|
||||
impl RpcHandler {
|
||||
/// Start a package: start all containers in dependency order.
|
||||
pub(in crate::api::rpc) async fn handle_package_start(
|
||||
@@ -56,7 +72,7 @@ impl RpcHandler {
|
||||
crate::crash_recovery::mark_user_stopped(&self.config.data_dir, &container_name)
|
||||
.await;
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args(["stop", &container_name])
|
||||
.args(["stop", "-t", stop_timeout_secs(&container_name), &container_name])
|
||||
.output()
|
||||
.await;
|
||||
return Ok(serde_json::Value::Null);
|
||||
@@ -67,7 +83,7 @@ impl RpcHandler {
|
||||
}
|
||||
for name in containers {
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args(["stop", &name])
|
||||
.args(["stop", "-t", stop_timeout_secs(&name), &name])
|
||||
.output()
|
||||
.await;
|
||||
}
|
||||
@@ -135,7 +151,7 @@ impl RpcHandler {
|
||||
for name in &containers_to_remove {
|
||||
tracing::info!("Uninstall {}: stopping container {}", package_id, name);
|
||||
let stop_out = tokio::process::Command::new("podman")
|
||||
.args(["stop", "-t", "10", name])
|
||||
.args(["stop", "-t", stop_timeout_secs(name), name])
|
||||
.output()
|
||||
.await;
|
||||
match stop_out {
|
||||
@@ -344,7 +360,7 @@ impl RpcHandler {
|
||||
validate_app_id(app_id)?;
|
||||
|
||||
let output = tokio::process::Command::new("podman")
|
||||
.args(["stop", app_id])
|
||||
.args(["stop", "-t", stop_timeout_secs(app_id), app_id])
|
||||
.output()
|
||||
.await
|
||||
.context("Failed to stop container")?;
|
||||
|
||||
@@ -7,6 +7,41 @@ use crate::api::rpc::RpcHandler;
|
||||
use anyhow::{Context, Result};
|
||||
use tracing::info;
|
||||
|
||||
/// Pull an image with retry and exponential backoff (3 attempts).
|
||||
async fn pull_image_with_retry(image: &str) -> Result<()> {
|
||||
const MAX_ATTEMPTS: u32 = 3;
|
||||
const BACKOFF_SECS: [u64; 3] = [5, 15, 45];
|
||||
|
||||
for attempt in 1..=MAX_ATTEMPTS {
|
||||
let output = tokio::process::Command::new("podman")
|
||||
.args(["pull", image])
|
||||
.output()
|
||||
.await
|
||||
.context("Failed to execute podman pull")?;
|
||||
|
||||
if output.status.success() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if attempt < MAX_ATTEMPTS {
|
||||
let delay = BACKOFF_SECS[(attempt - 1) as usize];
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
tracing::warn!(
|
||||
"Image pull failed for {} (attempt {}/{}): {}. Retrying in {}s...",
|
||||
image, attempt, MAX_ATTEMPTS, stderr.trim(), delay
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs(delay)).await;
|
||||
} else {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to pull {} after {} attempts: {}",
|
||||
image, MAX_ATTEMPTS, stderr.trim()
|
||||
));
|
||||
}
|
||||
}
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
impl RpcHandler {
|
||||
/// Install Immich stack (postgres + redis + server).
|
||||
pub(super) async fn install_immich_stack(&self) -> Result<serde_json::Value> {
|
||||
@@ -38,10 +73,7 @@ impl RpcHandler {
|
||||
"80.71.235.15:3000/archipelago/immich-server:release",
|
||||
];
|
||||
for img in &images {
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args(["pull", img])
|
||||
.output()
|
||||
.await;
|
||||
pull_image_with_retry(img).await?;
|
||||
}
|
||||
|
||||
let _ = tokio::process::Command::new("sudo")
|
||||
@@ -168,10 +200,7 @@ impl RpcHandler {
|
||||
"80.71.235.15:3000/archipelago/penpot-frontend:2.4",
|
||||
];
|
||||
for img in &images {
|
||||
let _ = tokio::process::Command::new("podman")
|
||||
.args(["pull", img])
|
||||
.output()
|
||||
.await;
|
||||
pull_image_with_retry(img).await?;
|
||||
}
|
||||
|
||||
let _ = tokio::process::Command::new("sudo")
|
||||
|
||||
@@ -384,6 +384,36 @@ fn container_boot_tier(name: &str) -> u8 {
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the reconciliation script after boot to fix any config drift.
|
||||
/// Ensures all containers match their canonical specs from container-specs.sh.
|
||||
pub async fn run_boot_reconciliation() {
|
||||
let script = "/home/archipelago/archy/scripts/reconcile-containers.sh";
|
||||
if !std::path::Path::new(script).exists() {
|
||||
info!("Reconciliation script not found (dev mode?) — skipping boot reconciliation");
|
||||
return;
|
||||
}
|
||||
info!("Running boot reconciliation...");
|
||||
let result = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(300),
|
||||
tokio::process::Command::new(script).output(),
|
||||
)
|
||||
.await;
|
||||
match result {
|
||||
Ok(Ok(output)) if output.status.success() => {
|
||||
info!("Boot reconciliation complete");
|
||||
}
|
||||
Ok(Ok(output)) => {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
warn!(
|
||||
"Boot reconciliation had failures: {}",
|
||||
stderr.chars().take(500).collect::<String>()
|
||||
);
|
||||
}
|
||||
Ok(Err(e)) => warn!("Boot reconciliation failed to run: {}", e),
|
||||
Err(_) => warn!("Boot reconciliation timed out (300s)"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Spawn a background task that periodically saves the container snapshot.
|
||||
pub fn spawn_snapshot_task(data_dir: PathBuf) {
|
||||
tokio::spawn(async move {
|
||||
|
||||
@@ -6,8 +6,9 @@
|
||||
use crate::data_model::{Notification, NotificationLevel};
|
||||
use crate::state::StateManager;
|
||||
use crate::webhooks::{self, WebhookEvent};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, info, warn};
|
||||
@@ -177,6 +178,69 @@ impl MemoryTracker {
|
||||
|
||||
}
|
||||
|
||||
// ── Persistent restart tracking ────────────────────────────────────────
|
||||
// Survives process restarts so a container can't loop infinitely by
|
||||
// crashing 3 times → triggering process restart → resetting counter → repeat.
|
||||
|
||||
const RESTART_HISTORY_FILE: &str = "restart-tracker.json";
|
||||
|
||||
#[derive(Serialize, Deserialize, Default)]
|
||||
struct RestartHistory {
|
||||
containers: HashMap<String, ContainerRestartRecord>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
struct ContainerRestartRecord {
|
||||
attempts: u32,
|
||||
last_failure_epoch: i64,
|
||||
}
|
||||
|
||||
impl RestartHistory {
|
||||
async fn load(data_dir: &Path) -> Self {
|
||||
let path = data_dir.join(RESTART_HISTORY_FILE);
|
||||
match tokio::fs::read_to_string(&path).await {
|
||||
Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
|
||||
Err(_) => Self::default(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn save(&self, data_dir: &Path) {
|
||||
let path = data_dir.join(RESTART_HISTORY_FILE);
|
||||
if let Ok(json) = serde_json::to_string(self) {
|
||||
let _ = tokio::fs::write(&path, json).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Seed the in-memory RestartTracker from persisted history.
|
||||
fn seed_tracker(&self, tracker: &mut RestartTracker) {
|
||||
let now_epoch = chrono::Utc::now().timestamp();
|
||||
for (name, record) in &self.containers {
|
||||
// Only seed if last failure was within the stability window
|
||||
let secs_since_failure = now_epoch - record.last_failure_epoch;
|
||||
if secs_since_failure < STABILITY_RESET_SECS as i64 && record.attempts > 0 {
|
||||
tracker.attempts.insert(name.clone(), record.attempts);
|
||||
info!(
|
||||
"Restored restart counter for {}: {} attempts ({}s ago)",
|
||||
name, record.attempts, secs_since_failure
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn record_attempt(&mut self, name: &str) {
|
||||
let entry = self.containers.entry(name.to_string()).or_insert(ContainerRestartRecord {
|
||||
attempts: 0,
|
||||
last_failure_epoch: 0,
|
||||
});
|
||||
entry.attempts += 1;
|
||||
entry.last_failure_epoch = chrono::Utc::now().timestamp();
|
||||
}
|
||||
|
||||
fn clear(&mut self, name: &str) {
|
||||
self.containers.remove(name);
|
||||
}
|
||||
}
|
||||
|
||||
/// Query container memory stats from podman.
|
||||
async fn check_container_memory() -> HashMap<String, u64> {
|
||||
let output = match tokio::time::timeout(
|
||||
@@ -373,6 +437,11 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
||||
let mut mem_check_counter: u32 = 0;
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(CHECK_INTERVAL_SECS));
|
||||
|
||||
// Load persistent restart history and seed the in-memory tracker
|
||||
let mut restart_history = RestartHistory::load(&data_dir).await;
|
||||
restart_history.seed_tracker(&mut tracker);
|
||||
let mut history_dirty = false;
|
||||
|
||||
loop {
|
||||
interval.tick().await;
|
||||
mem_check_counter += 1;
|
||||
@@ -406,6 +475,8 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
||||
if tracker.attempt_count(&container.name) > 0 {
|
||||
info!("Container {} is healthy again after restart", container.name);
|
||||
tracker.clear(&container.name);
|
||||
restart_history.clear(&container.name);
|
||||
history_dirty = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -430,6 +501,8 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
||||
if tracker.should_reset_failed(&container.name) {
|
||||
info!("Resetting restart counter for {} after {}s stability window", container.name, STABILITY_RESET_SECS);
|
||||
tracker.clear(&container.name);
|
||||
restart_history.clear(&container.name);
|
||||
history_dirty = true;
|
||||
}
|
||||
|
||||
if tracker.attempt_count(&container.name) >= MAX_RESTART_ATTEMPTS {
|
||||
@@ -453,6 +526,8 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
||||
prev_tier = Some(tier);
|
||||
|
||||
if tracker.record_attempt(&container.name) {
|
||||
restart_history.record_attempt(&container.name);
|
||||
history_dirty = true;
|
||||
let attempt = tracker.attempt_count(&container.name);
|
||||
info!("Restarting {} (tier {:?}, attempt {}/{}, backoff {}s)",
|
||||
container.name, tier, attempt, MAX_RESTART_ATTEMPTS,
|
||||
@@ -509,6 +584,12 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
||||
state.update_data(data).await;
|
||||
debug!("Health monitor: state updated with notifications");
|
||||
}
|
||||
|
||||
// Persist restart history to disk (debounced: once per check cycle)
|
||||
if history_dirty {
|
||||
restart_history.save(&data_dir).await;
|
||||
history_dirty = false;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -106,6 +106,9 @@ async fn main() -> Result<()> {
|
||||
|
||||
// Signal to health monitor that boot recovery is done
|
||||
crash_recovery::mark_recovery_complete();
|
||||
|
||||
// Reconcile containers against canonical specs (fixes config drift)
|
||||
crash_recovery::run_boot_reconciliation().await;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -135,9 +135,14 @@ impl HealthMonitor {
|
||||
HealthStatus::Unhealthy => {
|
||||
consecutive_failures += 1;
|
||||
if consecutive_failures >= max_failures {
|
||||
error!("Container {} is unhealthy after {} failures",
|
||||
error!("Container {} is unhealthy after {} failures",
|
||||
self.container_name, consecutive_failures);
|
||||
// TODO: Trigger auto-restart or alert
|
||||
// Auto-restart is handled by the orchestrator-level health monitor
|
||||
// (core/archipelago/src/health_monitor.rs) which runs every 60s,
|
||||
// checks all container states via `podman ps`, and restarts
|
||||
// exited containers with exponential backoff (10s/30s/90s).
|
||||
// This per-container monitor is for manifest-driven health
|
||||
// tracking and status change callbacks only.
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
|
||||
Reference in New Issue
Block a user