feat: Phase 4 backend hardening — container reliability + security audit
Container Management (CONT-01 through CONT-06): - Fix needs_archy_net: add lnd, nbxplorer to archy-net list - Add StartupTier dependency ordering to health monitor (DB→Core→Dependent→App→UI) - Add exponential backoff (10s/30s/90s) with 1hr stability reset - Add get_health_check_args() with health checks for 20+ apps - Add get_memory_limit() with per-app limits (128m-4g vs blanket 2g) - Create docs/network-topology.md - Fix fedimint containers on both nodes (moved to archy-net) Security Audit (SEC-01 through SEC-06): - Add sanitize_error_message() — strips internal paths from RPC errors - Add validate_identity_id() — blocks path traversal on identity operations - Add validate_did() — blocks path traversal on federation operations - Add message size limits: node-send-message (1MB), dwn.write-message (10MB) - Add rate limits for federation endpoints (join: 5/60s, invite: 10/300s) - Configure journald (500MB max, 7 day retention) on both nodes - Add /etc/logrotate.d/archipelago for backend + crowdsec logs - Verify all 4 nginx security headers on both nodes Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -151,6 +151,14 @@ impl RpcHandler {
|
||||
let data_format = params["dataFormat"].as_str();
|
||||
let data = params.get("data").cloned();
|
||||
|
||||
// Limit data size to 10MB to prevent disk exhaustion
|
||||
if let Some(ref d) = data {
|
||||
let data_str = d.to_string();
|
||||
if data_str.len() > 10_485_760 {
|
||||
anyhow::bail!("Message data too large (max 10MB)");
|
||||
}
|
||||
}
|
||||
|
||||
let store = DwnStore::new(&self.config.data_dir).await?;
|
||||
let message = store
|
||||
.write_message(author, protocol, schema, data_format, data)
|
||||
|
||||
@@ -4,6 +4,20 @@ use crate::identity;
|
||||
use anyhow::Result;
|
||||
use tracing::info;
|
||||
|
||||
/// Validate a DID parameter: must start with "did:", max 256 chars, no path traversal.
|
||||
fn validate_did(did: &str) -> Result<()> {
|
||||
if did.is_empty() || did.len() > 256 {
|
||||
anyhow::bail!("Invalid DID: must be 1-256 characters");
|
||||
}
|
||||
if !did.starts_with("did:") {
|
||||
anyhow::bail!("Invalid DID: must start with 'did:'");
|
||||
}
|
||||
if did.contains("..") || did.contains('/') || did.contains('\\') || did.contains('\0') {
|
||||
anyhow::bail!("Invalid DID: contains forbidden characters");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl RpcHandler {
|
||||
/// federation.invite — Generate an invite code containing our DID + onion for a peer.
|
||||
pub(super) async fn handle_federation_invite(&self) -> Result<serde_json::Value> {
|
||||
@@ -107,6 +121,7 @@ impl RpcHandler {
|
||||
.get("did")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing 'did' parameter"))?;
|
||||
validate_did(did)?;
|
||||
|
||||
let nodes = federation::remove_node(&self.config.data_dir, did).await?;
|
||||
info!(did = %did, "Removed node from federation");
|
||||
@@ -127,6 +142,7 @@ impl RpcHandler {
|
||||
.get("did")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing 'did' parameter"))?;
|
||||
validate_did(did)?;
|
||||
let trust_str = params
|
||||
.get("trust_level")
|
||||
.and_then(|v| v.as_str())
|
||||
|
||||
@@ -5,6 +5,20 @@ use crate::identity_manager::{IdentityManager, IdentityPurpose};
|
||||
use anyhow::{Context, Result};
|
||||
use nostr_sdk::ToBech32;
|
||||
|
||||
/// Validate an identity ID: alphanumeric, hyphens, underscores, 1-128 chars, no path traversal.
|
||||
fn validate_identity_id(id: &str) -> Result<()> {
|
||||
if id.is_empty() || id.len() > 128 {
|
||||
anyhow::bail!("Invalid identity id: must be 1-128 characters");
|
||||
}
|
||||
if id.contains("..") || id.contains('/') || id.contains('\\') || id.contains('\0') {
|
||||
anyhow::bail!("Invalid identity id: contains forbidden characters");
|
||||
}
|
||||
if !id.bytes().all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b':') {
|
||||
anyhow::bail!("Invalid identity id: must be alphanumeric, hyphens, underscores, or colons");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl RpcHandler {
|
||||
/// List all identities with their default status.
|
||||
pub(super) async fn handle_identity_list(
|
||||
@@ -83,6 +97,7 @@ impl RpcHandler {
|
||||
.get("id")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing required parameter: id"))?;
|
||||
validate_identity_id(id)?;
|
||||
|
||||
let manager = IdentityManager::new(&self.config.data_dir).await?;
|
||||
let record = manager.get(id).await?;
|
||||
@@ -112,6 +127,7 @@ impl RpcHandler {
|
||||
.get("id")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing required parameter: id"))?;
|
||||
validate_identity_id(id)?;
|
||||
|
||||
let manager = IdentityManager::new(&self.config.data_dir).await?;
|
||||
manager.delete(id).await?;
|
||||
@@ -129,6 +145,7 @@ impl RpcHandler {
|
||||
.get("id")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing required parameter: id"))?;
|
||||
validate_identity_id(id)?;
|
||||
|
||||
let manager = IdentityManager::new(&self.config.data_dir).await?;
|
||||
manager.set_default(id).await?;
|
||||
|
||||
@@ -66,6 +66,42 @@ struct RpcError {
|
||||
/// Default dev password when no user is set up (matches mock-backend).
|
||||
pub(crate) const DEV_DEFAULT_PASSWORD: &str = "password123";
|
||||
|
||||
/// Sanitize error messages before returning to clients.
|
||||
/// Keeps user-facing validation errors but strips internal system details.
|
||||
fn sanitize_error_message(msg: &str) -> String {
|
||||
// Allow known validation errors through (these are user-actionable)
|
||||
let user_facing_prefixes = [
|
||||
"Invalid",
|
||||
"Missing",
|
||||
"Not found",
|
||||
"Already exists",
|
||||
"Rate limit",
|
||||
"Unauthorized",
|
||||
"Forbidden",
|
||||
"Not supported",
|
||||
"requires",
|
||||
"must be",
|
||||
"cannot",
|
||||
"Password",
|
||||
"Session",
|
||||
];
|
||||
for prefix in &user_facing_prefixes {
|
||||
if msg.starts_with(prefix) || msg.contains(prefix) {
|
||||
// Truncate long messages and strip file paths
|
||||
let sanitized = msg.replace("/var/lib/archipelago/", "[data]/")
|
||||
.replace("/usr/local/bin/", "[bin]/")
|
||||
.replace("/etc/", "[config]/");
|
||||
return if sanitized.len() > 200 {
|
||||
format!("{}...", &sanitized[..200])
|
||||
} else {
|
||||
sanitized
|
||||
};
|
||||
}
|
||||
}
|
||||
// For all other errors, return a generic message
|
||||
"Operation failed. Check server logs for details.".to_string()
|
||||
}
|
||||
|
||||
/// Methods that do not require a valid session cookie.
|
||||
const UNAUTHENTICATED_METHODS: &[&str] = &[
|
||||
"auth.login",
|
||||
@@ -472,6 +508,9 @@ impl RpcHandler {
|
||||
"mesh.broadcast" => self.handle_mesh_broadcast().await,
|
||||
"mesh.configure" => self.handle_mesh_configure(params).await,
|
||||
|
||||
// Server settings
|
||||
"server.set-name" => self.handle_server_set_name(params).await,
|
||||
|
||||
// System monitoring
|
||||
"system.stats" => self.handle_system_stats().await,
|
||||
"system.processes" => self.handle_system_processes().await,
|
||||
@@ -558,12 +597,14 @@ impl RpcHandler {
|
||||
error: None,
|
||||
},
|
||||
Err(e) => {
|
||||
error!("RPC error: {}", e);
|
||||
error!("RPC error on {}: {}", rpc_req.method, e);
|
||||
// Sanitize error messages: only return user-facing text, not internal details
|
||||
let user_message = sanitize_error_message(&e.to_string());
|
||||
RpcResponse {
|
||||
result: None,
|
||||
error: Some(RpcError {
|
||||
code: -1,
|
||||
message: e.to_string(),
|
||||
message: user_message,
|
||||
data: None,
|
||||
}),
|
||||
}
|
||||
|
||||
@@ -226,8 +226,9 @@ impl RpcHandler {
|
||||
let needs_archy_net = matches!(
|
||||
package_id,
|
||||
"bitcoin-knots" | "bitcoin" | "bitcoin-core"
|
||||
| "lnd"
|
||||
| "mempool" | "mempool-web" | "mempool-api" | "mempool-electrs" | "electrs" | "mysql-mempool" | "archy-mempool-db" | "archy-mempool-web"
|
||||
| "btcpay-server" | "btcpayserver" | "archy-btcpay-db"
|
||||
| "btcpay-server" | "btcpayserver" | "archy-btcpay-db" | "archy-nbxplorer" | "nbxplorer"
|
||||
| "fedimint" | "fedimint-gateway"
|
||||
);
|
||||
|
||||
@@ -329,12 +330,18 @@ printtoconsole=1\n";
|
||||
run_args.push(env);
|
||||
}
|
||||
|
||||
// Security: Resource limits (from manifest)
|
||||
let memory_limit = if package_id == "ollama" { "4g" } else { "2g" };
|
||||
// Resource limits: per-app memory and CPU
|
||||
let memory_limit = get_memory_limit(package_id);
|
||||
let mem_arg = format!("--memory={}", memory_limit);
|
||||
run_args.push(&mem_arg);
|
||||
run_args.push("--cpus=2");
|
||||
|
||||
// Health check definitions
|
||||
let health_args = get_health_check_args(package_id);
|
||||
for arg in &health_args {
|
||||
run_args.push(arg);
|
||||
}
|
||||
|
||||
// Finally, the image
|
||||
run_args.push(docker_image);
|
||||
|
||||
@@ -1289,6 +1296,148 @@ fn is_readonly_compatible(app_id: &str) -> bool {
|
||||
)
|
||||
}
|
||||
|
||||
/// Get container health check arguments for podman run.
|
||||
/// Returns (health-cmd, interval, retries) args to append to run_args.
|
||||
fn get_health_check_args(app_id: &str) -> Vec<String> {
|
||||
let (cmd, interval, retries) = match app_id {
|
||||
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => (
|
||||
"bitcoin-cli -rpcuser=archipelago -rpcpassword=archipelago123 getblockchaininfo || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"lnd" => (
|
||||
"lncli getinfo || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"btcpay-server" | "btcpayserver" => (
|
||||
"curl -sf http://localhost:49392/ || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"mempool-api" => (
|
||||
"curl -sf http://localhost:8999/api/v1/backend-info || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"mempool" | "mempool-web" | "archy-mempool-web" => (
|
||||
"curl -sf http://localhost:8080/ || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"mempool-electrs" | "electrs" => (
|
||||
"curl -sf http://localhost:50001/ || exit 1",
|
||||
"60s", "3",
|
||||
),
|
||||
"nextcloud" => (
|
||||
"curl -sf http://localhost:80/status.php || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"homeassistant" | "home-assistant" => (
|
||||
"curl -sf http://localhost:8123/api/ || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"grafana" => (
|
||||
"curl -sf http://localhost:3000/api/health || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"jellyfin" => (
|
||||
"curl -sf http://localhost:8096/health || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"vaultwarden" => (
|
||||
"curl -sf http://localhost:80/alive || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"uptime-kuma" => (
|
||||
"curl -sf http://localhost:3001/ || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"filebrowser" => (
|
||||
"curl -sf http://localhost:80/health || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"searxng" => (
|
||||
"curl -sf http://localhost:8080/ || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"photoprism" => (
|
||||
"curl -sf http://localhost:2342/api/v1/status || exit 1",
|
||||
"60s", "3",
|
||||
),
|
||||
"immich_server" | "immich" => (
|
||||
"curl -sf http://localhost:2283/api/server/ping || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"dwn" => (
|
||||
"curl -sf http://localhost:3000/health || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"portainer" => (
|
||||
"curl -sf http://localhost:9000/api/status || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"ollama" => (
|
||||
"curl -sf http://localhost:11434/ || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"fedimint" => (
|
||||
"curl -sf http://localhost:8174/health || exit 1",
|
||||
"60s", "3",
|
||||
),
|
||||
"nostr-rs-relay" | "nostr-relay" => (
|
||||
"curl -sf http://localhost:8080/ || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
"nginx-proxy-manager" => (
|
||||
"curl -sf http://localhost:81/api/ || exit 1",
|
||||
"30s", "3",
|
||||
),
|
||||
_ => return vec![],
|
||||
};
|
||||
|
||||
vec![
|
||||
format!("--health-cmd={}", cmd),
|
||||
format!("--health-interval={}", interval),
|
||||
format!("--health-retries={}", retries),
|
||||
"--health-start-period=60s".to_string(),
|
||||
]
|
||||
}
|
||||
|
||||
/// Get per-app memory limit.
|
||||
fn get_memory_limit(app_id: &str) -> &'static str {
|
||||
match app_id {
|
||||
// Heavy apps
|
||||
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => "2g",
|
||||
"onlyoffice" | "onlyoffice-documentserver" => "2g",
|
||||
"ollama" => "4g",
|
||||
// Medium apps
|
||||
"lnd" => "512m",
|
||||
"mempool-electrs" | "electrs" => "1g",
|
||||
"nextcloud" => "1g",
|
||||
"immich_server" | "immich" => "1g",
|
||||
"btcpay-server" | "btcpayserver" => "1g",
|
||||
"homeassistant" | "home-assistant" => "512m",
|
||||
"fedimint" => "512m",
|
||||
"fedimint-gateway" => "512m",
|
||||
"photoprism" => "1g",
|
||||
// Light apps
|
||||
"mempool-api" => "512m",
|
||||
"mempool" | "mempool-web" | "archy-mempool-web" => "256m",
|
||||
"grafana" => "256m",
|
||||
"jellyfin" => "1g",
|
||||
"vaultwarden" => "256m",
|
||||
"uptime-kuma" => "256m",
|
||||
"filebrowser" => "256m",
|
||||
"searxng" => "512m",
|
||||
"dwn" => "256m",
|
||||
"portainer" => "256m",
|
||||
"nostr-rs-relay" | "nostr-relay" => "256m",
|
||||
"nginx-proxy-manager" => "256m",
|
||||
// Databases
|
||||
"archy-btcpay-db" | "archy-mempool-db" | "mysql-mempool" => "512m",
|
||||
"immich_postgres" | "penpot-postgres" => "256m",
|
||||
"immich_redis" | "penpot-valkey" => "128m",
|
||||
// Default
|
||||
_ => "512m",
|
||||
}
|
||||
}
|
||||
|
||||
/// Get app-specific configuration
|
||||
/// Returns: (ports, volumes, env_vars, custom_command, custom_args)
|
||||
fn get_app_config(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::RpcHandler;
|
||||
use crate::{node_message, nostr_discovery, peers};
|
||||
use crate::{federation, node_message, nostr_discovery, peers};
|
||||
use crate::peers::KnownPeer;
|
||||
use anyhow::Result;
|
||||
|
||||
@@ -61,15 +61,29 @@ impl RpcHandler {
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing message"))?;
|
||||
|
||||
// Validate onion is a known peer to prevent SSRF to arbitrary Tor destinations
|
||||
// Limit message size to 1MB to prevent DoS
|
||||
if message.len() > 1_048_576 {
|
||||
anyhow::bail!("Message too large (max 1MB)");
|
||||
}
|
||||
|
||||
// Validate onion is a known peer or federated node to prevent SSRF
|
||||
let known_peers = peers::load_peers(&self.config.data_dir).await?;
|
||||
let is_known = known_peers.iter().any(|p| {
|
||||
let is_known_peer = known_peers.iter().any(|p| {
|
||||
p.onion == onion || p.onion == format!("{}.onion", onion)
|
||||
|| format!("{}.onion", p.onion) == onion
|
||||
});
|
||||
if !is_known {
|
||||
let is_known_fed = if !is_known_peer {
|
||||
let fed_nodes = federation::load_nodes(&self.config.data_dir).await.unwrap_or_default();
|
||||
fed_nodes.iter().any(|n| {
|
||||
n.onion == onion || n.onion == format!("{}.onion", onion)
|
||||
|| format!("{}.onion", n.onion) == onion
|
||||
})
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if !is_known_peer && !is_known_fed {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Onion address not in known peers list. Add the peer first."
|
||||
"Onion address not in known peers or federation. Add the peer first."
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
// Container Health Monitor
|
||||
// Checks container health every 60s, auto-restarts unhealthy containers (max 3 times),
|
||||
// Checks container health every 60s, auto-restarts unhealthy containers (max 3 times)
|
||||
// with exponential backoff (10s, 30s, 90s), dependency-aware startup ordering,
|
||||
// and sends WebSocket notifications to the UI on failure.
|
||||
|
||||
use crate::data_model::{Notification, NotificationLevel};
|
||||
@@ -8,20 +9,67 @@ use crate::webhooks::{self, WebhookEvent};
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
const MAX_RESTART_ATTEMPTS: u32 = 3;
|
||||
const CHECK_INTERVAL_SECS: u64 = 60;
|
||||
/// Backoff delays per attempt: 10s, 30s, 90s
|
||||
const BACKOFF_DELAYS_SECS: [u64; 3] = [10, 30, 90];
|
||||
/// Reset restart counter after 1 hour of stability
|
||||
const STABILITY_RESET_SECS: u64 = 3600;
|
||||
|
||||
/// Track restart attempts per container to avoid infinite restart loops.
|
||||
/// Container startup tier for dependency ordering.
|
||||
/// Lower tiers start first. Containers in the same tier start together.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum StartupTier {
|
||||
/// Databases: postgres, redis, mariadb, mysql
|
||||
Database = 0,
|
||||
/// Core infrastructure: bitcoin-knots, bitcoin-core
|
||||
CoreInfra = 1,
|
||||
/// Services depending on core: lnd, electrs, nbxplorer
|
||||
DependentService = 2,
|
||||
/// Application layer: mempool-api, btcpay-server, fedimint, nextcloud, etc.
|
||||
Application = 3,
|
||||
/// UI/frontend containers: mempool-web, bitcoin-ui, lnd-ui
|
||||
Frontend = 4,
|
||||
}
|
||||
|
||||
fn container_tier(name: &str) -> StartupTier {
|
||||
let id = name.strip_prefix("archy-").unwrap_or(name);
|
||||
match id {
|
||||
// Tier 0: Databases
|
||||
"btcpay-db" | "mempool-db" | "penpot-postgres" | "immich_postgres"
|
||||
| "immich_redis" | "penpot-valkey" | "endurain-db" | "nextcloud-db" => StartupTier::Database,
|
||||
|
||||
// Tier 1: Core infrastructure
|
||||
"bitcoin-knots" | "bitcoin-core" | "bitcoin" => StartupTier::CoreInfra,
|
||||
|
||||
// Tier 2: Dependent services
|
||||
"lnd" | "mempool-electrs" | "electrs" | "nbxplorer" => StartupTier::DependentService,
|
||||
|
||||
// Tier 4: Frontend/UI
|
||||
"mempool-web" | "bitcoin-ui" | "lnd-ui" | "electrs-ui"
|
||||
| "penpot-frontend" | "penpot-exporter" => StartupTier::Frontend,
|
||||
|
||||
// Tier 3: Everything else
|
||||
_ => StartupTier::Application,
|
||||
}
|
||||
}
|
||||
|
||||
/// Track restart attempts per container with exponential backoff and stability reset.
|
||||
struct RestartTracker {
|
||||
attempts: HashMap<String, u32>,
|
||||
last_failure: HashMap<String, Instant>,
|
||||
last_healthy: HashMap<String, Instant>,
|
||||
}
|
||||
|
||||
impl RestartTracker {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
attempts: HashMap::new(),
|
||||
last_failure: HashMap::new(),
|
||||
last_healthy: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -29,17 +77,50 @@ impl RestartTracker {
|
||||
fn record_attempt(&mut self, name: &str) -> bool {
|
||||
let count = self.attempts.entry(name.to_string()).or_insert(0);
|
||||
*count += 1;
|
||||
self.last_failure.insert(name.to_string(), Instant::now());
|
||||
*count <= MAX_RESTART_ATTEMPTS
|
||||
}
|
||||
|
||||
/// Clear restart count when a container is healthy again.
|
||||
fn clear(&mut self, name: &str) {
|
||||
self.attempts.remove(name);
|
||||
self.last_failure.remove(name);
|
||||
self.last_healthy.insert(name.to_string(), Instant::now());
|
||||
}
|
||||
|
||||
fn attempt_count(&self, name: &str) -> u32 {
|
||||
*self.attempts.get(name).unwrap_or(&0)
|
||||
}
|
||||
|
||||
/// Get the backoff delay in seconds for the current attempt number.
|
||||
fn backoff_delay_secs(&self, name: &str) -> u64 {
|
||||
let attempts = self.attempt_count(name);
|
||||
if attempts == 0 {
|
||||
return BACKOFF_DELAYS_SECS[0];
|
||||
}
|
||||
let idx = (attempts as usize).saturating_sub(1).min(BACKOFF_DELAYS_SECS.len() - 1);
|
||||
BACKOFF_DELAYS_SECS[idx]
|
||||
}
|
||||
|
||||
/// Check if enough time has passed since last failure for the backoff delay.
|
||||
fn backoff_elapsed(&self, name: &str) -> bool {
|
||||
let delay = self.backoff_delay_secs(name);
|
||||
match self.last_failure.get(name) {
|
||||
Some(last) => last.elapsed().as_secs() >= delay,
|
||||
None => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a failed container should have its counter reset (1h stability window).
|
||||
fn should_reset_failed(&self, name: &str) -> bool {
|
||||
if self.attempt_count(name) < MAX_RESTART_ATTEMPTS {
|
||||
return false;
|
||||
}
|
||||
match self.last_failure.get(name) {
|
||||
Some(last) => last.elapsed().as_secs() >= STABILITY_RESET_SECS,
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -152,77 +233,107 @@ pub fn spawn_health_monitor(state: Arc<StateManager>, data_dir: PathBuf) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Sort containers by startup tier so databases restart before dependent services
|
||||
let mut unhealthy: Vec<&ContainerHealth> = Vec::new();
|
||||
let mut state_changed = false;
|
||||
let (mut data, _) = state.get_snapshot().await;
|
||||
|
||||
for container in &containers {
|
||||
if container.healthy {
|
||||
// Clear restart tracker if container recovered
|
||||
if tracker.attempt_count(&container.name) > 0 {
|
||||
info!("Container {} is healthy again after restart", container.name);
|
||||
tracker.clear(&container.name);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Container is unhealthy (exited/stopped)
|
||||
// Only try auto-restart if we haven't exceeded max attempts
|
||||
if container.state == "exited" || container.state == "stopped" {
|
||||
let attempts = tracker.attempt_count(&container.name);
|
||||
unhealthy.push(container);
|
||||
}
|
||||
}
|
||||
|
||||
if attempts >= MAX_RESTART_ATTEMPTS {
|
||||
// Already notified, skip
|
||||
debug!("Container {} exceeded max restart attempts ({})", container.name, MAX_RESTART_ATTEMPTS);
|
||||
continue;
|
||||
// Sort by startup tier: databases first, then core, then dependent, then apps, then UIs
|
||||
unhealthy.sort_by_key(|c| container_tier(&c.name));
|
||||
|
||||
let mut prev_tier: Option<StartupTier> = None;
|
||||
for container in &unhealthy {
|
||||
let tier = container_tier(&container.name);
|
||||
let attempts = tracker.attempt_count(&container.name);
|
||||
|
||||
// Reset counter after 1 hour for permanently failed containers
|
||||
if tracker.should_reset_failed(&container.name) {
|
||||
info!("Resetting restart counter for {} after {}s stability window", container.name, STABILITY_RESET_SECS);
|
||||
tracker.clear(&container.name);
|
||||
}
|
||||
|
||||
if tracker.attempt_count(&container.name) >= MAX_RESTART_ATTEMPTS {
|
||||
debug!("Container {} exceeded max restart attempts ({})", container.name, MAX_RESTART_ATTEMPTS);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Wait for backoff delay before retrying
|
||||
if !tracker.backoff_elapsed(&container.name) {
|
||||
let delay = tracker.backoff_delay_secs(&container.name);
|
||||
debug!("Container {} waiting for backoff ({}s)", container.name, delay);
|
||||
continue;
|
||||
}
|
||||
|
||||
// When transitioning to a higher tier, wait briefly for previous tier to stabilize
|
||||
if let Some(prev) = prev_tier {
|
||||
if tier > prev {
|
||||
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
|
||||
}
|
||||
}
|
||||
prev_tier = Some(tier);
|
||||
|
||||
if tracker.record_attempt(&container.name) {
|
||||
let restarted = restart_container(&container.name).await;
|
||||
let attempt = tracker.attempt_count(&container.name);
|
||||
if tracker.record_attempt(&container.name) {
|
||||
let attempt = tracker.attempt_count(&container.name);
|
||||
info!("Restarting {} (tier {:?}, attempt {}/{}, backoff {}s)",
|
||||
container.name, tier, attempt, MAX_RESTART_ATTEMPTS,
|
||||
BACKOFF_DELAYS_SECS.get(attempt.saturating_sub(1) as usize).unwrap_or(&90));
|
||||
|
||||
if !restarted || attempt >= MAX_RESTART_ATTEMPTS {
|
||||
// Push notification to UI
|
||||
let notification = Notification {
|
||||
id: format!("health-{}-{}", container.app_id, chrono::Utc::now().timestamp()),
|
||||
level: NotificationLevel::Error,
|
||||
title: format!("{} is unhealthy", container.app_id),
|
||||
message: if restarted {
|
||||
format!(
|
||||
"Container restarted ({}/{} attempts). May need manual attention.",
|
||||
attempt, MAX_RESTART_ATTEMPTS
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"Auto-restart failed (attempt {}/{}). Container state: {}",
|
||||
attempt, MAX_RESTART_ATTEMPTS, container.state
|
||||
)
|
||||
},
|
||||
timestamp: chrono::Utc::now().to_rfc3339(),
|
||||
app_id: Some(container.app_id.clone()),
|
||||
};
|
||||
let restarted = restart_container(&container.name).await;
|
||||
|
||||
// Keep only the latest 20 notifications
|
||||
data.notifications.push(notification.clone());
|
||||
if data.notifications.len() > 20 {
|
||||
data.notifications = data.notifications.split_off(data.notifications.len() - 20);
|
||||
}
|
||||
state_changed = true;
|
||||
if !restarted || attempt >= MAX_RESTART_ATTEMPTS {
|
||||
let notification = Notification {
|
||||
id: format!("health-{}-{}", container.app_id, chrono::Utc::now().timestamp()),
|
||||
level: NotificationLevel::Error,
|
||||
title: format!("{} is unhealthy", container.app_id),
|
||||
message: if restarted {
|
||||
format!(
|
||||
"Container restarted ({}/{} attempts). May need manual attention.",
|
||||
attempt, MAX_RESTART_ATTEMPTS
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"Auto-restart failed (attempt {}/{}). Container state: {}",
|
||||
attempt, MAX_RESTART_ATTEMPTS, container.state
|
||||
)
|
||||
},
|
||||
timestamp: chrono::Utc::now().to_rfc3339(),
|
||||
app_id: Some(container.app_id.clone()),
|
||||
};
|
||||
|
||||
// Fire-and-forget webhook delivery (checks config internally)
|
||||
let webhook_payload = webhooks::WebhookPayload {
|
||||
event: WebhookEvent::ContainerCrash,
|
||||
title: notification.title,
|
||||
message: notification.message,
|
||||
timestamp: notification.timestamp,
|
||||
node_id: String::new(),
|
||||
details: Some(serde_json::json!({
|
||||
"container": container.name,
|
||||
"app_id": container.app_id,
|
||||
"state": container.state,
|
||||
})),
|
||||
};
|
||||
webhooks::send_webhook(&data_dir, webhook_payload).await;
|
||||
data.notifications.push(notification.clone());
|
||||
if data.notifications.len() > 20 {
|
||||
data.notifications = data.notifications.split_off(data.notifications.len() - 20);
|
||||
}
|
||||
state_changed = true;
|
||||
|
||||
let webhook_payload = webhooks::WebhookPayload {
|
||||
event: WebhookEvent::ContainerCrash,
|
||||
title: notification.title,
|
||||
message: notification.message,
|
||||
timestamp: notification.timestamp,
|
||||
node_id: String::new(),
|
||||
details: Some(serde_json::json!({
|
||||
"container": container.name,
|
||||
"app_id": container.app_id,
|
||||
"state": container.state,
|
||||
"attempt": attempt,
|
||||
"tier": format!("{:?}", tier),
|
||||
})),
|
||||
};
|
||||
webhooks::send_webhook(&data_dir, webhook_payload).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -261,7 +372,6 @@ mod tests {
|
||||
#[test]
|
||||
fn test_restart_tracker_max_attempts_exceeded() {
|
||||
let mut tracker = RestartTracker::new();
|
||||
// First MAX_RESTART_ATTEMPTS attempts should return true
|
||||
for i in 1..=MAX_RESTART_ATTEMPTS {
|
||||
assert!(
|
||||
tracker.record_attempt("container-a"),
|
||||
@@ -269,7 +379,6 @@ mod tests {
|
||||
i
|
||||
);
|
||||
}
|
||||
// Next attempt exceeds max, returns false
|
||||
assert!(!tracker.record_attempt("container-a"));
|
||||
assert_eq!(tracker.attempt_count("container-a"), MAX_RESTART_ATTEMPTS + 1);
|
||||
}
|
||||
@@ -300,13 +409,11 @@ mod tests {
|
||||
#[test]
|
||||
fn test_restart_tracker_clear_allows_new_attempts() {
|
||||
let mut tracker = RestartTracker::new();
|
||||
// Exhaust attempts
|
||||
for _ in 0..=MAX_RESTART_ATTEMPTS {
|
||||
tracker.record_attempt("container-y");
|
||||
}
|
||||
assert!(!tracker.record_attempt("container-y"));
|
||||
|
||||
// Clear and try again
|
||||
tracker.clear("container-y");
|
||||
assert!(tracker.record_attempt("container-y"));
|
||||
assert_eq!(tracker.attempt_count("container-y"), 1);
|
||||
@@ -315,7 +422,6 @@ mod tests {
|
||||
#[test]
|
||||
fn test_restart_tracker_clear_nonexistent_is_safe() {
|
||||
let mut tracker = RestartTracker::new();
|
||||
// Should not panic
|
||||
tracker.clear("nonexistent");
|
||||
assert_eq!(tracker.attempt_count("nonexistent"), 0);
|
||||
}
|
||||
@@ -348,7 +454,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_max_restart_attempts_constant() {
|
||||
// Ensure the constant is a reasonable value (not 0, not too high)
|
||||
assert!(MAX_RESTART_ATTEMPTS >= 1);
|
||||
assert!(MAX_RESTART_ATTEMPTS <= 10);
|
||||
assert_eq!(MAX_RESTART_ATTEMPTS, 3);
|
||||
@@ -358,4 +463,75 @@ mod tests {
|
||||
fn test_check_interval_constant() {
|
||||
assert_eq!(CHECK_INTERVAL_SECS, 60);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backoff_delays() {
|
||||
let tracker = RestartTracker::new();
|
||||
// Before any attempts, delay is first backoff
|
||||
assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backoff_delays_escalate() {
|
||||
let mut tracker = RestartTracker::new();
|
||||
tracker.record_attempt("test");
|
||||
assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[0]); // 10s
|
||||
tracker.record_attempt("test");
|
||||
assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[1]); // 30s
|
||||
tracker.record_attempt("test");
|
||||
assert_eq!(tracker.backoff_delay_secs("test"), BACKOFF_DELAYS_SECS[2]); // 90s
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_backoff_elapsed_true_for_new() {
|
||||
let tracker = RestartTracker::new();
|
||||
assert!(tracker.backoff_elapsed("test"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stability_reset_not_triggered_early() {
|
||||
let mut tracker = RestartTracker::new();
|
||||
tracker.record_attempt("test");
|
||||
assert!(!tracker.should_reset_failed("test"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_container_tier_database() {
|
||||
assert_eq!(container_tier("archy-btcpay-db"), StartupTier::Database);
|
||||
assert_eq!(container_tier("immich_postgres"), StartupTier::Database);
|
||||
assert_eq!(container_tier("penpot-valkey"), StartupTier::Database);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_container_tier_core() {
|
||||
assert_eq!(container_tier("bitcoin-knots"), StartupTier::CoreInfra);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_container_tier_dependent() {
|
||||
assert_eq!(container_tier("lnd"), StartupTier::DependentService);
|
||||
assert_eq!(container_tier("mempool-electrs"), StartupTier::DependentService);
|
||||
assert_eq!(container_tier("archy-nbxplorer"), StartupTier::DependentService);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_container_tier_frontend() {
|
||||
assert_eq!(container_tier("archy-mempool-web"), StartupTier::Frontend);
|
||||
assert_eq!(container_tier("archy-bitcoin-ui"), StartupTier::Frontend);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_container_tier_application_default() {
|
||||
assert_eq!(container_tier("nextcloud"), StartupTier::Application);
|
||||
assert_eq!(container_tier("grafana"), StartupTier::Application);
|
||||
assert_eq!(container_tier("btcpay-server"), StartupTier::Application);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tier_ordering() {
|
||||
assert!(StartupTier::Database < StartupTier::CoreInfra);
|
||||
assert!(StartupTier::CoreInfra < StartupTier::DependentService);
|
||||
assert!(StartupTier::DependentService < StartupTier::Application);
|
||||
assert!(StartupTier::Application < StartupTier::Frontend);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,6 +312,13 @@ impl EndpointRateLimiter {
|
||||
limits.insert("system.shutdown".to_string(), (2, 300));
|
||||
// Password changes
|
||||
limits.insert("auth.changePassword".to_string(), (3, 300));
|
||||
// Federation join: prevent invite-code brute force
|
||||
limits.insert("federation.join".to_string(), (5, 60));
|
||||
limits.insert("federation.invite".to_string(), (10, 300));
|
||||
// Inter-node federation RPCs (unauthenticated, need stricter limits)
|
||||
limits.insert("federation.peer-joined".to_string(), (10, 60));
|
||||
limits.insert("federation.peer-address-changed".to_string(), (10, 60));
|
||||
limits.insert("federation.get-state".to_string(), (30, 60));
|
||||
|
||||
Self {
|
||||
requests: Arc::new(RwLock::new(HashMap::new())),
|
||||
|
||||
Reference in New Issue
Block a user