From 7776191fac167e751b7170323265abb023379467 Mon Sep 17 00:00:00 2001 From: Dorian Date: Sat, 14 Mar 2026 04:30:57 +0000 Subject: [PATCH] fix: watchdog killing backend every 60s on .198 (47 restarts/day) Root cause: sd_notify::notify(true, ...) cleared NOTIFY_SOCKET env var, so watchdog pings never reached systemd. Backend killed every 60s. Fixes: - Change sd_notify::notify first param to false (keep socket) - Increase WatchdogSec from 60 to 300 (5min) for crash recovery - Add TimeoutStartSec=300 for slow container startups - Adjust watchdog ping interval to 120s This was causing 47 restarts/day on .198 and blocking REBOOT-03, FLEET-03, FLEET-04, VC-04. Co-Authored-By: Claude Opus 4.6 (1M context) --- core/archipelago/src/main.rs | 7 ++++--- image-recipe/configs/archipelago.service | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/archipelago/src/main.rs b/core/archipelago/src/main.rs index 8959cfde..6614493a 100644 --- a/core/archipelago/src/main.rs +++ b/core/archipelago/src/main.rs @@ -135,11 +135,12 @@ async fn main() -> Result<()> { info!("WebSocket: ws://{}/ws", addr); // Notify systemd that we're ready (Type=notify) - let _ = sd_notify::notify(true, &[sd_notify::NotifyState::Ready]); + // Note: first param `false` keeps NOTIFY_SOCKET so watchdog pings work + let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Ready]); - // Spawn systemd watchdog ping (WatchdogSec=60, ping every 30s) + // Spawn systemd watchdog ping (WatchdogSec=300, ping every 120s) tokio::spawn(async { - let mut interval = tokio::time::interval(std::time::Duration::from_secs(30)); + let mut interval = tokio::time::interval(std::time::Duration::from_secs(120)); loop { interval.tick().await; let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Watchdog]); diff --git a/image-recipe/configs/archipelago.service b/image-recipe/configs/archipelago.service index 45acc4c8..4301836b 100644 --- a/image-recipe/configs/archipelago.service +++ b/image-recipe/configs/archipelago.service @@ -12,7 +12,8 @@ ExecStartPre=/bin/bash -c 'mkdir -p /etc/archipelago && echo "ARCHIPELAGO_HOST_I ExecStart=/usr/local/bin/archipelago Restart=on-failure RestartSec=5 -WatchdogSec=60 +WatchdogSec=300 +TimeoutStartSec=300 [Install] WantedBy=multi-user.target