fix: watchdog killing backend every 60s on .198 (47 restarts/day)

Root cause: sd_notify::notify(true, ...) cleared NOTIFY_SOCKET env var,
so watchdog pings never reached systemd. Backend killed every 60s.

Fixes:
- Change sd_notify::notify first param to false (keep socket)
- Increase WatchdogSec from 60 to 300 (5min) for crash recovery
- Add TimeoutStartSec=300 for slow container startups
- Adjust watchdog ping interval to 120s

This was causing 47 restarts/day on .198 and blocking REBOOT-03,
FLEET-03, FLEET-04, VC-04.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-03-14 04:30:57 +00:00
parent ca45cf957c
commit 7776191fac
2 changed files with 6 additions and 4 deletions

View File

@@ -135,11 +135,12 @@ async fn main() -> Result<()> {
info!("WebSocket: ws://{}/ws", addr);
// Notify systemd that we're ready (Type=notify)
let _ = sd_notify::notify(true, &[sd_notify::NotifyState::Ready]);
// Note: first param `false` keeps NOTIFY_SOCKET so watchdog pings work
let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Ready]);
// Spawn systemd watchdog ping (WatchdogSec=60, ping every 30s)
// Spawn systemd watchdog ping (WatchdogSec=300, ping every 120s)
tokio::spawn(async {
let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
let mut interval = tokio::time::interval(std::time::Duration::from_secs(120));
loop {
interval.tick().await;
let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Watchdog]);

View File

@@ -12,7 +12,8 @@ ExecStartPre=/bin/bash -c 'mkdir -p /etc/archipelago && echo "ARCHIPELAGO_HOST_I
ExecStart=/usr/local/bin/archipelago
Restart=on-failure
RestartSec=5
WatchdogSec=60
WatchdogSec=300
TimeoutStartSec=300
[Install]
WantedBy=multi-user.target