Files
archy/scripts/uptime-monitor.sh
Dorian 38dc845f57 fix: WebSocket race conditions, Vue error handler, remove sudo podman, add container health checks
- F1: Guard connectWebSocket against concurrent calls with isWsConnecting flag
- F2: Serialize mesh send operations with sendQueue to prevent fetchMessages races
- F3: Add global Vue error handler with toast notification
- S1: Replace sudo podman with podman across all scripts (rootless Podman)
- S2: Add health-cmd to all 40 container run commands in first-boot-containers.sh

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 01:11:05 +00:00

98 lines
4.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# Uptime Monitor for REL-05
# Runs every 5 minutes via cron, records metrics to a CSV file.
# Install: */5 * * * * /opt/archipelago/scripts/uptime-monitor.sh
#
# Tracks: timestamp, http_status, response_time_ms, cpu_percent,
# mem_used_mb, mem_total_mb, disk_used_gb, disk_total_gb,
# container_count, uptime_secs, restart_count
set -euo pipefail
LOG_DIR="/var/lib/archipelago/uptime-monitor"
LOG_FILE="$LOG_DIR/metrics.csv"
RESTART_FILE="$LOG_DIR/restart-count"
BACKEND_URL="http://localhost:5678/health"
RPC_URL="http://localhost:5678/rpc/v1"
mkdir -p "$LOG_DIR"
# Write CSV header if file doesn't exist
if [ ! -f "$LOG_FILE" ]; then
echo "timestamp,http_status,response_ms,cpu_percent,mem_used_mb,mem_total_mb,disk_used_gb,disk_total_gb,containers,uptime_secs,restart_count" > "$LOG_FILE"
fi
# Track restart count
if [ ! -f "$RESTART_FILE" ]; then
echo "0" > "$RESTART_FILE"
fi
RESTART_COUNT=$(cat "$RESTART_FILE" 2>/dev/null || echo "0")
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
# Check HTTP health
HTTP_START=$(date +%s%N)
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$BACKEND_URL" 2>/dev/null || echo "000")
HTTP_END=$(date +%s%N)
RESPONSE_MS=$(( (HTTP_END - HTTP_START) / 1000000 ))
# Authenticate for RPC access
curl -s -c /tmp/uptime-cookies --max-time 5 -X POST "$RPC_URL" \
-H "Content-Type: application/json" \
-d '{"method":"auth.login","params":{"password":"password123"}}' >/dev/null 2>&1
CSRF=$(grep csrf_token /tmp/uptime-cookies 2>/dev/null | awk '{print $NF}')
# Get system stats from RPC
STATS=$(curl -s --max-time 10 -b /tmp/uptime-cookies \
-H "Content-Type: application/json" \
-H "X-CSRF-Token: $CSRF" \
-X POST "$RPC_URL" \
-d '{"method":"system.stats"}' 2>/dev/null || echo '{"result":{}}')
CPU=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(d.get('cpu_usage_percent',0))" 2>/dev/null || echo "0")
MEM_USED=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(round(d.get('mem_used_bytes',0)/1048576))" 2>/dev/null || echo "0")
MEM_TOTAL=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(round(d.get('mem_total_bytes',0)/1048576))" 2>/dev/null || echo "0")
DISK_USED=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(round(d.get('disk_used_bytes',0)/1073741824,1))" 2>/dev/null || echo "0")
DISK_TOTAL=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(round(d.get('disk_total_bytes',0)/1073741824,1))" 2>/dev/null || echo "0")
UPTIME=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(d.get('uptime_secs',0))" 2>/dev/null || echo "0")
# Count running containers
CONTAINERS=$(podman ps --format "{{.Names}}" 2>/dev/null | wc -l || echo "0")
# Detect restart (uptime < 300s = likely just restarted)
if [ "$UPTIME" -lt 300 ] 2>/dev/null; then
# Check if we already counted this restart
LAST_UPTIME_FILE="$LOG_DIR/last-uptime"
LAST_UPTIME=$(cat "$LAST_UPTIME_FILE" 2>/dev/null || echo "99999")
if [ "$LAST_UPTIME" -gt 300 ] 2>/dev/null; then
RESTART_COUNT=$((RESTART_COUNT + 1))
echo "$RESTART_COUNT" > "$RESTART_FILE"
fi
echo "$UPTIME" > "$LAST_UPTIME_FILE"
else
echo "$UPTIME" > "$LOG_DIR/last-uptime"
fi
# Append metrics
echo "$TIMESTAMP,$HTTP_STATUS,$RESPONSE_MS,$CPU,$MEM_USED,$MEM_TOTAL,$DISK_USED,$DISK_TOTAL,$CONTAINERS,$UPTIME,$RESTART_COUNT" >> "$LOG_FILE"
# Generate summary report
TOTAL_CHECKS=$(wc -l < "$LOG_FILE")
TOTAL_CHECKS=$((TOTAL_CHECKS - 1)) # exclude header
if [ "$TOTAL_CHECKS" -gt 0 ]; then
OK_CHECKS=$(grep -c ",200," "$LOG_FILE" || echo "0")
UPTIME_PCT=$(python3 -c "print(round($OK_CHECKS / $TOTAL_CHECKS * 100, 3))" 2>/dev/null || echo "0")
cat > "$LOG_DIR/summary.json" << EOF
{
"start": "$(head -2 "$LOG_FILE" | tail -1 | cut -d',' -f1)",
"last_check": "$TIMESTAMP",
"total_checks": $TOTAL_CHECKS,
"ok_checks": $OK_CHECKS,
"uptime_percent": $UPTIME_PCT,
"restart_count": $RESTART_COUNT,
"current_status": "$HTTP_STATUS"
}
EOF
fi