- F1: Guard connectWebSocket against concurrent calls with isWsConnecting flag - F2: Serialize mesh send operations with sendQueue to prevent fetchMessages races - F3: Add global Vue error handler with toast notification - S1: Replace sudo podman with podman across all scripts (rootless Podman) - S2: Add health-cmd to all 40 container run commands in first-boot-containers.sh Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
98 lines
4.1 KiB
Bash
Executable File
98 lines
4.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Uptime Monitor for REL-05
|
|
# Runs every 5 minutes via cron, records metrics to a CSV file.
|
|
# Install: */5 * * * * /opt/archipelago/scripts/uptime-monitor.sh
|
|
#
|
|
# Tracks: timestamp, http_status, response_time_ms, cpu_percent,
|
|
# mem_used_mb, mem_total_mb, disk_used_gb, disk_total_gb,
|
|
# container_count, uptime_secs, restart_count
|
|
|
|
set -euo pipefail
|
|
|
|
LOG_DIR="/var/lib/archipelago/uptime-monitor"
|
|
LOG_FILE="$LOG_DIR/metrics.csv"
|
|
RESTART_FILE="$LOG_DIR/restart-count"
|
|
BACKEND_URL="http://localhost:5678/health"
|
|
RPC_URL="http://localhost:5678/rpc/v1"
|
|
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
# Write CSV header if file doesn't exist
|
|
if [ ! -f "$LOG_FILE" ]; then
|
|
echo "timestamp,http_status,response_ms,cpu_percent,mem_used_mb,mem_total_mb,disk_used_gb,disk_total_gb,containers,uptime_secs,restart_count" > "$LOG_FILE"
|
|
fi
|
|
|
|
# Track restart count
|
|
if [ ! -f "$RESTART_FILE" ]; then
|
|
echo "0" > "$RESTART_FILE"
|
|
fi
|
|
RESTART_COUNT=$(cat "$RESTART_FILE" 2>/dev/null || echo "0")
|
|
|
|
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
# Check HTTP health
|
|
HTTP_START=$(date +%s%N)
|
|
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$BACKEND_URL" 2>/dev/null || echo "000")
|
|
HTTP_END=$(date +%s%N)
|
|
RESPONSE_MS=$(( (HTTP_END - HTTP_START) / 1000000 ))
|
|
|
|
# Authenticate for RPC access
|
|
curl -s -c /tmp/uptime-cookies --max-time 5 -X POST "$RPC_URL" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"method":"auth.login","params":{"password":"password123"}}' >/dev/null 2>&1
|
|
CSRF=$(grep csrf_token /tmp/uptime-cookies 2>/dev/null | awk '{print $NF}')
|
|
|
|
# Get system stats from RPC
|
|
STATS=$(curl -s --max-time 10 -b /tmp/uptime-cookies \
|
|
-H "Content-Type: application/json" \
|
|
-H "X-CSRF-Token: $CSRF" \
|
|
-X POST "$RPC_URL" \
|
|
-d '{"method":"system.stats"}' 2>/dev/null || echo '{"result":{}}')
|
|
|
|
CPU=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(d.get('cpu_usage_percent',0))" 2>/dev/null || echo "0")
|
|
MEM_USED=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(round(d.get('mem_used_bytes',0)/1048576))" 2>/dev/null || echo "0")
|
|
MEM_TOTAL=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(round(d.get('mem_total_bytes',0)/1048576))" 2>/dev/null || echo "0")
|
|
DISK_USED=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(round(d.get('disk_used_bytes',0)/1073741824,1))" 2>/dev/null || echo "0")
|
|
DISK_TOTAL=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(round(d.get('disk_total_bytes',0)/1073741824,1))" 2>/dev/null || echo "0")
|
|
UPTIME=$(echo "$STATS" | python3 -c "import sys,json; d=json.load(sys.stdin).get('result',{}); print(d.get('uptime_secs',0))" 2>/dev/null || echo "0")
|
|
|
|
# Count running containers
|
|
CONTAINERS=$(podman ps --format "{{.Names}}" 2>/dev/null | wc -l || echo "0")
|
|
|
|
# Detect restart (uptime < 300s = likely just restarted)
|
|
if [ "$UPTIME" -lt 300 ] 2>/dev/null; then
|
|
# Check if we already counted this restart
|
|
LAST_UPTIME_FILE="$LOG_DIR/last-uptime"
|
|
LAST_UPTIME=$(cat "$LAST_UPTIME_FILE" 2>/dev/null || echo "99999")
|
|
if [ "$LAST_UPTIME" -gt 300 ] 2>/dev/null; then
|
|
RESTART_COUNT=$((RESTART_COUNT + 1))
|
|
echo "$RESTART_COUNT" > "$RESTART_FILE"
|
|
fi
|
|
echo "$UPTIME" > "$LAST_UPTIME_FILE"
|
|
else
|
|
echo "$UPTIME" > "$LOG_DIR/last-uptime"
|
|
fi
|
|
|
|
# Append metrics
|
|
echo "$TIMESTAMP,$HTTP_STATUS,$RESPONSE_MS,$CPU,$MEM_USED,$MEM_TOTAL,$DISK_USED,$DISK_TOTAL,$CONTAINERS,$UPTIME,$RESTART_COUNT" >> "$LOG_FILE"
|
|
|
|
# Generate summary report
|
|
TOTAL_CHECKS=$(wc -l < "$LOG_FILE")
|
|
TOTAL_CHECKS=$((TOTAL_CHECKS - 1)) # exclude header
|
|
if [ "$TOTAL_CHECKS" -gt 0 ]; then
|
|
OK_CHECKS=$(grep -c ",200," "$LOG_FILE" || echo "0")
|
|
UPTIME_PCT=$(python3 -c "print(round($OK_CHECKS / $TOTAL_CHECKS * 100, 3))" 2>/dev/null || echo "0")
|
|
|
|
cat > "$LOG_DIR/summary.json" << EOF
|
|
{
|
|
"start": "$(head -2 "$LOG_FILE" | tail -1 | cut -d',' -f1)",
|
|
"last_check": "$TIMESTAMP",
|
|
"total_checks": $TOTAL_CHECKS,
|
|
"ok_checks": $OK_CHECKS,
|
|
"uptime_percent": $UPTIME_PCT,
|
|
"restart_count": $RESTART_COUNT,
|
|
"current_status": "$HTTP_STATUS"
|
|
}
|
|
EOF
|
|
fi
|