Files
archy/scripts/generate-stability-report.sh
Dorian 067df69ce9 feat: deploy daily reboot test + stability report generator (SOAK-03/04)
SOAK-03: daily-reboot-test.sh deployed on both nodes via cron (4 AM).
  Systemd oneshot verifies recovery on boot, logs to reboot-test.csv.

SOAK-04: generate-stability-report.sh compiles metrics from
  uptime-monitor, reboot-test, sync-check CSVs. Initial .228 report:
  99.847% uptime, 0 OOM kills, 32/32 containers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-14 05:37:16 +00:00

125 lines
5.1 KiB
Bash
Executable File

#!/bin/bash
# generate-stability-report.sh — Compile stability report from monitoring data
# Run after 30-day soak test period
# Usage: ./scripts/generate-stability-report.sh [TARGET_IP]
TARGET="${1:-192.168.1.228}"
SSH_KEY="${HOME}/.ssh/archipelago-deploy"
SSH_OPTS="-i ${SSH_KEY} -o StrictHostKeyChecking=no -o ConnectTimeout=10"
echo "╔════════════════════════════════════════════════════════════════╗"
echo "║ Archipelago Stability Report ║"
echo "╚════════════════════════════════════════════════════════════════╝"
echo ""
echo "Node: ${TARGET}"
echo "Generated: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo ""
# Uptime metrics
echo "═══ Uptime Metrics ═══"
ssh $SSH_OPTS "archipelago@${TARGET}" "
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
TOTAL=\$(tail -n +2 /var/lib/archipelago/uptime-monitor/metrics.csv | wc -l)
OK=\$(grep -c ',200,' /var/lib/archipelago/uptime-monitor/metrics.csv 2>/dev/null || echo 0)
if [ \$TOTAL -gt 0 ]; then
PCT=\$(python3 -c \"print(round(\$OK / \$TOTAL * 100, 3))\" 2>/dev/null || echo '?')
echo \" Total checks: \$TOTAL\"
echo \" Healthy: \$OK\"
echo \" Uptime: \${PCT}%\"
FIRST=\$(head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | cut -d, -f1)
LAST=\$(tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | cut -d, -f1)
echo \" Period: \$FIRST to \$LAST\"
fi
else
echo ' No uptime data found'
fi
" 2>/dev/null
echo ""
# Reboot test results
echo "═══ Daily Reboot Tests ═══"
ssh $SSH_OPTS "archipelago@${TARGET}" "
if [ -f /var/lib/archipelago/monitoring/reboot-test.csv ]; then
REBOOTS=\$(grep -c ',reboot,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null || echo 0)
VERIFIED=\$(grep -c ',verify,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null || echo 0)
OK=\$(grep ',verify,.*,OK,' /var/lib/archipelago/monitoring/reboot-test.csv 2>/dev/null | wc -l || echo 0)
if [ \$VERIFIED -gt 0 ]; then
AVG=\$(grep ',verify,' /var/lib/archipelago/monitoring/reboot-test.csv | awk -F, '{sum+=\$7; n++} END {if(n>0) print int(sum/n); else print 0}')
echo \" Total reboots: \$REBOOTS\"
echo \" Verified recoveries: \$VERIFIED\"
echo \" Successful: \$OK\"
echo \" Avg recovery time: \${AVG}s\"
fi
else
echo ' No reboot test data (starts at 4 AM daily)'
fi
" 2>/dev/null
echo ""
# Federation sync
echo "═══ Federation Sync ═══"
ssh $SSH_OPTS "archipelago@${TARGET}" "
if [ -f /var/lib/archipelago/monitoring/sync-check.csv ]; then
TOTAL=\$(tail -n +2 /var/lib/archipelago/monitoring/sync-check.csv | wc -l)
OK=\$(awk -F, '\$2 > 0' /var/lib/archipelago/monitoring/sync-check.csv | wc -l)
if [ \$TOTAL -gt 0 ]; then
PCT=\$(python3 -c \"print(round(\$OK / \$TOTAL * 100, 1))\" 2>/dev/null || echo '?')
echo \" Total syncs: \$TOTAL\"
echo \" Successful: \$OK\"
echo \" Success rate: \${PCT}%\"
fi
else
echo ' No sync data yet'
fi
" 2>/dev/null
echo ""
# Memory trend
echo "═══ Memory Trend ═══"
ssh $SSH_OPTS "archipelago@${TARGET}" "
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
echo ' First reading:'
head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | awk -F, '{printf \" %s: %s/%s MB used\\n\", \$1, \$7, \$8}'
echo ' Latest reading:'
tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | awk -F, '{printf \" %s: %s/%s MB used\\n\", \$1, \$7, \$8}'
fi
" 2>/dev/null
echo ""
# Disk trend
echo "═══ Disk Trend ═══"
ssh $SSH_OPTS "archipelago@${TARGET}" "
if [ -f /var/lib/archipelago/uptime-monitor/metrics.csv ]; then
echo ' First reading:'
head -2 /var/lib/archipelago/uptime-monitor/metrics.csv | tail -1 | awk -F, '{printf \" %s: %s/%s GB\\n\", \$1, \$9, \$10}'
echo ' Latest reading:'
tail -1 /var/lib/archipelago/uptime-monitor/metrics.csv | awk -F, '{printf \" %s: %s/%s GB\\n\", \$1, \$9, \$10}'
fi
" 2>/dev/null
echo ""
# Container health
echo "═══ Container Health ═══"
ssh $SSH_OPTS "archipelago@${TARGET}" "
DOCKER=podman; command -v podman >/dev/null 2>&1 || DOCKER=docker
RUNNING=\$(sudo \$DOCKER ps --format '{{.Names}}' 2>/dev/null | wc -l)
EXITED=\$(sudo \$DOCKER ps -a --filter status=exited --format '{{.Names}}' 2>/dev/null | wc -l)
echo \" Running: \$RUNNING\"
echo \" Exited: \$EXITED\"
if [ \$EXITED -gt 0 ]; then
echo ' Exited containers:'
sudo \$DOCKER ps -a --filter status=exited --format ' {{.Names}}: {{.Status}}' 2>/dev/null
fi
" 2>/dev/null
echo ""
# OOM kills
echo "═══ OOM Kills ═══"
ssh $SSH_OPTS "archipelago@${TARGET}" "
OOM=\$(sudo dmesg --level=err,crit 2>/dev/null | grep -c 'oom-kill' || echo 0)
echo \" OOM kills since boot: \$OOM\"
" 2>/dev/null
echo ""
echo "═══ Report Complete ═══"