Files
archy/docs/operations-runbook.md
2026-03-22 03:30:21 +00:00

8.9 KiB

Archipelago Operations Runbook

Quick reference for common operational tasks on Archipelago nodes.

Primary node: 192.168.1.228 (Arch 1) Secondary node: 192.168.1.198 (Arch 2) SSH: ssh -i ~/.ssh/archipelago-deploy archipelago@{IP} Sudo: echo 'EwPDR8q45l0Upx@' | sudo -S {command}


1. Check Node Health

# Quick health check (from any machine)
curl http://192.168.1.228/health        # Should return "OK"
curl http://192.168.1.198/health

# Detailed system stats via RPC
curl -s -X POST -H "Content-Type: application/json" \
  -d '{"method":"system.stats"}' \
  http://192.168.1.228:5678/rpc/v1

# Check services
ssh archipelago@192.168.1.228
sudo systemctl status archipelago       # Backend service
sudo systemctl status nginx             # Web server
sudo systemctl status tor               # Tor hidden services

2. Check Container Status

# List all containers
podman ps -a

# Running count
podman ps --format '{{.Names}}' | wc -l

# Find exited/crashed containers
podman ps -a --filter status=exited

# Container logs
podman logs {container-name} --tail 50

# Container resource usage
podman stats --no-stream

3. Fix Crashed Containers

# Restart a specific container
podman restart {container-name}

# If container won't start, check logs first
podman logs {container-name} --tail 100

# Remove and recreate (last resort)
podman rm -f {container-name}
# Then redeploy with: ./scripts/deploy-to-target.sh --live

# The health monitor auto-restarts containers every 60s
# Check its status:
sudo journalctl -u archipelago --grep="health_monitor" --no-pager -n 20

4. Add/Remove Federation Peers

# Generate invite code (on inviting node)
# Via UI: Federation page > Generate Invite
# Via RPC:
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"federation.invite"}' \
  http://localhost:5678/rpc/v1

# Join federation (on joining node)
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"federation.join","params":{"invite_code":"{code}"}}' \
  http://localhost:5678/rpc/v1

# List peers
curl -s -X POST -H "Content-Type: application/json" \
  -d '{"method":"federation.list-nodes"}' \
  http://localhost:5678/rpc/v1

# Remove a peer
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"federation.remove-node","params":{"did":"{peer-did}"}}' \
  http://localhost:5678/rpc/v1

5. Rotate Tor Address

# Delete current hidden service keys
sudo rm -rf /var/lib/tor/hidden_service/
sudo systemctl restart tor

# Wait for new hostname
sleep 15
sudo cat /var/lib/tor/hidden_service/hostname

# The backend picks up the new address automatically (30s refresh)
# Federation peers need to re-discover via sync

6. Create/Restore Backups

# Create encrypted backup (via RPC)
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"backup.create","params":{"passphrase":"your-passphrase","description":"manual backup"}}' \
  http://localhost:5678/rpc/v1

# List backups
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"backup.list"}' \
  http://localhost:5678/rpc/v1

# Verify backup integrity
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"backup.verify","params":{"id":"{backup-id}","passphrase":"your-passphrase"}}' \
  http://localhost:5678/rpc/v1

# Restore (warning: overwrites current identity/data)
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"backup.restore","params":{"id":"{backup-id}","passphrase":"your-passphrase"}}' \
  http://localhost:5678/rpc/v1

# Backup files stored at: /var/lib/archipelago/backups/

7. Update the Node

# From development machine:
./scripts/deploy-to-target.sh --live     # Deploy to .228
./scripts/deploy-to-target.sh --both     # Deploy to both nodes
./scripts/deploy-to-target.sh --dry-run --live  # Preview changes

# The deploy script:
# 1. Syncs code to target
# 2. Builds frontend (vue-tsc + vite)
# 3. Builds backend (cargo build --release)
# 4. Deploys binary, frontend, configs
# 5. Restarts services
# 6. Verifies health

8. Diagnose High CPU

# Check system load
uptime

# Find CPU-heavy processes
top -b -n 1 | head -15

# Check container CPU usage
podman stats --no-stream --format '{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}'

# Common causes:
# - Bitcoin IBD (initial block download): normal, takes days
# - Container crash loops: check `podman ps -a --filter status=exited`
# - mempool-electrs indexing: normal after Bitcoin sync

9. Diagnose High Memory

# Check memory
free -h

# Check swap usage
swapon --show

# Per-container memory
podman stats --no-stream --format '{{.Name}}\t{{.MemUsage}}\t{{.MemPerc}}'

# Check for OOM kills
dmesg --level=err,crit | grep -i oom

# Add swap if missing
sudo fallocate -l 4G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab

10. Diagnose Disk Space

# Disk usage overview
df -h /

# Find large directories
sudo du -h --max-depth=2 /var/lib/archipelago/ | sort -rh | head -20

# Container image sizes
podman images --format '{{.Repository}}:{{.Tag}}\t{{.Size}}'

# Clean unused images
podman image prune -a

# Clean old journal logs
sudo journalctl --vacuum-size=500M

11. Check Tor Connectivity

# Tor service status
sudo systemctl status tor

# Get onion address
sudo cat /var/lib/tor/hidden_service/hostname

# Test self-connection via Tor
curl --socks5-hostname 127.0.0.1:9050 http://$(sudo cat /var/lib/tor/hidden_service/hostname)/health

# Test cross-node Tor
curl --socks5-hostname 127.0.0.1:9050 http://{peer-onion}/health

12. Check DWN Sync

# DWN status (via RPC, needs auth)
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"dwn.status"}' \
  http://localhost:5678/rpc/v1

# Trigger manual sync
curl -s -X POST -H "Content-Type: application/json" \
  -H "Cookie: session={session}; csrf_token={csrf}" \
  -H "X-CSRF-Token: {csrf}" \
  -d '{"method":"dwn.sync"}' \
  http://localhost:5678/rpc/v1

# Check message count
ls /var/lib/archipelago/dwn/messages/ | wc -l

13. Restart Services

# Restart backend only
sudo systemctl restart archipelago

# Restart nginx
sudo systemctl restart nginx

# Restart Tor
sudo systemctl restart tor

# Full service restart (backend + nginx)
sudo systemctl restart archipelago nginx

# Reboot (containers auto-recover via restart policy + health monitor)
sudo reboot

14. View Logs

# Backend logs
sudo journalctl -u archipelago --no-pager -n 100

# Follow logs in real time
sudo journalctl -u archipelago -f

# Nginx access log
sudo tail -f /var/log/nginx/access.log

# Nginx error log
sudo tail -f /var/log/nginx/error.log

# Container logs
podman logs {container-name} --tail 50 -f

15. Network Diagnostics

# Check listening ports
sudo ss -tlnp

# Check firewall rules
sudo ufw status verbose

# Required ports:
#   22  - SSH
#   80  - HTTP (nginx)
#   443 - HTTPS (nginx)
#   5678 - Backend API (localhost only, proxied by nginx)
#   8332 - Bitcoin RPC (container network only)
#   9050 - Tor SOCKS proxy (localhost only)

# If ports are blocked after reboot, re-add UFW rules:
sudo ufw allow ssh
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
sudo ufw allow from 10.88.0.0/16   # Podman container subnet
sudo ufw allow from 10.89.0.0/16   # Podman container subnet

16. Emergency: Node Won't Boot

If a node responds to ping but SSH/HTTP are down:

  1. Check UFW: After reboot, UFW may block all ports

    # If you have console access:
    sudo ufw allow ssh
    sudo ufw allow 80/tcp
    sudo ufw allow 443/tcp
    sudo ufw reload
    
  2. Check services: SSH or nginx may not have started

    sudo systemctl start ssh
    sudo systemctl start nginx
    sudo systemctl start archipelago
    
  3. Check disk: If root filesystem is full, services won't start

    df -h /
    sudo journalctl --vacuum-size=200M
    podman image prune -a
    

17. Run Cross-Node Tests

# Full test suite (all features, 10 iterations)
./scripts/test-cross-node.sh --iterations 10

# Skip reboot tests
./scripts/test-cross-node.sh --iterations 10 --skip-reboot

# Reboot survival test (single node)
./scripts/test-reboot-survival.sh --node 192.168.1.228 --iterations 3