#!/usr/bin/env bash # Discord Watchdog — detects reconnect storms and auto-restarts affected agents # Run via cron every 5 minutes AGENTS=(manager tech-lead secretary auditor webster optimizer study-builder nx-expert) THRESHOLD=10 # max disconnects in WINDOW before restart WINDOW="5 min ago" LOG_FILE="/home/papa/atomizer/logs/watchdog.log" mkdir -p "$(dirname "$LOG_FILE")" log() { echo "$(date -u '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE"; } for agent in "${AGENTS[@]}"; do # Count close/disconnect events in the window count=$(journalctl --user -u "openclaw-atomizer@${agent}" --since "$WINDOW" --no-pager 2>/dev/null \ | grep -c "discord.*close\|ws close\|code 100[56]" || echo 0) if [ "$count" -ge "$THRESHOLD" ]; then log "STORM DETECTED: ${agent} had ${count} disconnect events in 5min — restarting" systemctl --user restart "openclaw-atomizer@${agent}" sleep 2 # Verify it came back if systemctl --user is-active "openclaw-atomizer@${agent}" >/dev/null 2>&1; then log "RESTARTED: ${agent} is back up" else log "FAILED: ${agent} did not restart cleanly" fi fi done