Files
Atomizer/hq/scripts/discord-watchdog.sh

31 lines
1.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# Discord Watchdog — detects reconnect storms and auto-restarts affected agents
# Run via cron every 5 minutes
AGENTS=(manager tech-lead secretary auditor webster optimizer study-builder nx-expert)
THRESHOLD=10 # max disconnects in WINDOW before restart
WINDOW="5 min ago"
LOG_FILE="/home/papa/atomizer/logs/watchdog.log"
mkdir -p "$(dirname "$LOG_FILE")"
log() { echo "$(date -u '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE"; }
for agent in "${AGENTS[@]}"; do
# Count close/disconnect events in the window
count=$(journalctl --user -u "openclaw-atomizer@${agent}" --since "$WINDOW" --no-pager 2>/dev/null \
| grep -c "discord.*close\|ws close\|code 100[56]" || echo 0)
if [ "$count" -ge "$THRESHOLD" ]; then
log "STORM DETECTED: ${agent} had ${count} disconnect events in 5min — restarting"
systemctl --user restart "openclaw-atomizer@${agent}"
sleep 2
# Verify it came back
if systemctl --user is-active "openclaw-atomizer@${agent}" >/dev/null 2>&1; then
log "RESTARTED: ${agent} is back up"
else
log "FAILED: ${agent} did not restart cleanly"
fi
fi
done