31 lines
1.2 KiB
Bash
Executable File
31 lines
1.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Discord Watchdog — detects reconnect storms and auto-restarts affected agents
|
|
# Run via cron every 5 minutes
|
|
|
|
AGENTS=(manager tech-lead secretary auditor webster optimizer study-builder nx-expert)
|
|
THRESHOLD=10 # max disconnects in WINDOW before restart
|
|
WINDOW="5 min ago"
|
|
LOG_FILE="/home/papa/atomizer/logs/watchdog.log"
|
|
|
|
mkdir -p "$(dirname "$LOG_FILE")"
|
|
|
|
log() { echo "$(date -u '+%Y-%m-%d %H:%M:%S') $*" >> "$LOG_FILE"; }
|
|
|
|
for agent in "${AGENTS[@]}"; do
|
|
# Count close/disconnect events in the window
|
|
count=$(journalctl --user -u "openclaw-atomizer@${agent}" --since "$WINDOW" --no-pager 2>/dev/null \
|
|
| grep -c "discord.*close\|ws close\|code 100[56]" || echo 0)
|
|
|
|
if [ "$count" -ge "$THRESHOLD" ]; then
|
|
log "STORM DETECTED: ${agent} had ${count} disconnect events in 5min — restarting"
|
|
systemctl --user restart "openclaw-atomizer@${agent}"
|
|
sleep 2
|
|
# Verify it came back
|
|
if systemctl --user is-active "openclaw-atomizer@${agent}" >/dev/null 2>&1; then
|
|
log "RESTARTED: ${agent} is back up"
|
|
else
|
|
log "FAILED: ${agent} did not restart cleanly"
|
|
fi
|
|
fi
|
|
done
|