diff --git a/scripts/auto_triage.py b/scripts/auto_triage.py index 23dd4ec..7635a42 100644 --- a/scripts/auto_triage.py +++ b/scripts/auto_triage.py @@ -29,6 +29,7 @@ import os import shutil import subprocess import sys +import time import tempfile import urllib.error import urllib.parse @@ -131,22 +132,33 @@ def triage_one(candidate, active_memories, model, timeout_s): user_message, ] - try: - completed = subprocess.run( - args, capture_output=True, text=True, - timeout=timeout_s, cwd=get_sandbox_cwd(), - encoding="utf-8", errors="replace", - ) - except subprocess.TimeoutExpired: - return {"verdict": "needs_human", "confidence": 0.0, "reason": "triage model timed out"} - except Exception as exc: - return {"verdict": "needs_human", "confidence": 0.0, "reason": f"subprocess error: {exc}"} + # Retry with exponential backoff on transient failures (rate limits etc) + last_error = "" + for attempt in range(3): + if attempt > 0: + time.sleep(2 ** attempt) # 2s, 4s + try: + completed = subprocess.run( + args, capture_output=True, text=True, + timeout=timeout_s, cwd=get_sandbox_cwd(), + encoding="utf-8", errors="replace", + ) + except subprocess.TimeoutExpired: + last_error = "triage model timed out" + continue + except Exception as exc: + last_error = f"subprocess error: {exc}" + continue - if completed.returncode != 0: - return {"verdict": "needs_human", "confidence": 0.0, "reason": f"claude exit {completed.returncode}"} + if completed.returncode == 0: + raw = (completed.stdout or "").strip() + return parse_verdict(raw) - raw = (completed.stdout or "").strip() - return parse_verdict(raw) + # Capture stderr for diagnostics (truncate to 200 chars) + stderr = (completed.stderr or "").strip()[:200] + last_error = f"claude exit {completed.returncode}: {stderr}" if stderr else f"claude exit {completed.returncode}" + + return {"verdict": "needs_human", "confidence": 0.0, "reason": last_error} def parse_verdict(raw): @@ -213,6 +225,13 @@ def main(): promoted = rejected = needs_human = errors = 0 for i, cand in enumerate(candidates, 1): + # Light rate-limit pacing: 0.5s between triage calls so a burst + # doesn't overwhelm the claude CLI's backend. With ~60s per call + # this is negligible overhead but avoids the "all-failed" pattern + # we saw on large batches. + if i > 1: + time.sleep(0.5) + project = cand.get("project") or "" if project not in active_cache: active_cache[project] = fetch_active_memories_for_project(args.base_url, project) diff --git a/scripts/batch_llm_extract_live.py b/scripts/batch_llm_extract_live.py index 9680faa..d108824 100644 --- a/scripts/batch_llm_extract_live.py +++ b/scripts/batch_llm_extract_live.py @@ -126,22 +126,34 @@ def extract_one(prompt, response, project, model, timeout_s): user_message, ] - try: - completed = subprocess.run( - args, capture_output=True, text=True, - timeout=timeout_s, cwd=get_sandbox_cwd(), - encoding="utf-8", errors="replace", - ) - except subprocess.TimeoutExpired: - return [], "timeout" - except Exception as exc: - return [], f"subprocess_error: {exc}" + # Retry with exponential backoff on transient failures (rate limits etc) + import time as _time + last_error = "" + for attempt in range(3): + if attempt > 0: + _time.sleep(2 ** attempt) # 2s, 4s + try: + completed = subprocess.run( + args, capture_output=True, text=True, + timeout=timeout_s, cwd=get_sandbox_cwd(), + encoding="utf-8", errors="replace", + ) + except subprocess.TimeoutExpired: + last_error = "timeout" + continue + except Exception as exc: + last_error = f"subprocess_error: {exc}" + continue - if completed.returncode != 0: - return [], f"exit_{completed.returncode}" + if completed.returncode == 0: + raw = (completed.stdout or "").strip() + return parse_candidates(raw, project), "" - raw = (completed.stdout or "").strip() - return parse_candidates(raw, project), "" + # Capture stderr for diagnostics (truncate to 200 chars) + stderr = (completed.stderr or "").strip()[:200] + last_error = f"exit_{completed.returncode}: {stderr}" if stderr else f"exit_{completed.returncode}" + + return [], last_error def parse_candidates(raw, interaction_project): @@ -192,10 +204,14 @@ def main(): total_persisted = 0 errors = 0 - for summary in interaction_summaries: + import time as _time + for ix, summary in enumerate(interaction_summaries): resp_chars = summary.get("response_chars", 0) or 0 if resp_chars < 50: continue + # Light pacing between calls to avoid bursting the claude CLI + if ix > 0: + _time.sleep(0.5) iid = summary["id"] try: raw = api_get(