From 3011aa77daeacfaf89a1991ce6432aedf9de6cc7 Mon Sep 17 00:00:00 2001 From: Anto01 Date: Thu, 16 Apr 2026 16:29:20 -0400 Subject: [PATCH] fix: retry + stderr capture + pacing in triage/extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both scripts now: - Retry up to 3x with 2s/4s exponential backoff on transient failures (rate limits, capacity spikes) - Capture claude CLI stderr in the error message (200 char cap) instead of just the exit code — diagnostics actually useful now - Sleep 0.5s between calls to avoid bursting the backend Context: last batch run hit 100% failure in triage (every call exit 1) after 40% failure in extraction. claude CLI worked fine immediately after, so the failures were capacity/rate-limit transients. With retry + pacing these batches should complete cleanly now. 439 candidates are already in the queue waiting for triage. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/auto_triage.py | 47 ++++++++++++++++++++++--------- scripts/batch_llm_extract_live.py | 46 ++++++++++++++++++++---------- 2 files changed, 64 insertions(+), 29 deletions(-) diff --git a/scripts/auto_triage.py b/scripts/auto_triage.py index 23dd4ec..7635a42 100644 --- a/scripts/auto_triage.py +++ b/scripts/auto_triage.py @@ -29,6 +29,7 @@ import os import shutil import subprocess import sys +import time import tempfile import urllib.error import urllib.parse @@ -131,22 +132,33 @@ def triage_one(candidate, active_memories, model, timeout_s): user_message, ] - try: - completed = subprocess.run( - args, capture_output=True, text=True, - timeout=timeout_s, cwd=get_sandbox_cwd(), - encoding="utf-8", errors="replace", - ) - except subprocess.TimeoutExpired: - return {"verdict": "needs_human", "confidence": 0.0, "reason": "triage model timed out"} - except Exception as exc: - return {"verdict": "needs_human", "confidence": 0.0, "reason": f"subprocess error: {exc}"} + # Retry with exponential backoff on transient failures (rate limits etc) + last_error = "" + for attempt in range(3): + if attempt > 0: + time.sleep(2 ** attempt) # 2s, 4s + try: + completed = subprocess.run( + args, capture_output=True, text=True, + timeout=timeout_s, cwd=get_sandbox_cwd(), + encoding="utf-8", errors="replace", + ) + except subprocess.TimeoutExpired: + last_error = "triage model timed out" + continue + except Exception as exc: + last_error = f"subprocess error: {exc}" + continue - if completed.returncode != 0: - return {"verdict": "needs_human", "confidence": 0.0, "reason": f"claude exit {completed.returncode}"} + if completed.returncode == 0: + raw = (completed.stdout or "").strip() + return parse_verdict(raw) - raw = (completed.stdout or "").strip() - return parse_verdict(raw) + # Capture stderr for diagnostics (truncate to 200 chars) + stderr = (completed.stderr or "").strip()[:200] + last_error = f"claude exit {completed.returncode}: {stderr}" if stderr else f"claude exit {completed.returncode}" + + return {"verdict": "needs_human", "confidence": 0.0, "reason": last_error} def parse_verdict(raw): @@ -213,6 +225,13 @@ def main(): promoted = rejected = needs_human = errors = 0 for i, cand in enumerate(candidates, 1): + # Light rate-limit pacing: 0.5s between triage calls so a burst + # doesn't overwhelm the claude CLI's backend. With ~60s per call + # this is negligible overhead but avoids the "all-failed" pattern + # we saw on large batches. + if i > 1: + time.sleep(0.5) + project = cand.get("project") or "" if project not in active_cache: active_cache[project] = fetch_active_memories_for_project(args.base_url, project) diff --git a/scripts/batch_llm_extract_live.py b/scripts/batch_llm_extract_live.py index 9680faa..d108824 100644 --- a/scripts/batch_llm_extract_live.py +++ b/scripts/batch_llm_extract_live.py @@ -126,22 +126,34 @@ def extract_one(prompt, response, project, model, timeout_s): user_message, ] - try: - completed = subprocess.run( - args, capture_output=True, text=True, - timeout=timeout_s, cwd=get_sandbox_cwd(), - encoding="utf-8", errors="replace", - ) - except subprocess.TimeoutExpired: - return [], "timeout" - except Exception as exc: - return [], f"subprocess_error: {exc}" + # Retry with exponential backoff on transient failures (rate limits etc) + import time as _time + last_error = "" + for attempt in range(3): + if attempt > 0: + _time.sleep(2 ** attempt) # 2s, 4s + try: + completed = subprocess.run( + args, capture_output=True, text=True, + timeout=timeout_s, cwd=get_sandbox_cwd(), + encoding="utf-8", errors="replace", + ) + except subprocess.TimeoutExpired: + last_error = "timeout" + continue + except Exception as exc: + last_error = f"subprocess_error: {exc}" + continue - if completed.returncode != 0: - return [], f"exit_{completed.returncode}" + if completed.returncode == 0: + raw = (completed.stdout or "").strip() + return parse_candidates(raw, project), "" - raw = (completed.stdout or "").strip() - return parse_candidates(raw, project), "" + # Capture stderr for diagnostics (truncate to 200 chars) + stderr = (completed.stderr or "").strip()[:200] + last_error = f"exit_{completed.returncode}: {stderr}" if stderr else f"exit_{completed.returncode}" + + return [], last_error def parse_candidates(raw, interaction_project): @@ -192,10 +204,14 @@ def main(): total_persisted = 0 errors = 0 - for summary in interaction_summaries: + import time as _time + for ix, summary in enumerate(interaction_summaries): resp_chars = summary.get("response_chars", 0) or 0 if resp_chars < 50: continue + # Light pacing between calls to avoid bursting the claude CLI + if ix > 0: + _time.sleep(0.5) iid = summary["id"] try: raw = api_get(