From 3011aa77daeacfaf89a1991ce6432aedf9de6cc7 Mon Sep 17 00:00:00 2001
From: Anto01 <antoine.letarte@gmail.com>
Date: Thu, 16 Apr 2026 16:29:20 -0400
Subject: [PATCH] fix: retry + stderr capture + pacing in triage/extractor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both scripts now:
- Retry up to 3x with 2s/4s exponential backoff on transient
  failures (rate limits, capacity spikes)
- Capture claude CLI stderr in the error message (200 char cap)
  instead of just the exit code — diagnostics actually useful now
- Sleep 0.5s between calls to avoid bursting the backend

Context: last batch run hit 100% failure in triage (every call
exit 1) after 40% failure in extraction. claude CLI worked fine
immediately after, so the failures were capacity/rate-limit
transients. With retry + pacing these batches should complete
cleanly now. 439 candidates are already in the queue waiting
for triage.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 scripts/auto_triage.py            | 47 ++++++++++++++++++++++---------
 scripts/batch_llm_extract_live.py | 46 ++++++++++++++++++++----------
 2 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/scripts/auto_triage.py b/scripts/auto_triage.py
index 23dd4ec..7635a42 100644
--- a/scripts/auto_triage.py
+++ b/scripts/auto_triage.py
@@ -29,6 +29,7 @@ import os
 import shutil
 import subprocess
 import sys
+import time
 import tempfile
 import urllib.error
 import urllib.parse
@@ -131,22 +132,33 @@ def triage_one(candidate, active_memories, model, timeout_s):
         user_message,
     ]
 
-    try:
-        completed = subprocess.run(
-            args, capture_output=True, text=True,
-            timeout=timeout_s, cwd=get_sandbox_cwd(),
-            encoding="utf-8", errors="replace",
-        )
-    except subprocess.TimeoutExpired:
-        return {"verdict": "needs_human", "confidence": 0.0, "reason": "triage model timed out"}
-    except Exception as exc:
-        return {"verdict": "needs_human", "confidence": 0.0, "reason": f"subprocess error: {exc}"}
+    # Retry with exponential backoff on transient failures (rate limits etc)
+    last_error = ""
+    for attempt in range(3):
+        if attempt > 0:
+            time.sleep(2 ** attempt)  # 2s, 4s
+        try:
+            completed = subprocess.run(
+                args, capture_output=True, text=True,
+                timeout=timeout_s, cwd=get_sandbox_cwd(),
+                encoding="utf-8", errors="replace",
+            )
+        except subprocess.TimeoutExpired:
+            last_error = "triage model timed out"
+            continue
+        except Exception as exc:
+            last_error = f"subprocess error: {exc}"
+            continue
 
-    if completed.returncode != 0:
-        return {"verdict": "needs_human", "confidence": 0.0, "reason": f"claude exit {completed.returncode}"}
+        if completed.returncode == 0:
+            raw = (completed.stdout or "").strip()
+            return parse_verdict(raw)
 
-    raw = (completed.stdout or "").strip()
-    return parse_verdict(raw)
+        # Capture stderr for diagnostics (truncate to 200 chars)
+        stderr = (completed.stderr or "").strip()[:200]
+        last_error = f"claude exit {completed.returncode}: {stderr}" if stderr else f"claude exit {completed.returncode}"
+
+    return {"verdict": "needs_human", "confidence": 0.0, "reason": last_error}
 
 
 def parse_verdict(raw):
@@ -213,6 +225,13 @@ def main():
     promoted = rejected = needs_human = errors = 0
 
     for i, cand in enumerate(candidates, 1):
+        # Light rate-limit pacing: 0.5s between triage calls so a burst
+        # doesn't overwhelm the claude CLI's backend. With ~60s per call
+        # this is negligible overhead but avoids the "all-failed" pattern
+        # we saw on large batches.
+        if i > 1:
+            time.sleep(0.5)
+
         project = cand.get("project") or ""
         if project not in active_cache:
             active_cache[project] = fetch_active_memories_for_project(args.base_url, project)
diff --git a/scripts/batch_llm_extract_live.py b/scripts/batch_llm_extract_live.py
index 9680faa..d108824 100644
--- a/scripts/batch_llm_extract_live.py
+++ b/scripts/batch_llm_extract_live.py
@@ -126,22 +126,34 @@ def extract_one(prompt, response, project, model, timeout_s):
         user_message,
     ]
 
-    try:
-        completed = subprocess.run(
-            args, capture_output=True, text=True,
-            timeout=timeout_s, cwd=get_sandbox_cwd(),
-            encoding="utf-8", errors="replace",
-        )
-    except subprocess.TimeoutExpired:
-        return [], "timeout"
-    except Exception as exc:
-        return [], f"subprocess_error: {exc}"
+    # Retry with exponential backoff on transient failures (rate limits etc)
+    import time as _time
+    last_error = ""
+    for attempt in range(3):
+        if attempt > 0:
+            _time.sleep(2 ** attempt)  # 2s, 4s
+        try:
+            completed = subprocess.run(
+                args, capture_output=True, text=True,
+                timeout=timeout_s, cwd=get_sandbox_cwd(),
+                encoding="utf-8", errors="replace",
+            )
+        except subprocess.TimeoutExpired:
+            last_error = "timeout"
+            continue
+        except Exception as exc:
+            last_error = f"subprocess_error: {exc}"
+            continue
 
-    if completed.returncode != 0:
-        return [], f"exit_{completed.returncode}"
+        if completed.returncode == 0:
+            raw = (completed.stdout or "").strip()
+            return parse_candidates(raw, project), ""
 
-    raw = (completed.stdout or "").strip()
-    return parse_candidates(raw, project), ""
+        # Capture stderr for diagnostics (truncate to 200 chars)
+        stderr = (completed.stderr or "").strip()[:200]
+        last_error = f"exit_{completed.returncode}: {stderr}" if stderr else f"exit_{completed.returncode}"
+
+    return [], last_error
 
 
 def parse_candidates(raw, interaction_project):
@@ -192,10 +204,14 @@ def main():
     total_persisted = 0
     errors = 0
 
-    for summary in interaction_summaries:
+    import time as _time
+    for ix, summary in enumerate(interaction_summaries):
         resp_chars = summary.get("response_chars", 0) or 0
         if resp_chars < 50:
             continue
+        # Light pacing between calls to avoid bursting the claude CLI
+        if ix > 0:
+            _time.sleep(0.5)
         iid = summary["id"]
         try:
             raw = api_get(