fix: retry + stderr capture + pacing in triage/extractor

Both scripts now:
- Retry up to 3x with 2s/4s exponential backoff on transient
  failures (rate limits, capacity spikes)
- Capture claude CLI stderr in the error message (200 char cap)
  instead of just the exit code — diagnostics actually useful now
- Sleep 0.5s between calls to avoid bursting the backend

Context: last batch run hit 100% failure in triage (every call
exit 1) after 40% failure in extraction. claude CLI worked fine
immediately after, so the failures were capacity/rate-limit
transients. With retry + pacing these batches should complete
cleanly now. 439 candidates are already in the queue waiting
for triage.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-16 16:29:20 -04:00
parent ba36a28453
commit 3011aa77da
2 changed files with 64 additions and 29 deletions

View File

@@ -29,6 +29,7 @@ import os
import shutil import shutil
import subprocess import subprocess
import sys import sys
import time
import tempfile import tempfile
import urllib.error import urllib.error
import urllib.parse import urllib.parse
@@ -131,6 +132,11 @@ def triage_one(candidate, active_memories, model, timeout_s):
user_message, user_message,
] ]
# Retry with exponential backoff on transient failures (rate limits etc)
last_error = ""
for attempt in range(3):
if attempt > 0:
time.sleep(2 ** attempt) # 2s, 4s
try: try:
completed = subprocess.run( completed = subprocess.run(
args, capture_output=True, text=True, args, capture_output=True, text=True,
@@ -138,16 +144,22 @@ def triage_one(candidate, active_memories, model, timeout_s):
encoding="utf-8", errors="replace", encoding="utf-8", errors="replace",
) )
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
return {"verdict": "needs_human", "confidence": 0.0, "reason": "triage model timed out"} last_error = "triage model timed out"
continue
except Exception as exc: except Exception as exc:
return {"verdict": "needs_human", "confidence": 0.0, "reason": f"subprocess error: {exc}"} last_error = f"subprocess error: {exc}"
continue
if completed.returncode != 0:
return {"verdict": "needs_human", "confidence": 0.0, "reason": f"claude exit {completed.returncode}"}
if completed.returncode == 0:
raw = (completed.stdout or "").strip() raw = (completed.stdout or "").strip()
return parse_verdict(raw) return parse_verdict(raw)
# Capture stderr for diagnostics (truncate to 200 chars)
stderr = (completed.stderr or "").strip()[:200]
last_error = f"claude exit {completed.returncode}: {stderr}" if stderr else f"claude exit {completed.returncode}"
return {"verdict": "needs_human", "confidence": 0.0, "reason": last_error}
def parse_verdict(raw): def parse_verdict(raw):
"""Parse the triage model's JSON verdict.""" """Parse the triage model's JSON verdict."""
@@ -213,6 +225,13 @@ def main():
promoted = rejected = needs_human = errors = 0 promoted = rejected = needs_human = errors = 0
for i, cand in enumerate(candidates, 1): for i, cand in enumerate(candidates, 1):
# Light rate-limit pacing: 0.5s between triage calls so a burst
# doesn't overwhelm the claude CLI's backend. With ~60s per call
# this is negligible overhead but avoids the "all-failed" pattern
# we saw on large batches.
if i > 1:
time.sleep(0.5)
project = cand.get("project") or "" project = cand.get("project") or ""
if project not in active_cache: if project not in active_cache:
active_cache[project] = fetch_active_memories_for_project(args.base_url, project) active_cache[project] = fetch_active_memories_for_project(args.base_url, project)

View File

@@ -126,6 +126,12 @@ def extract_one(prompt, response, project, model, timeout_s):
user_message, user_message,
] ]
# Retry with exponential backoff on transient failures (rate limits etc)
import time as _time
last_error = ""
for attempt in range(3):
if attempt > 0:
_time.sleep(2 ** attempt) # 2s, 4s
try: try:
completed = subprocess.run( completed = subprocess.run(
args, capture_output=True, text=True, args, capture_output=True, text=True,
@@ -133,16 +139,22 @@ def extract_one(prompt, response, project, model, timeout_s):
encoding="utf-8", errors="replace", encoding="utf-8", errors="replace",
) )
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
return [], "timeout" last_error = "timeout"
continue
except Exception as exc: except Exception as exc:
return [], f"subprocess_error: {exc}" last_error = f"subprocess_error: {exc}"
continue
if completed.returncode != 0:
return [], f"exit_{completed.returncode}"
if completed.returncode == 0:
raw = (completed.stdout or "").strip() raw = (completed.stdout or "").strip()
return parse_candidates(raw, project), "" return parse_candidates(raw, project), ""
# Capture stderr for diagnostics (truncate to 200 chars)
stderr = (completed.stderr or "").strip()[:200]
last_error = f"exit_{completed.returncode}: {stderr}" if stderr else f"exit_{completed.returncode}"
return [], last_error
def parse_candidates(raw, interaction_project): def parse_candidates(raw, interaction_project):
"""Parse model JSON output into candidate dicts. """Parse model JSON output into candidate dicts.
@@ -192,10 +204,14 @@ def main():
total_persisted = 0 total_persisted = 0
errors = 0 errors = 0
for summary in interaction_summaries: import time as _time
for ix, summary in enumerate(interaction_summaries):
resp_chars = summary.get("response_chars", 0) or 0 resp_chars = summary.get("response_chars", 0) or 0
if resp_chars < 50: if resp_chars < 50:
continue continue
# Light pacing between calls to avoid bursting the claude CLI
if ix > 0:
_time.sleep(0.5)
iid = summary["id"] iid = summary["id"]
try: try:
raw = api_get( raw = api_get(