fix: retry + stderr capture + pacing in triage/extractor
Both scripts now: - Retry up to 3x with 2s/4s exponential backoff on transient failures (rate limits, capacity spikes) - Capture claude CLI stderr in the error message (200 char cap) instead of just the exit code — diagnostics actually useful now - Sleep 0.5s between calls to avoid bursting the backend Context: last batch run hit 100% failure in triage (every call exit 1) after 40% failure in extraction. claude CLI worked fine immediately after, so the failures were capacity/rate-limit transients. With retry + pacing these batches should complete cleanly now. 439 candidates are already in the queue waiting for triage. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -29,6 +29,7 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import tempfile
|
import tempfile
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
@@ -131,22 +132,33 @@ def triage_one(candidate, active_memories, model, timeout_s):
|
|||||||
user_message,
|
user_message,
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
# Retry with exponential backoff on transient failures (rate limits etc)
|
||||||
completed = subprocess.run(
|
last_error = ""
|
||||||
args, capture_output=True, text=True,
|
for attempt in range(3):
|
||||||
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
if attempt > 0:
|
||||||
encoding="utf-8", errors="replace",
|
time.sleep(2 ** attempt) # 2s, 4s
|
||||||
)
|
try:
|
||||||
except subprocess.TimeoutExpired:
|
completed = subprocess.run(
|
||||||
return {"verdict": "needs_human", "confidence": 0.0, "reason": "triage model timed out"}
|
args, capture_output=True, text=True,
|
||||||
except Exception as exc:
|
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
||||||
return {"verdict": "needs_human", "confidence": 0.0, "reason": f"subprocess error: {exc}"}
|
encoding="utf-8", errors="replace",
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
last_error = "triage model timed out"
|
||||||
|
continue
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = f"subprocess error: {exc}"
|
||||||
|
continue
|
||||||
|
|
||||||
if completed.returncode != 0:
|
if completed.returncode == 0:
|
||||||
return {"verdict": "needs_human", "confidence": 0.0, "reason": f"claude exit {completed.returncode}"}
|
raw = (completed.stdout or "").strip()
|
||||||
|
return parse_verdict(raw)
|
||||||
|
|
||||||
raw = (completed.stdout or "").strip()
|
# Capture stderr for diagnostics (truncate to 200 chars)
|
||||||
return parse_verdict(raw)
|
stderr = (completed.stderr or "").strip()[:200]
|
||||||
|
last_error = f"claude exit {completed.returncode}: {stderr}" if stderr else f"claude exit {completed.returncode}"
|
||||||
|
|
||||||
|
return {"verdict": "needs_human", "confidence": 0.0, "reason": last_error}
|
||||||
|
|
||||||
|
|
||||||
def parse_verdict(raw):
|
def parse_verdict(raw):
|
||||||
@@ -213,6 +225,13 @@ def main():
|
|||||||
promoted = rejected = needs_human = errors = 0
|
promoted = rejected = needs_human = errors = 0
|
||||||
|
|
||||||
for i, cand in enumerate(candidates, 1):
|
for i, cand in enumerate(candidates, 1):
|
||||||
|
# Light rate-limit pacing: 0.5s between triage calls so a burst
|
||||||
|
# doesn't overwhelm the claude CLI's backend. With ~60s per call
|
||||||
|
# this is negligible overhead but avoids the "all-failed" pattern
|
||||||
|
# we saw on large batches.
|
||||||
|
if i > 1:
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
project = cand.get("project") or ""
|
project = cand.get("project") or ""
|
||||||
if project not in active_cache:
|
if project not in active_cache:
|
||||||
active_cache[project] = fetch_active_memories_for_project(args.base_url, project)
|
active_cache[project] = fetch_active_memories_for_project(args.base_url, project)
|
||||||
|
|||||||
@@ -126,22 +126,34 @@ def extract_one(prompt, response, project, model, timeout_s):
|
|||||||
user_message,
|
user_message,
|
||||||
]
|
]
|
||||||
|
|
||||||
try:
|
# Retry with exponential backoff on transient failures (rate limits etc)
|
||||||
completed = subprocess.run(
|
import time as _time
|
||||||
args, capture_output=True, text=True,
|
last_error = ""
|
||||||
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
for attempt in range(3):
|
||||||
encoding="utf-8", errors="replace",
|
if attempt > 0:
|
||||||
)
|
_time.sleep(2 ** attempt) # 2s, 4s
|
||||||
except subprocess.TimeoutExpired:
|
try:
|
||||||
return [], "timeout"
|
completed = subprocess.run(
|
||||||
except Exception as exc:
|
args, capture_output=True, text=True,
|
||||||
return [], f"subprocess_error: {exc}"
|
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
||||||
|
encoding="utf-8", errors="replace",
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
last_error = "timeout"
|
||||||
|
continue
|
||||||
|
except Exception as exc:
|
||||||
|
last_error = f"subprocess_error: {exc}"
|
||||||
|
continue
|
||||||
|
|
||||||
if completed.returncode != 0:
|
if completed.returncode == 0:
|
||||||
return [], f"exit_{completed.returncode}"
|
raw = (completed.stdout or "").strip()
|
||||||
|
return parse_candidates(raw, project), ""
|
||||||
|
|
||||||
raw = (completed.stdout or "").strip()
|
# Capture stderr for diagnostics (truncate to 200 chars)
|
||||||
return parse_candidates(raw, project), ""
|
stderr = (completed.stderr or "").strip()[:200]
|
||||||
|
last_error = f"exit_{completed.returncode}: {stderr}" if stderr else f"exit_{completed.returncode}"
|
||||||
|
|
||||||
|
return [], last_error
|
||||||
|
|
||||||
|
|
||||||
def parse_candidates(raw, interaction_project):
|
def parse_candidates(raw, interaction_project):
|
||||||
@@ -192,10 +204,14 @@ def main():
|
|||||||
total_persisted = 0
|
total_persisted = 0
|
||||||
errors = 0
|
errors = 0
|
||||||
|
|
||||||
for summary in interaction_summaries:
|
import time as _time
|
||||||
|
for ix, summary in enumerate(interaction_summaries):
|
||||||
resp_chars = summary.get("response_chars", 0) or 0
|
resp_chars = summary.get("response_chars", 0) or 0
|
||||||
if resp_chars < 50:
|
if resp_chars < 50:
|
||||||
continue
|
continue
|
||||||
|
# Light pacing between calls to avoid bursting the claude CLI
|
||||||
|
if ix > 0:
|
||||||
|
_time.sleep(0.5)
|
||||||
iid = summary["id"]
|
iid = summary["id"]
|
||||||
try:
|
try:
|
||||||
raw = api_get(
|
raw = api_get(
|
||||||
|
|||||||
Reference in New Issue
Block a user