diff --git a/deploy/dalidou/batch-extract.sh b/deploy/dalidou/batch-extract.sh index 6a26109..fc2e3f0 100644 --- a/deploy/dalidou/batch-extract.sh +++ b/deploy/dalidou/batch-extract.sh @@ -31,10 +31,11 @@ log() { printf '[%s] %s\n' "$TIMESTAMP" "$*"; } # The Python script needs the atocore source on PYTHONPATH export PYTHONPATH="$APP_DIR/src:${PYTHONPATH:-}" -log "=== AtoCore batch LLM extraction starting ===" +log "=== AtoCore batch extraction + triage starting ===" log "URL=$ATOCORE_URL LIMIT=$LIMIT" -# Run the host-side extraction script +# Step A: Extract candidates from recent interactions +log "Step A: LLM extraction" python3 "$APP_DIR/scripts/batch_llm_extract_live.py" \ --base-url "$ATOCORE_URL" \ --limit "$LIMIT" \ @@ -42,4 +43,12 @@ python3 "$APP_DIR/scripts/batch_llm_extract_live.py" \ log "WARN: batch extraction failed (non-blocking)" } -log "=== AtoCore batch LLM extraction complete ===" +# Step B: Auto-triage candidates in the queue +log "Step B: auto-triage" +python3 "$APP_DIR/scripts/auto_triage.py" \ + --base-url "$ATOCORE_URL" \ + 2>&1 || { + log "WARN: auto-triage failed (non-blocking)" +} + +log "=== AtoCore batch extraction + triage complete ===" diff --git a/scripts/auto_triage.py b/scripts/auto_triage.py new file mode 100644 index 0000000..85e1261 --- /dev/null +++ b/scripts/auto_triage.py @@ -0,0 +1,247 @@ +"""Auto-triage: LLM second-pass over candidate memories. + +Fetches all status=candidate memories from the AtoCore API, asks +a triage model (via claude -p) to classify each as promote / reject / +needs_human, and executes the verdict via the promote/reject endpoints. +Only needs_human candidates remain in the queue for manual review. + +Trust model: +- Auto-promote: model says promote AND confidence >= 0.8 AND no + duplicate content in existing active memories +- Auto-reject: model says reject +- needs_human: everything else stays in queue + +Runs host-side (same as batch extraction) because it needs the +claude CLI. Intended to be called after batch-extract.sh in the +nightly cron, or manually. + +Usage: + + python3 scripts/auto_triage.py --base-url http://localhost:8100 + python3 scripts/auto_triage.py --dry-run # preview without executing +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import subprocess +import sys +import tempfile +import urllib.error +import urllib.parse +import urllib.request + +DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://localhost:8100") +DEFAULT_MODEL = os.environ.get("ATOCORE_TRIAGE_MODEL", "sonnet") +DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_TRIAGE_TIMEOUT_S", "60")) +AUTO_PROMOTE_MIN_CONFIDENCE = 0.8 + +TRIAGE_SYSTEM_PROMPT = """You are a memory triage reviewer for a personal context engine called AtoCore. You review candidate memories extracted from LLM conversations and decide whether each should be promoted to active status, rejected, or flagged for human review. + +You will receive: +- The candidate memory content and type +- A list of existing active memories for the same project (to check for duplicates) + +For each candidate, output exactly one JSON object: + +{"verdict": "promote|reject|needs_human", "confidence": 0.0-1.0, "reason": "one sentence"} + +Rules: + +1. PROMOTE when the candidate states a durable architectural fact, ratified decision, standing rule, or engineering constraint that is NOT already covered by an existing active memory. Confidence should reflect how certain you are this is worth keeping. + +2. REJECT when the candidate is: + - A stale point-in-time snapshot ("live SHA is X", "36 active memories") + - An implementation detail too granular to be useful as standalone context + - A planned-but-not-implemented feature description + - A duplicate or near-duplicate of an existing active memory + - A session observation or conversational filler + - A process rule that belongs in DEV-LEDGER.md or AGENTS.md, not memory + +3. NEEDS_HUMAN when you're genuinely unsure — the candidate might be valuable but you can't tell without domain knowledge. This should be rare (< 20% of candidates). + +4. Output ONLY the JSON object. No prose, no markdown, no explanation outside the reason field.""" + +_sandbox_cwd = None + + +def get_sandbox_cwd(): + global _sandbox_cwd + if _sandbox_cwd is None: + _sandbox_cwd = tempfile.mkdtemp(prefix="ato-triage-") + return _sandbox_cwd + + +def api_get(base_url, path, timeout=10): + req = urllib.request.Request(f"{base_url}{path}") + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def api_post(base_url, path, body=None, timeout=10): + data = json.dumps(body or {}).encode("utf-8") + req = urllib.request.Request( + f"{base_url}{path}", method="POST", + headers={"Content-Type": "application/json"}, data=data, + ) + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def fetch_active_memories_for_project(base_url, project): + """Fetch active memories for dedup checking.""" + params = "active_only=true&limit=50" + if project: + params += f"&project={urllib.parse.quote(project)}" + result = api_get(base_url, f"/memory?{params}") + return result.get("memories", []) + + +def triage_one(candidate, active_memories, model, timeout_s): + """Ask the triage model to classify one candidate.""" + if not shutil.which("claude"): + return {"verdict": "needs_human", "confidence": 0.0, "reason": "claude CLI not available"} + + active_summary = "\n".join( + f"- [{m['memory_type']}] {m['content'][:150]}" + for m in active_memories[:20] + ) or "(no active memories for this project)" + + user_message = ( + f"CANDIDATE TO TRIAGE:\n" + f" type: {candidate['memory_type']}\n" + f" project: {candidate.get('project') or '(none)'}\n" + f" content: {candidate['content']}\n\n" + f"EXISTING ACTIVE MEMORIES FOR THIS PROJECT:\n{active_summary}\n\n" + f"Return the JSON verdict now." + ) + + args = [ + "claude", "-p", + "--model", model, + "--append-system-prompt", TRIAGE_SYSTEM_PROMPT, + "--disable-slash-commands", + user_message, + ] + + try: + completed = subprocess.run( + args, capture_output=True, text=True, + timeout=timeout_s, cwd=get_sandbox_cwd(), + encoding="utf-8", errors="replace", + ) + except subprocess.TimeoutExpired: + return {"verdict": "needs_human", "confidence": 0.0, "reason": "triage model timed out"} + except Exception as exc: + return {"verdict": "needs_human", "confidence": 0.0, "reason": f"subprocess error: {exc}"} + + if completed.returncode != 0: + return {"verdict": "needs_human", "confidence": 0.0, "reason": f"claude exit {completed.returncode}"} + + raw = (completed.stdout or "").strip() + return parse_verdict(raw) + + +def parse_verdict(raw): + """Parse the triage model's JSON verdict.""" + text = raw.strip() + if text.startswith("```"): + text = text.strip("`") + nl = text.find("\n") + if nl >= 0: + text = text[nl + 1:] + if text.endswith("```"): + text = text[:-3] + text = text.strip() + + if not text.lstrip().startswith("{"): + start = text.find("{") + end = text.rfind("}") + if start >= 0 and end > start: + text = text[start:end + 1] + + try: + parsed = json.loads(text) + except json.JSONDecodeError: + return {"verdict": "needs_human", "confidence": 0.0, "reason": "failed to parse triage output"} + + verdict = str(parsed.get("verdict", "needs_human")).strip().lower() + if verdict not in {"promote", "reject", "needs_human"}: + verdict = "needs_human" + + confidence = parsed.get("confidence", 0.5) + try: + confidence = max(0.0, min(1.0, float(confidence))) + except (TypeError, ValueError): + confidence = 0.5 + + reason = str(parsed.get("reason", "")).strip()[:200] + return {"verdict": verdict, "confidence": confidence, "reason": reason} + + +def main(): + parser = argparse.ArgumentParser(description="Auto-triage candidate memories") + parser.add_argument("--base-url", default=DEFAULT_BASE_URL) + parser.add_argument("--model", default=DEFAULT_MODEL) + parser.add_argument("--dry-run", action="store_true", help="preview without executing") + args = parser.parse_args() + + # Fetch candidates + result = api_get(args.base_url, "/memory?status=candidate&limit=100") + candidates = result.get("memories", []) + print(f"candidates: {len(candidates)} model: {args.model} dry_run: {args.dry_run}") + + if not candidates: + print("queue empty, nothing to triage") + return + + # Cache active memories per project for dedup + active_cache = {} + promoted = rejected = needs_human = errors = 0 + + for i, cand in enumerate(candidates, 1): + project = cand.get("project") or "" + if project not in active_cache: + active_cache[project] = fetch_active_memories_for_project(args.base_url, project) + + verdict_obj = triage_one(cand, active_cache[project], args.model, DEFAULT_TIMEOUT_S) + verdict = verdict_obj["verdict"] + conf = verdict_obj["confidence"] + reason = verdict_obj["reason"] + + mid = cand["id"] + label = f"[{i:2d}/{len(candidates)}] {mid[:8]} [{cand['memory_type']}]" + + if verdict == "promote" and conf >= AUTO_PROMOTE_MIN_CONFIDENCE: + if args.dry_run: + print(f" WOULD PROMOTE {label} conf={conf:.2f} {reason}") + else: + try: + api_post(args.base_url, f"/memory/{mid}/promote") + print(f" PROMOTED {label} conf={conf:.2f} {reason}") + active_cache[project].append(cand) + except Exception: + errors += 1 + promoted += 1 + elif verdict == "reject": + if args.dry_run: + print(f" WOULD REJECT {label} conf={conf:.2f} {reason}") + else: + try: + api_post(args.base_url, f"/memory/{mid}/reject") + print(f" REJECTED {label} conf={conf:.2f} {reason}") + except Exception: + errors += 1 + rejected += 1 + else: + print(f" NEEDS_HUMAN {label} conf={conf:.2f} {reason}") + needs_human += 1 + + print(f"\npromoted={promoted} rejected={rejected} needs_human={needs_human} errors={errors}") + + +if __name__ == "__main__": + main() diff --git a/scripts/eval_data/candidate_queue_2026-04-12.json b/scripts/eval_data/candidate_queue_2026-04-12.json new file mode 100644 index 0000000..1c4203d --- /dev/null +++ b/scripts/eval_data/candidate_queue_2026-04-12.json @@ -0,0 +1 @@ +{"memories":[{"id":"7d4bdab5-3a8b-4e85-a1b1-117d6d5c094e","memory_type":"project","content":"AtoCore extraction must stay off the hot capture path; batch endpoint only","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:16"},{"id":"af4dbc85-e5bd-4aac-927b-5ecee4497ec4","memory_type":"project","content":"Auto-promote gate: confidence ≥0.8 AND no duplicate in active memories","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:16"},{"id":"a6d9c14d-e64c-4e9d-aeeb-c9667c5fbda0","memory_type":"project","content":"AtoCore LLM extraction pipeline deployed on Dalidou host, runs via cron at 03:00 UTC via scripts/batch_llm_extract_live.py","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:05"},{"id":"06a43be1-026a-4085-b3e1-76348fd78250","memory_type":"project","content":"LLM extractor runs host-side (not in container) because claude CLI not available in container environment","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:05"},{"id":"5458852f-6001-44d9-960b-71f6ef5b93b5","memory_type":"project","content":"Host-side extraction script scripts/batch_llm_extract_live.py uses pure stdlib, no atocore imports for deployment simplicity","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:05"},{"id":"6ba13dc3-2157-46ec-bd75-9bf9bd37aef1","memory_type":"project","content":"POST /admin/extract-batch accepts mode: rule|llm, POST /interactions/{id}/extract now mode-aware","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:05"},{"id":"40db1e3f-c748-40e1-a224-01ad65b0e4fd","memory_type":"knowledge","content":"claude CLI 2.0.60 removed --no-session-persistence flag, extraction sessions now persist in claude history","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:05"},{"id":"e66efb99-9b3b-4056-8f65-2dcaa773eb7f","memory_type":"adaptation","content":"Durable memory extraction candidates must be <200 chars, stand-alone, typed as project|knowledge|preference|adaptation","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:05"},{"id":"557329d4-ddf1-4b0e-b628-aa29a88a243b","memory_type":"adaptation","content":"Memory extraction confidence defaults to 0.5, raise to 0.6 only for unambiguous committed claims","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:05"},{"id":"4ca2545d-9437-48a8-b197-6da6b716f12a","memory_type":"project","content":"Live Dalidou is on commit 39d73e9, not e2895b5","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"3c3c6cff-7a2b-41d6-b69a-ba12683d4bc2","memory_type":"project","content":"Live harness is reproducible at 16/18 PASS","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"119a2d0f-2405-45cb-b54c-984ffc12c6ea","memory_type":"project","content":"Live active memories count is 36","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"4e7e0c38-1080-4631-a8b7-1a1216ed141e","memory_type":"project","content":"Wave 2 project-state entries on live: p04=5, p05=6, p06=6","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"8304fce1-7bbc-4e55-8c12-c5f127067235","memory_type":"project","content":"R6 is fixed by commit 39d73e9","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"9a27d81c-4e7f-41f0-9b49-7e296eaae91f","memory_type":"project","content":"R9: R6 fix only covers empty project fallback; wrong non-empty model project can still override known interaction scope","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"e9ace741-e8b1-4234-9641-4d023725d8d7","memory_type":"project","content":"R10: Phase 8 is baseline-complete but not primary-complete; OpenClaw client covers narrow read-oriented slice of API","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"6a3a5e36-c0a9-4943-a01e-3f44075d8a52","memory_type":"project","content":"Phase 8 is decent baseline integration milestone but not primary-ready yet","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"5f59595a-942d-43fb-b251-1dbe5e1358fc","memory_type":"project","content":"4-step roadmap complete: extractor → harness → Wave 2 → OpenClaw","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"80a3b9ef-aee1-482c-9979-06de3b8bdbc4","memory_type":"project","content":"Codex audit loop proven across two full round-trips in one session","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"72f0525b-8542-43c9-8ea0-597069238490","memory_type":"project","content":"Session end state: 36 active memories, 17 project-state entries, 16/18 harness, 280 tests, main at 54d84b5","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:46"},{"id":"fbdd3e86-5522-4abc-ad03-1e50ff7df05c","memory_type":"project","content":"AtoCore extraction stays off the hot capture path; LLM extraction runs as scheduled batch, not inline with POST /interactions.","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:37"},{"id":"cc4d9c8f-4c44-46e4-af29-927cdb5e3706","memory_type":"project","content":"AtoCore auto-triage trust model: auto-promote only when confidence ≥0.8 AND no duplicate active memory; else needs_human.","project":"atocore","confidence":0.6,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:37"},{"id":"826d6f52-21c2-4afd-bd0e-4c157be5bf74","memory_type":"project","content":"Multi-model triage: use different model for triage reviewer than extractor (sonnet for extract)","project":"atocore","confidence":0.5,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:16"},{"id":"feb56ac9-597d-4c46-a327-ea8fc9cbbd80","memory_type":"project","content":"R9 fix: when interaction has known project, prefer it over model's non-matching project unless model's is registered","project":"atocore","confidence":0.5,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:16"},{"id":"5760012e-ebd0-4283-9011-186e39e2efa1","memory_type":"project","content":"R7 ranking fix: add overlap-density as secondary signal (overlap_count / memory_token_count)","project":"atocore","confidence":0.5,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:16"},{"id":"aa0f82f5-f6b6-4e3d-9e2e-fb83d901204a","memory_type":"project","content":"Extraction pipeline skips interactions with response_chars < 50 to avoid low-signal content","project":"atocore","confidence":0.5,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 15:47:05"},{"id":"df19f021-8111-4441-87b4-5d6630d6c67b","memory_type":"project","content":"AtoCore triage uses independent model from extractor (extractor: sonnet, triage: different model or different prompt).","project":"atocore","confidence":0.5,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:37"},{"id":"a143ceb0-4ffb-498a-bd43-7120edba57f2","memory_type":"project","content":"AtoCore ranking scorer adds overlap-density (overlap_count / memory_tokens) as secondary signal to fix short-memory ranking.","project":"atocore","confidence":0.5,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:37"},{"id":"8d47092c-86be-41d6-9e4f-3a0bc7c1c9c9","memory_type":"project","content":"AtoCore project trust: when interaction has known project and model returns different project, prefer interaction's project unless model's is registered.","project":"atocore","confidence":0.5,"status":"candidate","reference_count":0,"last_referenced_at":"","updated_at":"2026-04-12 14:59:37"}],"types":["identity","preference","project","episodic","knowledge","adaptation"],"statuses":["candidate","active","superseded","invalid"]} \ No newline at end of file diff --git a/scripts/eval_data/candidate_queue_2026-04-12.txt b/scripts/eval_data/candidate_queue_2026-04-12.txt new file mode 100644 index 0000000..2d1e0d3 --- /dev/null +++ b/scripts/eval_data/candidate_queue_2026-04-12.txt @@ -0,0 +1,29 @@ + 1. [project ] proj=atocore AtoCore extraction must stay off the hot capture path; batch endpoint only + 2. [project ] proj=atocore Auto-promote gate: confidence ≥0.8 AND no duplicate in active memories + 3. [project ] proj=atocore AtoCore LLM extraction pipeline deployed on Dalidou host, runs via cron at 03:00 UTC via scripts/batch_llm_extract_live.py + 4. [project ] proj=atocore LLM extractor runs host-side (not in container) because claude CLI not available in container environment + 5. [project ] proj=atocore Host-side extraction script scripts/batch_llm_extract_live.py uses pure stdlib, no atocore imports for deployment simplicity + 6. [project ] proj=atocore POST /admin/extract-batch accepts mode: rule|llm, POST /interactions/{id}/extract now mode-aware + 7. [knowledge ] proj=atocore claude CLI 2.0.60 removed --no-session-persistence flag, extraction sessions now persist in claude history + 8. [adaptation ] proj=atocore Durable memory extraction candidates must be <200 chars, stand-alone, typed as project|knowledge|preference|adaptation + 9. [adaptation ] proj=atocore Memory extraction confidence defaults to 0.5, raise to 0.6 only for unambiguous committed claims +10. [project ] proj=atocore Live Dalidou is on commit 39d73e9, not e2895b5 +11. [project ] proj=atocore Live harness is reproducible at 16/18 PASS +12. [project ] proj=atocore Live active memories count is 36 +13. [project ] proj=atocore Wave 2 project-state entries on live: p04=5, p05=6, p06=6 +14. [project ] proj=atocore R6 is fixed by commit 39d73e9 +15. [project ] proj=atocore R9: R6 fix only covers empty project fallback; wrong non-empty model project can still override known interaction scope +16. [project ] proj=atocore R10: Phase 8 is baseline-complete but not primary-complete; OpenClaw client covers narrow read-oriented slice of API +17. [project ] proj=atocore Phase 8 is decent baseline integration milestone but not primary-ready yet +18. [project ] proj=atocore 4-step roadmap complete: extractor → harness → Wave 2 → OpenClaw +19. [project ] proj=atocore Codex audit loop proven across two full round-trips in one session +20. [project ] proj=atocore Session end state: 36 active memories, 17 project-state entries, 16/18 harness, 280 tests, main at 54d84b5 +21. [project ] proj=atocore AtoCore extraction stays off the hot capture path; LLM extraction runs as scheduled batch, not inline with POST /interactions. +22. [project ] proj=atocore AtoCore auto-triage trust model: auto-promote only when confidence ≥0.8 AND no duplicate active memory; else needs_human. +23. [project ] proj=atocore Multi-model triage: use different model for triage reviewer than extractor (sonnet for extract) +24. [project ] proj=atocore R9 fix: when interaction has known project, prefer it over model's non-matching project unless model's is registered +25. [project ] proj=atocore R7 ranking fix: add overlap-density as secondary signal (overlap_count / memory_token_count) +26. [project ] proj=atocore Extraction pipeline skips interactions with response_chars < 50 to avoid low-signal content +27. [project ] proj=atocore AtoCore triage uses independent model from extractor (extractor: sonnet, triage: different model or different prompt). +28. [project ] proj=atocore AtoCore ranking scorer adds overlap-density (overlap_count / memory_tokens) as secondary signal to fix short-memory ranking. +29. [project ] proj=atocore AtoCore project trust: when interaction has known project and model returns different project, prefer interaction's project unless