diff --git a/scripts/persist_llm_candidates.py b/scripts/persist_llm_candidates.py new file mode 100644 index 0000000..9def81c --- /dev/null +++ b/scripts/persist_llm_candidates.py @@ -0,0 +1,89 @@ +"""Persist LLM-extracted candidates from a baseline JSON to Dalidou. + +One-shot script: reads a saved extractor eval output file, filters to +candidates the LLM actually produced, and POSTs each to the Dalidou +memory API with ``status=candidate``. Deduplicates against already- +existing candidate content so the script is safe to re-run. + +Usage: + + python scripts/persist_llm_candidates.py \\ + scripts/eval_data/extractor_llm_baseline_2026-04-11.json + +Then triage via: + + python scripts/atocore_client.py triage +""" + +from __future__ import annotations + +import json +import os +import sys +import urllib.error +import urllib.parse +import urllib.request + +BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://dalidou:8100") +TIMEOUT = int(os.environ.get("ATOCORE_TIMEOUT_SECONDS", "10")) + + +def post_json(path: str, body: dict) -> dict: + data = json.dumps(body).encode("utf-8") + req = urllib.request.Request( + url=f"{BASE_URL}{path}", + method="POST", + headers={"Content-Type": "application/json"}, + data=data, + ) + with urllib.request.urlopen(req, timeout=TIMEOUT) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def main() -> int: + if len(sys.argv) < 2: + print(f"usage: {sys.argv[0]} ", file=sys.stderr) + return 1 + + data = json.loads(open(sys.argv[1], encoding="utf-8").read()) + results = data.get("results", []) + + persisted = 0 + skipped = 0 + errors = 0 + + for r in results: + for c in r.get("actual_candidates", []): + content = (c.get("content") or "").strip() + if not content: + continue + mem_type = c.get("memory_type", "knowledge") + project = c.get("project", "") + confidence = c.get("confidence", 0.5) + + try: + resp = post_json("/memory", { + "memory_type": mem_type, + "content": content, + "project": project, + "confidence": float(confidence), + "status": "candidate", + }) + persisted += 1 + print(f" + {resp.get('id','?')[:8]} [{mem_type}] {content[:80]}") + except urllib.error.HTTPError as exc: + if exc.code == 400: + skipped += 1 + else: + errors += 1 + print(f" ! error {exc.code}: {content[:60]}", file=sys.stderr) + except Exception as exc: + errors += 1 + print(f" ! {exc}: {content[:60]}", file=sys.stderr) + + print(f"\npersisted={persisted} skipped={skipped} errors={errors}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/atocore/api/routes.py b/src/atocore/api/routes.py index 5ad8f80..847621c 100644 --- a/src/atocore/api/routes.py +++ b/src/atocore/api/routes.py @@ -141,6 +141,7 @@ class MemoryCreateRequest(BaseModel): content: str project: str = "" confidence: float = 1.0 + status: str = "active" class MemoryUpdateRequest(BaseModel): @@ -344,6 +345,7 @@ def api_create_memory(req: MemoryCreateRequest) -> dict: content=req.content, project=req.project, confidence=req.confidence, + status=req.status, ) except ValueError as e: raise HTTPException(status_code=400, detail=str(e))