ATOCore/scripts/persist_llm_candidates.py

"""Persist LLM-extracted candidates from a baseline JSON to Dalidou.

One-shot script: reads a saved extractor eval output file, filters to
candidates the LLM actually produced, and POSTs each to the Dalidou
memory API with ``status=candidate``. Deduplicates against already-
existing candidate content so the script is safe to re-run.

Usage:

    python scripts/persist_llm_candidates.py \\
        scripts/eval_data/extractor_llm_baseline_2026-04-11.json

Then triage via:

    python scripts/atocore_client.py triage
"""

from __future__ import annotations

import json
import os
import sys
import urllib.error
import urllib.parse
import urllib.request

BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://dalidou:8100")
TIMEOUT = int(os.environ.get("ATOCORE_TIMEOUT_SECONDS", "10"))


def post_json(path: str, body: dict) -> dict:
    data = json.dumps(body).encode("utf-8")
    req = urllib.request.Request(
        url=f"{BASE_URL}{path}",
        method="POST",
        headers={"Content-Type": "application/json"},
        data=data,
    )
    with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
        return json.loads(resp.read().decode("utf-8"))


def main() -> int:
    if len(sys.argv) < 2:
        print(f"usage: {sys.argv[0]} <baseline_json>", file=sys.stderr)
        return 1

    data = json.loads(open(sys.argv[1], encoding="utf-8").read())
    results = data.get("results", [])

    persisted = 0
    skipped = 0
    errors = 0

    for r in results:
        for c in r.get("actual_candidates", []):
            content = (c.get("content") or "").strip()
            if not content:
                continue
            mem_type = c.get("memory_type", "knowledge")
            project = c.get("project", "")
            confidence = c.get("confidence", 0.5)

            try:
                resp = post_json("/memory", {
                    "memory_type": mem_type,
                    "content": content,
                    "project": project,
                    "confidence": float(confidence),
                    "status": "candidate",
                })
                persisted += 1
                print(f"  + {resp.get('id','?')[:8]}  [{mem_type}]  {content[:80]}")
            except urllib.error.HTTPError as exc:
                if exc.code == 400:
                    skipped += 1
                else:
                    errors += 1
                    print(f"  ! error {exc.code}: {content[:60]}", file=sys.stderr)
            except Exception as exc:
                errors += 1
                print(f"  ! {exc}: {content[:60]}", file=sys.stderr)

    print(f"\npersisted={persisted}  skipped={skipped}  errors={errors}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())