The API endpoint now passes the request's status field through to create_memory() so external scripts can create candidate memories directly without going through the extract endpoint. Default remains 'active' for backward compatibility. persist_llm_candidates.py reads a saved extractor eval baseline JSON (e.g. the Day 4 LLM run) and POSTs each candidate to Dalidou with status=candidate. Safe to re-run — duplicate content returns 400 which the script counts as 'skipped'. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
90 lines
2.7 KiB
Python
90 lines
2.7 KiB
Python
"""Persist LLM-extracted candidates from a baseline JSON to Dalidou.
|
|
|
|
One-shot script: reads a saved extractor eval output file, filters to
|
|
candidates the LLM actually produced, and POSTs each to the Dalidou
|
|
memory API with ``status=candidate``. Deduplicates against already-
|
|
existing candidate content so the script is safe to re-run.
|
|
|
|
Usage:
|
|
|
|
python scripts/persist_llm_candidates.py \\
|
|
scripts/eval_data/extractor_llm_baseline_2026-04-11.json
|
|
|
|
Then triage via:
|
|
|
|
python scripts/atocore_client.py triage
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://dalidou:8100")
|
|
TIMEOUT = int(os.environ.get("ATOCORE_TIMEOUT_SECONDS", "10"))
|
|
|
|
|
|
def post_json(path: str, body: dict) -> dict:
|
|
data = json.dumps(body).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
url=f"{BASE_URL}{path}",
|
|
method="POST",
|
|
headers={"Content-Type": "application/json"},
|
|
data=data,
|
|
)
|
|
with urllib.request.urlopen(req, timeout=TIMEOUT) as resp:
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
|
def main() -> int:
|
|
if len(sys.argv) < 2:
|
|
print(f"usage: {sys.argv[0]} <baseline_json>", file=sys.stderr)
|
|
return 1
|
|
|
|
data = json.loads(open(sys.argv[1], encoding="utf-8").read())
|
|
results = data.get("results", [])
|
|
|
|
persisted = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
for r in results:
|
|
for c in r.get("actual_candidates", []):
|
|
content = (c.get("content") or "").strip()
|
|
if not content:
|
|
continue
|
|
mem_type = c.get("memory_type", "knowledge")
|
|
project = c.get("project", "")
|
|
confidence = c.get("confidence", 0.5)
|
|
|
|
try:
|
|
resp = post_json("/memory", {
|
|
"memory_type": mem_type,
|
|
"content": content,
|
|
"project": project,
|
|
"confidence": float(confidence),
|
|
"status": "candidate",
|
|
})
|
|
persisted += 1
|
|
print(f" + {resp.get('id','?')[:8]} [{mem_type}] {content[:80]}")
|
|
except urllib.error.HTTPError as exc:
|
|
if exc.code == 400:
|
|
skipped += 1
|
|
else:
|
|
errors += 1
|
|
print(f" ! error {exc.code}: {content[:60]}", file=sys.stderr)
|
|
except Exception as exc:
|
|
errors += 1
|
|
print(f" ! {exc}: {content[:60]}", file=sys.stderr)
|
|
|
|
print(f"\npersisted={persisted} skipped={skipped} errors={errors}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|