238 lines
8.2 KiB
Python
238 lines
8.2 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Phase 5F — Memory → Entity graduation batch pass.
|
||
|
|
|
||
|
|
Takes active memories, asks claude-p whether each describes a typed
|
||
|
|
engineering entity, and creates entity candidates for the ones that do.
|
||
|
|
Each candidate carries source_refs back to its source memory so human
|
||
|
|
review can trace provenance.
|
||
|
|
|
||
|
|
Human reviews the entity candidates via /admin/triage (same UI as memory
|
||
|
|
triage). When a candidate is promoted, a post-promote hook marks the source
|
||
|
|
memory as `graduated` and sets `graduated_to_entity_id` for traceability.
|
||
|
|
|
||
|
|
This is THE population move: without it, the engineering graph stays sparse
|
||
|
|
and the killer queries (Q-006/009/011) have nothing to find gaps in.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python3 scripts/graduate_memories.py --base-url http://127.0.0.1:8100 \\
|
||
|
|
--project p05-interferometer --limit 20
|
||
|
|
|
||
|
|
# Dry run (don't create entities, just show decisions):
|
||
|
|
python3 scripts/graduate_memories.py --project p05-interferometer --dry-run
|
||
|
|
|
||
|
|
# Process all active memories across all projects (big run):
|
||
|
|
python3 scripts/graduate_memories.py --limit 200
|
||
|
|
|
||
|
|
Host-side because claude CLI lives on Dalidou, not in the container.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import shutil
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
import tempfile
|
||
|
|
import time
|
||
|
|
import urllib.error
|
||
|
|
import urllib.request
|
||
|
|
from typing import Any
|
||
|
|
|
||
|
|
# Make src/ importable so we can reuse the stdlib-only prompt module
|
||
|
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||
|
|
_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
|
||
|
|
if _SRC_DIR not in sys.path:
|
||
|
|
sys.path.insert(0, _SRC_DIR)
|
||
|
|
|
||
|
|
from atocore.engineering._graduation_prompt import ( # noqa: E402
|
||
|
|
GRADUATION_PROMPT_VERSION,
|
||
|
|
SYSTEM_PROMPT,
|
||
|
|
build_user_message,
|
||
|
|
parse_graduation_output,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")
|
||
|
|
DEFAULT_MODEL = os.environ.get("ATOCORE_LLM_EXTRACTOR_MODEL", "sonnet")
|
||
|
|
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_GRADUATION_TIMEOUT_S", "90"))
|
||
|
|
|
||
|
|
_sandbox_cwd = None
|
||
|
|
|
||
|
|
|
||
|
|
def get_sandbox_cwd() -> str:
|
||
|
|
"""Temp cwd so claude CLI doesn't auto-discover project CLAUDE.md files."""
|
||
|
|
global _sandbox_cwd
|
||
|
|
if _sandbox_cwd is None:
|
||
|
|
_sandbox_cwd = tempfile.mkdtemp(prefix="ato-graduate-")
|
||
|
|
return _sandbox_cwd
|
||
|
|
|
||
|
|
|
||
|
|
def api_get(base_url: str, path: str) -> dict:
|
||
|
|
req = urllib.request.Request(f"{base_url}{path}")
|
||
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
||
|
|
return json.loads(resp.read().decode("utf-8"))
|
||
|
|
|
||
|
|
|
||
|
|
def api_post(base_url: str, path: str, body: dict | None = None) -> dict:
|
||
|
|
data = json.dumps(body or {}).encode("utf-8")
|
||
|
|
req = urllib.request.Request(
|
||
|
|
f"{base_url}{path}", method="POST",
|
||
|
|
headers={"Content-Type": "application/json"}, data=data,
|
||
|
|
)
|
||
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
||
|
|
return json.loads(resp.read().decode("utf-8"))
|
||
|
|
|
||
|
|
|
||
|
|
def graduate_one(memory: dict, model: str, timeout_s: float) -> dict[str, Any] | None:
|
||
|
|
"""Ask claude whether this memory describes a typed entity.
|
||
|
|
|
||
|
|
Returns None on any failure (parse error, timeout, exit!=0).
|
||
|
|
Applies retry+pacing to match the pattern in auto_triage/batch_extract.
|
||
|
|
"""
|
||
|
|
if not shutil.which("claude"):
|
||
|
|
return None
|
||
|
|
|
||
|
|
user_msg = build_user_message(
|
||
|
|
memory_content=memory.get("content", "") or "",
|
||
|
|
memory_project=memory.get("project", "") or "",
|
||
|
|
memory_type=memory.get("memory_type", "") or "",
|
||
|
|
)
|
||
|
|
|
||
|
|
args = [
|
||
|
|
"claude", "-p",
|
||
|
|
"--model", model,
|
||
|
|
"--append-system-prompt", SYSTEM_PROMPT,
|
||
|
|
"--disable-slash-commands",
|
||
|
|
user_msg,
|
||
|
|
]
|
||
|
|
|
||
|
|
last_error = ""
|
||
|
|
for attempt in range(3):
|
||
|
|
if attempt > 0:
|
||
|
|
time.sleep(2 ** attempt)
|
||
|
|
try:
|
||
|
|
completed = subprocess.run(
|
||
|
|
args, capture_output=True, text=True,
|
||
|
|
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
||
|
|
encoding="utf-8", errors="replace",
|
||
|
|
)
|
||
|
|
except subprocess.TimeoutExpired:
|
||
|
|
last_error = "timeout"
|
||
|
|
continue
|
||
|
|
except Exception as exc:
|
||
|
|
last_error = f"subprocess error: {exc}"
|
||
|
|
continue
|
||
|
|
|
||
|
|
if completed.returncode == 0:
|
||
|
|
return parse_graduation_output(completed.stdout or "")
|
||
|
|
|
||
|
|
stderr = (completed.stderr or "").strip()[:200]
|
||
|
|
last_error = f"exit_{completed.returncode}: {stderr}" if stderr else f"exit_{completed.returncode}"
|
||
|
|
|
||
|
|
print(f" ! claude failed after 3 tries: {last_error}", file=sys.stderr)
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def create_entity_candidate(
|
||
|
|
base_url: str,
|
||
|
|
decision: dict,
|
||
|
|
memory: dict,
|
||
|
|
) -> str | None:
|
||
|
|
"""Create an entity candidate with source_refs pointing at the memory."""
|
||
|
|
try:
|
||
|
|
result = api_post(base_url, "/entities", {
|
||
|
|
"entity_type": decision["entity_type"],
|
||
|
|
"name": decision["name"],
|
||
|
|
"project": memory.get("project", "") or "",
|
||
|
|
"description": decision["description"],
|
||
|
|
"properties": {
|
||
|
|
"graduated_from_memory": memory["id"],
|
||
|
|
"proposed_relationships": decision["relationships"],
|
||
|
|
"prompt_version": GRADUATION_PROMPT_VERSION,
|
||
|
|
},
|
||
|
|
"status": "candidate",
|
||
|
|
"confidence": decision["confidence"],
|
||
|
|
"source_refs": [f"memory:{memory['id']}"],
|
||
|
|
})
|
||
|
|
return result.get("id")
|
||
|
|
except Exception as e:
|
||
|
|
print(f" ! entity create failed: {e}", file=sys.stderr)
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
parser = argparse.ArgumentParser(description="Graduate active memories into entity candidates")
|
||
|
|
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
||
|
|
parser.add_argument("--model", default=DEFAULT_MODEL)
|
||
|
|
parser.add_argument("--project", default=None, help="Only graduate memories in this project")
|
||
|
|
parser.add_argument("--limit", type=int, default=50, help="Max memories to process")
|
||
|
|
parser.add_argument("--min-confidence", type=float, default=0.3,
|
||
|
|
help="Skip memories with confidence below this (they're probably noise)")
|
||
|
|
parser.add_argument("--dry-run", action="store_true", help="Show decisions without creating entities")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Fetch active memories
|
||
|
|
query = "status=active"
|
||
|
|
query += f"&limit={args.limit}"
|
||
|
|
if args.project:
|
||
|
|
query += f"&project={args.project}"
|
||
|
|
result = api_get(args.base_url, f"/memory?{query}")
|
||
|
|
memories = result.get("memories", [])
|
||
|
|
|
||
|
|
# Filter by min_confidence + skip already-graduated
|
||
|
|
memories = [m for m in memories
|
||
|
|
if m.get("confidence", 0) >= args.min_confidence
|
||
|
|
and m.get("status") != "graduated"]
|
||
|
|
|
||
|
|
print(f"graduating: {len(memories)} memories project={args.project or '(all)'} "
|
||
|
|
f"model={args.model} dry_run={args.dry_run}")
|
||
|
|
|
||
|
|
graduated = 0
|
||
|
|
skipped = 0
|
||
|
|
errors = 0
|
||
|
|
entities_created: list[str] = []
|
||
|
|
|
||
|
|
for i, mem in enumerate(memories, 1):
|
||
|
|
if i > 1:
|
||
|
|
time.sleep(0.5) # light pacing, matches auto_triage
|
||
|
|
mid = mem["id"]
|
||
|
|
label = f"[{i:3d}/{len(memories)}] {mid[:8]} [{mem.get('memory_type','?')}]"
|
||
|
|
|
||
|
|
decision = graduate_one(mem, args.model, DEFAULT_TIMEOUT_S)
|
||
|
|
if decision is None:
|
||
|
|
print(f" ERROR {label} (graduate_one returned None)")
|
||
|
|
errors += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
if not decision.get("graduate"):
|
||
|
|
reason = decision.get("reason", "(no reason)")
|
||
|
|
print(f" skip {label} {reason}")
|
||
|
|
skipped += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
etype = decision["entity_type"]
|
||
|
|
ename = decision["name"]
|
||
|
|
nrel = len(decision.get("relationships", []))
|
||
|
|
|
||
|
|
if args.dry_run:
|
||
|
|
print(f" WOULD {label} → [{etype}] {ename!r} ({nrel} rels)")
|
||
|
|
graduated += 1
|
||
|
|
else:
|
||
|
|
entity_id = create_entity_candidate(args.base_url, decision, mem)
|
||
|
|
if entity_id:
|
||
|
|
print(f" CREATE {label} → [{etype}] {ename!r} ({nrel} rels) entity={entity_id[:8]}")
|
||
|
|
graduated += 1
|
||
|
|
entities_created.append(entity_id)
|
||
|
|
else:
|
||
|
|
errors += 1
|
||
|
|
|
||
|
|
print(f"\ntotal: graduated={graduated} skipped={skipped} errors={errors}")
|
||
|
|
if entities_created:
|
||
|
|
print(f"Review at /admin/triage ({len(entities_created)} entity candidates created)")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|