#!/usr/bin/env python3 """Phase 5F — Memory → Entity graduation batch pass. Takes active memories, asks claude-p whether each describes a typed engineering entity, and creates entity candidates for the ones that do. Each candidate carries source_refs back to its source memory so human review can trace provenance. Human reviews the entity candidates via /admin/triage (same UI as memory triage). When a candidate is promoted, a post-promote hook marks the source memory as `graduated` and sets `graduated_to_entity_id` for traceability. This is THE population move: without it, the engineering graph stays sparse and the killer queries (Q-006/009/011) have nothing to find gaps in. Usage: python3 scripts/graduate_memories.py --base-url http://127.0.0.1:8100 \\ --project p05-interferometer --limit 20 # Dry run (don't create entities, just show decisions): python3 scripts/graduate_memories.py --project p05-interferometer --dry-run # Process all active memories across all projects (big run): python3 scripts/graduate_memories.py --limit 200 Host-side because claude CLI lives on Dalidou, not in the container. """ from __future__ import annotations import argparse import json import os import shutil import subprocess import sys import tempfile import time import urllib.error import urllib.request from typing import Any # Make src/ importable so we can reuse the stdlib-only prompt module _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) _SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src")) if _SRC_DIR not in sys.path: sys.path.insert(0, _SRC_DIR) from atocore.engineering._graduation_prompt import ( # noqa: E402 GRADUATION_PROMPT_VERSION, SYSTEM_PROMPT, build_user_message, parse_graduation_output, ) DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100") DEFAULT_MODEL = os.environ.get("ATOCORE_LLM_EXTRACTOR_MODEL", "sonnet") DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_GRADUATION_TIMEOUT_S", "90")) _sandbox_cwd = None def get_sandbox_cwd() -> str: """Temp cwd so claude CLI doesn't auto-discover project CLAUDE.md files.""" global _sandbox_cwd if _sandbox_cwd is None: _sandbox_cwd = tempfile.mkdtemp(prefix="ato-graduate-") return _sandbox_cwd def api_get(base_url: str, path: str) -> dict: req = urllib.request.Request(f"{base_url}{path}") with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read().decode("utf-8")) def api_post(base_url: str, path: str, body: dict | None = None) -> dict: data = json.dumps(body or {}).encode("utf-8") req = urllib.request.Request( f"{base_url}{path}", method="POST", headers={"Content-Type": "application/json"}, data=data, ) with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read().decode("utf-8")) def graduate_one(memory: dict, model: str, timeout_s: float) -> dict[str, Any] | None: """Ask claude whether this memory describes a typed entity. Returns None on any failure (parse error, timeout, exit!=0). Applies retry+pacing to match the pattern in auto_triage/batch_extract. """ if not shutil.which("claude"): return None user_msg = build_user_message( memory_content=memory.get("content", "") or "", memory_project=memory.get("project", "") or "", memory_type=memory.get("memory_type", "") or "", ) args = [ "claude", "-p", "--model", model, "--append-system-prompt", SYSTEM_PROMPT, "--disable-slash-commands", user_msg, ] last_error = "" for attempt in range(3): if attempt > 0: time.sleep(2 ** attempt) try: completed = subprocess.run( args, capture_output=True, text=True, timeout=timeout_s, cwd=get_sandbox_cwd(), encoding="utf-8", errors="replace", ) except subprocess.TimeoutExpired: last_error = "timeout" continue except Exception as exc: last_error = f"subprocess error: {exc}" continue if completed.returncode == 0: return parse_graduation_output(completed.stdout or "") stderr = (completed.stderr or "").strip()[:200] last_error = f"exit_{completed.returncode}: {stderr}" if stderr else f"exit_{completed.returncode}" print(f" ! claude failed after 3 tries: {last_error}", file=sys.stderr) return None def create_entity_candidate( base_url: str, decision: dict, memory: dict, ) -> str | None: """Create an entity candidate with source_refs pointing at the memory.""" try: result = api_post(base_url, "/entities", { "entity_type": decision["entity_type"], "name": decision["name"], "project": memory.get("project", "") or "", "description": decision["description"], "properties": { "graduated_from_memory": memory["id"], "proposed_relationships": decision["relationships"], "prompt_version": GRADUATION_PROMPT_VERSION, }, "status": "candidate", "confidence": decision["confidence"], "source_refs": [f"memory:{memory['id']}"], }) return result.get("id") except Exception as e: print(f" ! entity create failed: {e}", file=sys.stderr) return None def main() -> None: parser = argparse.ArgumentParser(description="Graduate active memories into entity candidates") parser.add_argument("--base-url", default=DEFAULT_BASE_URL) parser.add_argument("--model", default=DEFAULT_MODEL) parser.add_argument("--project", default=None, help="Only graduate memories in this project") parser.add_argument("--limit", type=int, default=50, help="Max memories to process") parser.add_argument("--min-confidence", type=float, default=0.3, help="Skip memories with confidence below this (they're probably noise)") parser.add_argument("--dry-run", action="store_true", help="Show decisions without creating entities") args = parser.parse_args() # Fetch active memories query = "status=active" query += f"&limit={args.limit}" if args.project: query += f"&project={args.project}" result = api_get(args.base_url, f"/memory?{query}") memories = result.get("memories", []) # Filter by min_confidence + skip already-graduated memories = [m for m in memories if m.get("confidence", 0) >= args.min_confidence and m.get("status") != "graduated"] print(f"graduating: {len(memories)} memories project={args.project or '(all)'} " f"model={args.model} dry_run={args.dry_run}") graduated = 0 skipped = 0 errors = 0 entities_created: list[str] = [] for i, mem in enumerate(memories, 1): if i > 1: time.sleep(0.5) # light pacing, matches auto_triage mid = mem["id"] label = f"[{i:3d}/{len(memories)}] {mid[:8]} [{mem.get('memory_type','?')}]" decision = graduate_one(mem, args.model, DEFAULT_TIMEOUT_S) if decision is None: print(f" ERROR {label} (graduate_one returned None)") errors += 1 continue if not decision.get("graduate"): reason = decision.get("reason", "(no reason)") print(f" skip {label} {reason}") skipped += 1 continue etype = decision["entity_type"] ename = decision["name"] nrel = len(decision.get("relationships", [])) if args.dry_run: print(f" WOULD {label} → [{etype}] {ename!r} ({nrel} rels)") graduated += 1 else: entity_id = create_entity_candidate(args.base_url, decision, mem) if entity_id: print(f" CREATE {label} → [{etype}] {ename!r} ({nrel} rels) entity={entity_id[:8]}") graduated += 1 entities_created.append(entity_id) else: errors += 1 print(f"\ntotal: graduated={graduated} skipped={skipped} errors={errors}") if entities_created: print(f"Review at /admin/triage ({len(entities_created)} entity candidates created)") if __name__ == "__main__": main()