ATOCore/scripts/graduate_memories.py

#!/usr/bin/env python3
"""Phase 5F — Memory → Entity graduation batch pass.

Takes active memories, asks claude-p whether each describes a typed
engineering entity, and creates entity candidates for the ones that do.
Each candidate carries source_refs back to its source memory so human
review can trace provenance.

Human reviews the entity candidates via /admin/triage (same UI as memory
triage). When a candidate is promoted, a post-promote hook marks the source
memory as `graduated` and sets `graduated_to_entity_id` for traceability.

This is THE population move: without it, the engineering graph stays sparse
and the killer queries (Q-006/009/011) have nothing to find gaps in.

Usage:
  python3 scripts/graduate_memories.py --base-url http://127.0.0.1:8100 \\
      --project p05-interferometer --limit 20

  # Dry run (don't create entities, just show decisions):
  python3 scripts/graduate_memories.py --project p05-interferometer --dry-run

  # Process all active memories across all projects (big run):
  python3 scripts/graduate_memories.py --limit 200

Host-side because claude CLI lives on Dalidou, not in the container.
"""

from __future__ import annotations

import argparse
import json
import os
import shutil
import subprocess
import sys
import tempfile
import time
import urllib.error
import urllib.request
from typing import Any

# Make src/ importable so we can reuse the stdlib-only prompt module
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
if _SRC_DIR not in sys.path:
    sys.path.insert(0, _SRC_DIR)

from atocore.engineering._graduation_prompt import (  # noqa: E402
    GRADUATION_PROMPT_VERSION,
    SYSTEM_PROMPT,
    build_user_message,
    parse_graduation_output,
)


DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")
DEFAULT_MODEL = os.environ.get("ATOCORE_LLM_EXTRACTOR_MODEL", "sonnet")
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_GRADUATION_TIMEOUT_S", "90"))

_sandbox_cwd = None


def get_sandbox_cwd() -> str:
    """Temp cwd so claude CLI doesn't auto-discover project CLAUDE.md files."""
    global _sandbox_cwd
    if _sandbox_cwd is None:
        _sandbox_cwd = tempfile.mkdtemp(prefix="ato-graduate-")
    return _sandbox_cwd


def api_get(base_url: str, path: str) -> dict:
    req = urllib.request.Request(f"{base_url}{path}")
    with urllib.request.urlopen(req, timeout=15) as resp:
        return json.loads(resp.read().decode("utf-8"))


def api_post(base_url: str, path: str, body: dict | None = None) -> dict:
    data = json.dumps(body or {}).encode("utf-8")
    req = urllib.request.Request(
        f"{base_url}{path}", method="POST",
        headers={"Content-Type": "application/json"}, data=data,
    )
    with urllib.request.urlopen(req, timeout=15) as resp:
        return json.loads(resp.read().decode("utf-8"))


def graduate_one(memory: dict, model: str, timeout_s: float) -> dict[str, Any] | None:
    """Ask claude whether this memory describes a typed entity.

    Returns None on any failure (parse error, timeout, exit!=0).
    Applies retry+pacing to match the pattern in auto_triage/batch_extract.
    """
    if not shutil.which("claude"):
        return None

    user_msg = build_user_message(
        memory_content=memory.get("content", "") or "",
        memory_project=memory.get("project", "") or "",
        memory_type=memory.get("memory_type", "") or "",
    )

    args = [
        "claude", "-p",
        "--model", model,
        "--append-system-prompt", SYSTEM_PROMPT,
        "--disable-slash-commands",
        user_msg,
    ]

    last_error = ""
    for attempt in range(3):
        if attempt > 0:
            time.sleep(2 ** attempt)
        try:
            completed = subprocess.run(
                args, capture_output=True, text=True,
                timeout=timeout_s, cwd=get_sandbox_cwd(),
                encoding="utf-8", errors="replace",
            )
        except subprocess.TimeoutExpired:
            last_error = "timeout"
            continue
        except Exception as exc:
            last_error = f"subprocess error: {exc}"
            continue

        if completed.returncode == 0:
            return parse_graduation_output(completed.stdout or "")

        stderr = (completed.stderr or "").strip()[:200]
        last_error = f"exit_{completed.returncode}: {stderr}" if stderr else f"exit_{completed.returncode}"

    print(f"  ! claude failed after 3 tries: {last_error}", file=sys.stderr)
    return None


def create_entity_candidate(
    base_url: str,
    decision: dict,
    memory: dict,
) -> str | None:
    """Create an entity candidate with source_refs pointing at the memory."""
    try:
        result = api_post(base_url, "/entities", {
            "entity_type": decision["entity_type"],
            "name": decision["name"],
            "project": memory.get("project", "") or "",
            "description": decision["description"],
            "properties": {
                "graduated_from_memory": memory["id"],
                "proposed_relationships": decision["relationships"],
                "prompt_version": GRADUATION_PROMPT_VERSION,
            },
            "status": "candidate",
            "confidence": decision["confidence"],
            "source_refs": [f"memory:{memory['id']}"],
        })
        return result.get("id")
    except Exception as e:
        print(f"  ! entity create failed: {e}", file=sys.stderr)
        return None


def main() -> None:
    parser = argparse.ArgumentParser(description="Graduate active memories into entity candidates")
    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
    parser.add_argument("--model", default=DEFAULT_MODEL)
    parser.add_argument("--project", default=None, help="Only graduate memories in this project")
    parser.add_argument("--limit", type=int, default=50, help="Max memories to process")
    parser.add_argument("--min-confidence", type=float, default=0.3,
                        help="Skip memories with confidence below this (they're probably noise)")
    parser.add_argument("--dry-run", action="store_true", help="Show decisions without creating entities")
    args = parser.parse_args()

    # Fetch active memories
    query = "status=active"
    query += f"&limit={args.limit}"
    if args.project:
        query += f"&project={args.project}"
    result = api_get(args.base_url, f"/memory?{query}")
    memories = result.get("memories", [])

    # Filter by min_confidence + skip already-graduated
    memories = [m for m in memories
                if m.get("confidence", 0) >= args.min_confidence
                and m.get("status") != "graduated"]

    print(f"graduating: {len(memories)} memories  project={args.project or '(all)'}  "
          f"model={args.model}  dry_run={args.dry_run}")

    graduated = 0
    skipped = 0
    errors = 0
    entities_created: list[str] = []

    for i, mem in enumerate(memories, 1):
        if i > 1:
            time.sleep(0.5)  # light pacing, matches auto_triage
        mid = mem["id"]
        label = f"[{i:3d}/{len(memories)}] {mid[:8]} [{mem.get('memory_type','?')}]"

        decision = graduate_one(mem, args.model, DEFAULT_TIMEOUT_S)
        if decision is None:
            print(f"  ERROR  {label}  (graduate_one returned None)")
            errors += 1
            continue

        if not decision.get("graduate"):
            reason = decision.get("reason", "(no reason)")
            print(f"  skip   {label}  {reason}")
            skipped += 1
            continue

        etype = decision["entity_type"]
        ename = decision["name"]
        nrel = len(decision.get("relationships", []))

        if args.dry_run:
            print(f"  WOULD  {label}  → [{etype}] {ename!r}  ({nrel} rels)")
            graduated += 1
        else:
            entity_id = create_entity_candidate(args.base_url, decision, mem)
            if entity_id:
                print(f"  CREATE {label}  → [{etype}] {ename!r}  ({nrel} rels)  entity={entity_id[:8]}")
                graduated += 1
                entities_created.append(entity_id)
            else:
                errors += 1

    print(f"\ntotal: graduated={graduated} skipped={skipped} errors={errors}")
    if entities_created:
        print(f"Review at /admin/triage ({len(entities_created)} entity candidates created)")


if __name__ == "__main__":
    main()