feat: Karpathy-inspired upgrades — contradiction, lint, synthesis

Three additive upgrades borrowed from Karpathy's LLM Wiki pattern: 1. CONTRADICTION DETECTION: auto-triage now has a fourth verdict — "contradicts". When a candidate conflicts with an existing memory (not duplicates, genuine disagreement like "Option A selected" vs "Option B selected"), the triage model flags it and leaves it in the queue for human review instead of silently rejecting or double-storing. Preserves source tension rather than suppressing it. 2. WEEKLY LINT PASS: scripts/lint_knowledge_base.py checks for: - Orphan memories (active but zero references after 14 days) - Stale candidates (>7 days unreviewed) - Unused entities (no relationships) - Empty-state projects - Unregistered projects auto-detected in memories Runs Sundays via the cron. Outputs a report. 3. WEEKLY SYNTHESIS: scripts/synthesize_projects.py uses sonnet to generate a 3-5 sentence "current state" paragraph per project from state + memories + entities. Cached in project_state under status/synthesis_cache. Wiki project pages now show this at the top under "Current State (auto-synthesis)". Falls back to a deterministic summary if no cache exists. deploy/dalidou/batch-extract.sh: added Step C (synthesis) and Step D (lint) gated to Sundays via date check. All additive — nothing existing changes behavior. The database remains the source of truth; these operations just produce better synthesized views and catch rot. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 21:08:13 -04:00
parent 761c483474
commit c1f5b3bdee
5 changed files with 421 additions and 5 deletions
--- a/scripts/synthesize_projects.py
+++ b/scripts/synthesize_projects.py
@@ -0,0 +1,168 @@
+"""Weekly project synthesis — LLM-generated 'current state' paragraph per project.
+
+Reads each registered project's state entries, memories, and entities,
+asks sonnet for a 3-5 sentence synthesis, and caches it under
+project_state/status/synthesis_cache. The wiki's project page reads
+this cached synthesis as the top band.
+
+Runs weekly via cron (or manually). Cheap — one LLM call per project.
+
+Usage:
+  python3 scripts/synthesize_projects.py --base-url http://localhost:8100
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import tempfile
+import urllib.request
+
+DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://localhost:8100")
+DEFAULT_MODEL = os.environ.get("ATOCORE_SYNTHESIS_MODEL", "sonnet")
+TIMEOUT_S = 60
+
+SYSTEM_PROMPT = """You are summarizing the current state of an engineering project for a personal context engine called AtoCore.
+
+You will receive:
+- Project state entries (decisions, requirements, status)
+- Active memories tagged to this project
+- Entity graph (subsystems, components, materials, decisions)
+
+Write a 3-5 sentence synthesis covering:
+1. What the project is and its current stage
+2. The key locked-in decisions and architecture
+3. What the next focus is
+
+Rules:
+- Plain prose, no bullet lists
+- Factual, grounded in what the data says — don't invent or speculate
+- Present tense
+- Under 500 characters total
+- No markdown formatting, just prose
+- If the data is sparse, say so honestly ("limited project data available")
+
+Output ONLY the synthesis paragraph. No preamble, no JSON, no markdown headers."""
+
+
+_cwd = None
+
+
+def get_cwd():
+    global _cwd
+    if _cwd is None:
+        _cwd = tempfile.mkdtemp(prefix="ato-synth-")
+    return _cwd
+
+
+def api_get(base_url, path):
+    with urllib.request.urlopen(f"{base_url}{path}", timeout=15) as r:
+        return json.loads(r.read())
+
+
+def api_post(base_url, path, body):
+    data = json.dumps(body).encode("utf-8")
+    req = urllib.request.Request(
+        f"{base_url}{path}", method="POST",
+        headers={"Content-Type": "application/json"}, data=data,
+    )
+    with urllib.request.urlopen(req, timeout=15) as r:
+        return json.loads(r.read())
+
+
+def synthesize_project(base_url, project_id, model):
+    # Gather context
+    state = api_get(base_url, f"/project/state/{project_id}").get("entries", [])
+    memories = api_get(base_url, f"/memory?project={project_id}&active_only=true&limit=20").get("memories", [])
+    entities = api_get(base_url, f"/entities?project={project_id}&limit=50").get("entities", [])
+
+    if not (state or memories or entities):
+        return None
+
+    lines = [f"PROJECT: {project_id}\n"]
+    if state:
+        lines.append("STATE ENTRIES:")
+        for e in state[:15]:
+            if e.get("key") == "synthesis_cache":
+                continue
+            lines.append(f"  [{e['category']}] {e['key']}: {e['value'][:200]}")
+
+    if memories:
+        lines.append("\nACTIVE MEMORIES:")
+        for m in memories[:10]:
+            lines.append(f"  [{m['memory_type']}] {m['content'][:200]}")
+
+    if entities:
+        lines.append("\nENTITIES:")
+        by_type = {}
+        for e in entities:
+            by_type.setdefault(e["entity_type"], []).append(e["name"])
+        for t, names in by_type.items():
+            lines.append(f"  {t}: {', '.join(names[:8])}")
+
+    user_msg = "\n".join(lines) + "\n\nWrite the synthesis paragraph now."
+
+    if not shutil.which("claude"):
+        print(f"  ! claude CLI not available, skipping {project_id}")
+        return None
+
+    try:
+        result = subprocess.run(
+            ["claude", "-p", "--model", model,
+             "--append-system-prompt", SYSTEM_PROMPT,
+             "--disable-slash-commands",
+             user_msg],
+            capture_output=True, text=True, timeout=TIMEOUT_S,
+            cwd=get_cwd(), encoding="utf-8", errors="replace",
+        )
+    except Exception as e:
+        print(f"  ! subprocess failed for {project_id}: {e}")
+        return None
+
+    if result.returncode != 0:
+        print(f"  ! claude exit {result.returncode} for {project_id}")
+        return None
+
+    synthesis = (result.stdout or "").strip()
+    if not synthesis or len(synthesis) < 50:
+        return None
+    return synthesis[:1000]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
+    parser.add_argument("--model", default=DEFAULT_MODEL)
+    parser.add_argument("--project", default=None, help="single project to synthesize")
+    args = parser.parse_args()
+
+    projects = api_get(args.base_url, "/projects").get("projects", [])
+    if args.project:
+        projects = [p for p in projects if p["id"] == args.project]
+
+    print(f"Synthesizing {len(projects)} project(s) with {args.model}...")
+
+    for p in projects:
+        pid = p["id"]
+        print(f"\n- {pid}")
+        synthesis = synthesize_project(args.base_url, pid, args.model)
+        if synthesis:
+            print(f"  {synthesis[:200]}...")
+            try:
+                api_post(args.base_url, "/project/state", {
+                    "project": pid,
+                    "category": "status",
+                    "key": "synthesis_cache",
+                    "value": synthesis,
+                    "source": "weekly synthesis pass",
+                })
+                print(f"  + cached")
+            except Exception as e:
+                print(f"  ! save failed: {e}")
+
+
+if __name__ == "__main__":
+    main()