feat: Karpathy-inspired upgrades — contradiction, lint, synthesis
Three additive upgrades borrowed from Karpathy's LLM Wiki pattern: 1. CONTRADICTION DETECTION: auto-triage now has a fourth verdict — "contradicts". When a candidate conflicts with an existing memory (not duplicates, genuine disagreement like "Option A selected" vs "Option B selected"), the triage model flags it and leaves it in the queue for human review instead of silently rejecting or double-storing. Preserves source tension rather than suppressing it. 2. WEEKLY LINT PASS: scripts/lint_knowledge_base.py checks for: - Orphan memories (active but zero references after 14 days) - Stale candidates (>7 days unreviewed) - Unused entities (no relationships) - Empty-state projects - Unregistered projects auto-detected in memories Runs Sundays via the cron. Outputs a report. 3. WEEKLY SYNTHESIS: scripts/synthesize_projects.py uses sonnet to generate a 3-5 sentence "current state" paragraph per project from state + memories + entities. Cached in project_state under status/synthesis_cache. Wiki project pages now show this at the top under "Current State (auto-synthesis)". Falls back to a deterministic summary if no cache exists. deploy/dalidou/batch-extract.sh: added Step C (synthesis) and Step D (lint) gated to Sundays via date check. All additive — nothing existing changes behavior. The database remains the source of truth; these operations just produce better synthesized views and catch rot. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
168
scripts/synthesize_projects.py
Normal file
168
scripts/synthesize_projects.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""Weekly project synthesis — LLM-generated 'current state' paragraph per project.
|
||||
|
||||
Reads each registered project's state entries, memories, and entities,
|
||||
asks sonnet for a 3-5 sentence synthesis, and caches it under
|
||||
project_state/status/synthesis_cache. The wiki's project page reads
|
||||
this cached synthesis as the top band.
|
||||
|
||||
Runs weekly via cron (or manually). Cheap — one LLM call per project.
|
||||
|
||||
Usage:
|
||||
python3 scripts/synthesize_projects.py --base-url http://localhost:8100
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import urllib.request
|
||||
|
||||
DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://localhost:8100")
|
||||
DEFAULT_MODEL = os.environ.get("ATOCORE_SYNTHESIS_MODEL", "sonnet")
|
||||
TIMEOUT_S = 60
|
||||
|
||||
SYSTEM_PROMPT = """You are summarizing the current state of an engineering project for a personal context engine called AtoCore.
|
||||
|
||||
You will receive:
|
||||
- Project state entries (decisions, requirements, status)
|
||||
- Active memories tagged to this project
|
||||
- Entity graph (subsystems, components, materials, decisions)
|
||||
|
||||
Write a 3-5 sentence synthesis covering:
|
||||
1. What the project is and its current stage
|
||||
2. The key locked-in decisions and architecture
|
||||
3. What the next focus is
|
||||
|
||||
Rules:
|
||||
- Plain prose, no bullet lists
|
||||
- Factual, grounded in what the data says — don't invent or speculate
|
||||
- Present tense
|
||||
- Under 500 characters total
|
||||
- No markdown formatting, just prose
|
||||
- If the data is sparse, say so honestly ("limited project data available")
|
||||
|
||||
Output ONLY the synthesis paragraph. No preamble, no JSON, no markdown headers."""
|
||||
|
||||
|
||||
_cwd = None
|
||||
|
||||
|
||||
def get_cwd():
|
||||
global _cwd
|
||||
if _cwd is None:
|
||||
_cwd = tempfile.mkdtemp(prefix="ato-synth-")
|
||||
return _cwd
|
||||
|
||||
|
||||
def api_get(base_url, path):
|
||||
with urllib.request.urlopen(f"{base_url}{path}", timeout=15) as r:
|
||||
return json.loads(r.read())
|
||||
|
||||
|
||||
def api_post(base_url, path, body):
|
||||
data = json.dumps(body).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{base_url}{path}", method="POST",
|
||||
headers={"Content-Type": "application/json"}, data=data,
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=15) as r:
|
||||
return json.loads(r.read())
|
||||
|
||||
|
||||
def synthesize_project(base_url, project_id, model):
|
||||
# Gather context
|
||||
state = api_get(base_url, f"/project/state/{project_id}").get("entries", [])
|
||||
memories = api_get(base_url, f"/memory?project={project_id}&active_only=true&limit=20").get("memories", [])
|
||||
entities = api_get(base_url, f"/entities?project={project_id}&limit=50").get("entities", [])
|
||||
|
||||
if not (state or memories or entities):
|
||||
return None
|
||||
|
||||
lines = [f"PROJECT: {project_id}\n"]
|
||||
if state:
|
||||
lines.append("STATE ENTRIES:")
|
||||
for e in state[:15]:
|
||||
if e.get("key") == "synthesis_cache":
|
||||
continue
|
||||
lines.append(f" [{e['category']}] {e['key']}: {e['value'][:200]}")
|
||||
|
||||
if memories:
|
||||
lines.append("\nACTIVE MEMORIES:")
|
||||
for m in memories[:10]:
|
||||
lines.append(f" [{m['memory_type']}] {m['content'][:200]}")
|
||||
|
||||
if entities:
|
||||
lines.append("\nENTITIES:")
|
||||
by_type = {}
|
||||
for e in entities:
|
||||
by_type.setdefault(e["entity_type"], []).append(e["name"])
|
||||
for t, names in by_type.items():
|
||||
lines.append(f" {t}: {', '.join(names[:8])}")
|
||||
|
||||
user_msg = "\n".join(lines) + "\n\nWrite the synthesis paragraph now."
|
||||
|
||||
if not shutil.which("claude"):
|
||||
print(f" ! claude CLI not available, skipping {project_id}")
|
||||
return None
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["claude", "-p", "--model", model,
|
||||
"--append-system-prompt", SYSTEM_PROMPT,
|
||||
"--disable-slash-commands",
|
||||
user_msg],
|
||||
capture_output=True, text=True, timeout=TIMEOUT_S,
|
||||
cwd=get_cwd(), encoding="utf-8", errors="replace",
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" ! subprocess failed for {project_id}: {e}")
|
||||
return None
|
||||
|
||||
if result.returncode != 0:
|
||||
print(f" ! claude exit {result.returncode} for {project_id}")
|
||||
return None
|
||||
|
||||
synthesis = (result.stdout or "").strip()
|
||||
if not synthesis or len(synthesis) < 50:
|
||||
return None
|
||||
return synthesis[:1000]
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
||||
parser.add_argument("--model", default=DEFAULT_MODEL)
|
||||
parser.add_argument("--project", default=None, help="single project to synthesize")
|
||||
args = parser.parse_args()
|
||||
|
||||
projects = api_get(args.base_url, "/projects").get("projects", [])
|
||||
if args.project:
|
||||
projects = [p for p in projects if p["id"] == args.project]
|
||||
|
||||
print(f"Synthesizing {len(projects)} project(s) with {args.model}...")
|
||||
|
||||
for p in projects:
|
||||
pid = p["id"]
|
||||
print(f"\n- {pid}")
|
||||
synthesis = synthesize_project(args.base_url, pid, args.model)
|
||||
if synthesis:
|
||||
print(f" {synthesis[:200]}...")
|
||||
try:
|
||||
api_post(args.base_url, "/project/state", {
|
||||
"project": pid,
|
||||
"category": "status",
|
||||
"key": "synthesis_cache",
|
||||
"value": synthesis,
|
||||
"source": "weekly synthesis pass",
|
||||
})
|
||||
print(f" + cached")
|
||||
except Exception as e:
|
||||
print(f" ! save failed: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user