ATOCore/scripts/lint_knowledge_base.py

"""Weekly lint pass — health check for the AtoCore knowledge base.

Inspired by Karpathy's LLM Wiki pattern (the 'lint' operation).
Checks for orphans, stale claims, contradictions, and gaps.
Outputs a report that can be posted to the wiki as needs_review.

Usage:
  python3 scripts/lint_knowledge_base.py --base-url http://dalidou:8100

Run weekly via cron, or on-demand when the knowledge base feels stale.
"""

from __future__ import annotations

import argparse
import json
import os
import urllib.request
from datetime import datetime, timezone, timedelta

DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://localhost:8100")
ORPHAN_AGE_DAYS = 14


def api_get(base_url: str, path: str):
    with urllib.request.urlopen(f"{base_url}{path}", timeout=15) as r:
        return json.loads(r.read())


def parse_ts(ts: str) -> datetime | None:
    if not ts:
        return None
    try:
        return datetime.strptime(ts[:19], "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
    except Exception:
        return None


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
    args = parser.parse_args()
    b = args.base_url
    now = datetime.now(timezone.utc)
    orphan_threshold = now - timedelta(days=ORPHAN_AGE_DAYS)

    print(f"=== AtoCore Lint — {now.strftime('%Y-%m-%d %H:%M UTC')} ===\n")

    findings = {
        "orphan_memories": [],
        "stale_candidates": [],
        "unused_entities": [],
        "empty_state_projects": [],
        "unregistered_projects": [],
    }

    # 1. Orphan memories: active but never reinforced after N days
    memories = api_get(b, "/memory?active_only=true&limit=500").get("memories", [])
    for m in memories:
        updated = parse_ts(m.get("updated_at", ""))
        if m.get("reference_count", 0) == 0 and updated and updated < orphan_threshold:
            findings["orphan_memories"].append({
                "id": m["id"],
                "type": m["memory_type"],
                "project": m.get("project") or "(none)",
                "age_days": (now - updated).days,
                "content": m["content"][:120],
            })

    # 2. Stale candidates: been in queue > 7 days without triage
    candidates = api_get(b, "/memory?status=candidate&limit=500").get("memories", [])
    stale_threshold = now - timedelta(days=7)
    for c in candidates:
        updated = parse_ts(c.get("updated_at", ""))
        if updated and updated < stale_threshold:
            findings["stale_candidates"].append({
                "id": c["id"],
                "age_days": (now - updated).days,
                "content": c["content"][:120],
            })

    # 3. Unused entities: no relationships in either direction
    entities = api_get(b, "/entities?limit=500").get("entities", [])
    for e in entities:
        try:
            detail = api_get(b, f"/entities/{e['id']}")
            if not detail.get("relationships"):
                findings["unused_entities"].append({
                    "id": e["id"],
                    "type": e["entity_type"],
                    "name": e["name"],
                    "project": e.get("project") or "(none)",
                })
        except Exception:
            pass

    # 4. Registered projects with no state entries
    try:
        projects = api_get(b, "/projects").get("projects", [])
        for p in projects:
            state = api_get(b, f"/project/state/{p['id']}").get("entries", [])
            if not state:
                findings["empty_state_projects"].append(p["id"])
    except Exception:
        pass

    # 5. Memories tagged to unregistered projects (auto-detection candidates)
    registered_ids = {p["id"] for p in projects} | {
        a for p in projects for a in p.get("aliases", [])
    }
    all_mems = api_get(b, "/memory?limit=500").get("memories", [])
    for m in all_mems:
        proj = m.get("project", "")
        if proj and proj not in registered_ids and proj != "(none)":
            if proj not in findings["unregistered_projects"]:
                findings["unregistered_projects"].append(proj)

    # Print report
    print(f"## Orphan memories (active, no reinforcement, >{ORPHAN_AGE_DAYS} days old)")
    if findings["orphan_memories"]:
        print(f"  Found: {len(findings['orphan_memories'])}")
        for o in findings["orphan_memories"][:10]:
            print(f"  - [{o['type']}] {o['project']} ({o['age_days']}d): {o['content']}")
    else:
        print("  (none)")

    print(f"\n## Stale candidates (>7 days in queue)")
    if findings["stale_candidates"]:
        print(f"  Found: {len(findings['stale_candidates'])}")
        for s in findings["stale_candidates"][:10]:
            print(f"  - ({s['age_days']}d): {s['content']}")
    else:
        print("  (none)")

    print(f"\n## Unused entities (no relationships)")
    if findings["unused_entities"]:
        print(f"  Found: {len(findings['unused_entities'])}")
        for u in findings["unused_entities"][:10]:
            print(f"  - [{u['type']}] {u['project']}: {u['name']}")
    else:
        print("  (none)")

    print(f"\n## Empty-state projects")
    if findings["empty_state_projects"]:
        print(f"  Found: {len(findings['empty_state_projects'])}")
        for p in findings["empty_state_projects"]:
            print(f"  - {p}")
    else:
        print("  (none)")

    print(f"\n## Unregistered projects detected in memories")
    if findings["unregistered_projects"]:
        print(f"  Found: {len(findings['unregistered_projects'])}")
        print("  These were auto-detected by extraction — consider registering them:")
        for p in findings["unregistered_projects"]:
            print(f"  - {p}")
    else:
        print("  (none)")

    total_findings = sum(
        len(v) if isinstance(v, list) else 0 for v in findings.values()
    )
    print(f"\n=== Total findings: {total_findings} ===")

    # Return exit code based on findings count (for CI)
    return 0 if total_findings == 0 else 1


if __name__ == "__main__":
    raise SystemExit(main())