feat: Phase 7C — tag canonicalization (autonomous, weekly)

LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8 auto-apply, below goes to human triage. Protects project identifiers (p04, p05, p06, atocore, apm, etc.) from ever being canonicalized since they're their own namespace, not concepts. Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs "firmware-control" all mean the same thing, but cross-cutting queries that filter by tag only hit one variant. Weekly canonicalization pass keeps the tag graph clean. - Schema: tag_aliases table (pending | approved | rejected) - atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens) - service: get_tag_distribution, apply_tag_alias (atomic per-memory, dedupes if both alias + canonical present), create / approve / reject proposal lifecycle, per-memory audit rows with action="tag_canonicalized" - scripts/canonicalize_tags.py: host-side detector, autonomous by default, --no-auto-approve kill switch - 6 API endpoints under /admin/tags/* (distribution, list, propose, apply, approve/{id}, reject/{id}) - Step B4 in batch-extract.sh (Sundays only — weekly cadence) - 26 new tests (prompt parser, normalizer protections, distribution counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440. Design: aggressive protection of project tokens because a false canonicalization (p04 → p04-gigabit, or vice versa) would scramble cross-project filtering. Err toward preservation; the alias only applies if the model is very confident AND both strings appear in the current distribution. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 09:41:02 -04:00
parent e840ef4be3
commit 877b97ec78
7 changed files with 1085 additions and 0 deletions
--- a/scripts/canonicalize_tags.py
+++ b/scripts/canonicalize_tags.py
@@ -0,0 +1,254 @@
+#!/usr/bin/env python3
+"""Phase 7C — tag canonicalization detector.
+
+Weekly (or on-demand) LLM pass that:
+  1. Fetches the tag distribution across all active memories via HTTP
+  2. Asks claude-p to propose alias→canonical mappings
+  3. AUTO-APPLIES aliases with confidence >= AUTO_APPROVE_CONF (0.8)
+  4. Submits lower-confidence proposals as pending for human review
+
+Autonomous by default — matches the Phase 7A.1 pattern. Set
+--no-auto-approve to force every proposal into human review.
+
+Host-side because claude CLI lives on Dalidou, not the container.
+Reuses the PYTHONPATH=src pattern from scripts/memory_dedup.py.
+
+Usage:
+  python3 scripts/canonicalize_tags.py [--base-url URL] [--dry-run] [--no-auto-approve]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.error
+import urllib.request
+
+_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
+if _SRC_DIR not in sys.path:
+    sys.path.insert(0, _SRC_DIR)
+
+from atocore.memory._tag_canon_prompt import (  # noqa: E402
+    PROTECTED_PROJECT_TOKENS,
+    SYSTEM_PROMPT,
+    TAG_CANON_PROMPT_VERSION,
+    build_user_message,
+    normalize_alias_item,
+    parse_canon_output,
+)
+
+DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")
+DEFAULT_MODEL = os.environ.get("ATOCORE_TAG_CANON_MODEL", "sonnet")
+DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_TAG_CANON_TIMEOUT_S", "90"))
+
+AUTO_APPROVE_CONF = float(os.environ.get("ATOCORE_TAG_CANON_AUTO_APPROVE_CONF", "0.8"))
+MIN_ALIAS_COUNT = int(os.environ.get("ATOCORE_TAG_CANON_MIN_ALIAS_COUNT", "1"))
+
+_sandbox_cwd = None
+
+
+def get_sandbox_cwd() -> str:
+    global _sandbox_cwd
+    if _sandbox_cwd is None:
+        _sandbox_cwd = tempfile.mkdtemp(prefix="ato-tagcanon-")
+    return _sandbox_cwd
+
+
+def api_get(base_url: str, path: str) -> dict:
+    req = urllib.request.Request(f"{base_url}{path}")
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def api_post(base_url: str, path: str, body: dict | None = None) -> dict:
+    data = json.dumps(body or {}).encode("utf-8")
+    req = urllib.request.Request(
+        f"{base_url}{path}", method="POST",
+        headers={"Content-Type": "application/json"}, data=data,
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def call_claude(user_message: str, model: str, timeout_s: float) -> tuple[str | None, str | None]:
+    if not shutil.which("claude"):
+        return None, "claude CLI not available"
+    args = [
+        "claude", "-p",
+        "--model", model,
+        "--append-system-prompt", SYSTEM_PROMPT,
+        "--disable-slash-commands",
+        user_message,
+    ]
+    last_error = ""
+    for attempt in range(3):
+        if attempt > 0:
+            time.sleep(2 ** attempt)
+        try:
+            completed = subprocess.run(
+                args, capture_output=True, text=True,
+                timeout=timeout_s, cwd=get_sandbox_cwd(),
+                encoding="utf-8", errors="replace",
+            )
+        except subprocess.TimeoutExpired:
+            last_error = f"{model} timed out"
+            continue
+        except Exception as exc:
+            last_error = f"subprocess error: {exc}"
+            continue
+        if completed.returncode == 0:
+            return (completed.stdout or "").strip(), None
+        stderr = (completed.stderr or "").strip()[:200]
+        last_error = f"{model} exit {completed.returncode}: {stderr}"
+    return None, last_error
+
+
+def fetch_tag_distribution(base_url: str) -> dict[str, int]:
+    """Count tag occurrences across active memories (client-side)."""
+    try:
+        result = api_get(base_url, "/memory?active_only=true&limit=2000")
+    except Exception as e:
+        print(f"ERROR: could not fetch memories: {e}", file=sys.stderr)
+        return {}
+    mems = result.get("memories", [])
+    counts: dict[str, int] = {}
+    for m in mems:
+        tags = m.get("domain_tags") or []
+        if isinstance(tags, str):
+            try:
+                tags = json.loads(tags)
+            except Exception:
+                tags = []
+        if not isinstance(tags, list):
+            continue
+        for t in tags:
+            if not isinstance(t, str):
+                continue
+            key = t.strip().lower()
+            if key:
+                counts[key] = counts.get(key, 0) + 1
+    return counts
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Phase 7C tag canonicalization detector")
+    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
+    parser.add_argument("--model", default=DEFAULT_MODEL)
+    parser.add_argument("--timeout-s", type=float, default=DEFAULT_TIMEOUT_S)
+    parser.add_argument("--no-auto-approve", action="store_true",
+                        help="Disable autonomous apply; all proposals → human queue")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Print decisions without touching state")
+    args = parser.parse_args()
+
+    base = args.base_url.rstrip("/")
+    autonomous = not args.no_auto_approve
+
+    print(
+        f"canonicalize_tags {TAG_CANON_PROMPT_VERSION} | model={args.model} | "
+        f"autonomous={autonomous} | auto-approve conf>={AUTO_APPROVE_CONF}"
+    )
+
+    dist = fetch_tag_distribution(base)
+    print(f"tag distribution: {len(dist)} unique tags, "
+          f"{sum(dist.values())} total references")
+    if not dist:
+        print("no tags found — nothing to canonicalize")
+        return
+
+    user_msg = build_user_message(dist)
+    raw, err = call_claude(user_msg, args.model, args.timeout_s)
+    if err or raw is None:
+        print(f"ERROR: LLM call failed: {err}", file=sys.stderr)
+        return
+
+    aliases_raw = parse_canon_output(raw)
+    print(f"LLM returned {len(aliases_raw)} raw alias proposals")
+
+    auto_applied = 0
+    auto_skipped_missing_canonical = 0
+    proposals_created = 0
+    duplicates_skipped = 0
+
+    for item in aliases_raw:
+        norm = normalize_alias_item(item)
+        if norm is None:
+            continue
+        alias = norm["alias"]
+        canonical = norm["canonical"]
+        confidence = norm["confidence"]
+
+        alias_count = dist.get(alias, 0)
+        canonical_count = dist.get(canonical, 0)
+
+        # Sanity: alias must actually exist in the current distribution
+        if alias_count < MIN_ALIAS_COUNT:
+            print(f"  SKIP {alias!r} → {canonical!r}: alias not in distribution")
+            continue
+        if canonical_count == 0:
+            auto_skipped_missing_canonical += 1
+            print(f"  SKIP {alias!r} → {canonical!r}: canonical missing from distribution")
+            continue
+
+        label = f"{alias!r} ({alias_count}) → {canonical!r} ({canonical_count}) conf={confidence:.2f}"
+
+        auto_apply = autonomous and confidence >= AUTO_APPROVE_CONF
+        if auto_apply:
+            if args.dry_run:
+                auto_applied += 1
+                print(f"  [dry-run] would auto-apply: {label}")
+                continue
+            try:
+                result = api_post(base, "/admin/tags/aliases/apply", {
+                    "alias": alias, "canonical": canonical,
+                    "confidence": confidence, "reason": norm["reason"],
+                    "alias_count": alias_count, "canonical_count": canonical_count,
+                    "actor": "auto-tag-canon",
+                })
+                touched = result.get("memories_touched", 0)
+                auto_applied += 1
+                print(f"  ✅ auto-applied: {label} ({touched} memories)")
+            except Exception as e:
+                print(f"  ⚠️ auto-apply failed: {label} — {e}", file=sys.stderr)
+            time.sleep(0.2)
+            continue
+
+        # Lower confidence → human review
+        if args.dry_run:
+            proposals_created += 1
+            print(f"  [dry-run] would propose for review: {label}")
+            continue
+        try:
+            result = api_post(base, "/admin/tags/aliases/propose", {
+                "alias": alias, "canonical": canonical,
+                "confidence": confidence, "reason": norm["reason"],
+                "alias_count": alias_count, "canonical_count": canonical_count,
+            })
+            if result.get("proposal_id"):
+                proposals_created += 1
+                print(f"  → pending proposal: {label}")
+            else:
+                duplicates_skipped += 1
+                print(f"  (duplicate pending proposal): {label}")
+        except Exception as e:
+            print(f"  ⚠️ propose failed: {label} — {e}", file=sys.stderr)
+        time.sleep(0.2)
+
+    print(
+        f"\nsummary: proposals_seen={len(aliases_raw)} "
+        f"auto_applied={auto_applied} "
+        f"proposals_created={proposals_created} "
+        f"duplicates_skipped={duplicates_skipped} "
+        f"skipped_missing_canonical={auto_skipped_missing_canonical}"
+    )
+
+
+if __name__ == "__main__":
+    main()