feat: Phase 7C — tag canonicalization (autonomous, weekly)

LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.

Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.

- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
  dedupes if both alias + canonical present), create / approve / reject
  proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
  --no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
  apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
  counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.

Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-19 09:41:02 -04:00
parent e840ef4be3
commit 877b97ec78
7 changed files with 1085 additions and 0 deletions

View File

@@ -0,0 +1,254 @@
#!/usr/bin/env python3
"""Phase 7C — tag canonicalization detector.
Weekly (or on-demand) LLM pass that:
1. Fetches the tag distribution across all active memories via HTTP
2. Asks claude-p to propose alias→canonical mappings
3. AUTO-APPLIES aliases with confidence >= AUTO_APPROVE_CONF (0.8)
4. Submits lower-confidence proposals as pending for human review
Autonomous by default — matches the Phase 7A.1 pattern. Set
--no-auto-approve to force every proposal into human review.
Host-side because claude CLI lives on Dalidou, not the container.
Reuses the PYTHONPATH=src pattern from scripts/memory_dedup.py.
Usage:
python3 scripts/canonicalize_tags.py [--base-url URL] [--dry-run] [--no-auto-approve]
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import subprocess
import sys
import tempfile
import time
import urllib.error
import urllib.request
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
if _SRC_DIR not in sys.path:
sys.path.insert(0, _SRC_DIR)
from atocore.memory._tag_canon_prompt import ( # noqa: E402
PROTECTED_PROJECT_TOKENS,
SYSTEM_PROMPT,
TAG_CANON_PROMPT_VERSION,
build_user_message,
normalize_alias_item,
parse_canon_output,
)
DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")
DEFAULT_MODEL = os.environ.get("ATOCORE_TAG_CANON_MODEL", "sonnet")
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_TAG_CANON_TIMEOUT_S", "90"))
AUTO_APPROVE_CONF = float(os.environ.get("ATOCORE_TAG_CANON_AUTO_APPROVE_CONF", "0.8"))
MIN_ALIAS_COUNT = int(os.environ.get("ATOCORE_TAG_CANON_MIN_ALIAS_COUNT", "1"))
_sandbox_cwd = None
def get_sandbox_cwd() -> str:
global _sandbox_cwd
if _sandbox_cwd is None:
_sandbox_cwd = tempfile.mkdtemp(prefix="ato-tagcanon-")
return _sandbox_cwd
def api_get(base_url: str, path: str) -> dict:
req = urllib.request.Request(f"{base_url}{path}")
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode("utf-8"))
def api_post(base_url: str, path: str, body: dict | None = None) -> dict:
data = json.dumps(body or {}).encode("utf-8")
req = urllib.request.Request(
f"{base_url}{path}", method="POST",
headers={"Content-Type": "application/json"}, data=data,
)
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode("utf-8"))
def call_claude(user_message: str, model: str, timeout_s: float) -> tuple[str | None, str | None]:
if not shutil.which("claude"):
return None, "claude CLI not available"
args = [
"claude", "-p",
"--model", model,
"--append-system-prompt", SYSTEM_PROMPT,
"--disable-slash-commands",
user_message,
]
last_error = ""
for attempt in range(3):
if attempt > 0:
time.sleep(2 ** attempt)
try:
completed = subprocess.run(
args, capture_output=True, text=True,
timeout=timeout_s, cwd=get_sandbox_cwd(),
encoding="utf-8", errors="replace",
)
except subprocess.TimeoutExpired:
last_error = f"{model} timed out"
continue
except Exception as exc:
last_error = f"subprocess error: {exc}"
continue
if completed.returncode == 0:
return (completed.stdout or "").strip(), None
stderr = (completed.stderr or "").strip()[:200]
last_error = f"{model} exit {completed.returncode}: {stderr}"
return None, last_error
def fetch_tag_distribution(base_url: str) -> dict[str, int]:
"""Count tag occurrences across active memories (client-side)."""
try:
result = api_get(base_url, "/memory?active_only=true&limit=2000")
except Exception as e:
print(f"ERROR: could not fetch memories: {e}", file=sys.stderr)
return {}
mems = result.get("memories", [])
counts: dict[str, int] = {}
for m in mems:
tags = m.get("domain_tags") or []
if isinstance(tags, str):
try:
tags = json.loads(tags)
except Exception:
tags = []
if not isinstance(tags, list):
continue
for t in tags:
if not isinstance(t, str):
continue
key = t.strip().lower()
if key:
counts[key] = counts.get(key, 0) + 1
return counts
def main() -> None:
parser = argparse.ArgumentParser(description="Phase 7C tag canonicalization detector")
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
parser.add_argument("--model", default=DEFAULT_MODEL)
parser.add_argument("--timeout-s", type=float, default=DEFAULT_TIMEOUT_S)
parser.add_argument("--no-auto-approve", action="store_true",
help="Disable autonomous apply; all proposals → human queue")
parser.add_argument("--dry-run", action="store_true",
help="Print decisions without touching state")
args = parser.parse_args()
base = args.base_url.rstrip("/")
autonomous = not args.no_auto_approve
print(
f"canonicalize_tags {TAG_CANON_PROMPT_VERSION} | model={args.model} | "
f"autonomous={autonomous} | auto-approve conf>={AUTO_APPROVE_CONF}"
)
dist = fetch_tag_distribution(base)
print(f"tag distribution: {len(dist)} unique tags, "
f"{sum(dist.values())} total references")
if not dist:
print("no tags found — nothing to canonicalize")
return
user_msg = build_user_message(dist)
raw, err = call_claude(user_msg, args.model, args.timeout_s)
if err or raw is None:
print(f"ERROR: LLM call failed: {err}", file=sys.stderr)
return
aliases_raw = parse_canon_output(raw)
print(f"LLM returned {len(aliases_raw)} raw alias proposals")
auto_applied = 0
auto_skipped_missing_canonical = 0
proposals_created = 0
duplicates_skipped = 0
for item in aliases_raw:
norm = normalize_alias_item(item)
if norm is None:
continue
alias = norm["alias"]
canonical = norm["canonical"]
confidence = norm["confidence"]
alias_count = dist.get(alias, 0)
canonical_count = dist.get(canonical, 0)
# Sanity: alias must actually exist in the current distribution
if alias_count < MIN_ALIAS_COUNT:
print(f" SKIP {alias!r}{canonical!r}: alias not in distribution")
continue
if canonical_count == 0:
auto_skipped_missing_canonical += 1
print(f" SKIP {alias!r}{canonical!r}: canonical missing from distribution")
continue
label = f"{alias!r} ({alias_count}) → {canonical!r} ({canonical_count}) conf={confidence:.2f}"
auto_apply = autonomous and confidence >= AUTO_APPROVE_CONF
if auto_apply:
if args.dry_run:
auto_applied += 1
print(f" [dry-run] would auto-apply: {label}")
continue
try:
result = api_post(base, "/admin/tags/aliases/apply", {
"alias": alias, "canonical": canonical,
"confidence": confidence, "reason": norm["reason"],
"alias_count": alias_count, "canonical_count": canonical_count,
"actor": "auto-tag-canon",
})
touched = result.get("memories_touched", 0)
auto_applied += 1
print(f" ✅ auto-applied: {label} ({touched} memories)")
except Exception as e:
print(f" ⚠️ auto-apply failed: {label}{e}", file=sys.stderr)
time.sleep(0.2)
continue
# Lower confidence → human review
if args.dry_run:
proposals_created += 1
print(f" [dry-run] would propose for review: {label}")
continue
try:
result = api_post(base, "/admin/tags/aliases/propose", {
"alias": alias, "canonical": canonical,
"confidence": confidence, "reason": norm["reason"],
"alias_count": alias_count, "canonical_count": canonical_count,
})
if result.get("proposal_id"):
proposals_created += 1
print(f" → pending proposal: {label}")
else:
duplicates_skipped += 1
print(f" (duplicate pending proposal): {label}")
except Exception as e:
print(f" ⚠️ propose failed: {label}{e}", file=sys.stderr)
time.sleep(0.2)
print(
f"\nsummary: proposals_seen={len(aliases_raw)} "
f"auto_applied={auto_applied} "
f"proposals_created={proposals_created} "
f"duplicates_skipped={duplicates_skipped} "
f"skipped_missing_canonical={auto_skipped_missing_canonical}"
)
if __name__ == "__main__":
main()