feat: Phase 7C — tag canonicalization (autonomous, weekly)
LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.
Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.
- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
dedupes if both alias + canonical present), create / approve / reject
proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
--no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.
Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
254
scripts/canonicalize_tags.py
Normal file
254
scripts/canonicalize_tags.py
Normal file
@@ -0,0 +1,254 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Phase 7C — tag canonicalization detector.
|
||||
|
||||
Weekly (or on-demand) LLM pass that:
|
||||
1. Fetches the tag distribution across all active memories via HTTP
|
||||
2. Asks claude-p to propose alias→canonical mappings
|
||||
3. AUTO-APPLIES aliases with confidence >= AUTO_APPROVE_CONF (0.8)
|
||||
4. Submits lower-confidence proposals as pending for human review
|
||||
|
||||
Autonomous by default — matches the Phase 7A.1 pattern. Set
|
||||
--no-auto-approve to force every proposal into human review.
|
||||
|
||||
Host-side because claude CLI lives on Dalidou, not the container.
|
||||
Reuses the PYTHONPATH=src pattern from scripts/memory_dedup.py.
|
||||
|
||||
Usage:
|
||||
python3 scripts/canonicalize_tags.py [--base-url URL] [--dry-run] [--no-auto-approve]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
|
||||
if _SRC_DIR not in sys.path:
|
||||
sys.path.insert(0, _SRC_DIR)
|
||||
|
||||
from atocore.memory._tag_canon_prompt import ( # noqa: E402
|
||||
PROTECTED_PROJECT_TOKENS,
|
||||
SYSTEM_PROMPT,
|
||||
TAG_CANON_PROMPT_VERSION,
|
||||
build_user_message,
|
||||
normalize_alias_item,
|
||||
parse_canon_output,
|
||||
)
|
||||
|
||||
DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")
|
||||
DEFAULT_MODEL = os.environ.get("ATOCORE_TAG_CANON_MODEL", "sonnet")
|
||||
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_TAG_CANON_TIMEOUT_S", "90"))
|
||||
|
||||
AUTO_APPROVE_CONF = float(os.environ.get("ATOCORE_TAG_CANON_AUTO_APPROVE_CONF", "0.8"))
|
||||
MIN_ALIAS_COUNT = int(os.environ.get("ATOCORE_TAG_CANON_MIN_ALIAS_COUNT", "1"))
|
||||
|
||||
_sandbox_cwd = None
|
||||
|
||||
|
||||
def get_sandbox_cwd() -> str:
|
||||
global _sandbox_cwd
|
||||
if _sandbox_cwd is None:
|
||||
_sandbox_cwd = tempfile.mkdtemp(prefix="ato-tagcanon-")
|
||||
return _sandbox_cwd
|
||||
|
||||
|
||||
def api_get(base_url: str, path: str) -> dict:
|
||||
req = urllib.request.Request(f"{base_url}{path}")
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
|
||||
def api_post(base_url: str, path: str, body: dict | None = None) -> dict:
|
||||
data = json.dumps(body or {}).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{base_url}{path}", method="POST",
|
||||
headers={"Content-Type": "application/json"}, data=data,
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
|
||||
def call_claude(user_message: str, model: str, timeout_s: float) -> tuple[str | None, str | None]:
|
||||
if not shutil.which("claude"):
|
||||
return None, "claude CLI not available"
|
||||
args = [
|
||||
"claude", "-p",
|
||||
"--model", model,
|
||||
"--append-system-prompt", SYSTEM_PROMPT,
|
||||
"--disable-slash-commands",
|
||||
user_message,
|
||||
]
|
||||
last_error = ""
|
||||
for attempt in range(3):
|
||||
if attempt > 0:
|
||||
time.sleep(2 ** attempt)
|
||||
try:
|
||||
completed = subprocess.run(
|
||||
args, capture_output=True, text=True,
|
||||
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
||||
encoding="utf-8", errors="replace",
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
last_error = f"{model} timed out"
|
||||
continue
|
||||
except Exception as exc:
|
||||
last_error = f"subprocess error: {exc}"
|
||||
continue
|
||||
if completed.returncode == 0:
|
||||
return (completed.stdout or "").strip(), None
|
||||
stderr = (completed.stderr or "").strip()[:200]
|
||||
last_error = f"{model} exit {completed.returncode}: {stderr}"
|
||||
return None, last_error
|
||||
|
||||
|
||||
def fetch_tag_distribution(base_url: str) -> dict[str, int]:
|
||||
"""Count tag occurrences across active memories (client-side)."""
|
||||
try:
|
||||
result = api_get(base_url, "/memory?active_only=true&limit=2000")
|
||||
except Exception as e:
|
||||
print(f"ERROR: could not fetch memories: {e}", file=sys.stderr)
|
||||
return {}
|
||||
mems = result.get("memories", [])
|
||||
counts: dict[str, int] = {}
|
||||
for m in mems:
|
||||
tags = m.get("domain_tags") or []
|
||||
if isinstance(tags, str):
|
||||
try:
|
||||
tags = json.loads(tags)
|
||||
except Exception:
|
||||
tags = []
|
||||
if not isinstance(tags, list):
|
||||
continue
|
||||
for t in tags:
|
||||
if not isinstance(t, str):
|
||||
continue
|
||||
key = t.strip().lower()
|
||||
if key:
|
||||
counts[key] = counts.get(key, 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Phase 7C tag canonicalization detector")
|
||||
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
||||
parser.add_argument("--model", default=DEFAULT_MODEL)
|
||||
parser.add_argument("--timeout-s", type=float, default=DEFAULT_TIMEOUT_S)
|
||||
parser.add_argument("--no-auto-approve", action="store_true",
|
||||
help="Disable autonomous apply; all proposals → human queue")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print decisions without touching state")
|
||||
args = parser.parse_args()
|
||||
|
||||
base = args.base_url.rstrip("/")
|
||||
autonomous = not args.no_auto_approve
|
||||
|
||||
print(
|
||||
f"canonicalize_tags {TAG_CANON_PROMPT_VERSION} | model={args.model} | "
|
||||
f"autonomous={autonomous} | auto-approve conf>={AUTO_APPROVE_CONF}"
|
||||
)
|
||||
|
||||
dist = fetch_tag_distribution(base)
|
||||
print(f"tag distribution: {len(dist)} unique tags, "
|
||||
f"{sum(dist.values())} total references")
|
||||
if not dist:
|
||||
print("no tags found — nothing to canonicalize")
|
||||
return
|
||||
|
||||
user_msg = build_user_message(dist)
|
||||
raw, err = call_claude(user_msg, args.model, args.timeout_s)
|
||||
if err or raw is None:
|
||||
print(f"ERROR: LLM call failed: {err}", file=sys.stderr)
|
||||
return
|
||||
|
||||
aliases_raw = parse_canon_output(raw)
|
||||
print(f"LLM returned {len(aliases_raw)} raw alias proposals")
|
||||
|
||||
auto_applied = 0
|
||||
auto_skipped_missing_canonical = 0
|
||||
proposals_created = 0
|
||||
duplicates_skipped = 0
|
||||
|
||||
for item in aliases_raw:
|
||||
norm = normalize_alias_item(item)
|
||||
if norm is None:
|
||||
continue
|
||||
alias = norm["alias"]
|
||||
canonical = norm["canonical"]
|
||||
confidence = norm["confidence"]
|
||||
|
||||
alias_count = dist.get(alias, 0)
|
||||
canonical_count = dist.get(canonical, 0)
|
||||
|
||||
# Sanity: alias must actually exist in the current distribution
|
||||
if alias_count < MIN_ALIAS_COUNT:
|
||||
print(f" SKIP {alias!r} → {canonical!r}: alias not in distribution")
|
||||
continue
|
||||
if canonical_count == 0:
|
||||
auto_skipped_missing_canonical += 1
|
||||
print(f" SKIP {alias!r} → {canonical!r}: canonical missing from distribution")
|
||||
continue
|
||||
|
||||
label = f"{alias!r} ({alias_count}) → {canonical!r} ({canonical_count}) conf={confidence:.2f}"
|
||||
|
||||
auto_apply = autonomous and confidence >= AUTO_APPROVE_CONF
|
||||
if auto_apply:
|
||||
if args.dry_run:
|
||||
auto_applied += 1
|
||||
print(f" [dry-run] would auto-apply: {label}")
|
||||
continue
|
||||
try:
|
||||
result = api_post(base, "/admin/tags/aliases/apply", {
|
||||
"alias": alias, "canonical": canonical,
|
||||
"confidence": confidence, "reason": norm["reason"],
|
||||
"alias_count": alias_count, "canonical_count": canonical_count,
|
||||
"actor": "auto-tag-canon",
|
||||
})
|
||||
touched = result.get("memories_touched", 0)
|
||||
auto_applied += 1
|
||||
print(f" ✅ auto-applied: {label} ({touched} memories)")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ auto-apply failed: {label} — {e}", file=sys.stderr)
|
||||
time.sleep(0.2)
|
||||
continue
|
||||
|
||||
# Lower confidence → human review
|
||||
if args.dry_run:
|
||||
proposals_created += 1
|
||||
print(f" [dry-run] would propose for review: {label}")
|
||||
continue
|
||||
try:
|
||||
result = api_post(base, "/admin/tags/aliases/propose", {
|
||||
"alias": alias, "canonical": canonical,
|
||||
"confidence": confidence, "reason": norm["reason"],
|
||||
"alias_count": alias_count, "canonical_count": canonical_count,
|
||||
})
|
||||
if result.get("proposal_id"):
|
||||
proposals_created += 1
|
||||
print(f" → pending proposal: {label}")
|
||||
else:
|
||||
duplicates_skipped += 1
|
||||
print(f" (duplicate pending proposal): {label}")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ propose failed: {label} — {e}", file=sys.stderr)
|
||||
time.sleep(0.2)
|
||||
|
||||
print(
|
||||
f"\nsummary: proposals_seen={len(aliases_raw)} "
|
||||
f"auto_applied={auto_applied} "
|
||||
f"proposals_created={proposals_created} "
|
||||
f"duplicates_skipped={duplicates_skipped} "
|
||||
f"skipped_missing_canonical={auto_skipped_missing_canonical}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user