feat: Phase 7C — tag canonicalization (autonomous, weekly)

LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.

Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.

- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
  dedupes if both alias + canonical present), create / approve / reject
  proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
  --no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
  apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
  counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.

Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-19 09:41:02 -04:00
parent e840ef4be3
commit 877b97ec78
7 changed files with 1085 additions and 0 deletions

View File

@@ -1038,6 +1038,224 @@ def _validate_confidence(confidence: float) -> None:
raise ValueError("Confidence must be between 0.0 and 1.0")
# ---------------------------------------------------------------------
# Phase 7C — Tag canonicalization
# ---------------------------------------------------------------------
def get_tag_distribution(
active_only: bool = True,
min_count: int = 1,
) -> dict[str, int]:
"""Return {tag: occurrence_count} across memories for LLM input.
Used by the canonicalization detector to spot alias clusters like
{firmware: 23, fw: 5, firmware-control: 3}. Only counts memories
in the requested status (active by default) so superseded/invalid
rows don't bias the distribution.
"""
import json as _json
counts: dict[str, int] = {}
query = "SELECT domain_tags FROM memories"
if active_only:
query += " WHERE status = 'active'"
with get_connection() as conn:
rows = conn.execute(query).fetchall()
for r in rows:
tags_raw = r["domain_tags"]
try:
tags = _json.loads(tags_raw) if tags_raw else []
except Exception:
tags = []
if not isinstance(tags, list):
continue
for t in tags:
if not isinstance(t, str):
continue
key = t.strip().lower()
if key:
counts[key] = counts.get(key, 0) + 1
if min_count > 1:
counts = {k: v for k, v in counts.items() if v >= min_count}
return counts
def apply_tag_alias(
alias: str,
canonical: str,
actor: str = "tag-canon",
) -> dict:
"""Rewrite every active memory's domain_tags: alias → canonical.
Atomic per-memory. Dedupes within each memory's tag list (so if a
memory already has both alias AND canonical, we drop the alias and
keep canonical without duplicating). Writes one audit row per
touched memory with action="tag_canonicalized" so the full trail
is recoverable.
Returns {"memories_touched": int, "alias": ..., "canonical": ...}.
"""
import json as _json
alias = (alias or "").strip().lower()
canonical = (canonical or "").strip().lower()
if not alias or not canonical:
raise ValueError("alias and canonical must be non-empty")
if alias == canonical:
raise ValueError("alias cannot equal canonical")
touched: list[tuple[str, list[str], list[str]]] = []
with get_connection() as conn:
rows = conn.execute(
"SELECT id, domain_tags FROM memories WHERE status = 'active'"
).fetchall()
for r in rows:
raw = r["domain_tags"]
try:
tags = _json.loads(raw) if raw else []
except Exception:
tags = []
if not isinstance(tags, list):
continue
if alias not in tags:
continue
old_tags = [t for t in tags if isinstance(t, str)]
new_tags: list[str] = []
for t in old_tags:
rewritten = canonical if t == alias else t
if rewritten not in new_tags:
new_tags.append(rewritten)
if new_tags == old_tags:
continue
conn.execute(
"UPDATE memories SET domain_tags = ?, updated_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(_json.dumps(new_tags), r["id"]),
)
touched.append((r["id"], old_tags, new_tags))
# Audit rows outside the transaction
for mem_id, old_tags, new_tags in touched:
_audit_memory(
memory_id=mem_id,
action="tag_canonicalized",
actor=actor,
before={"domain_tags": old_tags},
after={"domain_tags": new_tags},
note=f"{alias}{canonical}",
)
if touched:
log.info("tag_alias_applied", alias=alias, canonical=canonical, memories_touched=len(touched))
return {
"memories_touched": len(touched),
"alias": alias,
"canonical": canonical,
}
def create_tag_alias_proposal(
alias: str,
canonical: str,
confidence: float,
alias_count: int = 0,
canonical_count: int = 0,
reason: str = "",
) -> str | None:
"""Insert a tag_aliases row in status=pending.
Idempotent: if a pending proposal for (alias, canonical) already
exists, returns None.
"""
import json as _json # noqa: F401 — kept for parity with other helpers
alias = (alias or "").strip().lower()
canonical = (canonical or "").strip().lower()
if not alias or not canonical or alias == canonical:
return None
confidence = max(0.0, min(1.0, float(confidence)))
proposal_id = str(uuid.uuid4())
with get_connection() as conn:
existing = conn.execute(
"SELECT id FROM tag_aliases WHERE alias = ? AND canonical = ? "
"AND status = 'pending'",
(alias, canonical),
).fetchone()
if existing:
return None
conn.execute(
"INSERT INTO tag_aliases (id, alias, canonical, status, confidence, "
"alias_count, canonical_count, reason) "
"VALUES (?, ?, ?, 'pending', ?, ?, ?, ?)",
(proposal_id, alias, canonical, confidence,
int(alias_count), int(canonical_count), reason[:500]),
)
log.info(
"tag_alias_proposed",
proposal_id=proposal_id,
alias=alias,
canonical=canonical,
confidence=round(confidence, 3),
)
return proposal_id
def get_tag_alias_proposals(status: str = "pending", limit: int = 100) -> list[dict]:
"""List tag alias proposals."""
with get_connection() as conn:
rows = conn.execute(
"SELECT * FROM tag_aliases WHERE status = ? "
"ORDER BY confidence DESC, created_at DESC LIMIT ?",
(status, limit),
).fetchall()
return [dict(r) for r in rows]
def approve_tag_alias(
proposal_id: str,
actor: str = "human-triage",
) -> dict | None:
"""Apply the alias rewrite + mark the proposal approved.
Returns the apply_tag_alias result dict, or None if the proposal
is not found or already resolved.
"""
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
with get_connection() as conn:
row = conn.execute(
"SELECT alias, canonical, status FROM tag_aliases WHERE id = ?",
(proposal_id,),
).fetchone()
if row is None or row["status"] != "pending":
return None
alias, canonical = row["alias"], row["canonical"]
result = apply_tag_alias(alias, canonical, actor=actor)
with get_connection() as conn:
conn.execute(
"UPDATE tag_aliases SET status = 'approved', resolved_at = ?, "
"resolved_by = ?, applied_to_memories = ? WHERE id = ?",
(now_str, actor, result["memories_touched"], proposal_id),
)
return result
def reject_tag_alias(proposal_id: str, actor: str = "human-triage") -> bool:
"""Mark a tag alias proposal as rejected without applying it."""
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
with get_connection() as conn:
result = conn.execute(
"UPDATE tag_aliases SET status = 'rejected', resolved_at = ?, "
"resolved_by = ? WHERE id = ? AND status = 'pending'",
(now_str, actor, proposal_id),
)
return result.rowcount > 0
# ---------------------------------------------------------------------
# Phase 7A — Memory Consolidation: merge-candidate lifecycle
# ---------------------------------------------------------------------