feat: Phase 7C — tag canonicalization (autonomous, weekly)

LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8 auto-apply, below goes to human triage. Protects project identifiers (p04, p05, p06, atocore, apm, etc.) from ever being canonicalized since they're their own namespace, not concepts. Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs "firmware-control" all mean the same thing, but cross-cutting queries that filter by tag only hit one variant. Weekly canonicalization pass keeps the tag graph clean. - Schema: tag_aliases table (pending | approved | rejected) - atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens) - service: get_tag_distribution, apply_tag_alias (atomic per-memory, dedupes if both alias + canonical present), create / approve / reject proposal lifecycle, per-memory audit rows with action="tag_canonicalized" - scripts/canonicalize_tags.py: host-side detector, autonomous by default, --no-auto-approve kill switch - 6 API endpoints under /admin/tags/* (distribution, list, propose, apply, approve/{id}, reject/{id}) - Step B4 in batch-extract.sh (Sundays only — weekly cadence) - 26 new tests (prompt parser, normalizer protections, distribution counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440. Design: aggressive protection of project tokens because a false canonicalization (p04 → p04-gigabit, or vice versa) would scramble cross-project filtering. Err toward preservation; the alias only applies if the model is very confident AND both strings appear in the current distribution. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 09:41:02 -04:00
parent e840ef4be3
commit 877b97ec78
7 changed files with 1085 additions and 0 deletions
--- a/src/atocore/memory/service.py
+++ b/src/atocore/memory/service.py
@@ -1038,6 +1038,224 @@ def _validate_confidence(confidence: float) -> None:
        raise ValueError("Confidence must be between 0.0 and 1.0")


+# ---------------------------------------------------------------------
+# Phase 7C — Tag canonicalization
+# ---------------------------------------------------------------------
+
+
+def get_tag_distribution(
+    active_only: bool = True,
+    min_count: int = 1,
+) -> dict[str, int]:
+    """Return {tag: occurrence_count} across memories for LLM input.
+
+    Used by the canonicalization detector to spot alias clusters like
+    {firmware: 23, fw: 5, firmware-control: 3}. Only counts memories
+    in the requested status (active by default) so superseded/invalid
+    rows don't bias the distribution.
+    """
+    import json as _json
+    counts: dict[str, int] = {}
+    query = "SELECT domain_tags FROM memories"
+    if active_only:
+        query += " WHERE status = 'active'"
+    with get_connection() as conn:
+        rows = conn.execute(query).fetchall()
+    for r in rows:
+        tags_raw = r["domain_tags"]
+        try:
+            tags = _json.loads(tags_raw) if tags_raw else []
+        except Exception:
+            tags = []
+        if not isinstance(tags, list):
+            continue
+        for t in tags:
+            if not isinstance(t, str):
+                continue
+            key = t.strip().lower()
+            if key:
+                counts[key] = counts.get(key, 0) + 1
+    if min_count > 1:
+        counts = {k: v for k, v in counts.items() if v >= min_count}
+    return counts
+
+
+def apply_tag_alias(
+    alias: str,
+    canonical: str,
+    actor: str = "tag-canon",
+) -> dict:
+    """Rewrite every active memory's domain_tags: alias → canonical.
+
+    Atomic per-memory. Dedupes within each memory's tag list (so if a
+    memory already has both alias AND canonical, we drop the alias and
+    keep canonical without duplicating). Writes one audit row per
+    touched memory with action="tag_canonicalized" so the full trail
+    is recoverable.
+
+    Returns {"memories_touched": int, "alias": ..., "canonical": ...}.
+    """
+    import json as _json
+    alias = (alias or "").strip().lower()
+    canonical = (canonical or "").strip().lower()
+    if not alias or not canonical:
+        raise ValueError("alias and canonical must be non-empty")
+    if alias == canonical:
+        raise ValueError("alias cannot equal canonical")
+
+    touched: list[tuple[str, list[str], list[str]]] = []
+
+    with get_connection() as conn:
+        rows = conn.execute(
+            "SELECT id, domain_tags FROM memories WHERE status = 'active'"
+        ).fetchall()
+
+        for r in rows:
+            raw = r["domain_tags"]
+            try:
+                tags = _json.loads(raw) if raw else []
+            except Exception:
+                tags = []
+            if not isinstance(tags, list):
+                continue
+            if alias not in tags:
+                continue
+
+            old_tags = [t for t in tags if isinstance(t, str)]
+            new_tags: list[str] = []
+            for t in old_tags:
+                rewritten = canonical if t == alias else t
+                if rewritten not in new_tags:
+                    new_tags.append(rewritten)
+            if new_tags == old_tags:
+                continue
+            conn.execute(
+                "UPDATE memories SET domain_tags = ?, updated_at = CURRENT_TIMESTAMP "
+                "WHERE id = ?",
+                (_json.dumps(new_tags), r["id"]),
+            )
+            touched.append((r["id"], old_tags, new_tags))
+
+    # Audit rows outside the transaction
+    for mem_id, old_tags, new_tags in touched:
+        _audit_memory(
+            memory_id=mem_id,
+            action="tag_canonicalized",
+            actor=actor,
+            before={"domain_tags": old_tags},
+            after={"domain_tags": new_tags},
+            note=f"{alias} → {canonical}",
+        )
+
+    if touched:
+        log.info("tag_alias_applied", alias=alias, canonical=canonical, memories_touched=len(touched))
+    return {
+        "memories_touched": len(touched),
+        "alias": alias,
+        "canonical": canonical,
+    }
+
+
+def create_tag_alias_proposal(
+    alias: str,
+    canonical: str,
+    confidence: float,
+    alias_count: int = 0,
+    canonical_count: int = 0,
+    reason: str = "",
+) -> str | None:
+    """Insert a tag_aliases row in status=pending.
+
+    Idempotent: if a pending proposal for (alias, canonical) already
+    exists, returns None.
+    """
+    import json as _json  # noqa: F401 — kept for parity with other helpers
+    alias = (alias or "").strip().lower()
+    canonical = (canonical or "").strip().lower()
+    if not alias or not canonical or alias == canonical:
+        return None
+    confidence = max(0.0, min(1.0, float(confidence)))
+
+    proposal_id = str(uuid.uuid4())
+    with get_connection() as conn:
+        existing = conn.execute(
+            "SELECT id FROM tag_aliases WHERE alias = ? AND canonical = ? "
+            "AND status = 'pending'",
+            (alias, canonical),
+        ).fetchone()
+        if existing:
+            return None
+
+        conn.execute(
+            "INSERT INTO tag_aliases (id, alias, canonical, status, confidence, "
+            "alias_count, canonical_count, reason) "
+            "VALUES (?, ?, ?, 'pending', ?, ?, ?, ?)",
+            (proposal_id, alias, canonical, confidence,
+             int(alias_count), int(canonical_count), reason[:500]),
+        )
+    log.info(
+        "tag_alias_proposed",
+        proposal_id=proposal_id,
+        alias=alias,
+        canonical=canonical,
+        confidence=round(confidence, 3),
+    )
+    return proposal_id
+
+
+def get_tag_alias_proposals(status: str = "pending", limit: int = 100) -> list[dict]:
+    """List tag alias proposals."""
+    with get_connection() as conn:
+        rows = conn.execute(
+            "SELECT * FROM tag_aliases WHERE status = ? "
+            "ORDER BY confidence DESC, created_at DESC LIMIT ?",
+            (status, limit),
+        ).fetchall()
+    return [dict(r) for r in rows]
+
+
+def approve_tag_alias(
+    proposal_id: str,
+    actor: str = "human-triage",
+) -> dict | None:
+    """Apply the alias rewrite + mark the proposal approved.
+
+    Returns the apply_tag_alias result dict, or None if the proposal
+    is not found or already resolved.
+    """
+    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+    with get_connection() as conn:
+        row = conn.execute(
+            "SELECT alias, canonical, status FROM tag_aliases WHERE id = ?",
+            (proposal_id,),
+        ).fetchone()
+        if row is None or row["status"] != "pending":
+            return None
+        alias, canonical = row["alias"], row["canonical"]
+
+    result = apply_tag_alias(alias, canonical, actor=actor)
+
+    with get_connection() as conn:
+        conn.execute(
+            "UPDATE tag_aliases SET status = 'approved', resolved_at = ?, "
+            "resolved_by = ?, applied_to_memories = ? WHERE id = ?",
+            (now_str, actor, result["memories_touched"], proposal_id),
+        )
+    return result
+
+
+def reject_tag_alias(proposal_id: str, actor: str = "human-triage") -> bool:
+    """Mark a tag alias proposal as rejected without applying it."""
+    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+    with get_connection() as conn:
+        result = conn.execute(
+            "UPDATE tag_aliases SET status = 'rejected', resolved_at = ?, "
+            "resolved_by = ? WHERE id = ? AND status = 'pending'",
+            (now_str, actor, proposal_id),
+        )
+        return result.rowcount > 0
+
+
 # ---------------------------------------------------------------------
 # Phase 7A — Memory Consolidation: merge-candidate lifecycle
 # ---------------------------------------------------------------------