feat: Phase 7C — tag canonicalization (autonomous, weekly)
LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.
Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.
- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
dedupes if both alias + canonical present), create / approve / reject
proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
--no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.
Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1038,6 +1038,224 @@ def _validate_confidence(confidence: float) -> None:
|
||||
raise ValueError("Confidence must be between 0.0 and 1.0")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Phase 7C — Tag canonicalization
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_tag_distribution(
|
||||
active_only: bool = True,
|
||||
min_count: int = 1,
|
||||
) -> dict[str, int]:
|
||||
"""Return {tag: occurrence_count} across memories for LLM input.
|
||||
|
||||
Used by the canonicalization detector to spot alias clusters like
|
||||
{firmware: 23, fw: 5, firmware-control: 3}. Only counts memories
|
||||
in the requested status (active by default) so superseded/invalid
|
||||
rows don't bias the distribution.
|
||||
"""
|
||||
import json as _json
|
||||
counts: dict[str, int] = {}
|
||||
query = "SELECT domain_tags FROM memories"
|
||||
if active_only:
|
||||
query += " WHERE status = 'active'"
|
||||
with get_connection() as conn:
|
||||
rows = conn.execute(query).fetchall()
|
||||
for r in rows:
|
||||
tags_raw = r["domain_tags"]
|
||||
try:
|
||||
tags = _json.loads(tags_raw) if tags_raw else []
|
||||
except Exception:
|
||||
tags = []
|
||||
if not isinstance(tags, list):
|
||||
continue
|
||||
for t in tags:
|
||||
if not isinstance(t, str):
|
||||
continue
|
||||
key = t.strip().lower()
|
||||
if key:
|
||||
counts[key] = counts.get(key, 0) + 1
|
||||
if min_count > 1:
|
||||
counts = {k: v for k, v in counts.items() if v >= min_count}
|
||||
return counts
|
||||
|
||||
|
||||
def apply_tag_alias(
|
||||
alias: str,
|
||||
canonical: str,
|
||||
actor: str = "tag-canon",
|
||||
) -> dict:
|
||||
"""Rewrite every active memory's domain_tags: alias → canonical.
|
||||
|
||||
Atomic per-memory. Dedupes within each memory's tag list (so if a
|
||||
memory already has both alias AND canonical, we drop the alias and
|
||||
keep canonical without duplicating). Writes one audit row per
|
||||
touched memory with action="tag_canonicalized" so the full trail
|
||||
is recoverable.
|
||||
|
||||
Returns {"memories_touched": int, "alias": ..., "canonical": ...}.
|
||||
"""
|
||||
import json as _json
|
||||
alias = (alias or "").strip().lower()
|
||||
canonical = (canonical or "").strip().lower()
|
||||
if not alias or not canonical:
|
||||
raise ValueError("alias and canonical must be non-empty")
|
||||
if alias == canonical:
|
||||
raise ValueError("alias cannot equal canonical")
|
||||
|
||||
touched: list[tuple[str, list[str], list[str]]] = []
|
||||
|
||||
with get_connection() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT id, domain_tags FROM memories WHERE status = 'active'"
|
||||
).fetchall()
|
||||
|
||||
for r in rows:
|
||||
raw = r["domain_tags"]
|
||||
try:
|
||||
tags = _json.loads(raw) if raw else []
|
||||
except Exception:
|
||||
tags = []
|
||||
if not isinstance(tags, list):
|
||||
continue
|
||||
if alias not in tags:
|
||||
continue
|
||||
|
||||
old_tags = [t for t in tags if isinstance(t, str)]
|
||||
new_tags: list[str] = []
|
||||
for t in old_tags:
|
||||
rewritten = canonical if t == alias else t
|
||||
if rewritten not in new_tags:
|
||||
new_tags.append(rewritten)
|
||||
if new_tags == old_tags:
|
||||
continue
|
||||
conn.execute(
|
||||
"UPDATE memories SET domain_tags = ?, updated_at = CURRENT_TIMESTAMP "
|
||||
"WHERE id = ?",
|
||||
(_json.dumps(new_tags), r["id"]),
|
||||
)
|
||||
touched.append((r["id"], old_tags, new_tags))
|
||||
|
||||
# Audit rows outside the transaction
|
||||
for mem_id, old_tags, new_tags in touched:
|
||||
_audit_memory(
|
||||
memory_id=mem_id,
|
||||
action="tag_canonicalized",
|
||||
actor=actor,
|
||||
before={"domain_tags": old_tags},
|
||||
after={"domain_tags": new_tags},
|
||||
note=f"{alias} → {canonical}",
|
||||
)
|
||||
|
||||
if touched:
|
||||
log.info("tag_alias_applied", alias=alias, canonical=canonical, memories_touched=len(touched))
|
||||
return {
|
||||
"memories_touched": len(touched),
|
||||
"alias": alias,
|
||||
"canonical": canonical,
|
||||
}
|
||||
|
||||
|
||||
def create_tag_alias_proposal(
|
||||
alias: str,
|
||||
canonical: str,
|
||||
confidence: float,
|
||||
alias_count: int = 0,
|
||||
canonical_count: int = 0,
|
||||
reason: str = "",
|
||||
) -> str | None:
|
||||
"""Insert a tag_aliases row in status=pending.
|
||||
|
||||
Idempotent: if a pending proposal for (alias, canonical) already
|
||||
exists, returns None.
|
||||
"""
|
||||
import json as _json # noqa: F401 — kept for parity with other helpers
|
||||
alias = (alias or "").strip().lower()
|
||||
canonical = (canonical or "").strip().lower()
|
||||
if not alias or not canonical or alias == canonical:
|
||||
return None
|
||||
confidence = max(0.0, min(1.0, float(confidence)))
|
||||
|
||||
proposal_id = str(uuid.uuid4())
|
||||
with get_connection() as conn:
|
||||
existing = conn.execute(
|
||||
"SELECT id FROM tag_aliases WHERE alias = ? AND canonical = ? "
|
||||
"AND status = 'pending'",
|
||||
(alias, canonical),
|
||||
).fetchone()
|
||||
if existing:
|
||||
return None
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO tag_aliases (id, alias, canonical, status, confidence, "
|
||||
"alias_count, canonical_count, reason) "
|
||||
"VALUES (?, ?, ?, 'pending', ?, ?, ?, ?)",
|
||||
(proposal_id, alias, canonical, confidence,
|
||||
int(alias_count), int(canonical_count), reason[:500]),
|
||||
)
|
||||
log.info(
|
||||
"tag_alias_proposed",
|
||||
proposal_id=proposal_id,
|
||||
alias=alias,
|
||||
canonical=canonical,
|
||||
confidence=round(confidence, 3),
|
||||
)
|
||||
return proposal_id
|
||||
|
||||
|
||||
def get_tag_alias_proposals(status: str = "pending", limit: int = 100) -> list[dict]:
|
||||
"""List tag alias proposals."""
|
||||
with get_connection() as conn:
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM tag_aliases WHERE status = ? "
|
||||
"ORDER BY confidence DESC, created_at DESC LIMIT ?",
|
||||
(status, limit),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def approve_tag_alias(
|
||||
proposal_id: str,
|
||||
actor: str = "human-triage",
|
||||
) -> dict | None:
|
||||
"""Apply the alias rewrite + mark the proposal approved.
|
||||
|
||||
Returns the apply_tag_alias result dict, or None if the proposal
|
||||
is not found or already resolved.
|
||||
"""
|
||||
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
||||
with get_connection() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT alias, canonical, status FROM tag_aliases WHERE id = ?",
|
||||
(proposal_id,),
|
||||
).fetchone()
|
||||
if row is None or row["status"] != "pending":
|
||||
return None
|
||||
alias, canonical = row["alias"], row["canonical"]
|
||||
|
||||
result = apply_tag_alias(alias, canonical, actor=actor)
|
||||
|
||||
with get_connection() as conn:
|
||||
conn.execute(
|
||||
"UPDATE tag_aliases SET status = 'approved', resolved_at = ?, "
|
||||
"resolved_by = ?, applied_to_memories = ? WHERE id = ?",
|
||||
(now_str, actor, result["memories_touched"], proposal_id),
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def reject_tag_alias(proposal_id: str, actor: str = "human-triage") -> bool:
|
||||
"""Mark a tag alias proposal as rejected without applying it."""
|
||||
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
||||
with get_connection() as conn:
|
||||
result = conn.execute(
|
||||
"UPDATE tag_aliases SET status = 'rejected', resolved_at = ?, "
|
||||
"resolved_by = ? WHERE id = ? AND status = 'pending'",
|
||||
(now_str, actor, proposal_id),
|
||||
)
|
||||
return result.rowcount > 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Phase 7A — Memory Consolidation: merge-candidate lifecycle
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user