feat: Phase 7C — tag canonicalization (autonomous, weekly)

LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.

Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.

- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
  dedupes if both alias + canonical present), create / approve / reject
  proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
  --no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
  apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
  counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.

Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-19 09:41:02 -04:00
parent e840ef4be3
commit 877b97ec78
7 changed files with 1085 additions and 0 deletions

View File

@@ -1517,6 +1517,125 @@ def api_graduation_status() -> dict:
return out
# --- Phase 7C: tag canonicalization ---
class TagAliasProposalBody(BaseModel):
alias: str
canonical: str
confidence: float = 0.0
reason: str = ""
alias_count: int = 0
canonical_count: int = 0
class TagAliasApplyBody(BaseModel):
alias: str
canonical: str
confidence: float = 0.9
reason: str = ""
alias_count: int = 0
canonical_count: int = 0
actor: str = "auto-tag-canon"
class TagAliasResolveBody(BaseModel):
actor: str = "human-triage"
@router.get("/admin/tags/distribution")
def api_tag_distribution() -> dict:
"""Current tag distribution across active memories (for UI / debug)."""
from atocore.memory.service import get_tag_distribution
dist = get_tag_distribution()
sorted_tags = sorted(dist.items(), key=lambda x: x[1], reverse=True)
return {"total_references": sum(dist.values()), "unique_tags": len(dist),
"tags": [{"tag": t, "count": c} for t, c in sorted_tags]}
@router.get("/admin/tags/aliases")
def api_list_tag_aliases(status: str = "pending", limit: int = 100) -> dict:
"""List tag alias proposals (default: pending for review)."""
from atocore.memory.service import get_tag_alias_proposals
rows = get_tag_alias_proposals(status=status, limit=limit)
return {"proposals": rows, "count": len(rows)}
@router.post("/admin/tags/aliases/propose")
def api_propose_tag_alias(body: TagAliasProposalBody) -> dict:
"""Submit a low-confidence alias proposal for human review."""
from atocore.memory.service import create_tag_alias_proposal
pid = create_tag_alias_proposal(
alias=body.alias, canonical=body.canonical,
confidence=body.confidence, alias_count=body.alias_count,
canonical_count=body.canonical_count, reason=body.reason,
)
if pid is None:
return {"proposal_id": None, "duplicate": True}
return {"proposal_id": pid, "duplicate": False}
@router.post("/admin/tags/aliases/apply")
def api_apply_tag_alias(body: TagAliasApplyBody) -> dict:
"""Apply an alias rewrite directly (used by the auto-approval path).
Creates a tag_aliases row in status=approved with the apply result
recorded, so autonomous merges land in the same audit surface as
human approvals.
"""
from datetime import datetime as _dt, timezone as _tz
from atocore.memory.service import apply_tag_alias, create_tag_alias_proposal
from atocore.models.database import get_connection
# Record proposal + apply + mark approved in one flow
pid = create_tag_alias_proposal(
alias=body.alias, canonical=body.canonical,
confidence=body.confidence, alias_count=body.alias_count,
canonical_count=body.canonical_count, reason=body.reason,
)
if pid is None:
# A pending proposal already exists — don't double-apply.
raise HTTPException(status_code=409, detail="A pending proposal already exists for this (alias, canonical) pair — approve it via /admin/tags/aliases/{id}/approve")
try:
result = apply_tag_alias(body.alias, body.canonical, actor=body.actor)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
now_str = _dt.now(_tz.utc).strftime("%Y-%m-%d %H:%M:%S")
with get_connection() as conn:
conn.execute(
"UPDATE tag_aliases SET status = 'approved', resolved_at = ?, "
"resolved_by = ?, applied_to_memories = ? WHERE id = ?",
(now_str, body.actor, result["memories_touched"], pid),
)
return {
"proposal_id": pid,
"memories_touched": result["memories_touched"],
"alias": body.alias, "canonical": body.canonical,
}
@router.post("/admin/tags/aliases/{proposal_id}/approve")
def api_approve_tag_alias(proposal_id: str, body: TagAliasResolveBody) -> dict:
"""Human-in-the-loop approve for a pending proposal."""
from atocore.memory.service import approve_tag_alias
result = approve_tag_alias(proposal_id, actor=body.actor)
if result is None:
raise HTTPException(status_code=404, detail="Proposal not found or already resolved")
return {"status": "approved", "proposal_id": proposal_id,
"memories_touched": result["memories_touched"]}
@router.post("/admin/tags/aliases/{proposal_id}/reject")
def api_reject_tag_alias(proposal_id: str, body: TagAliasResolveBody) -> dict:
"""Human-in-the-loop reject for a pending proposal."""
from atocore.memory.service import reject_tag_alias
if not reject_tag_alias(proposal_id, actor=body.actor):
raise HTTPException(status_code=404, detail="Proposal not found or already resolved")
return {"status": "rejected", "proposal_id": proposal_id}
class DecayRunBody(BaseModel):
idle_days_threshold: int = 30
daily_decay_factor: float = 0.97

View File

@@ -0,0 +1,158 @@
"""Shared LLM prompt + parser for tag canonicalization (Phase 7C).
Stdlib-only, importable from both the in-container service layer and the
host-side batch script that shells out to ``claude -p``.
The prompt instructs the model to propose a map of domain_tag aliases
to their canonical form. Confidence is key here — we AUTO-APPLY high-
confidence aliases; low-confidence go to human review. Over-merging
distinct concepts ("optics" vs "optical" — sometimes equivalent,
sometimes not) destroys cross-cutting retrieval, so the model is
instructed to err conservative.
"""
from __future__ import annotations
import json
from typing import Any
TAG_CANON_PROMPT_VERSION = "tagcanon-0.1.0"
MAX_TAGS_IN_PROMPT = 100
SYSTEM_PROMPT = """You canonicalize domain tags for AtoCore's memory layer.
Input: a distribution of lowercase domain tags (keyword → usage count across active memories). Examples: "firmware: 23", "fw: 5", "firmware-control: 3", "optics: 18", "optical: 2".
Your job: identify aliases — distinct strings that refer to the SAME concept — and map them to a single canonical form. The canonical should be the clearest / most-used / most-descriptive variant.
STRICT RULES:
1. ONLY propose aliases that are UNAMBIGUOUSLY equivalent. Examples:
- "fw""firmware" (abbreviation)
- "firmware-control""firmware" (compound narrowing — only if usage context makes it clear the narrower one is never used to DISTINGUISH from firmware-in-general)
- "py""python"
- "ml""machine-learning"
Do NOT merge:
- "optics" vs "optical" — these CAN diverge ("optics" = subsystem/product domain; "optical" = adjective used in non-optics contexts)
- "p04" vs "p04-gigabit" — project ids are their own namespace, never canonicalize
- "thermal" vs "temperature" — related but distinct
- Anything where you're not sure — skip it, human review will catch real aliases next week
2. Confidence scale:
0.9+ obvious abbreviation, very high usage disparity, no plausible alternative meaning
0.7-0.9 likely alias, one-word-diff or standard contraction
0.5-0.7 plausible but requires context — low count on alias side
<0.5 DO NOT PROPOSE — if you're under 0.5, skip the pair entirely
AtoCore auto-applies aliases at confidence >= 0.8; anything below goes to human review.
3. The CANONICAL must actually appear in the input list (don't invent a new term).
4. Never propose `alias == canonical`. Never propose circular mappings.
5. Project tags (p04, p05, p06, abb-space, atomizer-v2, atocore, apm) are OFF LIMITS — they are project identifiers, not concepts. Leave them alone entirely.
OUTPUT — raw JSON, no prose, no markdown fences:
{
"aliases": [
{"alias": "fw", "canonical": "firmware", "confidence": 0.95, "reason": "fw is a standard abbreviation of firmware; 5 uses vs 23"},
{"alias": "ml", "canonical": "machine-learning", "confidence": 0.90, "reason": "ml is the universal abbreviation"}
]
}
Empty aliases list is fine if nothing in the distribution is a clear alias. Err conservative — one false merge can pollute retrieval for hundreds of memories."""
def build_user_message(tag_distribution: dict[str, int]) -> str:
"""Format the tag distribution for the model.
Limited to MAX_TAGS_IN_PROMPT entries, sorted by count descending
so high-usage tags appear first (the LLM uses them as anchor points
for canonical selection).
"""
if not tag_distribution:
return "Empty tag distribution — return {\"aliases\": []}."
sorted_tags = sorted(tag_distribution.items(), key=lambda x: x[1], reverse=True)
top = sorted_tags[:MAX_TAGS_IN_PROMPT]
lines = [f"{tag}: {count}" for tag, count in top]
return (
f"Tag distribution across {sum(tag_distribution.values())} total tag references "
f"(showing top {len(top)} of {len(tag_distribution)} unique tags):\n\n"
+ "\n".join(lines)
+ "\n\nReturn the JSON aliases map now. Only propose UNAMBIGUOUS equivalents."
)
def parse_canon_output(raw_output: str) -> list[dict[str, Any]]:
"""Strip markdown fences / prose and return the parsed aliases list."""
text = (raw_output or "").strip()
if text.startswith("```"):
text = text.strip("`")
nl = text.find("\n")
if nl >= 0:
text = text[nl + 1:]
if text.endswith("```"):
text = text[:-3]
text = text.strip()
if not text.lstrip().startswith("{"):
start = text.find("{")
end = text.rfind("}")
if start >= 0 and end > start:
text = text[start:end + 1]
try:
parsed = json.loads(text)
except json.JSONDecodeError:
return []
if not isinstance(parsed, dict):
return []
aliases = parsed.get("aliases") or []
if not isinstance(aliases, list):
return []
return [a for a in aliases if isinstance(a, dict)]
# Project tokens that must never be canonicalized — they're project ids,
# not concepts. Keep this list in sync with the registered projects.
# Safe to be over-inclusive; extra entries just skip canonicalization.
PROTECTED_PROJECT_TOKENS = frozenset({
"p04", "p04-gigabit",
"p05", "p05-interferometer",
"p06", "p06-polisher",
"p08", "abb-space",
"atomizer", "atomizer-v2",
"atocore", "apm",
})
def normalize_alias_item(item: dict[str, Any]) -> dict[str, Any] | None:
"""Validate one raw alias proposal. Returns None if unusable.
Filters: non-strings, empty strings, identity mappings, protected
project tokens on either side.
"""
alias = str(item.get("alias") or "").strip().lower()
canonical = str(item.get("canonical") or "").strip().lower()
if not alias or not canonical:
return None
if alias == canonical:
return None
if alias in PROTECTED_PROJECT_TOKENS or canonical in PROTECTED_PROJECT_TOKENS:
return None
try:
confidence = float(item.get("confidence", 0.0))
except (TypeError, ValueError):
confidence = 0.0
confidence = max(0.0, min(1.0, confidence))
reason = str(item.get("reason") or "").strip()[:300]
return {
"alias": alias,
"canonical": canonical,
"confidence": confidence,
"reason": reason,
}

View File

@@ -1038,6 +1038,224 @@ def _validate_confidence(confidence: float) -> None:
raise ValueError("Confidence must be between 0.0 and 1.0")
# ---------------------------------------------------------------------
# Phase 7C — Tag canonicalization
# ---------------------------------------------------------------------
def get_tag_distribution(
active_only: bool = True,
min_count: int = 1,
) -> dict[str, int]:
"""Return {tag: occurrence_count} across memories for LLM input.
Used by the canonicalization detector to spot alias clusters like
{firmware: 23, fw: 5, firmware-control: 3}. Only counts memories
in the requested status (active by default) so superseded/invalid
rows don't bias the distribution.
"""
import json as _json
counts: dict[str, int] = {}
query = "SELECT domain_tags FROM memories"
if active_only:
query += " WHERE status = 'active'"
with get_connection() as conn:
rows = conn.execute(query).fetchall()
for r in rows:
tags_raw = r["domain_tags"]
try:
tags = _json.loads(tags_raw) if tags_raw else []
except Exception:
tags = []
if not isinstance(tags, list):
continue
for t in tags:
if not isinstance(t, str):
continue
key = t.strip().lower()
if key:
counts[key] = counts.get(key, 0) + 1
if min_count > 1:
counts = {k: v for k, v in counts.items() if v >= min_count}
return counts
def apply_tag_alias(
alias: str,
canonical: str,
actor: str = "tag-canon",
) -> dict:
"""Rewrite every active memory's domain_tags: alias → canonical.
Atomic per-memory. Dedupes within each memory's tag list (so if a
memory already has both alias AND canonical, we drop the alias and
keep canonical without duplicating). Writes one audit row per
touched memory with action="tag_canonicalized" so the full trail
is recoverable.
Returns {"memories_touched": int, "alias": ..., "canonical": ...}.
"""
import json as _json
alias = (alias or "").strip().lower()
canonical = (canonical or "").strip().lower()
if not alias or not canonical:
raise ValueError("alias and canonical must be non-empty")
if alias == canonical:
raise ValueError("alias cannot equal canonical")
touched: list[tuple[str, list[str], list[str]]] = []
with get_connection() as conn:
rows = conn.execute(
"SELECT id, domain_tags FROM memories WHERE status = 'active'"
).fetchall()
for r in rows:
raw = r["domain_tags"]
try:
tags = _json.loads(raw) if raw else []
except Exception:
tags = []
if not isinstance(tags, list):
continue
if alias not in tags:
continue
old_tags = [t for t in tags if isinstance(t, str)]
new_tags: list[str] = []
for t in old_tags:
rewritten = canonical if t == alias else t
if rewritten not in new_tags:
new_tags.append(rewritten)
if new_tags == old_tags:
continue
conn.execute(
"UPDATE memories SET domain_tags = ?, updated_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(_json.dumps(new_tags), r["id"]),
)
touched.append((r["id"], old_tags, new_tags))
# Audit rows outside the transaction
for mem_id, old_tags, new_tags in touched:
_audit_memory(
memory_id=mem_id,
action="tag_canonicalized",
actor=actor,
before={"domain_tags": old_tags},
after={"domain_tags": new_tags},
note=f"{alias}{canonical}",
)
if touched:
log.info("tag_alias_applied", alias=alias, canonical=canonical, memories_touched=len(touched))
return {
"memories_touched": len(touched),
"alias": alias,
"canonical": canonical,
}
def create_tag_alias_proposal(
alias: str,
canonical: str,
confidence: float,
alias_count: int = 0,
canonical_count: int = 0,
reason: str = "",
) -> str | None:
"""Insert a tag_aliases row in status=pending.
Idempotent: if a pending proposal for (alias, canonical) already
exists, returns None.
"""
import json as _json # noqa: F401 — kept for parity with other helpers
alias = (alias or "").strip().lower()
canonical = (canonical or "").strip().lower()
if not alias or not canonical or alias == canonical:
return None
confidence = max(0.0, min(1.0, float(confidence)))
proposal_id = str(uuid.uuid4())
with get_connection() as conn:
existing = conn.execute(
"SELECT id FROM tag_aliases WHERE alias = ? AND canonical = ? "
"AND status = 'pending'",
(alias, canonical),
).fetchone()
if existing:
return None
conn.execute(
"INSERT INTO tag_aliases (id, alias, canonical, status, confidence, "
"alias_count, canonical_count, reason) "
"VALUES (?, ?, ?, 'pending', ?, ?, ?, ?)",
(proposal_id, alias, canonical, confidence,
int(alias_count), int(canonical_count), reason[:500]),
)
log.info(
"tag_alias_proposed",
proposal_id=proposal_id,
alias=alias,
canonical=canonical,
confidence=round(confidence, 3),
)
return proposal_id
def get_tag_alias_proposals(status: str = "pending", limit: int = 100) -> list[dict]:
"""List tag alias proposals."""
with get_connection() as conn:
rows = conn.execute(
"SELECT * FROM tag_aliases WHERE status = ? "
"ORDER BY confidence DESC, created_at DESC LIMIT ?",
(status, limit),
).fetchall()
return [dict(r) for r in rows]
def approve_tag_alias(
proposal_id: str,
actor: str = "human-triage",
) -> dict | None:
"""Apply the alias rewrite + mark the proposal approved.
Returns the apply_tag_alias result dict, or None if the proposal
is not found or already resolved.
"""
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
with get_connection() as conn:
row = conn.execute(
"SELECT alias, canonical, status FROM tag_aliases WHERE id = ?",
(proposal_id,),
).fetchone()
if row is None or row["status"] != "pending":
return None
alias, canonical = row["alias"], row["canonical"]
result = apply_tag_alias(alias, canonical, actor=actor)
with get_connection() as conn:
conn.execute(
"UPDATE tag_aliases SET status = 'approved', resolved_at = ?, "
"resolved_by = ?, applied_to_memories = ? WHERE id = ?",
(now_str, actor, result["memories_touched"], proposal_id),
)
return result
def reject_tag_alias(proposal_id: str, actor: str = "human-triage") -> bool:
"""Mark a tag alias proposal as rejected without applying it."""
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
with get_connection() as conn:
result = conn.execute(
"UPDATE tag_aliases SET status = 'rejected', resolved_at = ?, "
"resolved_by = ? WHERE id = ? AND status = 'pending'",
(now_str, actor, proposal_id),
)
return result.rowcount > 0
# ---------------------------------------------------------------------
# Phase 7A — Memory Consolidation: merge-candidate lifecycle
# ---------------------------------------------------------------------

View File

@@ -287,6 +287,33 @@ def _apply_migrations(conn: sqlite3.Connection) -> None:
"CREATE INDEX IF NOT EXISTS idx_mmc_created_at ON memory_merge_candidates(created_at)"
)
# Phase 7C (Memory Consolidation — tag canonicalization): alias → canonical
# map for domain_tags. A weekly LLM pass proposes rows here; high-confidence
# ones auto-apply (rewrite domain_tags across all memories), low-confidence
# ones stay pending for human approval. Immutable history: resolved rows
# keep status=approved/rejected; the same alias can re-appear with a new
# id if the tag reaches a different canonical later.
conn.execute(
"""
CREATE TABLE IF NOT EXISTS tag_aliases (
id TEXT PRIMARY KEY,
alias TEXT NOT NULL,
canonical TEXT NOT NULL,
status TEXT DEFAULT 'pending',
confidence REAL DEFAULT 0.0,
alias_count INTEGER DEFAULT 0,
canonical_count INTEGER DEFAULT 0,
reason TEXT DEFAULT '',
applied_to_memories INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
resolved_at DATETIME,
resolved_by TEXT
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_tag_aliases_status ON tag_aliases(status)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_tag_aliases_alias ON tag_aliases(alias)")
def _column_exists(conn: sqlite3.Connection, table: str, column: str) -> bool:
rows = conn.execute(f"PRAGMA table_info({table})").fetchall()