feat: Phase 7C — tag canonicalization (autonomous, weekly)

LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.

Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.

- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
  dedupes if both alias + canonical present), create / approve / reject
  proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
  --no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
  apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
  counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.

Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-19 09:41:02 -04:00
parent e840ef4be3
commit 877b97ec78
7 changed files with 1085 additions and 0 deletions

View File

@@ -1517,6 +1517,125 @@ def api_graduation_status() -> dict:
return out
# --- Phase 7C: tag canonicalization ---
class TagAliasProposalBody(BaseModel):
alias: str
canonical: str
confidence: float = 0.0
reason: str = ""
alias_count: int = 0
canonical_count: int = 0
class TagAliasApplyBody(BaseModel):
alias: str
canonical: str
confidence: float = 0.9
reason: str = ""
alias_count: int = 0
canonical_count: int = 0
actor: str = "auto-tag-canon"
class TagAliasResolveBody(BaseModel):
actor: str = "human-triage"
@router.get("/admin/tags/distribution")
def api_tag_distribution() -> dict:
"""Current tag distribution across active memories (for UI / debug)."""
from atocore.memory.service import get_tag_distribution
dist = get_tag_distribution()
sorted_tags = sorted(dist.items(), key=lambda x: x[1], reverse=True)
return {"total_references": sum(dist.values()), "unique_tags": len(dist),
"tags": [{"tag": t, "count": c} for t, c in sorted_tags]}
@router.get("/admin/tags/aliases")
def api_list_tag_aliases(status: str = "pending", limit: int = 100) -> dict:
"""List tag alias proposals (default: pending for review)."""
from atocore.memory.service import get_tag_alias_proposals
rows = get_tag_alias_proposals(status=status, limit=limit)
return {"proposals": rows, "count": len(rows)}
@router.post("/admin/tags/aliases/propose")
def api_propose_tag_alias(body: TagAliasProposalBody) -> dict:
"""Submit a low-confidence alias proposal for human review."""
from atocore.memory.service import create_tag_alias_proposal
pid = create_tag_alias_proposal(
alias=body.alias, canonical=body.canonical,
confidence=body.confidence, alias_count=body.alias_count,
canonical_count=body.canonical_count, reason=body.reason,
)
if pid is None:
return {"proposal_id": None, "duplicate": True}
return {"proposal_id": pid, "duplicate": False}
@router.post("/admin/tags/aliases/apply")
def api_apply_tag_alias(body: TagAliasApplyBody) -> dict:
"""Apply an alias rewrite directly (used by the auto-approval path).
Creates a tag_aliases row in status=approved with the apply result
recorded, so autonomous merges land in the same audit surface as
human approvals.
"""
from datetime import datetime as _dt, timezone as _tz
from atocore.memory.service import apply_tag_alias, create_tag_alias_proposal
from atocore.models.database import get_connection
# Record proposal + apply + mark approved in one flow
pid = create_tag_alias_proposal(
alias=body.alias, canonical=body.canonical,
confidence=body.confidence, alias_count=body.alias_count,
canonical_count=body.canonical_count, reason=body.reason,
)
if pid is None:
# A pending proposal already exists — don't double-apply.
raise HTTPException(status_code=409, detail="A pending proposal already exists for this (alias, canonical) pair — approve it via /admin/tags/aliases/{id}/approve")
try:
result = apply_tag_alias(body.alias, body.canonical, actor=body.actor)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
now_str = _dt.now(_tz.utc).strftime("%Y-%m-%d %H:%M:%S")
with get_connection() as conn:
conn.execute(
"UPDATE tag_aliases SET status = 'approved', resolved_at = ?, "
"resolved_by = ?, applied_to_memories = ? WHERE id = ?",
(now_str, body.actor, result["memories_touched"], pid),
)
return {
"proposal_id": pid,
"memories_touched": result["memories_touched"],
"alias": body.alias, "canonical": body.canonical,
}
@router.post("/admin/tags/aliases/{proposal_id}/approve")
def api_approve_tag_alias(proposal_id: str, body: TagAliasResolveBody) -> dict:
"""Human-in-the-loop approve for a pending proposal."""
from atocore.memory.service import approve_tag_alias
result = approve_tag_alias(proposal_id, actor=body.actor)
if result is None:
raise HTTPException(status_code=404, detail="Proposal not found or already resolved")
return {"status": "approved", "proposal_id": proposal_id,
"memories_touched": result["memories_touched"]}
@router.post("/admin/tags/aliases/{proposal_id}/reject")
def api_reject_tag_alias(proposal_id: str, body: TagAliasResolveBody) -> dict:
"""Human-in-the-loop reject for a pending proposal."""
from atocore.memory.service import reject_tag_alias
if not reject_tag_alias(proposal_id, actor=body.actor):
raise HTTPException(status_code=404, detail="Proposal not found or already resolved")
return {"status": "rejected", "proposal_id": proposal_id}
class DecayRunBody(BaseModel):
idle_days_threshold: int = 30
daily_decay_factor: float = 0.97