feat: Phase 7C — tag canonicalization (autonomous, weekly)
LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.
Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.
- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
dedupes if both alias + canonical present), create / approve / reject
proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
--no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.
Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1517,6 +1517,125 @@ def api_graduation_status() -> dict:
|
||||
return out
|
||||
|
||||
|
||||
# --- Phase 7C: tag canonicalization ---
|
||||
|
||||
|
||||
class TagAliasProposalBody(BaseModel):
|
||||
alias: str
|
||||
canonical: str
|
||||
confidence: float = 0.0
|
||||
reason: str = ""
|
||||
alias_count: int = 0
|
||||
canonical_count: int = 0
|
||||
|
||||
|
||||
class TagAliasApplyBody(BaseModel):
|
||||
alias: str
|
||||
canonical: str
|
||||
confidence: float = 0.9
|
||||
reason: str = ""
|
||||
alias_count: int = 0
|
||||
canonical_count: int = 0
|
||||
actor: str = "auto-tag-canon"
|
||||
|
||||
|
||||
class TagAliasResolveBody(BaseModel):
|
||||
actor: str = "human-triage"
|
||||
|
||||
|
||||
@router.get("/admin/tags/distribution")
|
||||
def api_tag_distribution() -> dict:
|
||||
"""Current tag distribution across active memories (for UI / debug)."""
|
||||
from atocore.memory.service import get_tag_distribution
|
||||
dist = get_tag_distribution()
|
||||
sorted_tags = sorted(dist.items(), key=lambda x: x[1], reverse=True)
|
||||
return {"total_references": sum(dist.values()), "unique_tags": len(dist),
|
||||
"tags": [{"tag": t, "count": c} for t, c in sorted_tags]}
|
||||
|
||||
|
||||
@router.get("/admin/tags/aliases")
|
||||
def api_list_tag_aliases(status: str = "pending", limit: int = 100) -> dict:
|
||||
"""List tag alias proposals (default: pending for review)."""
|
||||
from atocore.memory.service import get_tag_alias_proposals
|
||||
rows = get_tag_alias_proposals(status=status, limit=limit)
|
||||
return {"proposals": rows, "count": len(rows)}
|
||||
|
||||
|
||||
@router.post("/admin/tags/aliases/propose")
|
||||
def api_propose_tag_alias(body: TagAliasProposalBody) -> dict:
|
||||
"""Submit a low-confidence alias proposal for human review."""
|
||||
from atocore.memory.service import create_tag_alias_proposal
|
||||
pid = create_tag_alias_proposal(
|
||||
alias=body.alias, canonical=body.canonical,
|
||||
confidence=body.confidence, alias_count=body.alias_count,
|
||||
canonical_count=body.canonical_count, reason=body.reason,
|
||||
)
|
||||
if pid is None:
|
||||
return {"proposal_id": None, "duplicate": True}
|
||||
return {"proposal_id": pid, "duplicate": False}
|
||||
|
||||
|
||||
@router.post("/admin/tags/aliases/apply")
|
||||
def api_apply_tag_alias(body: TagAliasApplyBody) -> dict:
|
||||
"""Apply an alias rewrite directly (used by the auto-approval path).
|
||||
|
||||
Creates a tag_aliases row in status=approved with the apply result
|
||||
recorded, so autonomous merges land in the same audit surface as
|
||||
human approvals.
|
||||
"""
|
||||
from datetime import datetime as _dt, timezone as _tz
|
||||
|
||||
from atocore.memory.service import apply_tag_alias, create_tag_alias_proposal
|
||||
from atocore.models.database import get_connection
|
||||
|
||||
# Record proposal + apply + mark approved in one flow
|
||||
pid = create_tag_alias_proposal(
|
||||
alias=body.alias, canonical=body.canonical,
|
||||
confidence=body.confidence, alias_count=body.alias_count,
|
||||
canonical_count=body.canonical_count, reason=body.reason,
|
||||
)
|
||||
if pid is None:
|
||||
# A pending proposal already exists — don't double-apply.
|
||||
raise HTTPException(status_code=409, detail="A pending proposal already exists for this (alias, canonical) pair — approve it via /admin/tags/aliases/{id}/approve")
|
||||
try:
|
||||
result = apply_tag_alias(body.alias, body.canonical, actor=body.actor)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
now_str = _dt.now(_tz.utc).strftime("%Y-%m-%d %H:%M:%S")
|
||||
with get_connection() as conn:
|
||||
conn.execute(
|
||||
"UPDATE tag_aliases SET status = 'approved', resolved_at = ?, "
|
||||
"resolved_by = ?, applied_to_memories = ? WHERE id = ?",
|
||||
(now_str, body.actor, result["memories_touched"], pid),
|
||||
)
|
||||
return {
|
||||
"proposal_id": pid,
|
||||
"memories_touched": result["memories_touched"],
|
||||
"alias": body.alias, "canonical": body.canonical,
|
||||
}
|
||||
|
||||
|
||||
@router.post("/admin/tags/aliases/{proposal_id}/approve")
|
||||
def api_approve_tag_alias(proposal_id: str, body: TagAliasResolveBody) -> dict:
|
||||
"""Human-in-the-loop approve for a pending proposal."""
|
||||
from atocore.memory.service import approve_tag_alias
|
||||
result = approve_tag_alias(proposal_id, actor=body.actor)
|
||||
if result is None:
|
||||
raise HTTPException(status_code=404, detail="Proposal not found or already resolved")
|
||||
return {"status": "approved", "proposal_id": proposal_id,
|
||||
"memories_touched": result["memories_touched"]}
|
||||
|
||||
|
||||
@router.post("/admin/tags/aliases/{proposal_id}/reject")
|
||||
def api_reject_tag_alias(proposal_id: str, body: TagAliasResolveBody) -> dict:
|
||||
"""Human-in-the-loop reject for a pending proposal."""
|
||||
from atocore.memory.service import reject_tag_alias
|
||||
if not reject_tag_alias(proposal_id, actor=body.actor):
|
||||
raise HTTPException(status_code=404, detail="Proposal not found or already resolved")
|
||||
return {"status": "rejected", "proposal_id": proposal_id}
|
||||
|
||||
|
||||
class DecayRunBody(BaseModel):
|
||||
idle_days_threshold: int = 30
|
||||
daily_decay_factor: float = 0.97
|
||||
|
||||
Reference in New Issue
Block a user