feat: Phase 7C — tag canonicalization (autonomous, weekly)

LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.

Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.

- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
  dedupes if both alias + canonical present), create / approve / reject
  proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
  --no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
  apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
  counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.

Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-19 09:41:02 -04:00
parent e840ef4be3
commit 877b97ec78
7 changed files with 1085 additions and 0 deletions

296
tests/test_tag_canon.py Normal file
View File

@@ -0,0 +1,296 @@
"""Phase 7C — tag canonicalization tests.
Covers:
- prompt parser (fences, prose, empty)
- normalizer (identity, protected tokens, empty)
- get_tag_distribution counts across active memories
- apply_tag_alias rewrites + dedupes + audits
- create / approve / reject lifecycle
- idempotency (dup proposals skipped)
"""
from __future__ import annotations
import pytest
from atocore.memory._tag_canon_prompt import (
PROTECTED_PROJECT_TOKENS,
build_user_message,
normalize_alias_item,
parse_canon_output,
)
from atocore.memory.service import (
apply_tag_alias,
approve_tag_alias,
create_memory,
create_tag_alias_proposal,
get_memory_audit,
get_tag_alias_proposals,
get_tag_distribution,
reject_tag_alias,
)
from atocore.models.database import get_connection, init_db
# --- Prompt parser ---
def test_parse_canon_output_handles_fences():
raw = "```json\n{\"aliases\": [{\"alias\": \"fw\", \"canonical\": \"firmware\", \"confidence\": 0.9}]}\n```"
items = parse_canon_output(raw)
assert len(items) == 1
assert items[0]["alias"] == "fw"
def test_parse_canon_output_handles_prose_prefix():
raw = "Here you go:\n{\"aliases\": [{\"alias\": \"ml\", \"canonical\": \"machine-learning\", \"confidence\": 0.9}]}"
items = parse_canon_output(raw)
assert len(items) == 1
def test_parse_canon_output_empty_list():
assert parse_canon_output("{\"aliases\": []}") == []
def test_parse_canon_output_malformed():
assert parse_canon_output("not json at all") == []
assert parse_canon_output("") == []
# --- Normalizer ---
def test_normalize_alias_strips_and_lowercases():
n = normalize_alias_item({"alias": " FW ", "canonical": "Firmware", "confidence": 0.95, "reason": "abbrev"})
assert n == {"alias": "fw", "canonical": "firmware", "confidence": 0.95, "reason": "abbrev"}
def test_normalize_rejects_identity():
assert normalize_alias_item({"alias": "foo", "canonical": "foo", "confidence": 0.9}) is None
def test_normalize_rejects_empty():
assert normalize_alias_item({"alias": "", "canonical": "foo", "confidence": 0.9}) is None
assert normalize_alias_item({"alias": "foo", "canonical": "", "confidence": 0.9}) is None
def test_normalize_protects_project_tokens():
# Project ids must not be canonicalized — they're their own namespace
assert "p04" in PROTECTED_PROJECT_TOKENS
assert normalize_alias_item({"alias": "p04", "canonical": "p04-gigabit", "confidence": 1.0}) is None
assert normalize_alias_item({"alias": "p04-gigabit", "canonical": "p04", "confidence": 1.0}) is None
assert normalize_alias_item({"alias": "apm", "canonical": "part-manager", "confidence": 1.0}) is None
def test_normalize_clamps_confidence():
hi = normalize_alias_item({"alias": "a", "canonical": "b", "confidence": 2.5})
assert hi["confidence"] == 1.0
lo = normalize_alias_item({"alias": "a", "canonical": "b", "confidence": -0.5})
assert lo["confidence"] == 0.0
def test_normalize_handles_non_numeric_confidence():
n = normalize_alias_item({"alias": "a", "canonical": "b", "confidence": "not a number"})
assert n is not None and n["confidence"] == 0.0
# --- build_user_message ---
def test_build_user_message_includes_top_tags():
dist = {"firmware": 23, "fw": 5, "optics": 18, "optical": 2}
msg = build_user_message(dist)
assert "firmware: 23" in msg
assert "optics: 18" in msg
assert "aliases" in msg.lower() or "JSON" in msg
def test_build_user_message_empty():
msg = build_user_message({})
assert "Empty" in msg or "empty" in msg
# --- get_tag_distribution ---
def test_tag_distribution_counts_active_only(tmp_data_dir):
init_db()
create_memory("knowledge", "a", domain_tags=["firmware", "p06"])
create_memory("knowledge", "b", domain_tags=["firmware"])
create_memory("knowledge", "c", domain_tags=["optics"])
# Add an invalid memory — should NOT be counted
m_invalid = create_memory("knowledge", "d", domain_tags=["firmware", "ignored"])
with get_connection() as conn:
conn.execute("UPDATE memories SET status = 'invalid' WHERE id = ?", (m_invalid.id,))
dist = get_tag_distribution()
assert dist.get("firmware") == 2 # two active memories
assert dist.get("optics") == 1
assert dist.get("p06") == 1
assert "ignored" not in dist
def test_tag_distribution_min_count_filter(tmp_data_dir):
init_db()
create_memory("knowledge", "a", domain_tags=["firmware"])
create_memory("knowledge", "b", domain_tags=["firmware"])
create_memory("knowledge", "c", domain_tags=["once"])
dist = get_tag_distribution(min_count=2)
assert "firmware" in dist
assert "once" not in dist
# --- apply_tag_alias ---
def test_apply_tag_alias_rewrites_across_memories(tmp_data_dir):
init_db()
m1 = create_memory("knowledge", "a", domain_tags=["fw", "p06"])
m2 = create_memory("knowledge", "b", domain_tags=["fw"])
m3 = create_memory("knowledge", "c", domain_tags=["optics"]) # untouched
result = apply_tag_alias("fw", "firmware")
assert result["memories_touched"] == 2
import json as _json
with get_connection() as conn:
r1 = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m1.id,)).fetchone()
r2 = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m2.id,)).fetchone()
r3 = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m3.id,)).fetchone()
assert "firmware" in _json.loads(r1["domain_tags"])
assert "fw" not in _json.loads(r1["domain_tags"])
assert "firmware" in _json.loads(r2["domain_tags"])
assert _json.loads(r3["domain_tags"]) == ["optics"] # untouched
def test_apply_tag_alias_dedupes_when_both_present(tmp_data_dir):
"""Memory has both fw AND firmware → rewrite collapses to just firmware."""
init_db()
m = create_memory("knowledge", "dual-tagged", domain_tags=["fw", "firmware", "p06"])
result = apply_tag_alias("fw", "firmware")
assert result["memories_touched"] == 1
import json as _json
with get_connection() as conn:
r = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m.id,)).fetchone()
tags = _json.loads(r["domain_tags"])
assert tags.count("firmware") == 1
assert "fw" not in tags
assert "p06" in tags
def test_apply_tag_alias_skips_memories_without_alias(tmp_data_dir):
init_db()
m = create_memory("knowledge", "no match", domain_tags=["optics", "p04"])
result = apply_tag_alias("fw", "firmware")
assert result["memories_touched"] == 0
def test_apply_tag_alias_writes_audit(tmp_data_dir):
init_db()
m = create_memory("knowledge", "audited", domain_tags=["fw"])
apply_tag_alias("fw", "firmware", actor="auto-tag-canon")
audit = get_memory_audit(m.id)
actions = [a["action"] for a in audit]
assert "tag_canonicalized" in actions
entry = next(a for a in audit if a["action"] == "tag_canonicalized")
assert entry["actor"] == "auto-tag-canon"
assert "fw → firmware" in entry["note"]
assert "fw" in entry["before"]["domain_tags"]
assert "firmware" in entry["after"]["domain_tags"]
def test_apply_tag_alias_rejects_identity(tmp_data_dir):
init_db()
with pytest.raises(ValueError):
apply_tag_alias("foo", "foo")
def test_apply_tag_alias_rejects_empty(tmp_data_dir):
init_db()
with pytest.raises(ValueError):
apply_tag_alias("", "firmware")
# --- Proposal lifecycle ---
def test_create_proposal_inserts_pending(tmp_data_dir):
init_db()
pid = create_tag_alias_proposal("fw", "firmware", confidence=0.65,
alias_count=5, canonical_count=23,
reason="standard abbreviation")
assert pid is not None
rows = get_tag_alias_proposals(status="pending")
assert len(rows) == 1
assert rows[0]["alias"] == "fw"
assert rows[0]["confidence"] == pytest.approx(0.65)
def test_create_proposal_idempotent(tmp_data_dir):
init_db()
first = create_tag_alias_proposal("fw", "firmware", confidence=0.6)
second = create_tag_alias_proposal("fw", "firmware", confidence=0.7)
assert first is not None
assert second is None
def test_approve_applies_rewrite(tmp_data_dir):
init_db()
m = create_memory("knowledge", "x", domain_tags=["fw"])
pid = create_tag_alias_proposal("fw", "firmware", confidence=0.7)
result = approve_tag_alias(pid, actor="human-triage")
assert result is not None
assert result["memories_touched"] == 1
# Proposal now approved with applied_to_memories recorded
rows = get_tag_alias_proposals(status="approved")
assert len(rows) == 1
assert rows[0]["applied_to_memories"] == 1
# Memory actually rewritten
import json as _json
with get_connection() as conn:
r = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m.id,)).fetchone()
assert "firmware" in _json.loads(r["domain_tags"])
def test_approve_already_resolved_returns_none(tmp_data_dir):
init_db()
pid = create_tag_alias_proposal("a", "b", confidence=0.6)
approve_tag_alias(pid)
assert approve_tag_alias(pid) is None # second approve — no-op
def test_reject_leaves_memories_untouched(tmp_data_dir):
init_db()
m = create_memory("knowledge", "x", domain_tags=["fw"])
pid = create_tag_alias_proposal("fw", "firmware", confidence=0.6)
assert reject_tag_alias(pid)
rows = get_tag_alias_proposals(status="rejected")
assert len(rows) == 1
# Memory still has the original tag
import json as _json
with get_connection() as conn:
r = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m.id,)).fetchone()
assert "fw" in _json.loads(r["domain_tags"])
# --- Schema sanity ---
def test_tag_aliases_table_exists(tmp_data_dir):
init_db()
with get_connection() as conn:
cols = [r["name"] for r in conn.execute("PRAGMA table_info(tag_aliases)").fetchall()]
expected = {"id", "alias", "canonical", "status", "confidence",
"alias_count", "canonical_count", "reason",
"applied_to_memories", "created_at", "resolved_at", "resolved_by"}
assert expected.issubset(set(cols))