feat: Phase 7C — tag canonicalization (autonomous, weekly)
LLM proposes alias→canonical mappings for domain_tags; confidence >= 0.8
auto-apply, below goes to human triage. Protects project identifiers
(p04, p05, p06, atocore, apm, etc.) from ever being canonicalized
since they're their own namespace, not concepts.
Problem solved: tag drift fragments retrieval. "fw" vs "firmware" vs
"firmware-control" all mean the same thing, but cross-cutting queries
that filter by tag only hit one variant. Weekly canonicalization pass
keeps the tag graph clean.
- Schema: tag_aliases table (pending | approved | rejected)
- atocore.memory._tag_canon_prompt (stdlib-only, protected project tokens)
- service: get_tag_distribution, apply_tag_alias (atomic per-memory,
dedupes if both alias + canonical present), create / approve / reject
proposal lifecycle, per-memory audit rows with action="tag_canonicalized"
- scripts/canonicalize_tags.py: host-side detector, autonomous by default,
--no-auto-approve kill switch
- 6 API endpoints under /admin/tags/* (distribution, list, propose,
apply, approve/{id}, reject/{id})
- Step B4 in batch-extract.sh (Sundays only — weekly cadence)
- 26 new tests (prompt parser, normalizer protections, distribution
counting, rewrite atomicity, dedup, audit, lifecycle). 414 → 440.
Design: aggressive protection of project tokens because a false
canonicalization (p04 → p04-gigabit, or vice versa) would scramble
cross-project filtering. Err toward preservation; the alias only
applies if the model is very confident AND both strings appear in
the current distribution.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
296
tests/test_tag_canon.py
Normal file
296
tests/test_tag_canon.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""Phase 7C — tag canonicalization tests.
|
||||
|
||||
Covers:
|
||||
- prompt parser (fences, prose, empty)
|
||||
- normalizer (identity, protected tokens, empty)
|
||||
- get_tag_distribution counts across active memories
|
||||
- apply_tag_alias rewrites + dedupes + audits
|
||||
- create / approve / reject lifecycle
|
||||
- idempotency (dup proposals skipped)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from atocore.memory._tag_canon_prompt import (
|
||||
PROTECTED_PROJECT_TOKENS,
|
||||
build_user_message,
|
||||
normalize_alias_item,
|
||||
parse_canon_output,
|
||||
)
|
||||
from atocore.memory.service import (
|
||||
apply_tag_alias,
|
||||
approve_tag_alias,
|
||||
create_memory,
|
||||
create_tag_alias_proposal,
|
||||
get_memory_audit,
|
||||
get_tag_alias_proposals,
|
||||
get_tag_distribution,
|
||||
reject_tag_alias,
|
||||
)
|
||||
from atocore.models.database import get_connection, init_db
|
||||
|
||||
|
||||
# --- Prompt parser ---
|
||||
|
||||
|
||||
def test_parse_canon_output_handles_fences():
|
||||
raw = "```json\n{\"aliases\": [{\"alias\": \"fw\", \"canonical\": \"firmware\", \"confidence\": 0.9}]}\n```"
|
||||
items = parse_canon_output(raw)
|
||||
assert len(items) == 1
|
||||
assert items[0]["alias"] == "fw"
|
||||
|
||||
|
||||
def test_parse_canon_output_handles_prose_prefix():
|
||||
raw = "Here you go:\n{\"aliases\": [{\"alias\": \"ml\", \"canonical\": \"machine-learning\", \"confidence\": 0.9}]}"
|
||||
items = parse_canon_output(raw)
|
||||
assert len(items) == 1
|
||||
|
||||
|
||||
def test_parse_canon_output_empty_list():
|
||||
assert parse_canon_output("{\"aliases\": []}") == []
|
||||
|
||||
|
||||
def test_parse_canon_output_malformed():
|
||||
assert parse_canon_output("not json at all") == []
|
||||
assert parse_canon_output("") == []
|
||||
|
||||
|
||||
# --- Normalizer ---
|
||||
|
||||
|
||||
def test_normalize_alias_strips_and_lowercases():
|
||||
n = normalize_alias_item({"alias": " FW ", "canonical": "Firmware", "confidence": 0.95, "reason": "abbrev"})
|
||||
assert n == {"alias": "fw", "canonical": "firmware", "confidence": 0.95, "reason": "abbrev"}
|
||||
|
||||
|
||||
def test_normalize_rejects_identity():
|
||||
assert normalize_alias_item({"alias": "foo", "canonical": "foo", "confidence": 0.9}) is None
|
||||
|
||||
|
||||
def test_normalize_rejects_empty():
|
||||
assert normalize_alias_item({"alias": "", "canonical": "foo", "confidence": 0.9}) is None
|
||||
assert normalize_alias_item({"alias": "foo", "canonical": "", "confidence": 0.9}) is None
|
||||
|
||||
|
||||
def test_normalize_protects_project_tokens():
|
||||
# Project ids must not be canonicalized — they're their own namespace
|
||||
assert "p04" in PROTECTED_PROJECT_TOKENS
|
||||
assert normalize_alias_item({"alias": "p04", "canonical": "p04-gigabit", "confidence": 1.0}) is None
|
||||
assert normalize_alias_item({"alias": "p04-gigabit", "canonical": "p04", "confidence": 1.0}) is None
|
||||
assert normalize_alias_item({"alias": "apm", "canonical": "part-manager", "confidence": 1.0}) is None
|
||||
|
||||
|
||||
def test_normalize_clamps_confidence():
|
||||
hi = normalize_alias_item({"alias": "a", "canonical": "b", "confidence": 2.5})
|
||||
assert hi["confidence"] == 1.0
|
||||
lo = normalize_alias_item({"alias": "a", "canonical": "b", "confidence": -0.5})
|
||||
assert lo["confidence"] == 0.0
|
||||
|
||||
|
||||
def test_normalize_handles_non_numeric_confidence():
|
||||
n = normalize_alias_item({"alias": "a", "canonical": "b", "confidence": "not a number"})
|
||||
assert n is not None and n["confidence"] == 0.0
|
||||
|
||||
|
||||
# --- build_user_message ---
|
||||
|
||||
|
||||
def test_build_user_message_includes_top_tags():
|
||||
dist = {"firmware": 23, "fw": 5, "optics": 18, "optical": 2}
|
||||
msg = build_user_message(dist)
|
||||
assert "firmware: 23" in msg
|
||||
assert "optics: 18" in msg
|
||||
assert "aliases" in msg.lower() or "JSON" in msg
|
||||
|
||||
|
||||
def test_build_user_message_empty():
|
||||
msg = build_user_message({})
|
||||
assert "Empty" in msg or "empty" in msg
|
||||
|
||||
|
||||
# --- get_tag_distribution ---
|
||||
|
||||
|
||||
def test_tag_distribution_counts_active_only(tmp_data_dir):
|
||||
init_db()
|
||||
create_memory("knowledge", "a", domain_tags=["firmware", "p06"])
|
||||
create_memory("knowledge", "b", domain_tags=["firmware"])
|
||||
create_memory("knowledge", "c", domain_tags=["optics"])
|
||||
|
||||
# Add an invalid memory — should NOT be counted
|
||||
m_invalid = create_memory("knowledge", "d", domain_tags=["firmware", "ignored"])
|
||||
with get_connection() as conn:
|
||||
conn.execute("UPDATE memories SET status = 'invalid' WHERE id = ?", (m_invalid.id,))
|
||||
|
||||
dist = get_tag_distribution()
|
||||
assert dist.get("firmware") == 2 # two active memories
|
||||
assert dist.get("optics") == 1
|
||||
assert dist.get("p06") == 1
|
||||
assert "ignored" not in dist
|
||||
|
||||
|
||||
def test_tag_distribution_min_count_filter(tmp_data_dir):
|
||||
init_db()
|
||||
create_memory("knowledge", "a", domain_tags=["firmware"])
|
||||
create_memory("knowledge", "b", domain_tags=["firmware"])
|
||||
create_memory("knowledge", "c", domain_tags=["once"])
|
||||
|
||||
dist = get_tag_distribution(min_count=2)
|
||||
assert "firmware" in dist
|
||||
assert "once" not in dist
|
||||
|
||||
|
||||
# --- apply_tag_alias ---
|
||||
|
||||
|
||||
def test_apply_tag_alias_rewrites_across_memories(tmp_data_dir):
|
||||
init_db()
|
||||
m1 = create_memory("knowledge", "a", domain_tags=["fw", "p06"])
|
||||
m2 = create_memory("knowledge", "b", domain_tags=["fw"])
|
||||
m3 = create_memory("knowledge", "c", domain_tags=["optics"]) # untouched
|
||||
|
||||
result = apply_tag_alias("fw", "firmware")
|
||||
assert result["memories_touched"] == 2
|
||||
|
||||
import json as _json
|
||||
with get_connection() as conn:
|
||||
r1 = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m1.id,)).fetchone()
|
||||
r2 = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m2.id,)).fetchone()
|
||||
r3 = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m3.id,)).fetchone()
|
||||
assert "firmware" in _json.loads(r1["domain_tags"])
|
||||
assert "fw" not in _json.loads(r1["domain_tags"])
|
||||
assert "firmware" in _json.loads(r2["domain_tags"])
|
||||
assert _json.loads(r3["domain_tags"]) == ["optics"] # untouched
|
||||
|
||||
|
||||
def test_apply_tag_alias_dedupes_when_both_present(tmp_data_dir):
|
||||
"""Memory has both fw AND firmware → rewrite collapses to just firmware."""
|
||||
init_db()
|
||||
m = create_memory("knowledge", "dual-tagged", domain_tags=["fw", "firmware", "p06"])
|
||||
|
||||
result = apply_tag_alias("fw", "firmware")
|
||||
assert result["memories_touched"] == 1
|
||||
|
||||
import json as _json
|
||||
with get_connection() as conn:
|
||||
r = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m.id,)).fetchone()
|
||||
tags = _json.loads(r["domain_tags"])
|
||||
assert tags.count("firmware") == 1
|
||||
assert "fw" not in tags
|
||||
assert "p06" in tags
|
||||
|
||||
|
||||
def test_apply_tag_alias_skips_memories_without_alias(tmp_data_dir):
|
||||
init_db()
|
||||
m = create_memory("knowledge", "no match", domain_tags=["optics", "p04"])
|
||||
result = apply_tag_alias("fw", "firmware")
|
||||
assert result["memories_touched"] == 0
|
||||
|
||||
|
||||
def test_apply_tag_alias_writes_audit(tmp_data_dir):
|
||||
init_db()
|
||||
m = create_memory("knowledge", "audited", domain_tags=["fw"])
|
||||
apply_tag_alias("fw", "firmware", actor="auto-tag-canon")
|
||||
|
||||
audit = get_memory_audit(m.id)
|
||||
actions = [a["action"] for a in audit]
|
||||
assert "tag_canonicalized" in actions
|
||||
entry = next(a for a in audit if a["action"] == "tag_canonicalized")
|
||||
assert entry["actor"] == "auto-tag-canon"
|
||||
assert "fw → firmware" in entry["note"]
|
||||
assert "fw" in entry["before"]["domain_tags"]
|
||||
assert "firmware" in entry["after"]["domain_tags"]
|
||||
|
||||
|
||||
def test_apply_tag_alias_rejects_identity(tmp_data_dir):
|
||||
init_db()
|
||||
with pytest.raises(ValueError):
|
||||
apply_tag_alias("foo", "foo")
|
||||
|
||||
|
||||
def test_apply_tag_alias_rejects_empty(tmp_data_dir):
|
||||
init_db()
|
||||
with pytest.raises(ValueError):
|
||||
apply_tag_alias("", "firmware")
|
||||
|
||||
|
||||
# --- Proposal lifecycle ---
|
||||
|
||||
|
||||
def test_create_proposal_inserts_pending(tmp_data_dir):
|
||||
init_db()
|
||||
pid = create_tag_alias_proposal("fw", "firmware", confidence=0.65,
|
||||
alias_count=5, canonical_count=23,
|
||||
reason="standard abbreviation")
|
||||
assert pid is not None
|
||||
|
||||
rows = get_tag_alias_proposals(status="pending")
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["alias"] == "fw"
|
||||
assert rows[0]["confidence"] == pytest.approx(0.65)
|
||||
|
||||
|
||||
def test_create_proposal_idempotent(tmp_data_dir):
|
||||
init_db()
|
||||
first = create_tag_alias_proposal("fw", "firmware", confidence=0.6)
|
||||
second = create_tag_alias_proposal("fw", "firmware", confidence=0.7)
|
||||
assert first is not None
|
||||
assert second is None
|
||||
|
||||
|
||||
def test_approve_applies_rewrite(tmp_data_dir):
|
||||
init_db()
|
||||
m = create_memory("knowledge", "x", domain_tags=["fw"])
|
||||
pid = create_tag_alias_proposal("fw", "firmware", confidence=0.7)
|
||||
result = approve_tag_alias(pid, actor="human-triage")
|
||||
assert result is not None
|
||||
assert result["memories_touched"] == 1
|
||||
|
||||
# Proposal now approved with applied_to_memories recorded
|
||||
rows = get_tag_alias_proposals(status="approved")
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["applied_to_memories"] == 1
|
||||
|
||||
# Memory actually rewritten
|
||||
import json as _json
|
||||
with get_connection() as conn:
|
||||
r = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m.id,)).fetchone()
|
||||
assert "firmware" in _json.loads(r["domain_tags"])
|
||||
|
||||
|
||||
def test_approve_already_resolved_returns_none(tmp_data_dir):
|
||||
init_db()
|
||||
pid = create_tag_alias_proposal("a", "b", confidence=0.6)
|
||||
approve_tag_alias(pid)
|
||||
assert approve_tag_alias(pid) is None # second approve — no-op
|
||||
|
||||
|
||||
def test_reject_leaves_memories_untouched(tmp_data_dir):
|
||||
init_db()
|
||||
m = create_memory("knowledge", "x", domain_tags=["fw"])
|
||||
pid = create_tag_alias_proposal("fw", "firmware", confidence=0.6)
|
||||
assert reject_tag_alias(pid)
|
||||
|
||||
rows = get_tag_alias_proposals(status="rejected")
|
||||
assert len(rows) == 1
|
||||
|
||||
# Memory still has the original tag
|
||||
import json as _json
|
||||
with get_connection() as conn:
|
||||
r = conn.execute("SELECT domain_tags FROM memories WHERE id = ?", (m.id,)).fetchone()
|
||||
assert "fw" in _json.loads(r["domain_tags"])
|
||||
|
||||
|
||||
# --- Schema sanity ---
|
||||
|
||||
|
||||
def test_tag_aliases_table_exists(tmp_data_dir):
|
||||
init_db()
|
||||
with get_connection() as conn:
|
||||
cols = [r["name"] for r in conn.execute("PRAGMA table_info(tag_aliases)").fetchall()]
|
||||
expected = {"id", "alias", "canonical", "status", "confidence",
|
||||
"alias_count", "canonical_count", "reason",
|
||||
"applied_to_memories", "created_at", "resolved_at", "resolved_by"}
|
||||
assert expected.issubset(set(cols))
|
||||
Reference in New Issue
Block a user