feat: Phase 7A.1 — autonomous merge tiering (sonnet → opus → human)

Dedup detector now merges high-confidence duplicates silently instead of piling every proposal into a human triage queue. Matches the 3-tier escalation pattern that auto_triage already uses. Tiering decision per cluster: TIER-1 auto-approve: sonnet confidence >= 0.8 AND min_pairwise_sim >= 0.92 AND all sources share project+type → auto-merge silently (actor="auto-dedup-tier1" in audit log) TIER-2 escalation: sonnet 0.5-0.8 conf OR sim 0.85-0.92 → opus second opinion. Opus confirms with conf >= 0.8 → auto-merge (actor="auto-dedup-tier2"). Opus overrides (reject) → skip silently. Opus low conf → human triage with opus's refined draft. HUMAN triage: Only the genuinely ambiguous land in /admin/triage. Env-tunable thresholds: ATOCORE_DEDUP_AUTO_APPROVE_CONF (0.8) ATOCORE_DEDUP_AUTO_APPROVE_SIM (0.92) ATOCORE_DEDUP_TIER2_MIN_CONF (0.5) ATOCORE_DEDUP_TIER2_MIN_SIM (0.85) ATOCORE_DEDUP_TIER2_MODEL (opus) New flag --no-auto-approve for kill-switch testing (everything → human queue). Tests: +6 (tier-2 prompt content, same_bucket edges, min_pairwise_similarity on identical + transitive clusters). 395 → 401. Rationale: user asked for autonomous behavior — "this needs to be intelligent, I don't want to manually triage stuff". Matches the consolidation principle: never discard details, but let the brain tidy up on its own for the easy cases. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 15:46:26 -04:00
parent 028d4c3594
commit 56d5df0ab4
3 changed files with 374 additions and 29 deletions
--- a/tests/test_memory_dedup.py
+++ b/tests/test_memory_dedup.py
@@ -15,6 +15,8 @@ from __future__ import annotations
 import pytest

 from atocore.memory._dedup_prompt import (
+    TIER2_SYSTEM_PROMPT,
+    build_tier2_user_message,
    normalize_merge_verdict,
    parse_merge_verdict,
 )
@@ -119,6 +121,120 @@ def test_normalize_merge_verdict_rejects_unknown_action():
    assert normalize_merge_verdict({"action": "?", "content": "x"}) is None


+# --- Tier-2 (Phase 7A.1) ---
+
+
+def test_tier2_prompt_is_stricter():
+    # The tier-2 system prompt must explicitly instruct the model to be
+    # stricter than tier-1 — that's the whole point of escalation.
+    assert "STRICTER" in TIER2_SYSTEM_PROMPT
+    assert "REJECT" in TIER2_SYSTEM_PROMPT
+
+
+def test_build_tier2_user_message_includes_tier1_draft():
+    sources = [{
+        "id": "abc12345", "content": "source text A",
+        "memory_type": "knowledge", "project": "p04",
+        "domain_tags": ["optics"], "confidence": 0.6,
+        "valid_until": "", "reference_count": 2,
+    }, {
+        "id": "def67890", "content": "source text B",
+        "memory_type": "knowledge", "project": "p04",
+        "domain_tags": ["optics"], "confidence": 0.7,
+        "valid_until": "", "reference_count": 1,
+    }]
+    tier1 = {
+        "action": "merge",
+        "content": "unified draft by tier1",
+        "memory_type": "knowledge",
+        "project": "p04",
+        "domain_tags": ["optics"],
+        "confidence": 0.65,
+        "reason": "near-paraphrase",
+    }
+    msg = build_tier2_user_message(sources, tier1)
+    assert "source text A" in msg
+    assert "source text B" in msg
+    assert "TIER-1 DRAFT" in msg
+    assert "unified draft by tier1" in msg
+    assert "near-paraphrase" in msg
+    # Should end asking for a verdict
+    assert "verdict" in msg.lower()
+
+
+# --- Tiering helpers (min_pairwise_similarity, same_bucket) ---
+
+
+def test_same_bucket_true_for_matching():
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(
+        "memory_dedup_for_test",
+        "scripts/memory_dedup.py",
+    )
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+
+    sources = [
+        {"memory_type": "knowledge", "project": "p04"},
+        {"memory_type": "knowledge", "project": "p04"},
+    ]
+    assert mod.same_bucket(sources) is True
+
+
+def test_same_bucket_false_for_mixed():
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(
+        "memory_dedup_for_test",
+        "scripts/memory_dedup.py",
+    )
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+
+    # Different project
+    assert mod.same_bucket([
+        {"memory_type": "knowledge", "project": "p04"},
+        {"memory_type": "knowledge", "project": "p05"},
+    ]) is False
+    # Different memory_type
+    assert mod.same_bucket([
+        {"memory_type": "knowledge", "project": "p04"},
+        {"memory_type": "project", "project": "p04"},
+    ]) is False
+
+
+def test_min_pairwise_similarity_identical_texts():
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(
+        "memory_dedup_for_test",
+        "scripts/memory_dedup.py",
+    )
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+
+    # Three identical texts — min should be ~1.0
+    ms = mod.min_pairwise_similarity(["hello world"] * 3)
+    assert 0.99 <= ms <= 1.0
+
+
+def test_min_pairwise_similarity_mixed_cluster():
+    """Transitive cluster A~B~C with A and C actually quite different
+    should expose a low min even though A~B and B~C are high."""
+    import importlib.util
+    spec = importlib.util.spec_from_file_location(
+        "memory_dedup_for_test",
+        "scripts/memory_dedup.py",
+    )
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+
+    ms = mod.min_pairwise_similarity([
+        "Antoine prefers OAuth over API keys",
+        "Antoine's OAuth preference",
+        "USB SSD mandatory for polisher firmware",
+    ])
+    assert ms < 0.6  # Third is unrelated; min is far below threshold
+
+
 # --- create_merge_candidate idempotency ---