feat: Phase 7A.1 — autonomous merge tiering (sonnet → opus → human)

Dedup detector now merges high-confidence duplicates silently instead
of piling every proposal into a human triage queue. Matches the 3-tier
escalation pattern that auto_triage already uses.

Tiering decision per cluster:
  TIER-1 auto-approve: sonnet confidence >= 0.8 AND min_pairwise_sim >= 0.92
                       AND all sources share project+type → auto-merge silently
                       (actor="auto-dedup-tier1" in audit log)
  TIER-2 escalation:   sonnet 0.5-0.8 conf OR sim 0.85-0.92 → opus second opinion.
                       Opus confirms with conf >= 0.8 → auto-merge (actor="auto-dedup-tier2").
                       Opus overrides (reject) → skip silently.
                       Opus low conf → human triage with opus's refined draft.
  HUMAN triage:        Only the genuinely ambiguous land in /admin/triage.

Env-tunable thresholds:
  ATOCORE_DEDUP_AUTO_APPROVE_CONF (0.8)
  ATOCORE_DEDUP_AUTO_APPROVE_SIM (0.92)
  ATOCORE_DEDUP_TIER2_MIN_CONF (0.5)
  ATOCORE_DEDUP_TIER2_MIN_SIM (0.85)
  ATOCORE_DEDUP_TIER2_MODEL (opus)

New flag --no-auto-approve for kill-switch testing (everything → human queue).

Tests: +6 (tier-2 prompt content, same_bucket edges, min_pairwise_similarity
on identical + transitive clusters). 395 → 401.

Rationale: user asked for autonomous behavior — "this needs to be intelligent,
I don't want to manually triage stuff". Matches the consolidation principle:
never discard details, but let the brain tidy up on its own for the easy cases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-18 15:46:26 -04:00
parent 028d4c3594
commit 56d5df0ab4
3 changed files with 374 additions and 29 deletions

View File

@@ -15,6 +15,8 @@ from __future__ import annotations
import pytest
from atocore.memory._dedup_prompt import (
TIER2_SYSTEM_PROMPT,
build_tier2_user_message,
normalize_merge_verdict,
parse_merge_verdict,
)
@@ -119,6 +121,120 @@ def test_normalize_merge_verdict_rejects_unknown_action():
assert normalize_merge_verdict({"action": "?", "content": "x"}) is None
# --- Tier-2 (Phase 7A.1) ---
def test_tier2_prompt_is_stricter():
# The tier-2 system prompt must explicitly instruct the model to be
# stricter than tier-1 — that's the whole point of escalation.
assert "STRICTER" in TIER2_SYSTEM_PROMPT
assert "REJECT" in TIER2_SYSTEM_PROMPT
def test_build_tier2_user_message_includes_tier1_draft():
sources = [{
"id": "abc12345", "content": "source text A",
"memory_type": "knowledge", "project": "p04",
"domain_tags": ["optics"], "confidence": 0.6,
"valid_until": "", "reference_count": 2,
}, {
"id": "def67890", "content": "source text B",
"memory_type": "knowledge", "project": "p04",
"domain_tags": ["optics"], "confidence": 0.7,
"valid_until": "", "reference_count": 1,
}]
tier1 = {
"action": "merge",
"content": "unified draft by tier1",
"memory_type": "knowledge",
"project": "p04",
"domain_tags": ["optics"],
"confidence": 0.65,
"reason": "near-paraphrase",
}
msg = build_tier2_user_message(sources, tier1)
assert "source text A" in msg
assert "source text B" in msg
assert "TIER-1 DRAFT" in msg
assert "unified draft by tier1" in msg
assert "near-paraphrase" in msg
# Should end asking for a verdict
assert "verdict" in msg.lower()
# --- Tiering helpers (min_pairwise_similarity, same_bucket) ---
def test_same_bucket_true_for_matching():
import importlib.util
spec = importlib.util.spec_from_file_location(
"memory_dedup_for_test",
"scripts/memory_dedup.py",
)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
sources = [
{"memory_type": "knowledge", "project": "p04"},
{"memory_type": "knowledge", "project": "p04"},
]
assert mod.same_bucket(sources) is True
def test_same_bucket_false_for_mixed():
import importlib.util
spec = importlib.util.spec_from_file_location(
"memory_dedup_for_test",
"scripts/memory_dedup.py",
)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
# Different project
assert mod.same_bucket([
{"memory_type": "knowledge", "project": "p04"},
{"memory_type": "knowledge", "project": "p05"},
]) is False
# Different memory_type
assert mod.same_bucket([
{"memory_type": "knowledge", "project": "p04"},
{"memory_type": "project", "project": "p04"},
]) is False
def test_min_pairwise_similarity_identical_texts():
import importlib.util
spec = importlib.util.spec_from_file_location(
"memory_dedup_for_test",
"scripts/memory_dedup.py",
)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
# Three identical texts — min should be ~1.0
ms = mod.min_pairwise_similarity(["hello world"] * 3)
assert 0.99 <= ms <= 1.0
def test_min_pairwise_similarity_mixed_cluster():
"""Transitive cluster A~B~C with A and C actually quite different
should expose a low min even though A~B and B~C are high."""
import importlib.util
spec = importlib.util.spec_from_file_location(
"memory_dedup_for_test",
"scripts/memory_dedup.py",
)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
ms = mod.min_pairwise_similarity([
"Antoine prefers OAuth over API keys",
"Antoine's OAuth preference",
"USB SSD mandatory for polisher firmware",
])
assert ms < 0.6 # Third is unrelated; min is far below threshold
# --- create_merge_candidate idempotency ---