Dedup detector now merges high-confidence duplicates silently instead
of piling every proposal into a human triage queue. Matches the 3-tier
escalation pattern that auto_triage already uses.
Tiering decision per cluster:
TIER-1 auto-approve: sonnet confidence >= 0.8 AND min_pairwise_sim >= 0.92
AND all sources share project+type → auto-merge silently
(actor="auto-dedup-tier1" in audit log)
TIER-2 escalation: sonnet 0.5-0.8 conf OR sim 0.85-0.92 → opus second opinion.
Opus confirms with conf >= 0.8 → auto-merge (actor="auto-dedup-tier2").
Opus overrides (reject) → skip silently.
Opus low conf → human triage with opus's refined draft.
HUMAN triage: Only the genuinely ambiguous land in /admin/triage.
Env-tunable thresholds:
ATOCORE_DEDUP_AUTO_APPROVE_CONF (0.8)
ATOCORE_DEDUP_AUTO_APPROVE_SIM (0.92)
ATOCORE_DEDUP_TIER2_MIN_CONF (0.5)
ATOCORE_DEDUP_TIER2_MIN_SIM (0.85)
ATOCORE_DEDUP_TIER2_MODEL (opus)
New flag --no-auto-approve for kill-switch testing (everything → human queue).
Tests: +6 (tier-2 prompt content, same_bucket edges, min_pairwise_similarity
on identical + transitive clusters). 395 → 401.
Rationale: user asked for autonomous behavior — "this needs to be intelligent,
I don't want to manually triage stuff". Matches the consolidation principle:
never discard details, but let the brain tidy up on its own for the easy cases.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
464 lines
18 KiB
Python
464 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""Phase 7A — semantic memory dedup detector.
|
|
|
|
Finds clusters of near-duplicate active memories and writes merge-
|
|
candidate proposals for human review in the triage UI.
|
|
|
|
Algorithm:
|
|
1. Fetch active memories via HTTP
|
|
2. Group by (project, memory_type) — cross-bucket merges are deferred
|
|
to Phase 7B contradiction flow
|
|
3. Within each group, embed contents via atocore.retrieval.embeddings
|
|
4. Greedy transitive cluster at similarity >= threshold
|
|
5. For each cluster of size >= 2, ask claude-p to draft unified content
|
|
6. POST the proposal to /admin/memory/merge-candidates/create (server-
|
|
side dedupes by the sorted memory-id set, so re-runs don't double-
|
|
create)
|
|
|
|
Host-side because claude CLI lives on Dalidou, not the container. Reuses
|
|
the same PYTHONPATH=src pattern as scripts/graduate_memories.py for
|
|
atocore imports (embeddings, similarity, prompt module).
|
|
|
|
Usage:
|
|
python3 scripts/memory_dedup.py --base-url http://127.0.0.1:8100 \\
|
|
--similarity-threshold 0.88 --max-batch 50
|
|
|
|
Threshold conventions (see Phase 7 doc):
|
|
0.88 interactive / default — balanced precision/recall
|
|
0.90 nightly cron — tight, only near-duplicates
|
|
0.85 weekly cron — deeper cleanup
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
from collections import defaultdict
|
|
from typing import Any
|
|
|
|
# Make src/ importable — same pattern as graduate_memories.py
|
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
|
|
if _SRC_DIR not in sys.path:
|
|
sys.path.insert(0, _SRC_DIR)
|
|
|
|
from atocore.memory._dedup_prompt import ( # noqa: E402
|
|
DEDUP_PROMPT_VERSION,
|
|
SYSTEM_PROMPT,
|
|
TIER2_SYSTEM_PROMPT,
|
|
build_tier2_user_message,
|
|
build_user_message,
|
|
normalize_merge_verdict,
|
|
parse_merge_verdict,
|
|
)
|
|
from atocore.memory.similarity import cluster_by_threshold # noqa: E402
|
|
|
|
DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")
|
|
DEFAULT_MODEL = os.environ.get("ATOCORE_DEDUP_MODEL", "sonnet")
|
|
DEFAULT_TIER2_MODEL = os.environ.get("ATOCORE_DEDUP_TIER2_MODEL", "opus")
|
|
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_DEDUP_TIMEOUT_S", "60"))
|
|
|
|
# Phase 7A.1 — auto-merge tiering thresholds.
|
|
# TIER-1 auto-approve: if sonnet confidence >= this AND min pairwise
|
|
# similarity >= AUTO_APPROVE_SIM AND all sources share project+type → merge silently.
|
|
AUTO_APPROVE_CONF = float(os.environ.get("ATOCORE_DEDUP_AUTO_APPROVE_CONF", "0.8"))
|
|
AUTO_APPROVE_SIM = float(os.environ.get("ATOCORE_DEDUP_AUTO_APPROVE_SIM", "0.92"))
|
|
# TIER-2 escalation band: sonnet uncertain but pair is similar enough to be worth opus time.
|
|
TIER2_MIN_CONF = float(os.environ.get("ATOCORE_DEDUP_TIER2_MIN_CONF", "0.5"))
|
|
TIER2_MIN_SIM = float(os.environ.get("ATOCORE_DEDUP_TIER2_MIN_SIM", "0.85"))
|
|
|
|
_sandbox_cwd = None
|
|
|
|
|
|
def get_sandbox_cwd() -> str:
|
|
global _sandbox_cwd
|
|
if _sandbox_cwd is None:
|
|
_sandbox_cwd = tempfile.mkdtemp(prefix="ato-dedup-")
|
|
return _sandbox_cwd
|
|
|
|
|
|
def api_get(base_url: str, path: str) -> dict:
|
|
req = urllib.request.Request(f"{base_url}{path}")
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
|
def api_post(base_url: str, path: str, body: dict | None = None) -> dict:
|
|
data = json.dumps(body or {}).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
f"{base_url}{path}", method="POST",
|
|
headers={"Content-Type": "application/json"}, data=data,
|
|
)
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
|
def call_claude(system_prompt: str, user_message: str, model: str, timeout_s: float) -> tuple[str | None, str | None]:
|
|
"""Shared CLI caller with retry + stderr capture (mirrors auto_triage)."""
|
|
if not shutil.which("claude"):
|
|
return None, "claude CLI not available"
|
|
args = [
|
|
"claude", "-p",
|
|
"--model", model,
|
|
"--append-system-prompt", system_prompt,
|
|
"--disable-slash-commands",
|
|
user_message,
|
|
]
|
|
last_error = ""
|
|
for attempt in range(3):
|
|
if attempt > 0:
|
|
time.sleep(2 ** attempt)
|
|
try:
|
|
completed = subprocess.run(
|
|
args, capture_output=True, text=True,
|
|
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
|
encoding="utf-8", errors="replace",
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
last_error = f"{model} timed out"
|
|
continue
|
|
except Exception as exc:
|
|
last_error = f"subprocess error: {exc}"
|
|
continue
|
|
if completed.returncode == 0:
|
|
return (completed.stdout or "").strip(), None
|
|
stderr = (completed.stderr or "").strip()[:200]
|
|
last_error = f"{model} exit {completed.returncode}: {stderr}" if stderr else f"{model} exit {completed.returncode}"
|
|
return None, last_error
|
|
|
|
|
|
def fetch_active_memories(base_url: str, project: str | None) -> list[dict]:
|
|
# The /memory endpoint with active_only=true returns active memories.
|
|
# Graduated memories are exempt from dedup — they're frozen pointers
|
|
# to entities. Filter them out on the client side.
|
|
params = "active_only=true&limit=2000"
|
|
if project:
|
|
params += f"&project={urllib.request.quote(project)}"
|
|
try:
|
|
result = api_get(base_url, f"/memory?{params}")
|
|
except Exception as e:
|
|
print(f"ERROR: could not fetch memories: {e}", file=sys.stderr)
|
|
return []
|
|
mems = result.get("memories", [])
|
|
return [m for m in mems if (m.get("status") or "active") == "active"]
|
|
|
|
|
|
def group_memories(mems: list[dict]) -> dict[tuple[str, str], list[dict]]:
|
|
"""Bucket by (project, memory_type). Empty project is its own bucket."""
|
|
buckets: dict[tuple[str, str], list[dict]] = defaultdict(list)
|
|
for m in mems:
|
|
key = ((m.get("project") or "").strip().lower(), (m.get("memory_type") or "").strip().lower())
|
|
buckets[key].append(m)
|
|
return buckets
|
|
|
|
|
|
def draft_merge(sources: list[dict], model: str, timeout_s: float) -> dict[str, Any] | None:
|
|
"""Tier-1 draft: cheap sonnet call proposes the unified content."""
|
|
user_msg = build_user_message(sources)
|
|
raw, err = call_claude(SYSTEM_PROMPT, user_msg, model, timeout_s)
|
|
if err:
|
|
print(f" WARN: claude tier-1 failed: {err}", file=sys.stderr)
|
|
return None
|
|
parsed = parse_merge_verdict(raw or "")
|
|
if parsed is None:
|
|
print(f" WARN: could not parse tier-1 verdict: {(raw or '')[:200]}", file=sys.stderr)
|
|
return None
|
|
return normalize_merge_verdict(parsed)
|
|
|
|
|
|
def tier2_review(
|
|
sources: list[dict],
|
|
tier1_verdict: dict[str, Any],
|
|
model: str,
|
|
timeout_s: float,
|
|
) -> dict[str, Any] | None:
|
|
"""Tier-2 second opinion: opus confirms or overrides the tier-1 draft."""
|
|
user_msg = build_tier2_user_message(sources, tier1_verdict)
|
|
raw, err = call_claude(TIER2_SYSTEM_PROMPT, user_msg, model, timeout_s)
|
|
if err:
|
|
print(f" WARN: claude tier-2 failed: {err}", file=sys.stderr)
|
|
return None
|
|
parsed = parse_merge_verdict(raw or "")
|
|
if parsed is None:
|
|
print(f" WARN: could not parse tier-2 verdict: {(raw or '')[:200]}", file=sys.stderr)
|
|
return None
|
|
return normalize_merge_verdict(parsed)
|
|
|
|
|
|
def min_pairwise_similarity(texts: list[str]) -> float:
|
|
"""Return the minimum pairwise cosine similarity across N texts.
|
|
|
|
Used to sanity-check a transitive cluster: A~B~C doesn't guarantee
|
|
A~C, so the auto-approve threshold should be met by the WEAKEST
|
|
pair, not just by the strongest. If the cluster has N=2 this is just
|
|
the one pairwise similarity.
|
|
"""
|
|
if len(texts) < 2:
|
|
return 0.0
|
|
# Reuse similarity_matrix rather than computing it ourselves
|
|
from atocore.memory.similarity import similarity_matrix
|
|
m = similarity_matrix(texts)
|
|
min_sim = 1.0
|
|
for i in range(len(texts)):
|
|
for j in range(i + 1, len(texts)):
|
|
if m[i][j] < min_sim:
|
|
min_sim = m[i][j]
|
|
return min_sim
|
|
|
|
|
|
def submit_candidate(
|
|
base_url: str,
|
|
memory_ids: list[str],
|
|
similarity: float,
|
|
verdict: dict[str, Any],
|
|
dry_run: bool,
|
|
) -> str | None:
|
|
body = {
|
|
"memory_ids": memory_ids,
|
|
"similarity": similarity,
|
|
"proposed_content": verdict["content"],
|
|
"proposed_memory_type": verdict["memory_type"],
|
|
"proposed_project": verdict["project"],
|
|
"proposed_tags": verdict["domain_tags"],
|
|
"proposed_confidence": verdict["confidence"],
|
|
"reason": verdict["reason"],
|
|
}
|
|
if dry_run:
|
|
print(f" [dry-run] would POST: {json.dumps(body)[:200]}...")
|
|
return "dry-run"
|
|
try:
|
|
result = api_post(base_url, "/admin/memory/merge-candidates/create", body)
|
|
return result.get("candidate_id")
|
|
except urllib.error.HTTPError as e:
|
|
print(f" ERROR: submit failed: {e.code} {e.read().decode()[:200]}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ERROR: submit failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def auto_approve(base_url: str, candidate_id: str, actor: str, dry_run: bool) -> str | None:
|
|
"""POST /admin/memory/merge-candidates/{id}/approve. Returns result_memory_id."""
|
|
if dry_run:
|
|
return "dry-run"
|
|
try:
|
|
result = api_post(
|
|
base_url,
|
|
f"/admin/memory/merge-candidates/{candidate_id}/approve",
|
|
{"actor": actor},
|
|
)
|
|
return result.get("result_memory_id")
|
|
except Exception as e:
|
|
print(f" ERROR: auto-approve failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def same_bucket(sources: list[dict]) -> bool:
|
|
"""All sources share the same (project, memory_type)."""
|
|
if not sources:
|
|
return False
|
|
proj = (sources[0].get("project") or "").strip().lower()
|
|
mtype = (sources[0].get("memory_type") or "").strip().lower()
|
|
for s in sources[1:]:
|
|
if (s.get("project") or "").strip().lower() != proj:
|
|
return False
|
|
if (s.get("memory_type") or "").strip().lower() != mtype:
|
|
return False
|
|
return True
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Phase 7A semantic dedup detector (tiered)")
|
|
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
|
parser.add_argument("--project", default="", help="Only scan this project (empty = all)")
|
|
parser.add_argument("--similarity-threshold", type=float, default=0.88)
|
|
parser.add_argument("--max-batch", type=int, default=50,
|
|
help="Max clusters to process per run")
|
|
parser.add_argument("--model", default=DEFAULT_MODEL, help="Tier-1 model (default: sonnet)")
|
|
parser.add_argument("--tier2-model", default=DEFAULT_TIER2_MODEL, help="Tier-2 model (default: opus)")
|
|
parser.add_argument("--timeout-s", type=float, default=DEFAULT_TIMEOUT_S)
|
|
parser.add_argument("--no-auto-approve", action="store_true",
|
|
help="Disable autonomous merging; all merges land in human triage queue")
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
base = args.base_url.rstrip("/")
|
|
autonomous = not args.no_auto_approve
|
|
|
|
print(
|
|
f"memory_dedup {DEDUP_PROMPT_VERSION} | threshold={args.similarity_threshold} | "
|
|
f"tier1={args.model} tier2={args.tier2_model} | "
|
|
f"autonomous={autonomous} | "
|
|
f"auto-approve: conf>={AUTO_APPROVE_CONF} sim>={AUTO_APPROVE_SIM}"
|
|
)
|
|
mems = fetch_active_memories(base, args.project or None)
|
|
print(f"fetched {len(mems)} active memories")
|
|
if not mems:
|
|
return
|
|
|
|
buckets = group_memories(mems)
|
|
print(f"grouped into {len(buckets)} (project, memory_type) buckets")
|
|
|
|
clusters_found = 0
|
|
auto_merged_tier1 = 0
|
|
auto_merged_tier2 = 0
|
|
human_candidates = 0
|
|
tier1_rejections = 0
|
|
tier2_overrides = 0 # opus disagreed with sonnet
|
|
skipped_low_sim = 0
|
|
skipped_existing = 0
|
|
processed = 0
|
|
|
|
for (proj, mtype), group in sorted(buckets.items()):
|
|
if len(group) < 2:
|
|
continue
|
|
if processed >= args.max_batch:
|
|
print(f"reached max-batch={args.max_batch}, stopping")
|
|
break
|
|
|
|
texts = [(m.get("content") or "") for m in group]
|
|
clusters = cluster_by_threshold(texts, args.similarity_threshold)
|
|
clusters = [c for c in clusters if len(c) >= 2]
|
|
if not clusters:
|
|
continue
|
|
|
|
print(f"\n[{proj or '(global)'}/{mtype}] {len(group)} mems → {len(clusters)} cluster(s)")
|
|
for cluster in clusters:
|
|
if processed >= args.max_batch:
|
|
break
|
|
clusters_found += 1
|
|
sources = [group[i] for i in cluster]
|
|
ids = [s["id"] for s in sources]
|
|
cluster_texts = [texts[i] for i in cluster]
|
|
min_sim = min_pairwise_similarity(cluster_texts)
|
|
print(f" cluster of {len(cluster)} (min_sim={min_sim:.3f}): {[s['id'][:8] for s in sources]}")
|
|
|
|
# Tier-1 draft
|
|
tier1 = draft_merge(sources, args.model, args.timeout_s)
|
|
processed += 1
|
|
if tier1 is None:
|
|
continue
|
|
if tier1["action"] == "reject":
|
|
tier1_rejections += 1
|
|
print(f" TIER-1 rejected: {tier1['reason'][:100]}")
|
|
continue
|
|
|
|
# --- Tiering decision ---
|
|
bucket_ok = same_bucket(sources)
|
|
tier1_ok = (
|
|
tier1["confidence"] >= AUTO_APPROVE_CONF
|
|
and min_sim >= AUTO_APPROVE_SIM
|
|
and bucket_ok
|
|
)
|
|
|
|
if autonomous and tier1_ok:
|
|
cid = submit_candidate(base, ids, min_sim, tier1, args.dry_run)
|
|
if cid == "dry-run":
|
|
auto_merged_tier1 += 1
|
|
print(" [dry-run] would auto-merge (tier-1)")
|
|
elif cid:
|
|
new_id = auto_approve(base, cid, actor="auto-dedup-tier1", dry_run=args.dry_run)
|
|
if new_id:
|
|
auto_merged_tier1 += 1
|
|
print(f" ✅ auto-merged (tier-1) → {str(new_id)[:8]}")
|
|
else:
|
|
print(f" ⚠️ tier-1 approve failed; candidate {cid[:8]} left pending")
|
|
human_candidates += 1
|
|
else:
|
|
skipped_existing += 1
|
|
time.sleep(0.3)
|
|
continue
|
|
|
|
# Not tier-1 auto-approve. Decide if it's worth tier-2 escalation.
|
|
tier2_eligible = (
|
|
autonomous
|
|
and min_sim >= TIER2_MIN_SIM
|
|
and tier1["confidence"] >= TIER2_MIN_CONF
|
|
and bucket_ok
|
|
)
|
|
|
|
if tier2_eligible:
|
|
print(" → escalating to tier-2 (opus)…")
|
|
tier2 = tier2_review(sources, tier1, args.tier2_model, args.timeout_s)
|
|
if tier2 is None:
|
|
# Opus errored. Fall back to human triage.
|
|
cid = submit_candidate(base, ids, min_sim, tier1, args.dry_run)
|
|
if cid and cid != "dry-run":
|
|
human_candidates += 1
|
|
print(f" → candidate {cid[:8]} (tier-2 errored, human review)")
|
|
time.sleep(0.5)
|
|
continue
|
|
|
|
if tier2["action"] == "reject":
|
|
tier2_overrides += 1
|
|
print(f" ❌ TIER-2 override (reject): {tier2['reason'][:100]}")
|
|
time.sleep(0.5)
|
|
continue
|
|
|
|
if tier2["confidence"] >= AUTO_APPROVE_CONF:
|
|
# Opus confirms. Auto-merge using opus's (possibly refined) content.
|
|
cid = submit_candidate(base, ids, min_sim, tier2, args.dry_run)
|
|
if cid == "dry-run":
|
|
auto_merged_tier2 += 1
|
|
print(" [dry-run] would auto-merge (tier-2)")
|
|
elif cid:
|
|
new_id = auto_approve(base, cid, actor="auto-dedup-tier2", dry_run=args.dry_run)
|
|
if new_id:
|
|
auto_merged_tier2 += 1
|
|
print(f" ✅ auto-merged (tier-2) → {str(new_id)[:8]}")
|
|
else:
|
|
human_candidates += 1
|
|
print(f" ⚠️ tier-2 approve failed; candidate {cid[:8]} left pending")
|
|
else:
|
|
skipped_existing += 1
|
|
time.sleep(0.5)
|
|
continue
|
|
|
|
# Opus confirmed but low confidence → human review with opus's draft
|
|
cid = submit_candidate(base, ids, min_sim, tier2, args.dry_run)
|
|
if cid and cid != "dry-run":
|
|
human_candidates += 1
|
|
print(f" → candidate {cid[:8]} (tier-2 low-confidence, human review)")
|
|
time.sleep(0.5)
|
|
continue
|
|
|
|
# Below tier-2 eligibility (either non-autonomous mode, or
|
|
# similarity too low / cross-bucket). Always human review.
|
|
if min_sim < TIER2_MIN_SIM or not bucket_ok:
|
|
skipped_low_sim += 1
|
|
# Still create a human candidate so it's visible, but log why
|
|
print(f" → below auto-tier thresholds (min_sim={min_sim:.3f}, bucket_ok={bucket_ok})")
|
|
|
|
cid = submit_candidate(base, ids, min_sim, tier1, args.dry_run)
|
|
if cid == "dry-run":
|
|
human_candidates += 1
|
|
elif cid:
|
|
human_candidates += 1
|
|
print(f" → candidate {cid[:8]} (human review)")
|
|
else:
|
|
skipped_existing += 1
|
|
time.sleep(0.3)
|
|
|
|
print(
|
|
f"\nsummary: clusters_found={clusters_found} "
|
|
f"auto_merged_tier1={auto_merged_tier1} "
|
|
f"auto_merged_tier2={auto_merged_tier2} "
|
|
f"human_candidates={human_candidates} "
|
|
f"tier1_rejections={tier1_rejections} "
|
|
f"tier2_overrides={tier2_overrides} "
|
|
f"skipped_low_sim={skipped_low_sim} "
|
|
f"skipped_existing={skipped_existing}"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|