2026-04-18 10:30:49 -04:00
|
|
|
#!/usr/bin/env python3
|
2026-04-21 16:18:00 -04:00
|
|
|
"""Phase 7A — semantic memory dedup detector (stdlib-only host script).
|
2026-04-18 10:30:49 -04:00
|
|
|
|
|
|
|
|
Finds clusters of near-duplicate active memories and writes merge-
|
2026-04-21 16:18:00 -04:00
|
|
|
candidate proposals for human (or autonomous) approval.
|
2026-04-18 10:30:49 -04:00
|
|
|
|
|
|
|
|
Algorithm:
|
2026-04-21 16:18:00 -04:00
|
|
|
1. POST /admin/memory/dedup-cluster on the AtoCore server — it
|
|
|
|
|
computes embeddings + transitive clusters under the (project,
|
|
|
|
|
memory_type) bucket rule (sentence-transformers lives in the
|
|
|
|
|
container, not on the host)
|
|
|
|
|
2. For each returned cluster of size >= 2, ask claude-p (host-side
|
|
|
|
|
CLI) to draft unified content preserving all specifics
|
|
|
|
|
3. Server-side tiering:
|
|
|
|
|
- TIER-1 auto-approve: sonnet confidence >= 0.8 AND min_sim >= 0.92
|
|
|
|
|
AND all sources share project+type → immediately submit and
|
|
|
|
|
approve (actor="auto-dedup-tier1")
|
|
|
|
|
- TIER-2 escalation: opus confirms with conf >= 0.8 → auto-approve
|
|
|
|
|
(actor="auto-dedup-tier2"); opus rejects → skip silently
|
|
|
|
|
- HUMAN: pending proposal lands in /admin/triage
|
|
|
|
|
|
|
|
|
|
Host-only dep: the `claude` CLI. No python packages beyond stdlib.
|
|
|
|
|
Reuses atocore.memory._dedup_prompt (stdlib-only shared prompt).
|
2026-04-18 10:30:49 -04:00
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
python3 scripts/memory_dedup.py --base-url http://127.0.0.1:8100 \\
|
|
|
|
|
--similarity-threshold 0.88 --max-batch 50
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import shutil
|
|
|
|
|
import subprocess
|
|
|
|
|
import sys
|
|
|
|
|
import tempfile
|
|
|
|
|
import time
|
|
|
|
|
import urllib.error
|
|
|
|
|
import urllib.request
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
# Make src/ importable for the stdlib-only prompt module.
|
|
|
|
|
# We DO NOT import anything that pulls in pydantic_settings or
|
|
|
|
|
# sentence-transformers; those live on the server side.
|
2026-04-18 10:30:49 -04:00
|
|
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
|
|
|
|
|
if _SRC_DIR not in sys.path:
|
|
|
|
|
sys.path.insert(0, _SRC_DIR)
|
|
|
|
|
|
|
|
|
|
from atocore.memory._dedup_prompt import ( # noqa: E402
|
|
|
|
|
DEDUP_PROMPT_VERSION,
|
|
|
|
|
SYSTEM_PROMPT,
|
2026-04-18 15:46:26 -04:00
|
|
|
TIER2_SYSTEM_PROMPT,
|
|
|
|
|
build_tier2_user_message,
|
2026-04-18 10:30:49 -04:00
|
|
|
build_user_message,
|
|
|
|
|
normalize_merge_verdict,
|
|
|
|
|
parse_merge_verdict,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")
|
|
|
|
|
DEFAULT_MODEL = os.environ.get("ATOCORE_DEDUP_MODEL", "sonnet")
|
2026-04-18 15:46:26 -04:00
|
|
|
DEFAULT_TIER2_MODEL = os.environ.get("ATOCORE_DEDUP_TIER2_MODEL", "opus")
|
2026-04-18 10:30:49 -04:00
|
|
|
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_DEDUP_TIMEOUT_S", "60"))
|
|
|
|
|
|
2026-04-18 15:46:26 -04:00
|
|
|
AUTO_APPROVE_CONF = float(os.environ.get("ATOCORE_DEDUP_AUTO_APPROVE_CONF", "0.8"))
|
|
|
|
|
AUTO_APPROVE_SIM = float(os.environ.get("ATOCORE_DEDUP_AUTO_APPROVE_SIM", "0.92"))
|
|
|
|
|
TIER2_MIN_CONF = float(os.environ.get("ATOCORE_DEDUP_TIER2_MIN_CONF", "0.5"))
|
|
|
|
|
TIER2_MIN_SIM = float(os.environ.get("ATOCORE_DEDUP_TIER2_MIN_SIM", "0.85"))
|
|
|
|
|
|
2026-04-18 10:30:49 -04:00
|
|
|
_sandbox_cwd = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_sandbox_cwd() -> str:
|
|
|
|
|
global _sandbox_cwd
|
|
|
|
|
if _sandbox_cwd is None:
|
|
|
|
|
_sandbox_cwd = tempfile.mkdtemp(prefix="ato-dedup-")
|
|
|
|
|
return _sandbox_cwd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def api_get(base_url: str, path: str) -> dict:
|
|
|
|
|
req = urllib.request.Request(f"{base_url}{path}")
|
|
|
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
|
|
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
def api_post(base_url: str, path: str, body: dict | None = None, timeout: int = 60) -> dict:
|
2026-04-18 10:30:49 -04:00
|
|
|
data = json.dumps(body or {}).encode("utf-8")
|
|
|
|
|
req = urllib.request.Request(
|
|
|
|
|
f"{base_url}{path}", method="POST",
|
|
|
|
|
headers={"Content-Type": "application/json"}, data=data,
|
|
|
|
|
)
|
2026-04-21 16:18:00 -04:00
|
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
2026-04-18 10:30:49 -04:00
|
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def call_claude(system_prompt: str, user_message: str, model: str, timeout_s: float) -> tuple[str | None, str | None]:
|
|
|
|
|
if not shutil.which("claude"):
|
|
|
|
|
return None, "claude CLI not available"
|
|
|
|
|
args = [
|
|
|
|
|
"claude", "-p",
|
|
|
|
|
"--model", model,
|
|
|
|
|
"--append-system-prompt", system_prompt,
|
|
|
|
|
"--disable-slash-commands",
|
|
|
|
|
user_message,
|
|
|
|
|
]
|
|
|
|
|
last_error = ""
|
|
|
|
|
for attempt in range(3):
|
|
|
|
|
if attempt > 0:
|
|
|
|
|
time.sleep(2 ** attempt)
|
|
|
|
|
try:
|
|
|
|
|
completed = subprocess.run(
|
|
|
|
|
args, capture_output=True, text=True,
|
|
|
|
|
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
|
|
|
|
encoding="utf-8", errors="replace",
|
|
|
|
|
)
|
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
|
|
last_error = f"{model} timed out"
|
|
|
|
|
continue
|
|
|
|
|
except Exception as exc:
|
|
|
|
|
last_error = f"subprocess error: {exc}"
|
|
|
|
|
continue
|
|
|
|
|
if completed.returncode == 0:
|
|
|
|
|
return (completed.stdout or "").strip(), None
|
|
|
|
|
stderr = (completed.stderr or "").strip()[:200]
|
2026-04-21 16:18:00 -04:00
|
|
|
last_error = f"{model} exit {completed.returncode}: {stderr}"
|
2026-04-18 10:30:49 -04:00
|
|
|
return None, last_error
|
|
|
|
|
|
|
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
def fetch_clusters(base_url: str, project: str, threshold: float, max_clusters: int) -> list[dict]:
|
|
|
|
|
"""Ask the server to compute near-duplicate clusters. The server
|
|
|
|
|
owns sentence-transformers; host stays lean."""
|
2026-04-18 10:30:49 -04:00
|
|
|
try:
|
2026-04-21 16:18:00 -04:00
|
|
|
result = api_post(base_url, "/admin/memory/dedup-cluster", {
|
|
|
|
|
"project": project,
|
|
|
|
|
"similarity_threshold": threshold,
|
|
|
|
|
"max_clusters": max_clusters,
|
|
|
|
|
}, timeout=120)
|
2026-04-18 10:30:49 -04:00
|
|
|
except Exception as e:
|
2026-04-21 16:18:00 -04:00
|
|
|
print(f"ERROR: dedup-cluster fetch failed: {e}", file=sys.stderr)
|
2026-04-18 10:30:49 -04:00
|
|
|
return []
|
2026-04-21 16:18:00 -04:00
|
|
|
clusters = result.get("clusters", [])
|
|
|
|
|
print(
|
|
|
|
|
f"server returned {len(clusters)} clusters "
|
|
|
|
|
f"(total_active={result.get('total_active_scanned')}, "
|
|
|
|
|
f"buckets={result.get('bucket_count')})"
|
|
|
|
|
)
|
|
|
|
|
return clusters
|
2026-04-18 10:30:49 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def draft_merge(sources: list[dict], model: str, timeout_s: float) -> dict[str, Any] | None:
|
|
|
|
|
user_msg = build_user_message(sources)
|
|
|
|
|
raw, err = call_claude(SYSTEM_PROMPT, user_msg, model, timeout_s)
|
|
|
|
|
if err:
|
2026-04-18 15:46:26 -04:00
|
|
|
print(f" WARN: claude tier-1 failed: {err}", file=sys.stderr)
|
|
|
|
|
return None
|
|
|
|
|
parsed = parse_merge_verdict(raw or "")
|
|
|
|
|
if parsed is None:
|
|
|
|
|
print(f" WARN: could not parse tier-1 verdict: {(raw or '')[:200]}", file=sys.stderr)
|
|
|
|
|
return None
|
|
|
|
|
return normalize_merge_verdict(parsed)
|
|
|
|
|
|
|
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
def tier2_review(sources: list[dict], tier1_verdict: dict, model: str, timeout_s: float) -> dict | None:
|
2026-04-18 15:46:26 -04:00
|
|
|
user_msg = build_tier2_user_message(sources, tier1_verdict)
|
|
|
|
|
raw, err = call_claude(TIER2_SYSTEM_PROMPT, user_msg, model, timeout_s)
|
|
|
|
|
if err:
|
|
|
|
|
print(f" WARN: claude tier-2 failed: {err}", file=sys.stderr)
|
2026-04-18 10:30:49 -04:00
|
|
|
return None
|
|
|
|
|
parsed = parse_merge_verdict(raw or "")
|
|
|
|
|
if parsed is None:
|
2026-04-18 15:46:26 -04:00
|
|
|
print(f" WARN: could not parse tier-2 verdict: {(raw or '')[:200]}", file=sys.stderr)
|
2026-04-18 10:30:49 -04:00
|
|
|
return None
|
|
|
|
|
return normalize_merge_verdict(parsed)
|
|
|
|
|
|
|
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
def submit_candidate(base_url: str, memory_ids: list[str], similarity: float, verdict: dict, dry_run: bool) -> str | None:
|
2026-04-18 10:30:49 -04:00
|
|
|
body = {
|
|
|
|
|
"memory_ids": memory_ids,
|
|
|
|
|
"similarity": similarity,
|
|
|
|
|
"proposed_content": verdict["content"],
|
|
|
|
|
"proposed_memory_type": verdict["memory_type"],
|
|
|
|
|
"proposed_project": verdict["project"],
|
|
|
|
|
"proposed_tags": verdict["domain_tags"],
|
|
|
|
|
"proposed_confidence": verdict["confidence"],
|
|
|
|
|
"reason": verdict["reason"],
|
|
|
|
|
}
|
|
|
|
|
if dry_run:
|
|
|
|
|
print(f" [dry-run] would POST: {json.dumps(body)[:200]}...")
|
|
|
|
|
return "dry-run"
|
|
|
|
|
try:
|
|
|
|
|
result = api_post(base_url, "/admin/memory/merge-candidates/create", body)
|
|
|
|
|
return result.get("candidate_id")
|
|
|
|
|
except urllib.error.HTTPError as e:
|
|
|
|
|
print(f" ERROR: submit failed: {e.code} {e.read().decode()[:200]}", file=sys.stderr)
|
|
|
|
|
return None
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f" ERROR: submit failed: {e}", file=sys.stderr)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-04-18 15:46:26 -04:00
|
|
|
def auto_approve(base_url: str, candidate_id: str, actor: str, dry_run: bool) -> str | None:
|
|
|
|
|
if dry_run:
|
|
|
|
|
return "dry-run"
|
|
|
|
|
try:
|
|
|
|
|
result = api_post(
|
|
|
|
|
base_url,
|
|
|
|
|
f"/admin/memory/merge-candidates/{candidate_id}/approve",
|
|
|
|
|
{"actor": actor},
|
|
|
|
|
)
|
|
|
|
|
return result.get("result_memory_id")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f" ERROR: auto-approve failed: {e}", file=sys.stderr)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
2026-04-18 10:30:49 -04:00
|
|
|
def main() -> None:
|
2026-04-21 16:18:00 -04:00
|
|
|
parser = argparse.ArgumentParser(description="Phase 7A semantic dedup detector (tiered, stdlib-only host)")
|
2026-04-18 10:30:49 -04:00
|
|
|
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
|
|
|
|
parser.add_argument("--project", default="", help="Only scan this project (empty = all)")
|
|
|
|
|
parser.add_argument("--similarity-threshold", type=float, default=0.88)
|
|
|
|
|
parser.add_argument("--max-batch", type=int, default=50,
|
2026-04-18 15:46:26 -04:00
|
|
|
help="Max clusters to process per run")
|
2026-04-21 16:18:00 -04:00
|
|
|
parser.add_argument("--model", default=DEFAULT_MODEL)
|
|
|
|
|
parser.add_argument("--tier2-model", default=DEFAULT_TIER2_MODEL)
|
2026-04-18 10:30:49 -04:00
|
|
|
parser.add_argument("--timeout-s", type=float, default=DEFAULT_TIMEOUT_S)
|
2026-04-18 15:46:26 -04:00
|
|
|
parser.add_argument("--no-auto-approve", action="store_true",
|
|
|
|
|
help="Disable autonomous merging; all merges land in human triage queue")
|
2026-04-18 10:30:49 -04:00
|
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
base = args.base_url.rstrip("/")
|
2026-04-18 15:46:26 -04:00
|
|
|
autonomous = not args.no_auto_approve
|
2026-04-18 10:30:49 -04:00
|
|
|
|
2026-04-18 15:46:26 -04:00
|
|
|
print(
|
|
|
|
|
f"memory_dedup {DEDUP_PROMPT_VERSION} | threshold={args.similarity_threshold} | "
|
|
|
|
|
f"tier1={args.model} tier2={args.tier2_model} | "
|
|
|
|
|
f"autonomous={autonomous} | "
|
|
|
|
|
f"auto-approve: conf>={AUTO_APPROVE_CONF} sim>={AUTO_APPROVE_SIM}"
|
|
|
|
|
)
|
2026-04-18 10:30:49 -04:00
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
clusters = fetch_clusters(
|
|
|
|
|
base, args.project, args.similarity_threshold, args.max_batch,
|
|
|
|
|
)
|
|
|
|
|
if not clusters:
|
|
|
|
|
print("no clusters — nothing to dedup")
|
|
|
|
|
return
|
2026-04-18 10:30:49 -04:00
|
|
|
|
2026-04-18 15:46:26 -04:00
|
|
|
auto_merged_tier1 = 0
|
|
|
|
|
auto_merged_tier2 = 0
|
|
|
|
|
human_candidates = 0
|
|
|
|
|
tier1_rejections = 0
|
2026-04-21 16:18:00 -04:00
|
|
|
tier2_overrides = 0
|
2026-04-18 10:30:49 -04:00
|
|
|
skipped_existing = 0
|
2026-04-18 15:46:26 -04:00
|
|
|
processed = 0
|
2026-04-18 10:30:49 -04:00
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
for cluster in clusters:
|
2026-04-18 15:46:26 -04:00
|
|
|
if processed >= args.max_batch:
|
2026-04-18 10:30:49 -04:00
|
|
|
break
|
2026-04-21 16:18:00 -04:00
|
|
|
processed += 1
|
|
|
|
|
sources = cluster["sources"]
|
|
|
|
|
ids = cluster["memory_ids"]
|
|
|
|
|
min_sim = float(cluster["min_similarity"])
|
|
|
|
|
proj = cluster.get("project") or "(global)"
|
|
|
|
|
mtype = cluster.get("memory_type") or "?"
|
|
|
|
|
|
|
|
|
|
print(f"\n[{proj}/{mtype}] cluster size={cluster['size']} min_sim={min_sim:.3f} "
|
|
|
|
|
f"{[s['id'][:8] for s in sources]}")
|
|
|
|
|
|
|
|
|
|
tier1 = draft_merge(sources, args.model, args.timeout_s)
|
|
|
|
|
if tier1 is None:
|
|
|
|
|
continue
|
|
|
|
|
if tier1["action"] == "reject":
|
|
|
|
|
tier1_rejections += 1
|
|
|
|
|
print(f" TIER-1 rejected: {tier1['reason'][:100]}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# All sources share the bucket by construction from the server
|
|
|
|
|
bucket_ok = True
|
|
|
|
|
tier1_ok = (
|
|
|
|
|
tier1["confidence"] >= AUTO_APPROVE_CONF
|
|
|
|
|
and min_sim >= AUTO_APPROVE_SIM
|
|
|
|
|
and bucket_ok
|
|
|
|
|
)
|
2026-04-18 10:30:49 -04:00
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
if autonomous and tier1_ok:
|
|
|
|
|
cid = submit_candidate(base, ids, min_sim, tier1, args.dry_run)
|
|
|
|
|
if cid == "dry-run":
|
|
|
|
|
auto_merged_tier1 += 1
|
|
|
|
|
print(" [dry-run] would auto-merge (tier-1)")
|
|
|
|
|
elif cid:
|
|
|
|
|
new_id = auto_approve(base, cid, actor="auto-dedup-tier1", dry_run=args.dry_run)
|
|
|
|
|
if new_id:
|
|
|
|
|
auto_merged_tier1 += 1
|
|
|
|
|
print(f" ✅ auto-merged (tier-1) → {str(new_id)[:8]}")
|
|
|
|
|
else:
|
|
|
|
|
human_candidates += 1
|
|
|
|
|
print(f" ⚠️ tier-1 approve failed; candidate {cid[:8]} pending")
|
|
|
|
|
else:
|
|
|
|
|
skipped_existing += 1
|
|
|
|
|
time.sleep(0.3)
|
2026-04-18 10:30:49 -04:00
|
|
|
continue
|
|
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
tier2_eligible = (
|
|
|
|
|
autonomous
|
|
|
|
|
and min_sim >= TIER2_MIN_SIM
|
|
|
|
|
and tier1["confidence"] >= TIER2_MIN_CONF
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if tier2_eligible:
|
|
|
|
|
print(" → escalating to tier-2 (opus)…")
|
|
|
|
|
tier2 = tier2_review(sources, tier1, args.tier2_model, args.timeout_s)
|
|
|
|
|
if tier2 is None:
|
|
|
|
|
cid = submit_candidate(base, ids, min_sim, tier1, args.dry_run)
|
|
|
|
|
if cid and cid != "dry-run":
|
|
|
|
|
human_candidates += 1
|
|
|
|
|
print(f" → candidate {cid[:8]} (tier-2 errored, human review)")
|
|
|
|
|
time.sleep(0.5)
|
2026-04-18 15:46:26 -04:00
|
|
|
continue
|
|
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
if tier2["action"] == "reject":
|
|
|
|
|
tier2_overrides += 1
|
|
|
|
|
print(f" ❌ TIER-2 override (reject): {tier2['reason'][:100]}")
|
|
|
|
|
time.sleep(0.5)
|
|
|
|
|
continue
|
2026-04-18 15:46:26 -04:00
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
if tier2["confidence"] >= AUTO_APPROVE_CONF:
|
|
|
|
|
cid = submit_candidate(base, ids, min_sim, tier2, args.dry_run)
|
2026-04-18 15:46:26 -04:00
|
|
|
if cid == "dry-run":
|
2026-04-21 16:18:00 -04:00
|
|
|
auto_merged_tier2 += 1
|
2026-04-18 15:46:26 -04:00
|
|
|
elif cid:
|
2026-04-21 16:18:00 -04:00
|
|
|
new_id = auto_approve(base, cid, actor="auto-dedup-tier2", dry_run=args.dry_run)
|
2026-04-18 15:46:26 -04:00
|
|
|
if new_id:
|
2026-04-21 16:18:00 -04:00
|
|
|
auto_merged_tier2 += 1
|
|
|
|
|
print(f" ✅ auto-merged (tier-2) → {str(new_id)[:8]}")
|
2026-04-18 15:46:26 -04:00
|
|
|
else:
|
|
|
|
|
human_candidates += 1
|
|
|
|
|
else:
|
|
|
|
|
skipped_existing += 1
|
|
|
|
|
time.sleep(0.5)
|
2026-04-18 10:30:49 -04:00
|
|
|
continue
|
|
|
|
|
|
2026-04-21 16:18:00 -04:00
|
|
|
cid = submit_candidate(base, ids, min_sim, tier2, args.dry_run)
|
|
|
|
|
if cid and cid != "dry-run":
|
2026-04-18 15:46:26 -04:00
|
|
|
human_candidates += 1
|
2026-04-21 16:18:00 -04:00
|
|
|
print(f" → candidate {cid[:8]} (tier-2 low-conf, human review)")
|
|
|
|
|
time.sleep(0.5)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Below tier-2 thresholds — human review with tier-1 draft
|
|
|
|
|
cid = submit_candidate(base, ids, min_sim, tier1, args.dry_run)
|
|
|
|
|
if cid == "dry-run":
|
|
|
|
|
human_candidates += 1
|
|
|
|
|
elif cid:
|
|
|
|
|
human_candidates += 1
|
|
|
|
|
print(f" → candidate {cid[:8]} (human review)")
|
|
|
|
|
else:
|
|
|
|
|
skipped_existing += 1
|
|
|
|
|
time.sleep(0.3)
|
2026-04-18 10:30:49 -04:00
|
|
|
|
|
|
|
|
print(
|
2026-04-21 16:18:00 -04:00
|
|
|
f"\nsummary: clusters_processed={processed} "
|
2026-04-18 15:46:26 -04:00
|
|
|
f"auto_merged_tier1={auto_merged_tier1} "
|
|
|
|
|
f"auto_merged_tier2={auto_merged_tier2} "
|
|
|
|
|
f"human_candidates={human_candidates} "
|
|
|
|
|
f"tier1_rejections={tier1_rejections} "
|
|
|
|
|
f"tier2_overrides={tier2_overrides} "
|
2026-04-18 10:30:49 -04:00
|
|
|
f"skipped_existing={skipped_existing}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|