fix(7A): host-side memory_dedup.py must stay stdlib-only

Broke the dedup-watcher cron when I wrote memory_dedup.py in session 7A: imported atocore.memory.similarity, which transitively pulls sentence-transformers + pydantic_settings onto host Python that intentionally doesn't have them. Every UI-triggered + cron dedup scan since 7A deployed was silently crashing with ModuleNotFoundError (visible only in /home/papa/atocore-logs/dedup-ondemand-*.log). I even documented this architecture rule in atocore.memory._llm_prompt ('This module MUST stay stdlib-only') then violated it one session later. Shame. Real fix — matches the extractor pattern: - New endpoint POST /admin/memory/dedup-cluster on the server: takes {project, similarity_threshold, max_clusters}, runs the embedding + transitive-clustering inside the container where sentence-transformers lives, returns cluster shape. - scripts/memory_dedup.py now pure stdlib: pulls clusters via HTTP, LLM-drafts merges via claude CLI, POSTs proposals back. No atocore imports beyond the stdlib-only _dedup_prompt shared module. - Regression test pins the rule: test_memory_dedup_script_is_stdlib_only snapshots sys.modules before/after importing the script and asserts no non-allowed atocore modules were pulled. Also: similarity.py + cluster_by_threshold stay server-side, still covered by the same tests that used to live in the host tier-helper section. Tests 459 → 458 (-1 via rewrite of obsolete host-tier helper tests, +2 for the new stdlib-only regression + endpoint shape tests). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 16:18:00 -04:00
parent 6a2471d509
commit 90001c1956
3 changed files with 300 additions and 288 deletions
--- a/src/atocore/api/routes.py
+++ b/src/atocore/api/routes.py
@@ -1744,6 +1744,102 @@ class DedupScanRequestBody(BaseModel):
    max_batch: int = 50


+class DedupClusterBody(BaseModel):
+    project: str = ""
+    similarity_threshold: float = 0.88
+    max_clusters: int = 100
+
+
+@router.post("/admin/memory/dedup-cluster")
+def api_dedup_cluster(body: DedupClusterBody) -> dict:
+    """Server-side near-duplicate clustering for Phase 7A dedup detector.
+
+    Host-side scripts/memory_dedup.py can't import atocore.memory.similarity
+    (that would transitively pull sentence-transformers + torch onto the
+    host Python, which intentionally stays lean). Instead the host posts
+    here; we compute embeddings + transitive clusters server-side and
+    return the cluster shape the host needs to draft merges via claude CLI.
+
+    Buckets by (project, memory_type) — cross-bucket merges are deferred
+    to the 7B contradiction flow. Active non-graduated memories only.
+    Returns up to max_clusters clusters of size >= 2, ordered by min
+    intra-cluster similarity descending (strongest candidates first)."""
+    from atocore.memory.service import get_memories
+    from atocore.memory.similarity import cluster_by_threshold, similarity_matrix
+
+    project_filter = (body.project or "").strip() or None
+    threshold = max(0.5, min(0.99, body.similarity_threshold))
+
+    mems = get_memories(
+        project=project_filter,
+        active_only=True,
+        limit=2000,
+    )
+    # Drop graduated (frozen entity pointers) — they're exempt from dedup
+    mems = [m for m in mems if m.status == "active"]
+
+    # Group by (project, memory_type)
+    buckets: dict[tuple[str, str], list] = {}
+    for m in mems:
+        key = ((m.project or "").lower(), (m.memory_type or "").lower())
+        buckets.setdefault(key, []).append(m)
+
+    out_clusters: list[dict] = []
+    for (proj, mtype), group in sorted(buckets.items()):
+        if len(group) < 2:
+            continue
+        texts = [m.content or "" for m in group]
+        clusters = cluster_by_threshold(texts, threshold)
+        clusters = [c for c in clusters if len(c) >= 2]
+        if not clusters:
+            continue
+
+        # Cache matrix once per bucket so we can report min pairwise sim
+        matrix = similarity_matrix(texts)
+
+        for cluster in clusters:
+            min_sim = 1.0
+            for i in range(len(cluster)):
+                for j in range(i + 1, len(cluster)):
+                    s = matrix[cluster[i]][cluster[j]]
+                    if s < min_sim:
+                        min_sim = s
+            sources = []
+            for idx in cluster:
+                m = group[idx]
+                sources.append({
+                    "id": m.id,
+                    "memory_type": m.memory_type,
+                    "content": m.content,
+                    "project": m.project or "",
+                    "confidence": m.confidence,
+                    "reference_count": m.reference_count,
+                    "domain_tags": m.domain_tags or [],
+                    "valid_until": m.valid_until or "",
+                })
+            out_clusters.append({
+                "project": proj,
+                "memory_type": mtype,
+                "min_similarity": round(min_sim, 4),
+                "size": len(cluster),
+                "memory_ids": [s["id"] for s in sources],
+                "sources": sources,
+            })
+
+    # Strongest clusters first
+    out_clusters.sort(key=lambda c: -c["min_similarity"])
+    out_clusters = out_clusters[:body.max_clusters]
+
+    return {
+        "cluster_count": len(out_clusters),
+        "threshold": threshold,
+        "project_filter": project_filter or "",
+        "total_active_scanned": len(mems),
+        "bucket_count": sum(1 for g in buckets.values() if len(g) >= 2),
+        "clusters": out_clusters,
+    }
+
+
@router.get("/admin/memory/merge-candidates")
 def api_list_merge_candidates(status: str = "pending", limit: int = 100) -> dict:
    """Phase 7A: list merge-candidate proposals for triage UI."""