fix(7A): host-side memory_dedup.py must stay stdlib-only

Broke the dedup-watcher cron when I wrote memory_dedup.py in session
7A: imported atocore.memory.similarity, which transitively pulls
sentence-transformers + pydantic_settings onto host Python that
intentionally doesn't have them. Every UI-triggered + cron dedup scan
since 7A deployed was silently crashing with ModuleNotFoundError
(visible only in /home/papa/atocore-logs/dedup-ondemand-*.log).

I even documented this architecture rule in atocore.memory._llm_prompt
('This module MUST stay stdlib-only') then violated it one session
later. Shame.

Real fix — matches the extractor pattern:
- New endpoint POST /admin/memory/dedup-cluster on the server: takes
  {project, similarity_threshold, max_clusters}, runs the embedding +
  transitive-clustering inside the container where
  sentence-transformers lives, returns cluster shape.
- scripts/memory_dedup.py now pure stdlib: pulls clusters via HTTP,
  LLM-drafts merges via claude CLI, POSTs proposals back. No atocore
  imports beyond the stdlib-only _dedup_prompt shared module.
- Regression test pins the rule: test_memory_dedup_script_is_stdlib_only
  snapshots sys.modules before/after importing the script and asserts
  no non-allowed atocore modules were pulled.

Also: similarity.py + cluster_by_threshold stay server-side, still
covered by the same tests that used to live in the host tier-helper
section.

Tests 459 → 458 (-1 via rewrite of obsolete host-tier helper tests,
+2 for the new stdlib-only regression + endpoint shape tests).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-21 16:18:00 -04:00
parent 6a2471d509
commit 90001c1956
3 changed files with 300 additions and 288 deletions

View File

@@ -1744,6 +1744,102 @@ class DedupScanRequestBody(BaseModel):
max_batch: int = 50
class DedupClusterBody(BaseModel):
project: str = ""
similarity_threshold: float = 0.88
max_clusters: int = 100
@router.post("/admin/memory/dedup-cluster")
def api_dedup_cluster(body: DedupClusterBody) -> dict:
"""Server-side near-duplicate clustering for Phase 7A dedup detector.
Host-side scripts/memory_dedup.py can't import atocore.memory.similarity
(that would transitively pull sentence-transformers + torch onto the
host Python, which intentionally stays lean). Instead the host posts
here; we compute embeddings + transitive clusters server-side and
return the cluster shape the host needs to draft merges via claude CLI.
Buckets by (project, memory_type) — cross-bucket merges are deferred
to the 7B contradiction flow. Active non-graduated memories only.
Returns up to max_clusters clusters of size >= 2, ordered by min
intra-cluster similarity descending (strongest candidates first)."""
from atocore.memory.service import get_memories
from atocore.memory.similarity import cluster_by_threshold, similarity_matrix
project_filter = (body.project or "").strip() or None
threshold = max(0.5, min(0.99, body.similarity_threshold))
mems = get_memories(
project=project_filter,
active_only=True,
limit=2000,
)
# Drop graduated (frozen entity pointers) — they're exempt from dedup
mems = [m for m in mems if m.status == "active"]
# Group by (project, memory_type)
buckets: dict[tuple[str, str], list] = {}
for m in mems:
key = ((m.project or "").lower(), (m.memory_type or "").lower())
buckets.setdefault(key, []).append(m)
out_clusters: list[dict] = []
for (proj, mtype), group in sorted(buckets.items()):
if len(group) < 2:
continue
texts = [m.content or "" for m in group]
clusters = cluster_by_threshold(texts, threshold)
clusters = [c for c in clusters if len(c) >= 2]
if not clusters:
continue
# Cache matrix once per bucket so we can report min pairwise sim
matrix = similarity_matrix(texts)
for cluster in clusters:
min_sim = 1.0
for i in range(len(cluster)):
for j in range(i + 1, len(cluster)):
s = matrix[cluster[i]][cluster[j]]
if s < min_sim:
min_sim = s
sources = []
for idx in cluster:
m = group[idx]
sources.append({
"id": m.id,
"memory_type": m.memory_type,
"content": m.content,
"project": m.project or "",
"confidence": m.confidence,
"reference_count": m.reference_count,
"domain_tags": m.domain_tags or [],
"valid_until": m.valid_until or "",
})
out_clusters.append({
"project": proj,
"memory_type": mtype,
"min_similarity": round(min_sim, 4),
"size": len(cluster),
"memory_ids": [s["id"] for s in sources],
"sources": sources,
})
# Strongest clusters first
out_clusters.sort(key=lambda c: -c["min_similarity"])
out_clusters = out_clusters[:body.max_clusters]
return {
"cluster_count": len(out_clusters),
"threshold": threshold,
"project_filter": project_filter or "",
"total_active_scanned": len(mems),
"bucket_count": sum(1 for g in buckets.values() if len(g) >= 2),
"clusters": out_clusters,
}
@router.get("/admin/memory/merge-candidates")
def api_list_merge_candidates(status: str = "pending", limit: int = 100) -> dict:
"""Phase 7A: list merge-candidate proposals for triage UI."""