fix(7A): host-side memory_dedup.py must stay stdlib-only
Broke the dedup-watcher cron when I wrote memory_dedup.py in session
7A: imported atocore.memory.similarity, which transitively pulls
sentence-transformers + pydantic_settings onto host Python that
intentionally doesn't have them. Every UI-triggered + cron dedup scan
since 7A deployed was silently crashing with ModuleNotFoundError
(visible only in /home/papa/atocore-logs/dedup-ondemand-*.log).
I even documented this architecture rule in atocore.memory._llm_prompt
('This module MUST stay stdlib-only') then violated it one session
later. Shame.
Real fix — matches the extractor pattern:
- New endpoint POST /admin/memory/dedup-cluster on the server: takes
{project, similarity_threshold, max_clusters}, runs the embedding +
transitive-clustering inside the container where
sentence-transformers lives, returns cluster shape.
- scripts/memory_dedup.py now pure stdlib: pulls clusters via HTTP,
LLM-drafts merges via claude CLI, POSTs proposals back. No atocore
imports beyond the stdlib-only _dedup_prompt shared module.
- Regression test pins the rule: test_memory_dedup_script_is_stdlib_only
snapshots sys.modules before/after importing the script and asserts
no non-allowed atocore modules were pulled.
Also: similarity.py + cluster_by_threshold stay server-side, still
covered by the same tests that used to live in the host tier-helper
section.
Tests 459 → 458 (-1 via rewrite of obsolete host-tier helper tests,
+2 for the new stdlib-only regression + endpoint shape tests).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1744,6 +1744,102 @@ class DedupScanRequestBody(BaseModel):
|
||||
max_batch: int = 50
|
||||
|
||||
|
||||
class DedupClusterBody(BaseModel):
|
||||
project: str = ""
|
||||
similarity_threshold: float = 0.88
|
||||
max_clusters: int = 100
|
||||
|
||||
|
||||
@router.post("/admin/memory/dedup-cluster")
|
||||
def api_dedup_cluster(body: DedupClusterBody) -> dict:
|
||||
"""Server-side near-duplicate clustering for Phase 7A dedup detector.
|
||||
|
||||
Host-side scripts/memory_dedup.py can't import atocore.memory.similarity
|
||||
(that would transitively pull sentence-transformers + torch onto the
|
||||
host Python, which intentionally stays lean). Instead the host posts
|
||||
here; we compute embeddings + transitive clusters server-side and
|
||||
return the cluster shape the host needs to draft merges via claude CLI.
|
||||
|
||||
Buckets by (project, memory_type) — cross-bucket merges are deferred
|
||||
to the 7B contradiction flow. Active non-graduated memories only.
|
||||
Returns up to max_clusters clusters of size >= 2, ordered by min
|
||||
intra-cluster similarity descending (strongest candidates first)."""
|
||||
from atocore.memory.service import get_memories
|
||||
from atocore.memory.similarity import cluster_by_threshold, similarity_matrix
|
||||
|
||||
project_filter = (body.project or "").strip() or None
|
||||
threshold = max(0.5, min(0.99, body.similarity_threshold))
|
||||
|
||||
mems = get_memories(
|
||||
project=project_filter,
|
||||
active_only=True,
|
||||
limit=2000,
|
||||
)
|
||||
# Drop graduated (frozen entity pointers) — they're exempt from dedup
|
||||
mems = [m for m in mems if m.status == "active"]
|
||||
|
||||
# Group by (project, memory_type)
|
||||
buckets: dict[tuple[str, str], list] = {}
|
||||
for m in mems:
|
||||
key = ((m.project or "").lower(), (m.memory_type or "").lower())
|
||||
buckets.setdefault(key, []).append(m)
|
||||
|
||||
out_clusters: list[dict] = []
|
||||
for (proj, mtype), group in sorted(buckets.items()):
|
||||
if len(group) < 2:
|
||||
continue
|
||||
texts = [m.content or "" for m in group]
|
||||
clusters = cluster_by_threshold(texts, threshold)
|
||||
clusters = [c for c in clusters if len(c) >= 2]
|
||||
if not clusters:
|
||||
continue
|
||||
|
||||
# Cache matrix once per bucket so we can report min pairwise sim
|
||||
matrix = similarity_matrix(texts)
|
||||
|
||||
for cluster in clusters:
|
||||
min_sim = 1.0
|
||||
for i in range(len(cluster)):
|
||||
for j in range(i + 1, len(cluster)):
|
||||
s = matrix[cluster[i]][cluster[j]]
|
||||
if s < min_sim:
|
||||
min_sim = s
|
||||
sources = []
|
||||
for idx in cluster:
|
||||
m = group[idx]
|
||||
sources.append({
|
||||
"id": m.id,
|
||||
"memory_type": m.memory_type,
|
||||
"content": m.content,
|
||||
"project": m.project or "",
|
||||
"confidence": m.confidence,
|
||||
"reference_count": m.reference_count,
|
||||
"domain_tags": m.domain_tags or [],
|
||||
"valid_until": m.valid_until or "",
|
||||
})
|
||||
out_clusters.append({
|
||||
"project": proj,
|
||||
"memory_type": mtype,
|
||||
"min_similarity": round(min_sim, 4),
|
||||
"size": len(cluster),
|
||||
"memory_ids": [s["id"] for s in sources],
|
||||
"sources": sources,
|
||||
})
|
||||
|
||||
# Strongest clusters first
|
||||
out_clusters.sort(key=lambda c: -c["min_similarity"])
|
||||
out_clusters = out_clusters[:body.max_clusters]
|
||||
|
||||
return {
|
||||
"cluster_count": len(out_clusters),
|
||||
"threshold": threshold,
|
||||
"project_filter": project_filter or "",
|
||||
"total_active_scanned": len(mems),
|
||||
"bucket_count": sum(1 for g in buckets.values() if len(g) >= 2),
|
||||
"clusters": out_clusters,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/admin/memory/merge-candidates")
|
||||
def api_list_merge_candidates(status: str = "pending", limit: int = 100) -> dict:
|
||||
"""Phase 7A: list merge-candidate proposals for triage UI."""
|
||||
|
||||
Reference in New Issue
Block a user