fix(7A): host-side memory_dedup.py must stay stdlib-only

Broke the dedup-watcher cron when I wrote memory_dedup.py in session
7A: imported atocore.memory.similarity, which transitively pulls
sentence-transformers + pydantic_settings onto host Python that
intentionally doesn't have them. Every UI-triggered + cron dedup scan
since 7A deployed was silently crashing with ModuleNotFoundError
(visible only in /home/papa/atocore-logs/dedup-ondemand-*.log).

I even documented this architecture rule in atocore.memory._llm_prompt
('This module MUST stay stdlib-only') then violated it one session
later. Shame.

Real fix — matches the extractor pattern:
- New endpoint POST /admin/memory/dedup-cluster on the server: takes
  {project, similarity_threshold, max_clusters}, runs the embedding +
  transitive-clustering inside the container where
  sentence-transformers lives, returns cluster shape.
- scripts/memory_dedup.py now pure stdlib: pulls clusters via HTTP,
  LLM-drafts merges via claude CLI, POSTs proposals back. No atocore
  imports beyond the stdlib-only _dedup_prompt shared module.
- Regression test pins the rule: test_memory_dedup_script_is_stdlib_only
  snapshots sys.modules before/after importing the script and asserts
  no non-allowed atocore modules were pulled.

Also: similarity.py + cluster_by_threshold stay server-side, still
covered by the same tests that used to live in the host tier-helper
section.

Tests 459 → 458 (-1 via rewrite of obsolete host-tier helper tests,
+2 for the new stdlib-only regression + endpoint shape tests).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-21 16:18:00 -04:00
parent 6a2471d509
commit 90001c1956
3 changed files with 300 additions and 288 deletions

View File

@@ -162,77 +162,82 @@ def test_build_tier2_user_message_includes_tier1_draft():
assert "verdict" in msg.lower()
# --- Tiering helpers (min_pairwise_similarity, same_bucket) ---
# --- Host script is stdlib-only (Phase 7A architecture rule) ---
def test_same_bucket_true_for_matching():
def test_memory_dedup_script_is_stdlib_only():
"""The host-side scripts/memory_dedup.py must NOT import anything
that pulls pydantic_settings, sentence-transformers, torch, etc.
into the host Python. The only atocore-land module allowed is the
stdlib-only prompt helper at atocore.memory._dedup_prompt.
This regression test prevents re-introducing the bug where the
dedup-watcher on Dalidou host crashed with ModuleNotFoundError
because someone imported atocore.memory.similarity (which pulls
in atocore.retrieval.embeddings → sentence_transformers)."""
import importlib.util
import sys as _sys
before = set(_sys.modules.keys())
spec = importlib.util.spec_from_file_location(
"memory_dedup_for_test",
"scripts/memory_dedup.py",
"memory_dedup_for_test", "scripts/memory_dedup.py",
)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
after = set(_sys.modules.keys())
sources = [
{"memory_type": "knowledge", "project": "p04"},
{"memory_type": "knowledge", "project": "p04"},
]
assert mod.same_bucket(sources) is True
def test_same_bucket_false_for_mixed():
import importlib.util
spec = importlib.util.spec_from_file_location(
"memory_dedup_for_test",
"scripts/memory_dedup.py",
new_atocore = sorted(m for m in (after - before) if m.startswith("atocore"))
# Only the stdlib-only shared prompt module is allowed to load
allowed = {"atocore", "atocore.memory", "atocore.memory._dedup_prompt"}
disallowed = [m for m in new_atocore if m not in allowed]
assert not disallowed, (
f"scripts/memory_dedup.py pulled non-stdlib atocore modules "
f"(will break host Python without ML deps): {disallowed}"
)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
# Different project
assert mod.same_bucket([
{"memory_type": "knowledge", "project": "p04"},
{"memory_type": "knowledge", "project": "p05"},
]) is False
# Different memory_type
assert mod.same_bucket([
{"memory_type": "knowledge", "project": "p04"},
{"memory_type": "project", "project": "p04"},
]) is False
def test_min_pairwise_similarity_identical_texts():
import importlib.util
spec = importlib.util.spec_from_file_location(
"memory_dedup_for_test",
"scripts/memory_dedup.py",
# --- Server-side clustering (still in atocore.memory.similarity) ---
def test_similarity_module_server_side():
"""similarity.py stays server-side for ML deps. These helpers are
only invoked via the /admin/memory/dedup-cluster endpoint."""
from atocore.memory.similarity import cluster_by_threshold
clusters = cluster_by_threshold(
["duplicate fact A", "duplicate fact A slightly reworded",
"totally unrelated fact about firmware"],
threshold=0.7,
)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
# Three identical texts — min should be ~1.0
ms = mod.min_pairwise_similarity(["hello world"] * 3)
assert 0.99 <= ms <= 1.0
multi = [c for c in clusters if len(c) >= 2]
assert multi, "expected at least one multi-member cluster"
def test_min_pairwise_similarity_mixed_cluster():
"""Transitive cluster A~B~C with A and C actually quite different
should expose a low min even though A~B and B~C are high."""
import importlib.util
spec = importlib.util.spec_from_file_location(
"memory_dedup_for_test",
"scripts/memory_dedup.py",
)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
def test_cluster_endpoint_returns_groups(tmp_data_dir):
"""POST /admin/memory/dedup-cluster shape test — we just verify the
service layer produces the expected output. Full HTTP is
integration-tested by the live scan."""
from atocore.models.database import init_db
init_db()
from atocore.memory.service import create_memory, get_memories
create_memory("knowledge", "APM uses NX bridge for DXF to STL conversion",
project="apm")
create_memory("knowledge", "APM uses the NX Python bridge for DXF-to-STL",
project="apm")
create_memory("knowledge", "The polisher firmware requires USB SSD storage",
project="p06-polisher")
ms = mod.min_pairwise_similarity([
"Antoine prefers OAuth over API keys",
"Antoine's OAuth preference",
"USB SSD mandatory for polisher firmware",
])
assert ms < 0.6 # Third is unrelated; min is far below threshold
# Mirror the server code path
from atocore.memory.similarity import cluster_by_threshold
mems = get_memories(project="apm", active_only=True, limit=100)
texts = [m.content for m in mems]
clusters = cluster_by_threshold(texts, threshold=0.7)
multi = [c for c in clusters if len(c) >= 2]
assert multi, "expected the two APM memories to cluster together"
# Unrelated p06 memory should NOT be in that cluster
apm_ids = {mems[i].id for i in multi[0]}
assert len(apm_ids) == 2
all_ids = {m.id for m in mems}
assert apm_ids.issubset(all_ids)
# --- create_merge_candidate idempotency ---