fix(7A): host-side memory_dedup.py must stay stdlib-only
Broke the dedup-watcher cron when I wrote memory_dedup.py in session
7A: imported atocore.memory.similarity, which transitively pulls
sentence-transformers + pydantic_settings onto host Python that
intentionally doesn't have them. Every UI-triggered + cron dedup scan
since 7A deployed was silently crashing with ModuleNotFoundError
(visible only in /home/papa/atocore-logs/dedup-ondemand-*.log).
I even documented this architecture rule in atocore.memory._llm_prompt
('This module MUST stay stdlib-only') then violated it one session
later. Shame.
Real fix — matches the extractor pattern:
- New endpoint POST /admin/memory/dedup-cluster on the server: takes
{project, similarity_threshold, max_clusters}, runs the embedding +
transitive-clustering inside the container where
sentence-transformers lives, returns cluster shape.
- scripts/memory_dedup.py now pure stdlib: pulls clusters via HTTP,
LLM-drafts merges via claude CLI, POSTs proposals back. No atocore
imports beyond the stdlib-only _dedup_prompt shared module.
- Regression test pins the rule: test_memory_dedup_script_is_stdlib_only
snapshots sys.modules before/after importing the script and asserts
no non-allowed atocore modules were pulled.
Also: similarity.py + cluster_by_threshold stay server-side, still
covered by the same tests that used to live in the host tier-helper
section.
Tests 459 → 458 (-1 via rewrite of obsolete host-tier helper tests,
+2 for the new stdlib-only regression + endpoint shape tests).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -162,77 +162,82 @@ def test_build_tier2_user_message_includes_tier1_draft():
|
||||
assert "verdict" in msg.lower()
|
||||
|
||||
|
||||
# --- Tiering helpers (min_pairwise_similarity, same_bucket) ---
|
||||
# --- Host script is stdlib-only (Phase 7A architecture rule) ---
|
||||
|
||||
|
||||
def test_same_bucket_true_for_matching():
|
||||
def test_memory_dedup_script_is_stdlib_only():
|
||||
"""The host-side scripts/memory_dedup.py must NOT import anything
|
||||
that pulls pydantic_settings, sentence-transformers, torch, etc.
|
||||
into the host Python. The only atocore-land module allowed is the
|
||||
stdlib-only prompt helper at atocore.memory._dedup_prompt.
|
||||
|
||||
This regression test prevents re-introducing the bug where the
|
||||
dedup-watcher on Dalidou host crashed with ModuleNotFoundError
|
||||
because someone imported atocore.memory.similarity (which pulls
|
||||
in atocore.retrieval.embeddings → sentence_transformers)."""
|
||||
import importlib.util
|
||||
import sys as _sys
|
||||
|
||||
before = set(_sys.modules.keys())
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"memory_dedup_for_test",
|
||||
"scripts/memory_dedup.py",
|
||||
"memory_dedup_for_test", "scripts/memory_dedup.py",
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
after = set(_sys.modules.keys())
|
||||
|
||||
sources = [
|
||||
{"memory_type": "knowledge", "project": "p04"},
|
||||
{"memory_type": "knowledge", "project": "p04"},
|
||||
]
|
||||
assert mod.same_bucket(sources) is True
|
||||
|
||||
|
||||
def test_same_bucket_false_for_mixed():
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"memory_dedup_for_test",
|
||||
"scripts/memory_dedup.py",
|
||||
new_atocore = sorted(m for m in (after - before) if m.startswith("atocore"))
|
||||
# Only the stdlib-only shared prompt module is allowed to load
|
||||
allowed = {"atocore", "atocore.memory", "atocore.memory._dedup_prompt"}
|
||||
disallowed = [m for m in new_atocore if m not in allowed]
|
||||
assert not disallowed, (
|
||||
f"scripts/memory_dedup.py pulled non-stdlib atocore modules "
|
||||
f"(will break host Python without ML deps): {disallowed}"
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
|
||||
# Different project
|
||||
assert mod.same_bucket([
|
||||
{"memory_type": "knowledge", "project": "p04"},
|
||||
{"memory_type": "knowledge", "project": "p05"},
|
||||
]) is False
|
||||
# Different memory_type
|
||||
assert mod.same_bucket([
|
||||
{"memory_type": "knowledge", "project": "p04"},
|
||||
{"memory_type": "project", "project": "p04"},
|
||||
]) is False
|
||||
|
||||
|
||||
def test_min_pairwise_similarity_identical_texts():
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"memory_dedup_for_test",
|
||||
"scripts/memory_dedup.py",
|
||||
# --- Server-side clustering (still in atocore.memory.similarity) ---
|
||||
|
||||
|
||||
def test_similarity_module_server_side():
|
||||
"""similarity.py stays server-side for ML deps. These helpers are
|
||||
only invoked via the /admin/memory/dedup-cluster endpoint."""
|
||||
from atocore.memory.similarity import cluster_by_threshold
|
||||
clusters = cluster_by_threshold(
|
||||
["duplicate fact A", "duplicate fact A slightly reworded",
|
||||
"totally unrelated fact about firmware"],
|
||||
threshold=0.7,
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
|
||||
# Three identical texts — min should be ~1.0
|
||||
ms = mod.min_pairwise_similarity(["hello world"] * 3)
|
||||
assert 0.99 <= ms <= 1.0
|
||||
multi = [c for c in clusters if len(c) >= 2]
|
||||
assert multi, "expected at least one multi-member cluster"
|
||||
|
||||
|
||||
def test_min_pairwise_similarity_mixed_cluster():
|
||||
"""Transitive cluster A~B~C with A and C actually quite different
|
||||
should expose a low min even though A~B and B~C are high."""
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"memory_dedup_for_test",
|
||||
"scripts/memory_dedup.py",
|
||||
)
|
||||
mod = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
def test_cluster_endpoint_returns_groups(tmp_data_dir):
|
||||
"""POST /admin/memory/dedup-cluster shape test — we just verify the
|
||||
service layer produces the expected output. Full HTTP is
|
||||
integration-tested by the live scan."""
|
||||
from atocore.models.database import init_db
|
||||
init_db()
|
||||
from atocore.memory.service import create_memory, get_memories
|
||||
create_memory("knowledge", "APM uses NX bridge for DXF to STL conversion",
|
||||
project="apm")
|
||||
create_memory("knowledge", "APM uses the NX Python bridge for DXF-to-STL",
|
||||
project="apm")
|
||||
create_memory("knowledge", "The polisher firmware requires USB SSD storage",
|
||||
project="p06-polisher")
|
||||
|
||||
ms = mod.min_pairwise_similarity([
|
||||
"Antoine prefers OAuth over API keys",
|
||||
"Antoine's OAuth preference",
|
||||
"USB SSD mandatory for polisher firmware",
|
||||
])
|
||||
assert ms < 0.6 # Third is unrelated; min is far below threshold
|
||||
# Mirror the server code path
|
||||
from atocore.memory.similarity import cluster_by_threshold
|
||||
mems = get_memories(project="apm", active_only=True, limit=100)
|
||||
texts = [m.content for m in mems]
|
||||
clusters = cluster_by_threshold(texts, threshold=0.7)
|
||||
multi = [c for c in clusters if len(c) >= 2]
|
||||
assert multi, "expected the two APM memories to cluster together"
|
||||
# Unrelated p06 memory should NOT be in that cluster
|
||||
apm_ids = {mems[i].id for i in multi[0]}
|
||||
assert len(apm_ids) == 2
|
||||
all_ids = {m.id for m in mems}
|
||||
assert apm_ids.issubset(all_ids)
|
||||
|
||||
|
||||
# --- create_merge_candidate idempotency ---
|
||||
|
||||
Reference in New Issue
Block a user