feat: Phase 7A — semantic memory dedup ("sleep cycle" V1)

New table memory_merge_candidates + service functions to cluster near-duplicate active memories within (project, memory_type) buckets, draft a unified content via LLM, and merge on human approval. Source memories become superseded (never deleted); merged memory carries union of tags, max of confidence, sum of reference_count. - schema migration for memory_merge_candidates - atocore.memory.similarity: cosine + transitive clustering - atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific - service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate - scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent) - 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan - triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar - batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays) - deploy/dalidou/dedup-watcher.sh for UI-triggered scans - 21 new tests (374 → 395) - docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 10:30:49 -04:00
parent 9f262a21b0
commit 028d4c3594
12 changed files with 1860 additions and 8 deletions
--- a/src/atocore/memory/similarity.py
+++ b/src/atocore/memory/similarity.py
@@ -0,0 +1,88 @@
+"""Phase 7A (Memory Consolidation): semantic similarity helpers.
+
+Thin wrapper over ``atocore.retrieval.embeddings`` that exposes
+pairwise + batch cosine similarity on normalized embeddings. Used by
+the dedup detector to cluster near-duplicate active memories.
+
+Embeddings from ``embed_texts()`` are already L2-normalized, so cosine
+similarity reduces to a dot product — no extra normalization needed.
+"""
+
+from __future__ import annotations
+
+from atocore.retrieval.embeddings import embed_texts
+
+
+def _dot(a: list[float], b: list[float]) -> float:
+    return sum(x * y for x, y in zip(a, b))
+
+
+def cosine(a: list[float], b: list[float]) -> float:
+    """Cosine similarity on already-normalized vectors. Clamped to [0,1]
+    (embeddings use paraphrase-multilingual-MiniLM which is unit-norm,
+    and we never want negative values leaking into thresholds)."""
+    return max(0.0, min(1.0, _dot(a, b)))
+
+
+def compute_memory_similarity(text_a: str, text_b: str) -> float:
+    """Return cosine similarity of two memory contents in [0,1].
+
+    Convenience helper for one-off checks + tests. For batch work (the
+    dedup detector), use ``embed_texts()`` directly and compute the
+    similarity matrix yourself to avoid re-embedding shared texts.
+    """
+    if not text_a or not text_b:
+        return 0.0
+    vecs = embed_texts([text_a, text_b])
+    return cosine(vecs[0], vecs[1])
+
+
+def similarity_matrix(texts: list[str]) -> list[list[float]]:
+    """N×N cosine similarity matrix. Diagonal is 1.0, symmetric."""
+    if not texts:
+        return []
+    vecs = embed_texts(texts)
+    n = len(vecs)
+    matrix = [[0.0] * n for _ in range(n)]
+    for i in range(n):
+        matrix[i][i] = 1.0
+        for j in range(i + 1, n):
+            s = cosine(vecs[i], vecs[j])
+            matrix[i][j] = s
+            matrix[j][i] = s
+    return matrix
+
+
+def cluster_by_threshold(texts: list[str], threshold: float) -> list[list[int]]:
+    """Greedy transitive clustering: if sim(i,j) >= threshold, merge.
+
+    Returns a list of clusters, each a list of indices into ``texts``.
+    Singletons are included. Used by the dedup detector to collapse
+    A~B~C into one merge proposal rather than three pair proposals.
+    """
+    if not texts:
+        return []
+    matrix = similarity_matrix(texts)
+    n = len(texts)
+    parent = list(range(n))
+
+    def find(x: int) -> int:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]
+            x = parent[x]
+        return x
+
+    def union(x: int, y: int) -> None:
+        rx, ry = find(x), find(y)
+        if rx != ry:
+            parent[rx] = ry
+
+    for i in range(n):
+        for j in range(i + 1, n):
+            if matrix[i][j] >= threshold:
+                union(i, j)
+
+    groups: dict[int, list[int]] = {}
+    for i in range(n):
+        groups.setdefault(find(i), []).append(i)
+    return list(groups.values())