feat: Phase 7A — semantic memory dedup ("sleep cycle" V1)

New table memory_merge_candidates + service functions to cluster
near-duplicate active memories within (project, memory_type) buckets,
draft a unified content via LLM, and merge on human approval. Source
memories become superseded (never deleted); merged memory carries
union of tags, max of confidence, sum of reference_count.

- schema migration for memory_merge_candidates
- atocore.memory.similarity: cosine + transitive clustering
- atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific
- service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate
- scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent)
- 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan
- triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar
- batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays)
- deploy/dalidou/dedup-watcher.sh for UI-triggered scans
- 21 new tests (374 → 395)
- docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-18 10:30:49 -04:00
parent 9f262a21b0
commit 028d4c3594
12 changed files with 1860 additions and 8 deletions

View File

@@ -0,0 +1,88 @@
"""Phase 7A (Memory Consolidation): semantic similarity helpers.
Thin wrapper over ``atocore.retrieval.embeddings`` that exposes
pairwise + batch cosine similarity on normalized embeddings. Used by
the dedup detector to cluster near-duplicate active memories.
Embeddings from ``embed_texts()`` are already L2-normalized, so cosine
similarity reduces to a dot product — no extra normalization needed.
"""
from __future__ import annotations
from atocore.retrieval.embeddings import embed_texts
def _dot(a: list[float], b: list[float]) -> float:
return sum(x * y for x, y in zip(a, b))
def cosine(a: list[float], b: list[float]) -> float:
"""Cosine similarity on already-normalized vectors. Clamped to [0,1]
(embeddings use paraphrase-multilingual-MiniLM which is unit-norm,
and we never want negative values leaking into thresholds)."""
return max(0.0, min(1.0, _dot(a, b)))
def compute_memory_similarity(text_a: str, text_b: str) -> float:
"""Return cosine similarity of two memory contents in [0,1].
Convenience helper for one-off checks + tests. For batch work (the
dedup detector), use ``embed_texts()`` directly and compute the
similarity matrix yourself to avoid re-embedding shared texts.
"""
if not text_a or not text_b:
return 0.0
vecs = embed_texts([text_a, text_b])
return cosine(vecs[0], vecs[1])
def similarity_matrix(texts: list[str]) -> list[list[float]]:
"""N×N cosine similarity matrix. Diagonal is 1.0, symmetric."""
if not texts:
return []
vecs = embed_texts(texts)
n = len(vecs)
matrix = [[0.0] * n for _ in range(n)]
for i in range(n):
matrix[i][i] = 1.0
for j in range(i + 1, n):
s = cosine(vecs[i], vecs[j])
matrix[i][j] = s
matrix[j][i] = s
return matrix
def cluster_by_threshold(texts: list[str], threshold: float) -> list[list[int]]:
"""Greedy transitive clustering: if sim(i,j) >= threshold, merge.
Returns a list of clusters, each a list of indices into ``texts``.
Singletons are included. Used by the dedup detector to collapse
A~B~C into one merge proposal rather than three pair proposals.
"""
if not texts:
return []
matrix = similarity_matrix(texts)
n = len(texts)
parent = list(range(n))
def find(x: int) -> int:
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x: int, y: int) -> None:
rx, ry = find(x), find(y)
if rx != ry:
parent[rx] = ry
for i in range(n):
for j in range(i + 1, n):
if matrix[i][j] >= threshold:
union(i, j)
groups: dict[int, list[int]] = {}
for i in range(n):
groups.setdefault(find(i), []).append(i)
return list(groups.values())