New table memory_merge_candidates + service functions to cluster near-duplicate active memories within (project, memory_type) buckets, draft a unified content via LLM, and merge on human approval. Source memories become superseded (never deleted); merged memory carries union of tags, max of confidence, sum of reference_count. - schema migration for memory_merge_candidates - atocore.memory.similarity: cosine + transitive clustering - atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific - service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate - scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent) - 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan - triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar - batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays) - deploy/dalidou/dedup-watcher.sh for UI-triggered scans - 21 new tests (374 → 395) - docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
89 lines
2.8 KiB
Python
89 lines
2.8 KiB
Python
"""Phase 7A (Memory Consolidation): semantic similarity helpers.
|
||
|
||
Thin wrapper over ``atocore.retrieval.embeddings`` that exposes
|
||
pairwise + batch cosine similarity on normalized embeddings. Used by
|
||
the dedup detector to cluster near-duplicate active memories.
|
||
|
||
Embeddings from ``embed_texts()`` are already L2-normalized, so cosine
|
||
similarity reduces to a dot product — no extra normalization needed.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from atocore.retrieval.embeddings import embed_texts
|
||
|
||
|
||
def _dot(a: list[float], b: list[float]) -> float:
|
||
return sum(x * y for x, y in zip(a, b))
|
||
|
||
|
||
def cosine(a: list[float], b: list[float]) -> float:
|
||
"""Cosine similarity on already-normalized vectors. Clamped to [0,1]
|
||
(embeddings use paraphrase-multilingual-MiniLM which is unit-norm,
|
||
and we never want negative values leaking into thresholds)."""
|
||
return max(0.0, min(1.0, _dot(a, b)))
|
||
|
||
|
||
def compute_memory_similarity(text_a: str, text_b: str) -> float:
|
||
"""Return cosine similarity of two memory contents in [0,1].
|
||
|
||
Convenience helper for one-off checks + tests. For batch work (the
|
||
dedup detector), use ``embed_texts()`` directly and compute the
|
||
similarity matrix yourself to avoid re-embedding shared texts.
|
||
"""
|
||
if not text_a or not text_b:
|
||
return 0.0
|
||
vecs = embed_texts([text_a, text_b])
|
||
return cosine(vecs[0], vecs[1])
|
||
|
||
|
||
def similarity_matrix(texts: list[str]) -> list[list[float]]:
|
||
"""N×N cosine similarity matrix. Diagonal is 1.0, symmetric."""
|
||
if not texts:
|
||
return []
|
||
vecs = embed_texts(texts)
|
||
n = len(vecs)
|
||
matrix = [[0.0] * n for _ in range(n)]
|
||
for i in range(n):
|
||
matrix[i][i] = 1.0
|
||
for j in range(i + 1, n):
|
||
s = cosine(vecs[i], vecs[j])
|
||
matrix[i][j] = s
|
||
matrix[j][i] = s
|
||
return matrix
|
||
|
||
|
||
def cluster_by_threshold(texts: list[str], threshold: float) -> list[list[int]]:
|
||
"""Greedy transitive clustering: if sim(i,j) >= threshold, merge.
|
||
|
||
Returns a list of clusters, each a list of indices into ``texts``.
|
||
Singletons are included. Used by the dedup detector to collapse
|
||
A~B~C into one merge proposal rather than three pair proposals.
|
||
"""
|
||
if not texts:
|
||
return []
|
||
matrix = similarity_matrix(texts)
|
||
n = len(texts)
|
||
parent = list(range(n))
|
||
|
||
def find(x: int) -> int:
|
||
while parent[x] != x:
|
||
parent[x] = parent[parent[x]]
|
||
x = parent[x]
|
||
return x
|
||
|
||
def union(x: int, y: int) -> None:
|
||
rx, ry = find(x), find(y)
|
||
if rx != ry:
|
||
parent[rx] = ry
|
||
|
||
for i in range(n):
|
||
for j in range(i + 1, n):
|
||
if matrix[i][j] >= threshold:
|
||
union(i, j)
|
||
|
||
groups: dict[int, list[int]] = {}
|
||
for i in range(n):
|
||
groups.setdefault(find(i), []).append(i)
|
||
return list(groups.values())
|