Files
ATOCore/src/atocore/memory/similarity.py
Anto01 028d4c3594 feat: Phase 7A — semantic memory dedup ("sleep cycle" V1)
New table memory_merge_candidates + service functions to cluster
near-duplicate active memories within (project, memory_type) buckets,
draft a unified content via LLM, and merge on human approval. Source
memories become superseded (never deleted); merged memory carries
union of tags, max of confidence, sum of reference_count.

- schema migration for memory_merge_candidates
- atocore.memory.similarity: cosine + transitive clustering
- atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific
- service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate
- scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent)
- 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan
- triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar
- batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays)
- deploy/dalidou/dedup-watcher.sh for UI-triggered scans
- 21 new tests (374 → 395)
- docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 10:30:49 -04:00

89 lines
2.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Phase 7A (Memory Consolidation): semantic similarity helpers.
Thin wrapper over ``atocore.retrieval.embeddings`` that exposes
pairwise + batch cosine similarity on normalized embeddings. Used by
the dedup detector to cluster near-duplicate active memories.
Embeddings from ``embed_texts()`` are already L2-normalized, so cosine
similarity reduces to a dot product — no extra normalization needed.
"""
from __future__ import annotations
from atocore.retrieval.embeddings import embed_texts
def _dot(a: list[float], b: list[float]) -> float:
return sum(x * y for x, y in zip(a, b))
def cosine(a: list[float], b: list[float]) -> float:
"""Cosine similarity on already-normalized vectors. Clamped to [0,1]
(embeddings use paraphrase-multilingual-MiniLM which is unit-norm,
and we never want negative values leaking into thresholds)."""
return max(0.0, min(1.0, _dot(a, b)))
def compute_memory_similarity(text_a: str, text_b: str) -> float:
"""Return cosine similarity of two memory contents in [0,1].
Convenience helper for one-off checks + tests. For batch work (the
dedup detector), use ``embed_texts()`` directly and compute the
similarity matrix yourself to avoid re-embedding shared texts.
"""
if not text_a or not text_b:
return 0.0
vecs = embed_texts([text_a, text_b])
return cosine(vecs[0], vecs[1])
def similarity_matrix(texts: list[str]) -> list[list[float]]:
"""N×N cosine similarity matrix. Diagonal is 1.0, symmetric."""
if not texts:
return []
vecs = embed_texts(texts)
n = len(vecs)
matrix = [[0.0] * n for _ in range(n)]
for i in range(n):
matrix[i][i] = 1.0
for j in range(i + 1, n):
s = cosine(vecs[i], vecs[j])
matrix[i][j] = s
matrix[j][i] = s
return matrix
def cluster_by_threshold(texts: list[str], threshold: float) -> list[list[int]]:
"""Greedy transitive clustering: if sim(i,j) >= threshold, merge.
Returns a list of clusters, each a list of indices into ``texts``.
Singletons are included. Used by the dedup detector to collapse
A~B~C into one merge proposal rather than three pair proposals.
"""
if not texts:
return []
matrix = similarity_matrix(texts)
n = len(texts)
parent = list(range(n))
def find(x: int) -> int:
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x: int, y: int) -> None:
rx, ry = find(x), find(y)
if rx != ry:
parent[rx] = ry
for i in range(n):
for j in range(i + 1, n):
if matrix[i][j] >= threshold:
union(i, j)
groups: dict[int, list[int]] = {}
for i in range(n):
groups.setdefault(find(i), []).append(i)
return list(groups.values())