feat: Phase 7A — semantic memory dedup ("sleep cycle" V1)

New table memory_merge_candidates + service functions to cluster
near-duplicate active memories within (project, memory_type) buckets,
draft a unified content via LLM, and merge on human approval. Source
memories become superseded (never deleted); merged memory carries
union of tags, max of confidence, sum of reference_count.

- schema migration for memory_merge_candidates
- atocore.memory.similarity: cosine + transitive clustering
- atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific
- service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate
- scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent)
- 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan
- triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar
- batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays)
- deploy/dalidou/dedup-watcher.sh for UI-triggered scans
- 21 new tests (374 → 395)
- docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-18 10:30:49 -04:00
parent 9f262a21b0
commit 028d4c3594
12 changed files with 1860 additions and 8 deletions

View File

@@ -1530,6 +1530,169 @@ def api_extend_reinforced() -> dict:
return {"extended_count": len(extended), "extensions": extended}
# --- Phase 7A: memory dedup / merge-candidate lifecycle ---
class MergeCandidateCreateBody(BaseModel):
memory_ids: list[str]
similarity: float = 0.0
proposed_content: str
proposed_memory_type: str = "knowledge"
proposed_project: str = ""
proposed_tags: list[str] = []
proposed_confidence: float = 0.6
reason: str = ""
class MergeCandidateApproveBody(BaseModel):
actor: str = "human-triage"
content: str | None = None
domain_tags: list[str] | None = None
class MergeCandidateRejectBody(BaseModel):
actor: str = "human-triage"
note: str = ""
class DedupScanRequestBody(BaseModel):
project: str = ""
similarity_threshold: float = 0.88
max_batch: int = 50
@router.get("/admin/memory/merge-candidates")
def api_list_merge_candidates(status: str = "pending", limit: int = 100) -> dict:
"""Phase 7A: list merge-candidate proposals for triage UI."""
from atocore.memory.service import get_merge_candidates
cands = get_merge_candidates(status=status, limit=limit)
return {"candidates": cands, "count": len(cands)}
@router.post("/admin/memory/merge-candidates/create")
def api_create_merge_candidate(body: MergeCandidateCreateBody) -> dict:
"""Phase 7A: host-side dedup detector submits a proposal here.
Server-side idempotency: if a pending candidate already exists for
the same sorted memory_id set, returns the existing id.
"""
from atocore.memory.service import create_merge_candidate
try:
cid = create_merge_candidate(
memory_ids=body.memory_ids,
similarity=body.similarity,
proposed_content=body.proposed_content,
proposed_memory_type=body.proposed_memory_type,
proposed_project=body.proposed_project,
proposed_tags=body.proposed_tags,
proposed_confidence=body.proposed_confidence,
reason=body.reason,
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
if cid is None:
return {"candidate_id": None, "duplicate": True}
return {"candidate_id": cid, "duplicate": False}
@router.post("/admin/memory/merge-candidates/{candidate_id}/approve")
def api_approve_merge_candidate(candidate_id: str, body: MergeCandidateApproveBody) -> dict:
"""Phase 7A: execute an approved merge. Sources → superseded; new
merged memory created. UI can pass content/tag edits via body."""
from atocore.memory.service import merge_memories
new_id = merge_memories(
candidate_id=candidate_id,
actor=body.actor,
override_content=body.content,
override_tags=body.domain_tags,
)
if new_id is None:
raise HTTPException(
status_code=409,
detail="Merge could not execute (candidate not pending, or source memory tampered)",
)
return {"status": "approved", "candidate_id": candidate_id, "result_memory_id": new_id}
@router.post("/admin/memory/merge-candidates/{candidate_id}/reject")
def api_reject_merge_candidate(candidate_id: str, body: MergeCandidateRejectBody) -> dict:
"""Phase 7A: dismiss a merge candidate. Sources stay untouched."""
from atocore.memory.service import reject_merge_candidate
ok = reject_merge_candidate(candidate_id, actor=body.actor, note=body.note)
if not ok:
raise HTTPException(status_code=404, detail="Candidate not found or already resolved")
return {"status": "rejected", "candidate_id": candidate_id}
@router.post("/admin/memory/dedup-scan")
def api_request_dedup_scan(body: DedupScanRequestBody) -> dict:
"""Phase 7A: request a host-side dedup scan.
Writes a flag in project_state with project + threshold + max_batch.
A host cron watcher picks it up within ~2 min and runs
scripts/memory_dedup.py. Mirrors /admin/graduation/request.
"""
import json as _json
from datetime import datetime as _dt, timezone as _tz
from atocore.context.project_state import set_state
now = _dt.now(_tz.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
payload = _json.dumps({
"project": (body.project or "").strip(),
"similarity_threshold": max(0.5, min(0.99, body.similarity_threshold)),
"max_batch": max(1, min(body.max_batch, 200)),
"requested_at": now,
})
set_state(
project_name="atocore",
category="config",
key="dedup_requested_at",
value=payload,
source="admin ui",
)
return {
"requested_at": now,
"project": body.project,
"similarity_threshold": body.similarity_threshold,
"max_batch": body.max_batch,
"note": "Host watcher picks up within ~2 min. Poll /admin/memory/dedup-status for progress.",
}
@router.get("/admin/memory/dedup-status")
def api_dedup_status() -> dict:
"""Phase 7A: state of the dedup scan pipeline (UI polling)."""
import json as _json
from atocore.context.project_state import get_state
out = {
"requested": None,
"last_started_at": None,
"last_finished_at": None,
"last_result": None,
"is_running": False,
}
try:
for e in get_state("atocore"):
if e.category not in ("config", "status"):
continue
if e.key == "dedup_requested_at":
try:
out["requested"] = _json.loads(e.value)
except Exception:
out["requested"] = {"raw": e.value}
elif e.key == "dedup_last_started_at":
out["last_started_at"] = e.value
elif e.key == "dedup_last_finished_at":
out["last_finished_at"] = e.value
elif e.key == "dedup_last_result":
out["last_result"] = e.value
elif e.key == "dedup_running":
out["is_running"] = (e.value == "1")
except Exception:
pass
return out
@router.get("/admin/graduation/stats")
def api_graduation_stats() -> dict:
"""Phase 5F graduation stats for dashboard."""

View File

@@ -377,6 +377,177 @@ _ENTITY_TRIAGE_CSS = """
"""
# ---------------------------------------------------------------------
# Phase 7A — Merge candidates (semantic dedup)
# ---------------------------------------------------------------------
_MERGE_TRIAGE_CSS = """
<style>
.cand-merge { border-left: 3px solid #8b5cf6; }
.merge-type { background: #8b5cf6; color: white; padding: 0.1rem 0.5rem; border-radius: 3px; font-size: 0.75rem; }
.merge-sources { margin: 0.5rem 0 0.8rem 0; display: flex; flex-direction: column; gap: 0.35rem; }
.merge-source { background: var(--bg); border: 1px dashed var(--border); border-radius: 4px; padding: 0.4rem 0.6rem; font-size: 0.85rem; }
.merge-source-meta { font-family: monospace; font-size: 0.72rem; opacity: 0.7; margin-bottom: 0.2rem; }
.merge-arrow { text-align: center; font-size: 1.1rem; opacity: 0.5; margin: 0.3rem 0; }
.merge-proposed { background: var(--card); border: 1px solid #8b5cf6; border-radius: 4px; padding: 0.5rem; }
.btn-merge-approve { background: #8b5cf6; color: white; border-color: #8b5cf6; }
.btn-merge-approve:hover { background: #7c3aed; }
</style>
"""
def _render_merge_card(cand: dict) -> str:
import json as _json
cid = _escape(cand.get("id", ""))
sim = cand.get("similarity") or 0.0
sources = cand.get("sources") or []
proposed_content = cand.get("proposed_content") or ""
proposed_tags = cand.get("proposed_tags") or []
proposed_project = cand.get("proposed_project") or ""
reason = cand.get("reason") or ""
src_html = "".join(
f"""
<div class="merge-source">
<div class="merge-source-meta">
{_escape(s.get('id','')[:8])} · [{_escape(s.get('memory_type',''))}]
· {_escape(s.get('project','') or '(global)')}
· conf {float(s.get('confidence',0)):.2f}
· refs {int(s.get('reference_count',0))}
</div>
<div>{_escape((s.get('content') or '')[:300])}</div>
</div>
"""
for s in sources
)
tags_str = ", ".join(proposed_tags)
return f"""
<div class="cand cand-merge" id="mcand-{cid}" data-merge-id="{cid}">
<div class="cand-head">
<span class="cand-type merge-type">[merge · {len(sources)} sources]</span>
<span class="cand-project">{_escape(proposed_project or '(global)')}</span>
<span class="cand-meta">sim ≥ {sim:.2f}</span>
</div>
<div class="merge-sources">{src_html}</div>
<div class="merge-arrow">↓ merged into ↓</div>
<div class="merge-proposed">
<textarea class="cand-content" id="mcontent-{cid}">{_escape(proposed_content)}</textarea>
<div class="cand-meta-row">
<label class="cand-field-label">Tags:
<input type="text" class="cand-tags-input" id="mtags-{cid}" value="{_escape(tags_str)}" placeholder="tag1, tag2">
</label>
</div>
{f'<div class="auto-triage-msg" style="margin-top:0.4rem;">💡 {_escape(reason)}</div>' if reason else ''}
</div>
<div class="cand-actions">
<button class="btn-merge-approve" data-merge-id="{cid}" title="Approve merge">✅ Approve Merge</button>
<button class="btn-reject" data-merge-id="{cid}" data-merge-reject="1" title="Keep separate">❌ Keep Separate</button>
</div>
<div class="cand-status" id="mstatus-{cid}"></div>
</div>
"""
_MERGE_TRIAGE_SCRIPT = """
<script>
async function mergeApprove(id) {
const st = document.getElementById('mstatus-' + id);
st.textContent = 'Merging…';
st.className = 'cand-status ok';
const content = document.getElementById('mcontent-' + id).value;
const tagsRaw = document.getElementById('mtags-' + id).value;
const tags = tagsRaw.split(',').map(t => t.trim()).filter(Boolean);
const r = await fetch('/admin/memory/merge-candidates/' + encodeURIComponent(id) + '/approve', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({actor: 'human-triage', content: content, domain_tags: tags}),
});
if (r.ok) {
const data = await r.json();
st.textContent = '✅ Merged → ' + (data.result_memory_id || '').slice(0, 8);
setTimeout(() => {
const card = document.getElementById('mcand-' + id);
if (card) { card.style.opacity = '0'; setTimeout(() => card.remove(), 300); }
}, 600);
} else {
const err = await r.text();
st.textContent = '' + r.status + ': ' + err.slice(0, 120);
st.className = 'cand-status err';
}
}
async function mergeReject(id) {
const st = document.getElementById('mstatus-' + id);
st.textContent = 'Rejecting…';
st.className = 'cand-status ok';
const r = await fetch('/admin/memory/merge-candidates/' + encodeURIComponent(id) + '/reject', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({actor: 'human-triage'}),
});
if (r.ok) {
st.textContent = '❌ Kept separate';
setTimeout(() => {
const card = document.getElementById('mcand-' + id);
if (card) { card.style.opacity = '0'; setTimeout(() => card.remove(), 300); }
}, 400);
} else st.textContent = '' + r.status;
}
document.addEventListener('click', (e) => {
const mid = e.target.dataset?.mergeId;
if (!mid) return;
if (e.target.classList.contains('btn-merge-approve')) mergeApprove(mid);
else if (e.target.dataset?.mergeReject) mergeReject(mid);
});
async function requestDedupScan() {
const btn = document.getElementById('dedup-btn');
const status = document.getElementById('dedup-status');
btn.disabled = true;
btn.textContent = 'Queuing…';
status.textContent = '';
status.className = 'auto-triage-msg';
const threshold = parseFloat(document.getElementById('dedup-threshold').value || '0.88');
const r = await fetch('/admin/memory/dedup-scan', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({project: '', similarity_threshold: threshold, max_batch: 50}),
});
if (r.ok) {
status.textContent = `✓ Queued dedup scan at threshold ${threshold}. Host watcher runs every 2 min; refresh in ~3 min to see merge candidates.`;
status.className = 'auto-triage-msg ok';
} else {
status.textContent = '' + r.status;
status.className = 'auto-triage-msg err';
}
setTimeout(() => {
btn.disabled = false;
btn.textContent = '🔗 Scan for duplicates';
}, 2000);
}
</script>
"""
def _render_dedup_bar() -> str:
return """
<div class="auto-triage-bar">
<button id="dedup-btn" onclick="requestDedupScan()" title="Run semantic dedup scan on Dalidou host">
🔗 Scan for duplicates
</button>
<label class="cand-field-label" style="margin:0 0.5rem;">
Threshold:
<input id="dedup-threshold" type="number" min="0.70" max="0.99" step="0.01" value="0.88"
style="width:70px; padding:0.25rem; background:var(--bg); color:var(--text); border:1px solid var(--border); border-radius:3px;">
</label>
<span id="dedup-status" class="auto-triage-msg">
Finds semantically near-duplicate active memories and proposes LLM-drafted merges for review. Source memories become <code>superseded</code> on approve; nothing is deleted.
</span>
</div>
"""
def _render_graduation_bar() -> str:
"""The 'Graduate memories → entity candidates' control bar."""
from atocore.projects.registry import load_project_registry
@@ -478,26 +649,51 @@ def render_triage_page(limit: int = 100) -> str:
except Exception as e:
entity_candidates = []
total = len(mem_candidates) + len(entity_candidates)
try:
from atocore.memory.service import get_merge_candidates
merge_candidates = get_merge_candidates(status="pending", limit=limit)
except Exception:
merge_candidates = []
total = len(mem_candidates) + len(entity_candidates) + len(merge_candidates)
graduation_bar = _render_graduation_bar()
dedup_bar = _render_dedup_bar()
if total == 0:
body = _TRIAGE_CSS + _ENTITY_TRIAGE_CSS + f"""
body = _TRIAGE_CSS + _ENTITY_TRIAGE_CSS + _MERGE_TRIAGE_CSS + f"""
<div class="triage-header">
<h1>Triage Queue</h1>
</div>
{graduation_bar}
{dedup_bar}
<div class="empty">
<p>🎉 No candidates to review.</p>
<p>The auto-triage pipeline keeps this queue empty unless something needs your judgment.</p>
<p>Use the 🎓 Graduate memories button above to propose new entity candidates from existing memories.</p>
<p>Use 🎓 Graduate memories to propose entity candidates, or 🔗 Scan for duplicates to find near-duplicate memories to merge.</p>
</div>
""" + _GRADUATION_SCRIPT
""" + _GRADUATION_SCRIPT + _MERGE_TRIAGE_SCRIPT
return render_html("Triage — AtoCore", body, breadcrumbs=[("Wiki", "/wiki"), ("Triage", "")])
# Memory cards
mem_cards = "".join(_render_candidate_card(c) for c in mem_candidates)
# Merge cards (Phase 7A)
merge_cards_html = ""
if merge_candidates:
merge_cards = "".join(_render_merge_card(c) for c in merge_candidates)
merge_cards_html = f"""
<div class="section-break">
<h2>🔗 Merge Candidates ({len(merge_candidates)})</h2>
<p class="auto-triage-msg">
Semantically near-duplicate active memories. Approving merges the sources
into the proposed unified memory; sources become <code>superseded</code>
(not deleted — still queryable). You can edit the draft content and tags
before approving.
</p>
</div>
{merge_cards}
"""
# Entity cards
ent_cards_html = ""
if entity_candidates:
@@ -513,11 +709,12 @@ def render_triage_page(limit: int = 100) -> str:
{ent_cards}
"""
body = _TRIAGE_CSS + _ENTITY_TRIAGE_CSS + f"""
body = _TRIAGE_CSS + _ENTITY_TRIAGE_CSS + _MERGE_TRIAGE_CSS + f"""
<div class="triage-header">
<h1>Triage Queue</h1>
<span class="count">
<span id="cand-count">{len(mem_candidates)}</span> memory ·
{len(merge_candidates)} merge ·
{len(entity_candidates)} entity
</span>
</div>
@@ -536,10 +733,12 @@ def render_triage_page(limit: int = 100) -> str:
</span>
</div>
{graduation_bar}
{dedup_bar}
<h2>📝 Memory Candidates ({len(mem_candidates)})</h2>
{mem_cards}
{merge_cards_html}
{ent_cards_html}
""" + _TRIAGE_SCRIPT + _ENTITY_TRIAGE_SCRIPT + _GRADUATION_SCRIPT
""" + _TRIAGE_SCRIPT + _ENTITY_TRIAGE_SCRIPT + _GRADUATION_SCRIPT + _MERGE_TRIAGE_SCRIPT
return render_html(
"Triage — AtoCore",

View File

@@ -0,0 +1,156 @@
"""Shared LLM prompt + parser for memory dedup (Phase 7A).
Stdlib-only — must be importable from both the in-container service
layer (when a user clicks "scan for duplicates" in the UI) and the
host-side batch script (``scripts/memory_dedup.py``), which runs on
Dalidou where the container's Python deps are not available.
The prompt instructs the model to draft a UNIFIED memory that
preserves every specific detail from the sources. We never want a
merge to lose information — if two memories disagree on a number, the
merged content should surface both with context.
"""
from __future__ import annotations
import json
from typing import Any
DEDUP_PROMPT_VERSION = "dedup-0.1.0"
MAX_CONTENT_CHARS = 1000
MAX_SOURCES = 8 # cluster size cap — bigger clusters are suspicious
SYSTEM_PROMPT = """You consolidate near-duplicate memories for AtoCore, a personal context engine.
Given 2-8 memories that a semantic-similarity scan flagged as likely duplicates, draft a UNIFIED replacement that preserves every specific detail from every source.
CORE PRINCIPLE: information never gets lost. If the sources disagree on a number, date, vendor, or spec, surface BOTH with attribution (e.g., "quoted at $3.2k on 2026-03-01, revised to $3.8k on 2026-04-10"). If one source is more specific than another, keep the specificity. If they say the same thing differently, pick the clearer wording.
YOU MUST:
- Produce content under 500 characters that reads as a single coherent statement
- Keep all project/vendor/person/part names that appear in any source
- Keep all numbers, dates, and identifiers
- Keep the strongest claim wording ("ratified", "decided", "committed") if any source has it
- Propose domain_tags as a UNION of the sources' tags (lowercase, deduped, cap 6)
- Return valid_until = latest non-null valid_until across sources, or null if any source has null (permanent beats transient)
REFUSE TO MERGE (return action="reject") if:
- The memories are actually about DIFFERENT subjects that just share vocabulary (e.g., "p04 mirror" and "p05 mirror" — same project bucket means same project, but different components)
- One memory CONTRADICTS another and you cannot reconcile them — flag for contradiction review instead
- The sources span different time snapshots of a changing state that should stay as a timeline, not be collapsed
OUTPUT — raw JSON, no prose, no markdown fences:
{
"action": "merge" | "reject",
"content": "the unified memory content",
"memory_type": "knowledge|project|preference|adaptation|episodic|identity",
"project": "project-slug or empty",
"domain_tags": ["tag1", "tag2"],
"confidence": 0.5,
"reason": "one sentence explaining the merge (or the rejection)"
}
On action=reject, still fill content with a short explanation and set confidence=0."""
def build_user_message(sources: list[dict[str, Any]]) -> str:
"""Format N source memories for the model to consolidate.
Each source dict should carry id, content, project, memory_type,
domain_tags, confidence, valid_until, reference_count.
"""
lines = [f"You have {len(sources)} source memories in the same (project, memory_type) bucket:\n"]
for i, src in enumerate(sources[:MAX_SOURCES], start=1):
tags = src.get("domain_tags") or []
if isinstance(tags, str):
try:
tags = json.loads(tags)
except Exception:
tags = []
lines.append(
f"--- Source {i} (id={src.get('id','?')[:8]}, "
f"refs={src.get('reference_count',0)}, "
f"conf={src.get('confidence',0):.2f}, "
f"valid_until={src.get('valid_until') or 'permanent'}) ---"
)
lines.append(f"project: {src.get('project','')}")
lines.append(f"type: {src.get('memory_type','')}")
lines.append(f"tags: {tags}")
lines.append(f"content: {(src.get('content') or '')[:MAX_CONTENT_CHARS]}")
lines.append("")
lines.append("Return the JSON object now.")
return "\n".join(lines)
def parse_merge_verdict(raw_output: str) -> dict[str, Any] | None:
"""Strip markdown fences / leading prose and return the parsed JSON
object. Returns None on parse failure."""
text = (raw_output or "").strip()
if text.startswith("```"):
text = text.strip("`")
nl = text.find("\n")
if nl >= 0:
text = text[nl + 1:]
if text.endswith("```"):
text = text[:-3]
text = text.strip()
if not text.lstrip().startswith("{"):
start = text.find("{")
end = text.rfind("}")
if start >= 0 and end > start:
text = text[start:end + 1]
try:
parsed = json.loads(text)
except json.JSONDecodeError:
return None
if not isinstance(parsed, dict):
return None
return parsed
def normalize_merge_verdict(verdict: dict[str, Any]) -> dict[str, Any] | None:
"""Validate + normalize a raw merge verdict. Returns None if the
verdict is unusable (no content, unknown action)."""
action = str(verdict.get("action") or "").strip().lower()
if action not in ("merge", "reject"):
return None
content = str(verdict.get("content") or "").strip()
if not content:
return None
memory_type = str(verdict.get("memory_type") or "knowledge").strip().lower()
project = str(verdict.get("project") or "").strip()
raw_tags = verdict.get("domain_tags") or []
if isinstance(raw_tags, str):
raw_tags = [t.strip() for t in raw_tags.split(",") if t.strip()]
if not isinstance(raw_tags, list):
raw_tags = []
tags: list[str] = []
for t in raw_tags[:6]:
if not isinstance(t, str):
continue
tt = t.strip().lower()
if tt and tt not in tags:
tags.append(tt)
try:
confidence = float(verdict.get("confidence", 0.5))
except (TypeError, ValueError):
confidence = 0.5
confidence = max(0.0, min(1.0, confidence))
reason = str(verdict.get("reason") or "").strip()[:500]
return {
"action": action,
"content": content[:1000],
"memory_type": memory_type,
"project": project,
"domain_tags": tags,
"confidence": confidence,
"reason": reason,
}

View File

@@ -925,3 +925,327 @@ def _row_to_memory(row) -> Memory:
def _validate_confidence(confidence: float) -> None:
if not 0.0 <= confidence <= 1.0:
raise ValueError("Confidence must be between 0.0 and 1.0")
# ---------------------------------------------------------------------
# Phase 7A — Memory Consolidation: merge-candidate lifecycle
# ---------------------------------------------------------------------
#
# The detector (scripts/memory_dedup.py) writes proposals into
# memory_merge_candidates. The triage UI lists pending rows, a human
# reviews, and on approve we execute the merge here — never at detect
# time. This keeps the audit trail clean: every mutation is a human
# decision.
def create_merge_candidate(
memory_ids: list[str],
similarity: float,
proposed_content: str,
proposed_memory_type: str,
proposed_project: str,
proposed_tags: list[str] | None = None,
proposed_confidence: float = 0.6,
reason: str = "",
) -> str | None:
"""Insert a merge-candidate row. Returns the new row id, or None if
a pending candidate already covers this exact set of memory ids
(idempotent scan — re-running the detector doesn't double-create)."""
import json as _json
if not memory_ids or len(memory_ids) < 2:
raise ValueError("merge candidate requires at least 2 memory_ids")
memory_ids_sorted = sorted(set(memory_ids))
memory_ids_json = _json.dumps(memory_ids_sorted)
tags_json = _json.dumps(_normalize_tags(proposed_tags))
candidate_id = str(uuid.uuid4())
with get_connection() as conn:
# Idempotency: same sorted-id set already pending? skip.
existing = conn.execute(
"SELECT id FROM memory_merge_candidates "
"WHERE status = 'pending' AND memory_ids = ?",
(memory_ids_json,),
).fetchone()
if existing:
return None
conn.execute(
"INSERT INTO memory_merge_candidates "
"(id, status, memory_ids, similarity, proposed_content, "
"proposed_memory_type, proposed_project, proposed_tags, "
"proposed_confidence, reason) "
"VALUES (?, 'pending', ?, ?, ?, ?, ?, ?, ?, ?)",
(
candidate_id, memory_ids_json, float(similarity or 0.0),
(proposed_content or "")[:2000],
(proposed_memory_type or "knowledge")[:50],
(proposed_project or "")[:100],
tags_json,
max(0.0, min(1.0, float(proposed_confidence))),
(reason or "")[:500],
),
)
log.info(
"merge_candidate_created",
candidate_id=candidate_id,
memory_count=len(memory_ids_sorted),
similarity=round(similarity, 4),
)
return candidate_id
def get_merge_candidates(status: str = "pending", limit: int = 100) -> list[dict]:
"""List merge candidates with their source memories inlined."""
import json as _json
with get_connection() as conn:
rows = conn.execute(
"SELECT * FROM memory_merge_candidates "
"WHERE status = ? ORDER BY created_at DESC LIMIT ?",
(status, limit),
).fetchall()
out = []
for r in rows:
try:
mem_ids = _json.loads(r["memory_ids"] or "[]")
except Exception:
mem_ids = []
try:
tags = _json.loads(r["proposed_tags"] or "[]")
except Exception:
tags = []
sources = []
for mid in mem_ids:
srow = conn.execute(
"SELECT id, memory_type, content, project, confidence, "
"status, reference_count, domain_tags, valid_until "
"FROM memories WHERE id = ?",
(mid,),
).fetchone()
if srow:
try:
stags = _json.loads(srow["domain_tags"] or "[]")
except Exception:
stags = []
sources.append({
"id": srow["id"],
"memory_type": srow["memory_type"],
"content": srow["content"],
"project": srow["project"] or "",
"confidence": srow["confidence"],
"status": srow["status"],
"reference_count": int(srow["reference_count"] or 0),
"domain_tags": stags,
"valid_until": srow["valid_until"] or "",
})
out.append({
"id": r["id"],
"status": r["status"],
"memory_ids": mem_ids,
"similarity": r["similarity"],
"proposed_content": r["proposed_content"] or "",
"proposed_memory_type": r["proposed_memory_type"] or "knowledge",
"proposed_project": r["proposed_project"] or "",
"proposed_tags": tags,
"proposed_confidence": r["proposed_confidence"],
"reason": r["reason"] or "",
"created_at": r["created_at"],
"resolved_at": r["resolved_at"],
"resolved_by": r["resolved_by"],
"result_memory_id": r["result_memory_id"],
"sources": sources,
})
return out
def reject_merge_candidate(candidate_id: str, actor: str = "human-triage", note: str = "") -> bool:
"""Mark a merge candidate as rejected. Source memories stay untouched."""
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
with get_connection() as conn:
result = conn.execute(
"UPDATE memory_merge_candidates "
"SET status = 'rejected', resolved_at = ?, resolved_by = ? "
"WHERE id = ? AND status = 'pending'",
(now_str, actor, candidate_id),
)
if result.rowcount == 0:
return False
log.info("merge_candidate_rejected", candidate_id=candidate_id, actor=actor, note=note[:100])
return True
def merge_memories(
candidate_id: str,
actor: str = "human-triage",
override_content: str | None = None,
override_tags: list[str] | None = None,
) -> str | None:
"""Execute an approved merge candidate.
1. Validate all source memories still status=active
2. Create the new merged memory (status=active)
3. Mark each source status=superseded with an audit row pointing at
the new merged id
4. Mark the candidate status=approved, record result_memory_id
5. Write a consolidated audit row on the new memory
Returns the new merged memory's id, or None if the candidate cannot
be executed (already resolved, source tampered, etc.).
``override_content`` and ``override_tags`` let the UI pass the human's
edits before clicking approve.
"""
import json as _json
now_str = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
with get_connection() as conn:
row = conn.execute(
"SELECT * FROM memory_merge_candidates WHERE id = ?",
(candidate_id,),
).fetchone()
if row is None or row["status"] != "pending":
log.warning("merge_candidate_not_pending", candidate_id=candidate_id)
return None
try:
mem_ids = _json.loads(row["memory_ids"] or "[]")
except Exception:
mem_ids = []
if not mem_ids or len(mem_ids) < 2:
log.warning("merge_candidate_invalid_memory_ids", candidate_id=candidate_id)
return None
# Snapshot sources + validate all active
source_rows = []
for mid in mem_ids:
srow = conn.execute(
"SELECT * FROM memories WHERE id = ?", (mid,)
).fetchone()
if srow is None or srow["status"] != "active":
log.warning(
"merge_source_not_active",
candidate_id=candidate_id,
memory_id=mid,
actual_status=(srow["status"] if srow else "missing"),
)
return None
source_rows.append(srow)
# Build merged memory fields — prefer human overrides, then proposed
content = (override_content or row["proposed_content"] or "").strip()
if not content:
log.warning("merge_candidate_empty_content", candidate_id=candidate_id)
return None
merged_type = (row["proposed_memory_type"] or source_rows[0]["memory_type"]).lower()
if merged_type not in MEMORY_TYPES:
merged_type = source_rows[0]["memory_type"]
merged_project = row["proposed_project"] or source_rows[0]["project"] or ""
merged_project = resolve_project_name(merged_project)
# Tags: override wins, else proposed, else union of sources
if override_tags is not None:
merged_tags = _normalize_tags(override_tags)
else:
try:
proposed_tags = _json.loads(row["proposed_tags"] or "[]")
except Exception:
proposed_tags = []
if proposed_tags:
merged_tags = _normalize_tags(proposed_tags)
else:
union: list[str] = []
for srow in source_rows:
try:
stags = _json.loads(srow["domain_tags"] or "[]")
except Exception:
stags = []
for t in stags:
if isinstance(t, str) and t and t not in union:
union.append(t)
merged_tags = union
# confidence = max; reference_count = sum
merged_confidence = max(float(s["confidence"]) for s in source_rows)
total_refs = sum(int(s["reference_count"] or 0) for s in source_rows)
# valid_until: if any source is permanent (None/empty), merged is permanent.
# Otherwise take the latest (lexical compare on ISO dates works).
merged_vu: str | None = "" # placeholder
has_permanent = any(not (s["valid_until"] or "").strip() for s in source_rows)
if has_permanent:
merged_vu = None
else:
merged_vu = max((s["valid_until"] or "").strip() for s in source_rows) or None
new_id = str(uuid.uuid4())
tags_json = _json.dumps(merged_tags)
conn.execute(
"INSERT INTO memories (id, memory_type, content, project, "
"source_chunk_id, confidence, status, domain_tags, valid_until, "
"reference_count, last_referenced_at) "
"VALUES (?, ?, ?, ?, NULL, ?, 'active', ?, ?, ?, ?)",
(
new_id, merged_type, content[:2000], merged_project,
merged_confidence, tags_json, merged_vu, total_refs, now_str,
),
)
# Mark sources superseded
for srow in source_rows:
conn.execute(
"UPDATE memories SET status = 'superseded', updated_at = ? "
"WHERE id = ?",
(now_str, srow["id"]),
)
# Mark candidate approved
conn.execute(
"UPDATE memory_merge_candidates SET status = 'approved', "
"resolved_at = ?, resolved_by = ?, result_memory_id = ? WHERE id = ?",
(now_str, actor, new_id, candidate_id),
)
# Audit rows (out of the transaction; fail-open via _audit_memory)
_audit_memory(
memory_id=new_id,
action="created_via_merge",
actor=actor,
after={
"memory_type": merged_type,
"content": content,
"project": merged_project,
"confidence": merged_confidence,
"domain_tags": merged_tags,
"reference_count": total_refs,
"merged_from": list(mem_ids),
"merge_candidate_id": candidate_id,
},
note=f"merged {len(mem_ids)} sources via candidate {candidate_id[:8]}",
)
for srow in source_rows:
_audit_memory(
memory_id=srow["id"],
action="superseded",
actor=actor,
before={"status": "active", "content": srow["content"]},
after={"status": "superseded", "superseded_by": new_id},
note=f"merged into {new_id}",
)
log.info(
"merge_executed",
candidate_id=candidate_id,
result_memory_id=new_id,
source_count=len(source_rows),
actor=actor,
)
return new_id

View File

@@ -0,0 +1,88 @@
"""Phase 7A (Memory Consolidation): semantic similarity helpers.
Thin wrapper over ``atocore.retrieval.embeddings`` that exposes
pairwise + batch cosine similarity on normalized embeddings. Used by
the dedup detector to cluster near-duplicate active memories.
Embeddings from ``embed_texts()`` are already L2-normalized, so cosine
similarity reduces to a dot product — no extra normalization needed.
"""
from __future__ import annotations
from atocore.retrieval.embeddings import embed_texts
def _dot(a: list[float], b: list[float]) -> float:
return sum(x * y for x, y in zip(a, b))
def cosine(a: list[float], b: list[float]) -> float:
"""Cosine similarity on already-normalized vectors. Clamped to [0,1]
(embeddings use paraphrase-multilingual-MiniLM which is unit-norm,
and we never want negative values leaking into thresholds)."""
return max(0.0, min(1.0, _dot(a, b)))
def compute_memory_similarity(text_a: str, text_b: str) -> float:
"""Return cosine similarity of two memory contents in [0,1].
Convenience helper for one-off checks + tests. For batch work (the
dedup detector), use ``embed_texts()`` directly and compute the
similarity matrix yourself to avoid re-embedding shared texts.
"""
if not text_a or not text_b:
return 0.0
vecs = embed_texts([text_a, text_b])
return cosine(vecs[0], vecs[1])
def similarity_matrix(texts: list[str]) -> list[list[float]]:
"""N×N cosine similarity matrix. Diagonal is 1.0, symmetric."""
if not texts:
return []
vecs = embed_texts(texts)
n = len(vecs)
matrix = [[0.0] * n for _ in range(n)]
for i in range(n):
matrix[i][i] = 1.0
for j in range(i + 1, n):
s = cosine(vecs[i], vecs[j])
matrix[i][j] = s
matrix[j][i] = s
return matrix
def cluster_by_threshold(texts: list[str], threshold: float) -> list[list[int]]:
"""Greedy transitive clustering: if sim(i,j) >= threshold, merge.
Returns a list of clusters, each a list of indices into ``texts``.
Singletons are included. Used by the dedup detector to collapse
A~B~C into one merge proposal rather than three pair proposals.
"""
if not texts:
return []
matrix = similarity_matrix(texts)
n = len(texts)
parent = list(range(n))
def find(x: int) -> int:
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x: int, y: int) -> None:
rx, ry = find(x), find(y)
if rx != ry:
parent[rx] = ry
for i in range(n):
for j in range(i + 1, n):
if matrix[i][j] >= threshold:
union(i, j)
groups: dict[int, list[int]] = {}
for i in range(n):
groups.setdefault(find(i), []).append(i)
return list(groups.values())

View File

@@ -251,6 +251,42 @@ def _apply_migrations(conn: sqlite3.Connection) -> None:
"CREATE INDEX IF NOT EXISTS idx_interactions_created_at ON interactions(created_at)"
)
# Phase 7A (Memory Consolidation — "sleep cycle"): merge candidates.
# When the dedup detector finds a cluster of semantically similar active
# memories within the same (project, memory_type) bucket, it drafts a
# unified content via LLM and writes a proposal here. The triage UI
# surfaces these for human approval. On approve, source memories become
# status=superseded and a new merged memory is created.
# memory_ids is a JSON array (length >= 2) of the source memory ids.
# proposed_* hold the LLM's draft; a human can edit before approve.
# result_memory_id is filled on approve with the new merged memory's id.
conn.execute(
"""
CREATE TABLE IF NOT EXISTS memory_merge_candidates (
id TEXT PRIMARY KEY,
status TEXT DEFAULT 'pending',
memory_ids TEXT NOT NULL,
similarity REAL,
proposed_content TEXT,
proposed_memory_type TEXT,
proposed_project TEXT,
proposed_tags TEXT DEFAULT '[]',
proposed_confidence REAL,
reason TEXT DEFAULT '',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
resolved_at DATETIME,
resolved_by TEXT,
result_memory_id TEXT
)
"""
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_mmc_status ON memory_merge_candidates(status)"
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_mmc_created_at ON memory_merge_candidates(created_at)"
)
def _column_exists(conn: sqlite3.Connection, table: str, column: str) -> bool:
rows = conn.execute(f"PRAGMA table_info({table})").fetchall()