feat: Phase 7A — semantic memory dedup ("sleep cycle" V1)

New table memory_merge_candidates + service functions to cluster
near-duplicate active memories within (project, memory_type) buckets,
draft a unified content via LLM, and merge on human approval. Source
memories become superseded (never deleted); merged memory carries
union of tags, max of confidence, sum of reference_count.

- schema migration for memory_merge_candidates
- atocore.memory.similarity: cosine + transitive clustering
- atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific
- service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate
- scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent)
- 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan
- triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar
- batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays)
- deploy/dalidou/dedup-watcher.sh for UI-triggered scans
- 21 new tests (374 → 395)
- docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-18 10:30:49 -04:00
parent 9f262a21b0
commit 028d4c3594
12 changed files with 1860 additions and 8 deletions

View File

@@ -1530,6 +1530,169 @@ def api_extend_reinforced() -> dict:
return {"extended_count": len(extended), "extensions": extended}
# --- Phase 7A: memory dedup / merge-candidate lifecycle ---
class MergeCandidateCreateBody(BaseModel):
memory_ids: list[str]
similarity: float = 0.0
proposed_content: str
proposed_memory_type: str = "knowledge"
proposed_project: str = ""
proposed_tags: list[str] = []
proposed_confidence: float = 0.6
reason: str = ""
class MergeCandidateApproveBody(BaseModel):
actor: str = "human-triage"
content: str | None = None
domain_tags: list[str] | None = None
class MergeCandidateRejectBody(BaseModel):
actor: str = "human-triage"
note: str = ""
class DedupScanRequestBody(BaseModel):
project: str = ""
similarity_threshold: float = 0.88
max_batch: int = 50
@router.get("/admin/memory/merge-candidates")
def api_list_merge_candidates(status: str = "pending", limit: int = 100) -> dict:
"""Phase 7A: list merge-candidate proposals for triage UI."""
from atocore.memory.service import get_merge_candidates
cands = get_merge_candidates(status=status, limit=limit)
return {"candidates": cands, "count": len(cands)}
@router.post("/admin/memory/merge-candidates/create")
def api_create_merge_candidate(body: MergeCandidateCreateBody) -> dict:
"""Phase 7A: host-side dedup detector submits a proposal here.
Server-side idempotency: if a pending candidate already exists for
the same sorted memory_id set, returns the existing id.
"""
from atocore.memory.service import create_merge_candidate
try:
cid = create_merge_candidate(
memory_ids=body.memory_ids,
similarity=body.similarity,
proposed_content=body.proposed_content,
proposed_memory_type=body.proposed_memory_type,
proposed_project=body.proposed_project,
proposed_tags=body.proposed_tags,
proposed_confidence=body.proposed_confidence,
reason=body.reason,
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
if cid is None:
return {"candidate_id": None, "duplicate": True}
return {"candidate_id": cid, "duplicate": False}
@router.post("/admin/memory/merge-candidates/{candidate_id}/approve")
def api_approve_merge_candidate(candidate_id: str, body: MergeCandidateApproveBody) -> dict:
"""Phase 7A: execute an approved merge. Sources → superseded; new
merged memory created. UI can pass content/tag edits via body."""
from atocore.memory.service import merge_memories
new_id = merge_memories(
candidate_id=candidate_id,
actor=body.actor,
override_content=body.content,
override_tags=body.domain_tags,
)
if new_id is None:
raise HTTPException(
status_code=409,
detail="Merge could not execute (candidate not pending, or source memory tampered)",
)
return {"status": "approved", "candidate_id": candidate_id, "result_memory_id": new_id}
@router.post("/admin/memory/merge-candidates/{candidate_id}/reject")
def api_reject_merge_candidate(candidate_id: str, body: MergeCandidateRejectBody) -> dict:
"""Phase 7A: dismiss a merge candidate. Sources stay untouched."""
from atocore.memory.service import reject_merge_candidate
ok = reject_merge_candidate(candidate_id, actor=body.actor, note=body.note)
if not ok:
raise HTTPException(status_code=404, detail="Candidate not found or already resolved")
return {"status": "rejected", "candidate_id": candidate_id}
@router.post("/admin/memory/dedup-scan")
def api_request_dedup_scan(body: DedupScanRequestBody) -> dict:
"""Phase 7A: request a host-side dedup scan.
Writes a flag in project_state with project + threshold + max_batch.
A host cron watcher picks it up within ~2 min and runs
scripts/memory_dedup.py. Mirrors /admin/graduation/request.
"""
import json as _json
from datetime import datetime as _dt, timezone as _tz
from atocore.context.project_state import set_state
now = _dt.now(_tz.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
payload = _json.dumps({
"project": (body.project or "").strip(),
"similarity_threshold": max(0.5, min(0.99, body.similarity_threshold)),
"max_batch": max(1, min(body.max_batch, 200)),
"requested_at": now,
})
set_state(
project_name="atocore",
category="config",
key="dedup_requested_at",
value=payload,
source="admin ui",
)
return {
"requested_at": now,
"project": body.project,
"similarity_threshold": body.similarity_threshold,
"max_batch": body.max_batch,
"note": "Host watcher picks up within ~2 min. Poll /admin/memory/dedup-status for progress.",
}
@router.get("/admin/memory/dedup-status")
def api_dedup_status() -> dict:
"""Phase 7A: state of the dedup scan pipeline (UI polling)."""
import json as _json
from atocore.context.project_state import get_state
out = {
"requested": None,
"last_started_at": None,
"last_finished_at": None,
"last_result": None,
"is_running": False,
}
try:
for e in get_state("atocore"):
if e.category not in ("config", "status"):
continue
if e.key == "dedup_requested_at":
try:
out["requested"] = _json.loads(e.value)
except Exception:
out["requested"] = {"raw": e.value}
elif e.key == "dedup_last_started_at":
out["last_started_at"] = e.value
elif e.key == "dedup_last_finished_at":
out["last_finished_at"] = e.value
elif e.key == "dedup_last_result":
out["last_result"] = e.value
elif e.key == "dedup_running":
out["is_running"] = (e.value == "1")
except Exception:
pass
return out
@router.get("/admin/graduation/stats")
def api_graduation_stats() -> dict:
"""Phase 5F graduation stats for dashboard."""