feat: Phase 7A — semantic memory dedup ("sleep cycle" V1)
New table memory_merge_candidates + service functions to cluster near-duplicate active memories within (project, memory_type) buckets, draft a unified content via LLM, and merge on human approval. Source memories become superseded (never deleted); merged memory carries union of tags, max of confidence, sum of reference_count. - schema migration for memory_merge_candidates - atocore.memory.similarity: cosine + transitive clustering - atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific - service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate - scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent) - 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan - triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar - batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays) - deploy/dalidou/dedup-watcher.sh for UI-triggered scans - 21 new tests (374 → 395) - docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1530,6 +1530,169 @@ def api_extend_reinforced() -> dict:
|
||||
return {"extended_count": len(extended), "extensions": extended}
|
||||
|
||||
|
||||
# --- Phase 7A: memory dedup / merge-candidate lifecycle ---
|
||||
|
||||
|
||||
class MergeCandidateCreateBody(BaseModel):
|
||||
memory_ids: list[str]
|
||||
similarity: float = 0.0
|
||||
proposed_content: str
|
||||
proposed_memory_type: str = "knowledge"
|
||||
proposed_project: str = ""
|
||||
proposed_tags: list[str] = []
|
||||
proposed_confidence: float = 0.6
|
||||
reason: str = ""
|
||||
|
||||
|
||||
class MergeCandidateApproveBody(BaseModel):
|
||||
actor: str = "human-triage"
|
||||
content: str | None = None
|
||||
domain_tags: list[str] | None = None
|
||||
|
||||
|
||||
class MergeCandidateRejectBody(BaseModel):
|
||||
actor: str = "human-triage"
|
||||
note: str = ""
|
||||
|
||||
|
||||
class DedupScanRequestBody(BaseModel):
|
||||
project: str = ""
|
||||
similarity_threshold: float = 0.88
|
||||
max_batch: int = 50
|
||||
|
||||
|
||||
@router.get("/admin/memory/merge-candidates")
|
||||
def api_list_merge_candidates(status: str = "pending", limit: int = 100) -> dict:
|
||||
"""Phase 7A: list merge-candidate proposals for triage UI."""
|
||||
from atocore.memory.service import get_merge_candidates
|
||||
cands = get_merge_candidates(status=status, limit=limit)
|
||||
return {"candidates": cands, "count": len(cands)}
|
||||
|
||||
|
||||
@router.post("/admin/memory/merge-candidates/create")
|
||||
def api_create_merge_candidate(body: MergeCandidateCreateBody) -> dict:
|
||||
"""Phase 7A: host-side dedup detector submits a proposal here.
|
||||
|
||||
Server-side idempotency: if a pending candidate already exists for
|
||||
the same sorted memory_id set, returns the existing id.
|
||||
"""
|
||||
from atocore.memory.service import create_merge_candidate
|
||||
try:
|
||||
cid = create_merge_candidate(
|
||||
memory_ids=body.memory_ids,
|
||||
similarity=body.similarity,
|
||||
proposed_content=body.proposed_content,
|
||||
proposed_memory_type=body.proposed_memory_type,
|
||||
proposed_project=body.proposed_project,
|
||||
proposed_tags=body.proposed_tags,
|
||||
proposed_confidence=body.proposed_confidence,
|
||||
reason=body.reason,
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
if cid is None:
|
||||
return {"candidate_id": None, "duplicate": True}
|
||||
return {"candidate_id": cid, "duplicate": False}
|
||||
|
||||
|
||||
@router.post("/admin/memory/merge-candidates/{candidate_id}/approve")
|
||||
def api_approve_merge_candidate(candidate_id: str, body: MergeCandidateApproveBody) -> dict:
|
||||
"""Phase 7A: execute an approved merge. Sources → superseded; new
|
||||
merged memory created. UI can pass content/tag edits via body."""
|
||||
from atocore.memory.service import merge_memories
|
||||
new_id = merge_memories(
|
||||
candidate_id=candidate_id,
|
||||
actor=body.actor,
|
||||
override_content=body.content,
|
||||
override_tags=body.domain_tags,
|
||||
)
|
||||
if new_id is None:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="Merge could not execute (candidate not pending, or source memory tampered)",
|
||||
)
|
||||
return {"status": "approved", "candidate_id": candidate_id, "result_memory_id": new_id}
|
||||
|
||||
|
||||
@router.post("/admin/memory/merge-candidates/{candidate_id}/reject")
|
||||
def api_reject_merge_candidate(candidate_id: str, body: MergeCandidateRejectBody) -> dict:
|
||||
"""Phase 7A: dismiss a merge candidate. Sources stay untouched."""
|
||||
from atocore.memory.service import reject_merge_candidate
|
||||
ok = reject_merge_candidate(candidate_id, actor=body.actor, note=body.note)
|
||||
if not ok:
|
||||
raise HTTPException(status_code=404, detail="Candidate not found or already resolved")
|
||||
return {"status": "rejected", "candidate_id": candidate_id}
|
||||
|
||||
|
||||
@router.post("/admin/memory/dedup-scan")
|
||||
def api_request_dedup_scan(body: DedupScanRequestBody) -> dict:
|
||||
"""Phase 7A: request a host-side dedup scan.
|
||||
|
||||
Writes a flag in project_state with project + threshold + max_batch.
|
||||
A host cron watcher picks it up within ~2 min and runs
|
||||
scripts/memory_dedup.py. Mirrors /admin/graduation/request.
|
||||
"""
|
||||
import json as _json
|
||||
from datetime import datetime as _dt, timezone as _tz
|
||||
from atocore.context.project_state import set_state
|
||||
|
||||
now = _dt.now(_tz.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
payload = _json.dumps({
|
||||
"project": (body.project or "").strip(),
|
||||
"similarity_threshold": max(0.5, min(0.99, body.similarity_threshold)),
|
||||
"max_batch": max(1, min(body.max_batch, 200)),
|
||||
"requested_at": now,
|
||||
})
|
||||
set_state(
|
||||
project_name="atocore",
|
||||
category="config",
|
||||
key="dedup_requested_at",
|
||||
value=payload,
|
||||
source="admin ui",
|
||||
)
|
||||
return {
|
||||
"requested_at": now,
|
||||
"project": body.project,
|
||||
"similarity_threshold": body.similarity_threshold,
|
||||
"max_batch": body.max_batch,
|
||||
"note": "Host watcher picks up within ~2 min. Poll /admin/memory/dedup-status for progress.",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/admin/memory/dedup-status")
|
||||
def api_dedup_status() -> dict:
|
||||
"""Phase 7A: state of the dedup scan pipeline (UI polling)."""
|
||||
import json as _json
|
||||
from atocore.context.project_state import get_state
|
||||
out = {
|
||||
"requested": None,
|
||||
"last_started_at": None,
|
||||
"last_finished_at": None,
|
||||
"last_result": None,
|
||||
"is_running": False,
|
||||
}
|
||||
try:
|
||||
for e in get_state("atocore"):
|
||||
if e.category not in ("config", "status"):
|
||||
continue
|
||||
if e.key == "dedup_requested_at":
|
||||
try:
|
||||
out["requested"] = _json.loads(e.value)
|
||||
except Exception:
|
||||
out["requested"] = {"raw": e.value}
|
||||
elif e.key == "dedup_last_started_at":
|
||||
out["last_started_at"] = e.value
|
||||
elif e.key == "dedup_last_finished_at":
|
||||
out["last_finished_at"] = e.value
|
||||
elif e.key == "dedup_last_result":
|
||||
out["last_result"] = e.value
|
||||
elif e.key == "dedup_running":
|
||||
out["is_running"] = (e.value == "1")
|
||||
except Exception:
|
||||
pass
|
||||
return out
|
||||
|
||||
|
||||
@router.get("/admin/graduation/stats")
|
||||
def api_graduation_stats() -> dict:
|
||||
"""Phase 5F graduation stats for dashboard."""
|
||||
|
||||
Reference in New Issue
Block a user