scripts/memory_dedup.py

#!/usr/bin/env python3
"""Phase 7A — semantic memory dedup detector.

Finds clusters of near-duplicate active memories and writes merge-
candidate proposals for human review in the triage UI.

Algorithm:
  1. Fetch active memories via HTTP
  2. Group by (project, memory_type) — cross-bucket merges are deferred
     to Phase 7B contradiction flow
  3. Within each group, embed contents via atocore.retrieval.embeddings
  4. Greedy transitive cluster at similarity >= threshold
  5. For each cluster of size >= 2, ask claude-p to draft unified content
  6. POST the proposal to /admin/memory/merge-candidates/create (server-
     side dedupes by the sorted memory-id set, so re-runs don't double-
     create)

Host-side because claude CLI lives on Dalidou, not the container. Reuses
the same PYTHONPATH=src pattern as scripts/graduate_memories.py for
atocore imports (embeddings, similarity, prompt module).

Usage:
  python3 scripts/memory_dedup.py --base-url http://127.0.0.1:8100 \\
      --similarity-threshold 0.88 --max-batch 50

Threshold conventions (see Phase 7 doc):
  0.88  interactive / default — balanced precision/recall
  0.90  nightly cron — tight, only near-duplicates
  0.85  weekly cron — deeper cleanup
"""

from __future__ import annotations

import argparse
import json
import os
import shutil
import subprocess
import sys
import tempfile
import time
import urllib.error
import urllib.request
from collections import defaultdict
from typing import Any

# Make src/ importable — same pattern as graduate_memories.py
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
if _SRC_DIR not in sys.path:
    sys.path.insert(0, _SRC_DIR)

from atocore.memory._dedup_prompt import (  # noqa: E402
    DEDUP_PROMPT_VERSION,
    SYSTEM_PROMPT,
    build_user_message,
    normalize_merge_verdict,
    parse_merge_verdict,
)
from atocore.memory.similarity import cluster_by_threshold  # noqa: E402

DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")
DEFAULT_MODEL = os.environ.get("ATOCORE_DEDUP_MODEL", "sonnet")
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_DEDUP_TIMEOUT_S", "60"))

_sandbox_cwd = None


def get_sandbox_cwd() -> str:
    global _sandbox_cwd
    if _sandbox_cwd is None:
        _sandbox_cwd = tempfile.mkdtemp(prefix="ato-dedup-")
    return _sandbox_cwd


def api_get(base_url: str, path: str) -> dict:
    req = urllib.request.Request(f"{base_url}{path}")
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read().decode("utf-8"))


def api_post(base_url: str, path: str, body: dict | None = None) -> dict:
    data = json.dumps(body or {}).encode("utf-8")
    req = urllib.request.Request(
        f"{base_url}{path}", method="POST",
        headers={"Content-Type": "application/json"}, data=data,
    )
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read().decode("utf-8"))


def call_claude(system_prompt: str, user_message: str, model: str, timeout_s: float) -> tuple[str | None, str | None]:
    """Shared CLI caller with retry + stderr capture (mirrors auto_triage)."""
    if not shutil.which("claude"):
        return None, "claude CLI not available"
    args = [
        "claude", "-p",
        "--model", model,
        "--append-system-prompt", system_prompt,
        "--disable-slash-commands",
        user_message,
    ]
    last_error = ""
    for attempt in range(3):
        if attempt > 0:
            time.sleep(2 ** attempt)
        try:
            completed = subprocess.run(
                args, capture_output=True, text=True,
                timeout=timeout_s, cwd=get_sandbox_cwd(),
                encoding="utf-8", errors="replace",
            )
        except subprocess.TimeoutExpired:
            last_error = f"{model} timed out"
            continue
        except Exception as exc:
            last_error = f"subprocess error: {exc}"
            continue
        if completed.returncode == 0:
            return (completed.stdout or "").strip(), None
        stderr = (completed.stderr or "").strip()[:200]
        last_error = f"{model} exit {completed.returncode}: {stderr}" if stderr else f"{model} exit {completed.returncode}"
    return None, last_error


def fetch_active_memories(base_url: str, project: str | None) -> list[dict]:
    # The /memory endpoint with active_only=true returns active memories.
    # Graduated memories are exempt from dedup — they're frozen pointers
    # to entities. Filter them out on the client side.
    params = "active_only=true&limit=2000"
    if project:
        params += f"&project={urllib.request.quote(project)}"
    try:
        result = api_get(base_url, f"/memory?{params}")
    except Exception as e:
        print(f"ERROR: could not fetch memories: {e}", file=sys.stderr)
        return []
    mems = result.get("memories", [])
    return [m for m in mems if (m.get("status") or "active") == "active"]


def group_memories(mems: list[dict]) -> dict[tuple[str, str], list[dict]]:
    """Bucket by (project, memory_type). Empty project is its own bucket."""
    buckets: dict[tuple[str, str], list[dict]] = defaultdict(list)
    for m in mems:
        key = ((m.get("project") or "").strip().lower(), (m.get("memory_type") or "").strip().lower())
        buckets[key].append(m)
    return buckets


def draft_merge(sources: list[dict], model: str, timeout_s: float) -> dict[str, Any] | None:
    user_msg = build_user_message(sources)
    raw, err = call_claude(SYSTEM_PROMPT, user_msg, model, timeout_s)
    if err:
        print(f"  WARN: claude call failed: {err}", file=sys.stderr)
        return None
    parsed = parse_merge_verdict(raw or "")
    if parsed is None:
        print(f"  WARN: could not parse verdict: {(raw or '')[:200]}", file=sys.stderr)
        return None
    return normalize_merge_verdict(parsed)


def submit_candidate(
    base_url: str,
    memory_ids: list[str],
    similarity: float,
    verdict: dict[str, Any],
    dry_run: bool,
) -> str | None:
    body = {
        "memory_ids": memory_ids,
        "similarity": similarity,
        "proposed_content": verdict["content"],
        "proposed_memory_type": verdict["memory_type"],
        "proposed_project": verdict["project"],
        "proposed_tags": verdict["domain_tags"],
        "proposed_confidence": verdict["confidence"],
        "reason": verdict["reason"],
    }
    if dry_run:
        print(f"  [dry-run] would POST: {json.dumps(body)[:200]}...")
        return "dry-run"
    try:
        result = api_post(base_url, "/admin/memory/merge-candidates/create", body)
        return result.get("candidate_id")
    except urllib.error.HTTPError as e:
        print(f"  ERROR: submit failed: {e.code} {e.read().decode()[:200]}", file=sys.stderr)
        return None
    except Exception as e:
        print(f"  ERROR: submit failed: {e}", file=sys.stderr)
        return None


def main() -> None:
    parser = argparse.ArgumentParser(description="Phase 7A semantic dedup detector")
    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
    parser.add_argument("--project", default="", help="Only scan this project (empty = all)")
    parser.add_argument("--similarity-threshold", type=float, default=0.88)
    parser.add_argument("--max-batch", type=int, default=50,
                        help="Max clusters to propose per run")
    parser.add_argument("--model", default=DEFAULT_MODEL)
    parser.add_argument("--timeout-s", type=float, default=DEFAULT_TIMEOUT_S)
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()

    base = args.base_url.rstrip("/")

    print(f"memory_dedup {DEDUP_PROMPT_VERSION} | threshold={args.similarity_threshold} | model={args.model}")
    mems = fetch_active_memories(base, args.project or None)
    print(f"fetched {len(mems)} active memories")
    if not mems:
        return

    buckets = group_memories(mems)
    print(f"grouped into {len(buckets)} (project, memory_type) buckets")

    clusters_found = 0
    candidates_created = 0
    skipped_existing = 0
    llm_rejections = 0

    for (proj, mtype), group in sorted(buckets.items()):
        if len(group) < 2:
            continue
        if candidates_created >= args.max_batch:
            print(f"reached max-batch={args.max_batch}, stopping")
            break

        texts = [(m.get("content") or "") for m in group]
        clusters = cluster_by_threshold(texts, args.similarity_threshold)
        # Keep only non-trivial clusters
        clusters = [c for c in clusters if len(c) >= 2]
        if not clusters:
            continue

        print(f"\n[{proj or '(global)'}/{mtype}] {len(group)} mems → {len(clusters)} cluster(s)")
        for cluster in clusters:
            if candidates_created >= args.max_batch:
                break
            clusters_found += 1
            sources = [group[i] for i in cluster]
            ids = [s["id"] for s in sources]
            # Approximate cluster similarity = min pairwise within cluster.
            # For reporting, just use threshold (we know all pairs >= threshold
            # transitively; min may be lower). Keep it simple.
            sim = args.similarity_threshold
            print(f"  cluster of {len(cluster)}: {[s['id'][:8] for s in sources]}")

            verdict = draft_merge(sources, args.model, args.timeout_s)
            if verdict is None:
                continue
            if verdict["action"] == "reject":
                llm_rejections += 1
                print(f"    LLM rejected: {verdict['reason'][:100]}")
                continue

            cid = submit_candidate(base, ids, sim, verdict, args.dry_run)
            if cid == "dry-run":
                candidates_created += 1
            elif cid:
                candidates_created += 1
                print(f"    → candidate {cid[:8]}")
            else:
                skipped_existing += 1

            time.sleep(0.3)  # be kind to claude CLI

    print(
        f"\nsummary: clusters_found={clusters_found} "
        f"candidates_created={candidates_created} "
        f"llm_rejections={llm_rejections} "
        f"skipped_existing={skipped_existing}"
    )


if __name__ == "__main__":
    main()
feat: Phase 7A — semantic memory dedup ("sleep cycle" V1) New table memory_merge_candidates + service functions to cluster near-duplicate active memories within (project, memory_type) buckets, draft a unified content via LLM, and merge on human approval. Source memories become superseded (never deleted); merged memory carries union of tags, max of confidence, sum of reference_count. - schema migration for memory_merge_candidates - atocore.memory.similarity: cosine + transitive clustering - atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific - service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate - scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent) - 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan - triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar - batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays) - deploy/dalidou/dedup-watcher.sh for UI-triggered scans - 21 new tests (374 → 395) - docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-18 10:30:49 -04:00			`#!/usr/bin/env python3`
			`"""Phase 7A — semantic memory dedup detector.`

			`Finds clusters of near-duplicate active memories and writes merge-`
			`candidate proposals for human review in the triage UI.`

			`Algorithm:`
			`1. Fetch active memories via HTTP`
			`2. Group by (project, memory_type) — cross-bucket merges are deferred`
			`to Phase 7B contradiction flow`
			`3. Within each group, embed contents via atocore.retrieval.embeddings`
			`4. Greedy transitive cluster at similarity >= threshold`
			`5. For each cluster of size >= 2, ask claude-p to draft unified content`
			`6. POST the proposal to /admin/memory/merge-candidates/create (server-`
			`side dedupes by the sorted memory-id set, so re-runs don't double-`
			`create)`

			`Host-side because claude CLI lives on Dalidou, not the container. Reuses`
			`the same PYTHONPATH=src pattern as scripts/graduate_memories.py for`
			`atocore imports (embeddings, similarity, prompt module).`

			`Usage:`
			`python3 scripts/memory_dedup.py --base-url http://127.0.0.1:8100 \\`
			`--similarity-threshold 0.88 --max-batch 50`

			`Threshold conventions (see Phase 7 doc):`
			`0.88 interactive / default — balanced precision/recall`
			`0.90 nightly cron — tight, only near-duplicates`
			`0.85 weekly cron — deeper cleanup`
			`"""`

			`from __future__ import annotations`

			`import argparse`
			`import json`
			`import os`
			`import shutil`
			`import subprocess`
			`import sys`
			`import tempfile`
			`import time`
			`import urllib.error`
			`import urllib.request`
			`from collections import defaultdict`
			`from typing import Any`

			`# Make src/ importable — same pattern as graduate_memories.py`
			`_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))`
			`_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))`
			`if _SRC_DIR not in sys.path:`
			`sys.path.insert(0, _SRC_DIR)`

			`from atocore.memory._dedup_prompt import ( # noqa: E402`
			`DEDUP_PROMPT_VERSION,`
			`SYSTEM_PROMPT,`
			`build_user_message,`
			`normalize_merge_verdict,`
			`parse_merge_verdict,`
			`)`
			`from atocore.memory.similarity import cluster_by_threshold # noqa: E402`

			`DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")`
			`DEFAULT_MODEL = os.environ.get("ATOCORE_DEDUP_MODEL", "sonnet")`
			`DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_DEDUP_TIMEOUT_S", "60"))`

			`_sandbox_cwd = None`


			`def get_sandbox_cwd() -> str:`
			`global _sandbox_cwd`
			`if _sandbox_cwd is None:`
			`_sandbox_cwd = tempfile.mkdtemp(prefix="ato-dedup-")`
			`return _sandbox_cwd`


			`def api_get(base_url: str, path: str) -> dict:`
			`req = urllib.request.Request(f"{base_url}{path}")`
			`with urllib.request.urlopen(req, timeout=30) as resp:`
			`return json.loads(resp.read().decode("utf-8"))`


			`def api_post(base_url: str, path: str, body: dict \| None = None) -> dict:`
			`data = json.dumps(body or {}).encode("utf-8")`
			`req = urllib.request.Request(`
			`f"{base_url}{path}", method="POST",`
			`headers={"Content-Type": "application/json"}, data=data,`
			`)`
			`with urllib.request.urlopen(req, timeout=30) as resp:`
			`return json.loads(resp.read().decode("utf-8"))`


			`def call_claude(system_prompt: str, user_message: str, model: str, timeout_s: float) -> tuple[str \| None, str \| None]:`
			`"""Shared CLI caller with retry + stderr capture (mirrors auto_triage)."""`
			`if not shutil.which("claude"):`
			`return None, "claude CLI not available"`
			`args = [`
			`"claude", "-p",`
			`"--model", model,`
			`"--append-system-prompt", system_prompt,`
			`"--disable-slash-commands",`
			`user_message,`
			`]`
			`last_error = ""`
			`for attempt in range(3):`
			`if attempt > 0:`
			`time.sleep(2 ** attempt)`
			`try:`
			`completed = subprocess.run(`
			`args, capture_output=True, text=True,`
			`timeout=timeout_s, cwd=get_sandbox_cwd(),`
			`encoding="utf-8", errors="replace",`
			`)`
			`except subprocess.TimeoutExpired:`
			`last_error = f"{model} timed out"`
			`continue`
			`except Exception as exc:`
			`last_error = f"subprocess error: {exc}"`
			`continue`
			`if completed.returncode == 0:`
			`return (completed.stdout or "").strip(), None`
			`stderr = (completed.stderr or "").strip()[:200]`
			`last_error = f"{model} exit {completed.returncode}: {stderr}" if stderr else f"{model} exit {completed.returncode}"`
			`return None, last_error`


			`def fetch_active_memories(base_url: str, project: str \| None) -> list[dict]:`
			`# The /memory endpoint with active_only=true returns active memories.`
			`# Graduated memories are exempt from dedup — they're frozen pointers`
			`# to entities. Filter them out on the client side.`
			`params = "active_only=true&limit=2000"`
			`if project:`
			`params += f"&project={urllib.request.quote(project)}"`
			`try:`
			`result = api_get(base_url, f"/memory?{params}")`
			`except Exception as e:`
			`print(f"ERROR: could not fetch memories: {e}", file=sys.stderr)`
			`return []`
			`mems = result.get("memories", [])`
			`return [m for m in mems if (m.get("status") or "active") == "active"]`


			`def group_memories(mems: list[dict]) -> dict[tuple[str, str], list[dict]]:`
			`"""Bucket by (project, memory_type). Empty project is its own bucket."""`
			`buckets: dict[tuple[str, str], list[dict]] = defaultdict(list)`
			`for m in mems:`
			`key = ((m.get("project") or "").strip().lower(), (m.get("memory_type") or "").strip().lower())`
			`buckets[key].append(m)`
			`return buckets`


			`def draft_merge(sources: list[dict], model: str, timeout_s: float) -> dict[str, Any] \| None:`
			`user_msg = build_user_message(sources)`
			`raw, err = call_claude(SYSTEM_PROMPT, user_msg, model, timeout_s)`
			`if err:`
			`print(f" WARN: claude call failed: {err}", file=sys.stderr)`
			`return None`
			`parsed = parse_merge_verdict(raw or "")`
			`if parsed is None:`
			`print(f" WARN: could not parse verdict: {(raw or '')[:200]}", file=sys.stderr)`
			`return None`
			`return normalize_merge_verdict(parsed)`


			`def submit_candidate(`
			`base_url: str,`
			`memory_ids: list[str],`
			`similarity: float,`
			`verdict: dict[str, Any],`
			`dry_run: bool,`
			`) -> str \| None:`
			`body = {`
			`"memory_ids": memory_ids,`
			`"similarity": similarity,`
			`"proposed_content": verdict["content"],`
			`"proposed_memory_type": verdict["memory_type"],`
			`"proposed_project": verdict["project"],`
			`"proposed_tags": verdict["domain_tags"],`
			`"proposed_confidence": verdict["confidence"],`
			`"reason": verdict["reason"],`
			`}`
			`if dry_run:`
			`print(f" [dry-run] would POST: {json.dumps(body)[:200]}...")`
			`return "dry-run"`
			`try:`
			`result = api_post(base_url, "/admin/memory/merge-candidates/create", body)`
			`return result.get("candidate_id")`
			`except urllib.error.HTTPError as e:`
			`print(f" ERROR: submit failed: {e.code} {e.read().decode()[:200]}", file=sys.stderr)`
			`return None`
			`except Exception as e:`
			`print(f" ERROR: submit failed: {e}", file=sys.stderr)`
			`return None`


			`def main() -> None:`
			`parser = argparse.ArgumentParser(description="Phase 7A semantic dedup detector")`
			`parser.add_argument("--base-url", default=DEFAULT_BASE_URL)`
			`parser.add_argument("--project", default="", help="Only scan this project (empty = all)")`
			`parser.add_argument("--similarity-threshold", type=float, default=0.88)`
			`parser.add_argument("--max-batch", type=int, default=50,`
			`help="Max clusters to propose per run")`
			`parser.add_argument("--model", default=DEFAULT_MODEL)`
			`parser.add_argument("--timeout-s", type=float, default=DEFAULT_TIMEOUT_S)`
			`parser.add_argument("--dry-run", action="store_true")`
			`args = parser.parse_args()`

			`base = args.base_url.rstrip("/")`

			`print(f"memory_dedup {DEDUP_PROMPT_VERSION} \| threshold={args.similarity_threshold} \| model={args.model}")`
			`mems = fetch_active_memories(base, args.project or None)`
			`print(f"fetched {len(mems)} active memories")`
			`if not mems:`
			`return`

			`buckets = group_memories(mems)`
			`print(f"grouped into {len(buckets)} (project, memory_type) buckets")`

			`clusters_found = 0`
			`candidates_created = 0`
			`skipped_existing = 0`
			`llm_rejections = 0`

			`for (proj, mtype), group in sorted(buckets.items()):`
			`if len(group) < 2:`
			`continue`
			`if candidates_created >= args.max_batch:`
			`print(f"reached max-batch={args.max_batch}, stopping")`
			`break`

			`texts = [(m.get("content") or "") for m in group]`
			`clusters = cluster_by_threshold(texts, args.similarity_threshold)`
			`# Keep only non-trivial clusters`
			`clusters = [c for c in clusters if len(c) >= 2]`
			`if not clusters:`
			`continue`

			`print(f"\n[{proj or '(global)'}/{mtype}] {len(group)} mems → {len(clusters)} cluster(s)")`
			`for cluster in clusters:`
			`if candidates_created >= args.max_batch:`
			`break`
			`clusters_found += 1`
			`sources = [group[i] for i in cluster]`
			`ids = [s["id"] for s in sources]`
			`# Approximate cluster similarity = min pairwise within cluster.`
			`# For reporting, just use threshold (we know all pairs >= threshold`
			`# transitively; min may be lower). Keep it simple.`
			`sim = args.similarity_threshold`
			`print(f" cluster of {len(cluster)}: {[s['id'][:8] for s in sources]}")`

			`verdict = draft_merge(sources, args.model, args.timeout_s)`
			`if verdict is None:`
			`continue`
			`if verdict["action"] == "reject":`
			`llm_rejections += 1`
			`print(f" LLM rejected: {verdict['reason'][:100]}")`
			`continue`

			`cid = submit_candidate(base, ids, sim, verdict, args.dry_run)`
			`if cid == "dry-run":`
			`candidates_created += 1`
			`elif cid:`
			`candidates_created += 1`
			`print(f" → candidate {cid[:8]}")`
			`else:`
			`skipped_existing += 1`

			`time.sleep(0.3) # be kind to claude CLI`

			`print(`
			`f"\nsummary: clusters_found={clusters_found} "`
			`f"candidates_created={candidates_created} "`
			`f"llm_rejections={llm_rejections} "`
			`f"skipped_existing={skipped_existing}"`
			`)`


			`if __name__ == "__main__":`
			`main()`