feat: Phase 7A — semantic memory dedup ("sleep cycle" V1)
New table memory_merge_candidates + service functions to cluster near-duplicate active memories within (project, memory_type) buckets, draft a unified content via LLM, and merge on human approval. Source memories become superseded (never deleted); merged memory carries union of tags, max of confidence, sum of reference_count. - schema migration for memory_merge_candidates - atocore.memory.similarity: cosine + transitive clustering - atocore.memory._dedup_prompt: stdlib-only LLM prompt preserving every specific - service: merge_memories / create_merge_candidate / get_merge_candidates / reject_merge_candidate - scripts/memory_dedup.py: host-side detector (HTTP-only, idempotent) - 5 API endpoints under /admin/memory/merge-candidates* + /admin/memory/dedup-scan - triage UI: purple "🔗 Merge Candidates" section + "🔗 Scan for duplicates" bar - batch-extract.sh Step B3 (0.90 daily, 0.85 Sundays) - deploy/dalidou/dedup-watcher.sh for UI-triggered scans - 21 new tests (374 → 395) - docs/PHASE-7-MEMORY-CONSOLIDATION.md covering 7A-7H roadmap Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
110
deploy/dalidou/dedup-watcher.sh
Normal file
110
deploy/dalidou/dedup-watcher.sh
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# deploy/dalidou/dedup-watcher.sh
|
||||
# -------------------------------
|
||||
# Host-side watcher for on-demand memory dedup scans (Phase 7A).
|
||||
#
|
||||
# The /admin/triage page has a "🔗 Scan for duplicates" button that POSTs
|
||||
# to /admin/memory/dedup-scan with {project, similarity_threshold, max_batch}.
|
||||
# The container writes this to project_state (atocore/config/dedup_requested_at).
|
||||
#
|
||||
# This script runs on the Dalidou HOST (where claude CLI lives), polls
|
||||
# for the flag, and runs memory_dedup.py when seen.
|
||||
#
|
||||
# Installed via cron every 2 minutes:
|
||||
# */2 * * * * /srv/storage/atocore/app/deploy/dalidou/dedup-watcher.sh \
|
||||
# >> /home/papa/atocore-logs/dedup-watcher.log 2>&1
|
||||
#
|
||||
# Mirrors deploy/dalidou/graduation-watcher.sh exactly.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
ATOCORE_URL="${ATOCORE_URL:-http://127.0.0.1:8100}"
|
||||
APP_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
LOCK_FILE="/tmp/atocore-dedup.lock"
|
||||
LOG_DIR="/home/papa/atocore-logs"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
TS="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
log() { printf '[%s] %s\n' "$TS" "$*"; }
|
||||
|
||||
# Fetch the flag via API
|
||||
STATE_JSON=$(curl -sSf --max-time 5 "$ATOCORE_URL/project/state/atocore" 2>/dev/null || echo "{}")
|
||||
REQUESTED=$(echo "$STATE_JSON" | python3 -c "
|
||||
import sys, json
|
||||
try:
|
||||
d = json.load(sys.stdin)
|
||||
for e in d.get('entries', d.get('state', [])):
|
||||
if e.get('category') == 'config' and e.get('key') == 'dedup_requested_at':
|
||||
print(e.get('value', ''))
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
" 2>/dev/null || echo "")
|
||||
|
||||
if [[ -z "$REQUESTED" ]]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
PROJECT=$(echo "$REQUESTED" | python3 -c "import sys,json; print(json.loads(sys.stdin.read() or '{}').get('project',''))" 2>/dev/null || echo "")
|
||||
THRESHOLD=$(echo "$REQUESTED" | python3 -c "import sys,json; print(json.loads(sys.stdin.read() or '{}').get('similarity_threshold',0.88))" 2>/dev/null || echo "0.88")
|
||||
MAX_BATCH=$(echo "$REQUESTED" | python3 -c "import sys,json; print(json.loads(sys.stdin.read() or '{}').get('max_batch',50))" 2>/dev/null || echo "50")
|
||||
|
||||
# Acquire lock
|
||||
exec 9>"$LOCK_FILE" || exit 0
|
||||
if ! flock -n 9; then
|
||||
log "dedup already running, skipping"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Mark running
|
||||
curl -sSf -X POST "$ATOCORE_URL/project/state" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"project\":\"atocore\",\"category\":\"status\",\"key\":\"dedup_running\",\"value\":\"1\",\"source\":\"dedup watcher\"}" \
|
||||
>/dev/null 2>&1 || true
|
||||
curl -sSf -X POST "$ATOCORE_URL/project/state" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"project\":\"atocore\",\"category\":\"status\",\"key\":\"dedup_last_started_at\",\"value\":\"$TS\",\"source\":\"dedup watcher\"}" \
|
||||
>/dev/null 2>&1 || true
|
||||
|
||||
LOG_FILE="$LOG_DIR/dedup-ondemand-$(date -u +%Y%m%d-%H%M%S).log"
|
||||
log "Starting dedup (project='$PROJECT' threshold=$THRESHOLD max_batch=$MAX_BATCH, log: $LOG_FILE)"
|
||||
|
||||
# Clear the flag BEFORE running so duplicate clicks queue at most one
|
||||
curl -sSf -X DELETE "$ATOCORE_URL/project/state" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"project\":\"atocore\",\"category\":\"config\",\"key\":\"dedup_requested_at\"}" \
|
||||
>/dev/null 2>&1 || true
|
||||
|
||||
cd "$APP_DIR"
|
||||
export PYTHONPATH="$APP_DIR/src:${PYTHONPATH:-}"
|
||||
ARGS=(--base-url "$ATOCORE_URL" --similarity-threshold "$THRESHOLD" --max-batch "$MAX_BATCH")
|
||||
if [[ -n "$PROJECT" ]]; then
|
||||
ARGS+=(--project "$PROJECT")
|
||||
fi
|
||||
|
||||
if python3 scripts/memory_dedup.py "${ARGS[@]}" >> "$LOG_FILE" 2>&1; then
|
||||
RESULT=$(grep "^summary:" "$LOG_FILE" | tail -1 || tail -1 "$LOG_FILE")
|
||||
RESULT="${RESULT:-completed}"
|
||||
log "dedup finished: $RESULT"
|
||||
else
|
||||
RESULT="ERROR — see $LOG_FILE"
|
||||
log "dedup FAILED"
|
||||
fi
|
||||
|
||||
FINISH_TS="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
|
||||
curl -sSf -X POST "$ATOCORE_URL/project/state" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"project\":\"atocore\",\"category\":\"status\",\"key\":\"dedup_running\",\"value\":\"0\",\"source\":\"dedup watcher\"}" \
|
||||
>/dev/null 2>&1 || true
|
||||
curl -sSf -X POST "$ATOCORE_URL/project/state" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"project\":\"atocore\",\"category\":\"status\",\"key\":\"dedup_last_finished_at\",\"value\":\"$FINISH_TS\",\"source\":\"dedup watcher\"}" \
|
||||
>/dev/null 2>&1 || true
|
||||
|
||||
SAFE_RESULT=$(printf '%s' "$RESULT" | python3 -c "import sys,json; print(json.dumps(sys.stdin.read())[1:-1])")
|
||||
curl -sSf -X POST "$ATOCORE_URL/project/state" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"project\":\"atocore\",\"category\":\"status\",\"key\":\"dedup_last_result\",\"value\":\"$SAFE_RESULT\",\"source\":\"dedup watcher\"}" \
|
||||
>/dev/null 2>&1 || true
|
||||
Reference in New Issue
Block a user