diff --git a/deploy/dalidou/cron-backup.sh b/deploy/dalidou/cron-backup.sh index fd81bf3..059e104 100644 --- a/deploy/dalidou/cron-backup.sh +++ b/deploy/dalidou/cron-backup.sh @@ -18,13 +18,22 @@ # emails. Check /var/log/atocore-backup.log for diagnostics. # # Environment variables: -# ATOCORE_URL default http://127.0.0.1:8100 -# ATOCORE_BACKUP_CHROMA default false (set to "true" for cold chroma copy) +# ATOCORE_URL default http://127.0.0.1:8100 +# ATOCORE_BACKUP_CHROMA default false (set to "true" for cold chroma copy) +# ATOCORE_BACKUP_DIR default /srv/storage/atocore/backups +# ATOCORE_BACKUP_RSYNC optional rsync destination for off-host copies +# (e.g. papa@laptop:/home/papa/atocore-backups/) +# When set, the local snapshots tree is rsynced to +# the destination after cleanup. Unset = skip. +# SSH key auth must already be configured from this +# host to the destination. set -euo pipefail ATOCORE_URL="${ATOCORE_URL:-http://127.0.0.1:8100}" INCLUDE_CHROMA="${ATOCORE_BACKUP_CHROMA:-false}" +BACKUP_DIR="${ATOCORE_BACKUP_DIR:-/srv/storage/atocore/backups}" +RSYNC_TARGET="${ATOCORE_BACKUP_RSYNC:-}" TIMESTAMP="$(date -u +%Y-%m-%dT%H:%M:%SZ)" log() { printf '[%s] %s\n' "$TIMESTAMP" "$*"; } @@ -53,4 +62,24 @@ CLEANUP_RESULT=$(curl -sf -X POST \ } log "Cleanup result: $CLEANUP_RESULT" +# Step 3: Off-host rsync (optional). Fail-open: log but don't abort +# the cron so a laptop being offline at 03:00 UTC never turns the +# local backup path red. +if [[ -n "$RSYNC_TARGET" ]]; then + log "Step 3: rsyncing snapshots to $RSYNC_TARGET" + if [[ ! -d "$BACKUP_DIR/snapshots" ]]; then + log "WARN: $BACKUP_DIR/snapshots does not exist, skipping rsync" + else + RSYNC_OUTPUT=$(rsync -a --delete \ + -e "ssh -o ConnectTimeout=10 -o BatchMode=yes -o StrictHostKeyChecking=accept-new" \ + "$BACKUP_DIR/snapshots/" "$RSYNC_TARGET" 2>&1) && { + log "Rsync complete" + } || { + log "WARN: rsync to $RSYNC_TARGET failed (offline or auth?): $RSYNC_OUTPUT" + } + fi +else + log "Step 3: ATOCORE_BACKUP_RSYNC not set, skipping off-host copy" +fi + log "=== AtoCore daily backup complete ===" diff --git a/docs/next-steps.md b/docs/next-steps.md index a30a5cd..784941d 100644 --- a/docs/next-steps.md +++ b/docs/next-steps.md @@ -159,6 +159,44 @@ The next batch is successful if: - project ingestion remains controlled rather than noisy - the canonical Dalidou instance stays stable +## Retrieval Quality Review — 2026-04-11 + +First sweep with real project-hinted queries on Dalidou. Used +`POST /context/build` against p04, p05, p06 with representative +questions and inspected `formatted_context`. + +Findings: + +- **Trusted Project State is surfacing correctly.** The DECISION and + REQUIREMENT categories appear at the top of the pack and include + the expected key facts (e.g. p04 "Option B conical-back mirror + architecture"). This is the strongest signal in the pack today. +- **Chunk retrieval is relevant on-topic but broad.** Top chunks for + the p04 architecture query are PDR intro, CAD assembly overview, + and the index — all on the right project but none of them directly + answer the "why was Option B chosen" question. The authoritative + answer sits in Project State, not in the chunks. +- **Active memories are NOT reaching the pack.** The context builder + surfaces Trusted Project State and retrieved chunks but does not + include the 21 active project/knowledge memories. Reinforcement + (Phase 9 Commit B) bumps memory confidence without the memory ever + being read back into a prompt — the reflection loop has no outlet + on the retrieval side. This is a design gap, not a bug: needs a + decision on whether memories should feed into context assembly, + and if so at what trust level (below project_state, above chunks). +- **Cross-project bleed is low.** The p04 query did pull one p05 + chunk (CGH_Design_Input_for_AOM) as the bottom hit but the top-4 + were all p04. + +Proposed follow-ups (not yet scheduled): + +1. Decide whether memories should be folded into `formatted_context` + and under what section header. Candidate: a "--- Project Memories ---" + band between Trusted Project State and Retrieved Context, filtered + to active memories for the target project plus identity/preference. +2. Re-run the same three queries after any builder change and compare + `formatted_context` diffs. + ## Long-Run Goal The long-run target is: diff --git a/scripts/atocore_client.py b/scripts/atocore_client.py index 16e4768..d6a22c4 100644 --- a/scripts/atocore_client.py +++ b/scripts/atocore_client.py @@ -340,6 +340,22 @@ def build_parser() -> argparse.ArgumentParser: p = sub.add_parser("reject") p.add_argument("memory_id") + # batch-extract: fan out /interactions/{id}/extract?persist=true across + # recent interactions. Idempotent — the extractor create_memory path + # silently skips duplicates, so re-running is safe. + p = sub.add_parser("batch-extract") + p.add_argument("since", nargs="?", default="") + p.add_argument("project", nargs="?", default="") + p.add_argument("limit", nargs="?", type=int, default=100) + p.add_argument("persist", nargs="?", default="true") + + # triage: interactive candidate review loop. Fetches the queue, shows + # each candidate, accepts p/r/s (promote / reject / skip) / q (quit). + p = sub.add_parser("triage") + p.add_argument("memory_type", nargs="?", default="") + p.add_argument("project", nargs="?", default="") + p.add_argument("limit", nargs="?", type=int, default=50) + return parser @@ -474,10 +490,141 @@ def main() -> int: {}, ) ) + elif cmd == "batch-extract": + print_json(run_batch_extract(args.since, args.project, args.limit, args.persist)) + elif cmd == "triage": + return run_triage(args.memory_type, args.project, args.limit) else: return 1 return 0 +def run_batch_extract(since: str, project: str, limit: int, persist_flag: str) -> dict: + """Fetch recent interactions and run the extractor against each one. + + Returns an aggregated summary. Safe to re-run: the server-side + persist path catches ValueError on duplicates and the endpoint + reports per-interaction candidate counts either way. + """ + persist = persist_flag.lower() in {"1", "true", "yes", "y"} + query_parts: list[str] = [] + if project: + query_parts.append(f"project={urllib.parse.quote(project)}") + if since: + query_parts.append(f"since={urllib.parse.quote(since)}") + query_parts.append(f"limit={int(limit)}") + query = "?" + "&".join(query_parts) + + listing = request("GET", f"/interactions{query}") + interactions = listing.get("interactions", []) if isinstance(listing, dict) else [] + + processed = 0 + total_candidates = 0 + total_persisted = 0 + errors: list[dict] = [] + per_interaction: list[dict] = [] + + for item in interactions: + iid = item.get("id") or "" + if not iid: + continue + try: + result = request( + "POST", + f"/interactions/{urllib.parse.quote(iid, safe='')}/extract", + {"persist": persist}, + ) + except Exception as exc: # pragma: no cover - network errors land here + errors.append({"interaction_id": iid, "error": str(exc)}) + continue + processed += 1 + count = int(result.get("candidate_count", 0) or 0) + persisted_ids = result.get("persisted_ids") or [] + total_candidates += count + total_persisted += len(persisted_ids) + if count: + per_interaction.append( + { + "interaction_id": iid, + "candidate_count": count, + "persisted_count": len(persisted_ids), + "project": item.get("project") or "", + } + ) + + return { + "processed": processed, + "total_candidates": total_candidates, + "total_persisted": total_persisted, + "persist": persist, + "errors": errors, + "interactions_with_candidates": per_interaction, + } + + +def run_triage(memory_type: str, project: str, limit: int) -> int: + """Interactive review of candidate memories. + + Loads the queue once, walks through entries, prompts for + (p)romote / (r)eject / (s)kip / (q)uit. Stateless between runs — + re-running picks up whatever is still status=candidate. + """ + query_parts = ["status=candidate"] + if memory_type: + query_parts.append(f"memory_type={urllib.parse.quote(memory_type)}") + if project: + query_parts.append(f"project={urllib.parse.quote(project)}") + query_parts.append(f"limit={int(limit)}") + listing = request("GET", "/memory?" + "&".join(query_parts)) + memories = listing.get("memories", []) if isinstance(listing, dict) else [] + + if not memories: + print_json({"status": "empty_queue", "count": 0}) + return 0 + + promoted = 0 + rejected = 0 + skipped = 0 + stopped_early = False + + print(f"Triage queue: {len(memories)} candidate(s)\n", file=sys.stderr) + for idx, mem in enumerate(memories, 1): + mid = mem.get("id", "") + print(f"[{idx}/{len(memories)}] {mem.get('memory_type','?')} project={mem.get('project','')} conf={mem.get('confidence','?')}", file=sys.stderr) + print(f" id: {mid}", file=sys.stderr) + print(f" {mem.get('content','')}", file=sys.stderr) + try: + choice = input(" (p)romote / (r)eject / (s)kip / (q)uit > ").strip().lower() + except EOFError: + stopped_early = True + break + if choice in {"q", "quit"}: + stopped_early = True + break + if choice in {"p", "promote"}: + request("POST", f"/memory/{urllib.parse.quote(mid, safe='')}/promote", {}) + promoted += 1 + print(" -> promoted", file=sys.stderr) + elif choice in {"r", "reject"}: + request("POST", f"/memory/{urllib.parse.quote(mid, safe='')}/reject", {}) + rejected += 1 + print(" -> rejected", file=sys.stderr) + else: + skipped += 1 + print(" -> skipped", file=sys.stderr) + + print_json( + { + "reviewed": promoted + rejected + skipped, + "promoted": promoted, + "rejected": rejected, + "skipped": skipped, + "stopped_early": stopped_early, + "remaining_in_queue": len(memories) - (promoted + rejected + skipped) - (1 if stopped_early else 0), + } + ) + return 0 + + if __name__ == "__main__": raise SystemExit(main()) diff --git a/src/atocore/memory/reinforcement.py b/src/atocore/memory/reinforcement.py index fc6ee7a..b4fb4c5 100644 --- a/src/atocore/memory/reinforcement.py +++ b/src/atocore/memory/reinforcement.py @@ -51,6 +51,15 @@ _STOP_WORDS: frozenset[str] = frozenset({ }) _MATCH_THRESHOLD = 0.70 +# Long memories can't realistically hit 70% overlap through organic +# paraphrase — a 40-token memory would need 28 stemmed tokens echoed +# verbatim. Above this token count the matcher switches to an absolute +# overlap floor plus a softer fraction floor so paragraph-length memories +# still reinforce when the response genuinely uses them. +_LONG_MEMORY_TOKEN_COUNT = 15 +_LONG_MODE_MIN_OVERLAP = 12 +_LONG_MODE_MIN_FRACTION = 0.35 + DEFAULT_CONFIDENCE_DELTA = 0.02 @@ -188,9 +197,14 @@ def _tokenize(text: str) -> set[str]: def _memory_matches(memory_content: str, normalized_response: str) -> bool: """Return True if enough of the memory's tokens appear in the response. - Uses token-overlap: tokenize both sides (lowercase, stem, drop stop - words), then check whether >= 70 % of the memory's content tokens - appear in the response token set. + Dual-mode token overlap: + - Short memories (<= _LONG_MEMORY_TOKEN_COUNT stems): require + >= 70 % of memory tokens echoed. + - Long memories (paragraphs): require an absolute floor of + _LONG_MODE_MIN_OVERLAP distinct stems echoed AND a softer + fraction of _LONG_MODE_MIN_FRACTION, so organic paraphrase + of a real project memory can reinforce without the response + quoting the paragraph verbatim. """ if not memory_content: return False @@ -202,4 +216,10 @@ def _memory_matches(memory_content: str, normalized_response: str) -> bool: return False response_tokens = _tokenize(normalized_response) overlap = memory_tokens & response_tokens - return len(overlap) / len(memory_tokens) >= _MATCH_THRESHOLD + fraction = len(overlap) / len(memory_tokens) + if len(memory_tokens) <= _LONG_MEMORY_TOKEN_COUNT: + return fraction >= _MATCH_THRESHOLD + return ( + len(overlap) >= _LONG_MODE_MIN_OVERLAP + and fraction >= _LONG_MODE_MIN_FRACTION + ) diff --git a/tests/test_reinforcement.py b/tests/test_reinforcement.py index 9d3832b..fc3cd3b 100644 --- a/tests/test_reinforcement.py +++ b/tests/test_reinforcement.py @@ -476,6 +476,60 @@ def test_reinforce_matches_at_70_percent_threshold(tmp_data_dir): assert any(r.memory_id == mem.id for r in results) +def test_reinforce_long_memory_matches_on_absolute_overlap(tmp_data_dir): + """A paragraph-length memory should reinforce when the response + echoes a substantive subset of its distinctive tokens, even though + the overlap fraction stays well under 70%.""" + init_db() + mem = create_memory( + memory_type="project", + content=( + "Interferometer architecture: a folded-beam configuration with a " + "fixed horizontal interferometer, a forty-five degree fold mirror, " + "a six-DOF CGH stage, and the mirror on its own tilting platform. " + "The fold mirror redirects the beam while the CGH shapes the wavefront." + ), + project="p05-interferometer", + confidence=0.5, + ) + interaction = _make_interaction( + project="p05-interferometer", + response=( + "For the interferometer we keep the folded-beam layout: horizontal " + "interferometer, fold mirror at forty-five degrees, CGH stage with " + "six DOF, and the mirror sitting on its tilting platform. The fold " + "mirror redirects the beam and the CGH shapes the wavefront." + ), + ) + results = reinforce_from_interaction(interaction) + assert any(r.memory_id == mem.id for r in results) + + +def test_reinforce_long_memory_rejects_thin_overlap(tmp_data_dir): + """Long memory + a response that only brushes a few generic terms + must NOT reinforce — otherwise the reflection loop rots.""" + init_db() + mem = create_memory( + memory_type="project", + content=( + "Polisher control system executes approved controller jobs, " + "enforces state transitions and interlocks, supports pause " + "resume and abort, and records auditable run logs while " + "never reinterpreting metrology or inventing new strategies." + ), + project="p06-polisher", + confidence=0.5, + ) + interaction = _make_interaction( + project="p06-polisher", + response=( + "I updated the polisher docs and fixed a typo in the run logs section." + ), + ) + results = reinforce_from_interaction(interaction) + assert all(r.memory_id != mem.id for r in results) + + def test_reinforce_rejects_below_70_percent(tmp_data_dir): """Only 6 of 10 content tokens present (60%) → should NOT match.""" init_db()