From 1a8fdf42259ae850e4986af9a3592d98a250c9ac Mon Sep 17 00:00:00 2001 From: Anto01 Date: Thu, 9 Apr 2026 09:13:21 -0400 Subject: [PATCH] fix: chroma restore bind-mount bug + consolidate docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes from the 2026-04-09 first real restore drill on Dalidou, plus the long-overdue doc consolidation I should have done when I added the drill runbook instead of creating a duplicate. ## Chroma restore bind-mount bug (drill finding) src/atocore/ops/backup.py: restore_runtime_backup() used to call shutil.rmtree(dst_chroma) before copying the snapshot back. In the Dockerized Dalidou deployment the chroma dir is a bind-mounted volume — you can't unlink a mount point, rmtree raises OSError [Errno 16] Device or resource busy and the restore silently fails to touch Chroma. This bit the first real drill; the operator worked around it with --no-chroma plus a manual cp -a. Fix: clear the destination's CONTENTS (iterdir + rmtree/unlink per child) and use copytree(dirs_exist_ok=True) so the mount point itself is never touched. Equivalent semantics, bind-mount-safe. Regression test: tests/test_backup.py::test_restore_chroma_does_not_unlink_destination_directory captures Path.stat().st_ino of the dest dir before and after restore and asserts they match. That's the same invariant a bind-mounted chroma dir enforces — if the inode changed, the mount would have failed. 11/11 backup tests now pass. ## Doc consolidation docs/backup-restore-drill.md existed as a duplicate of the authoritative docs/backup-restore-procedure.md. When I added the drill runbook in commit 3362080 I wrote it from scratch instead of updating the existing procedure — bad doc hygiene on a project that's literally about being a context engine. - Deleted docs/backup-restore-drill.md - Folded its contents into docs/backup-restore-procedure.md: - Replaced the manual sudo cp restore sequence with the new `python -m atocore.ops.backup restore --confirm-service-stopped` CLI - Added the one-shot docker compose run pattern for running restore inside a container that reuses the live volume mounts - Documented the --no-pre-snapshot / --no-chroma / --chroma flags - New "Chroma restore and bind-mounted volumes" subsection explaining the bug and the regression test that protects the fix - New "Restore drill" subsection with three levels (unit tests, module round-trip, live Dalidou drill) and the cadence list - Failure-mode table gained four entries: restored_integrity_ok, Device-or-resource-busy, drill marker still present, chroma_snapshot_missing - "Open follow-ups" struck the restore_runtime_backup item (done) and added a "Done (historical)" note referencing 2026-04-09 - Quickstart cheat sheet now has a full drill one-liner using memory_type=episodic (the 2026-04-09 drill found the runbook's memory_type=note was invalid — the valid set is identity, preference, project, episodic, knowledge, adaptation) ## Status doc sync Long overdue — I've been landing code without updating the project's narrative state docs. docs/current-state.md: - "Reliability Baseline" now reflects: restore_runtime_backup is real with CLI, pre-restore safety snapshot, WAL cleanup, integrity check; live drill on 2026-04-09 surfaced and fixed Chroma bind-mount bug; deploy provenance via /health build_sha; deploy.sh self-update re-exec guard - "Immediate Next Focus" reshuffled: drill re-run (priority 1) and auto-capture (priority 2) are now ahead of retrieval quality work, reflecting the updated unblock sequence docs/next-steps.md: - New item 1: re-run the drill with chroma working end-to-end - New item 2: auto-capture conservative mode (Stop hook) - Old item 7 rewritten as item 9 listing what's DONE (create/list/validate/restore, admin/backup endpoint with include_chroma, /health provenance, self-update guard, procedure doc with failure modes) and what's still pending (retention cleanup, off-Dalidou target, auto-validation) ## Test count 226 passing (was 225 + 1 new inode-stability regression test). Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/backup-restore-drill.md | 296 ------------------------------- docs/backup-restore-procedure.md | 280 ++++++++++++++++++----------- docs/current-state.md | 48 +++-- docs/next-steps.md | 60 +++++-- src/atocore/ops/backup.py | 19 +- tests/test_backup.py | 59 ++++++ 6 files changed, 331 insertions(+), 431 deletions(-) delete mode 100644 docs/backup-restore-drill.md diff --git a/docs/backup-restore-drill.md b/docs/backup-restore-drill.md deleted file mode 100644 index 4a88ab6..0000000 --- a/docs/backup-restore-drill.md +++ /dev/null @@ -1,296 +0,0 @@ -# Backup / Restore Drill - -## Purpose - -Before turning on any automation that writes to AtoCore continuously -(auto-capture of Claude Code sessions, automated source ingestion, -reinforcement sweeps), we need to know — with certainty — that a -backup can actually be restored. A backup you've never restored is -not a backup; it's a file that happens to be named that way. - -This runbook walks through the canonical drill: take a snapshot, -mutate live state, stop the service, restore from the snapshot, -start the service, and verify the mutation is reversed. When the -drill passes, the runtime store has a trustworthy rollback. - -## What gets backed up - -`src/atocore/ops/backup.py::create_runtime_backup()` writes the -following into `$ATOCORE_BACKUP_DIR/snapshots//`: - -| Component | How | Hot/Cold | Notes | -|---|---|---|---| -| SQLite (`atocore.db`) | `conn.backup()` online API | **hot** | Safe with service running; self-contained main file, no WAL sidecar. | -| Project registry JSON | file copy | cold | Only if the file exists. | -| Chroma vector store | `shutil.copytree` | **cold** | Only when `include_chroma=True`. Caller must hold `exclusive_ingestion()` so nothing writes during the copy — the `POST /admin/backup?include_chroma=true` route does this automatically. | -| `backup-metadata.json` | JSON blob | — | Records paths, sizes, and whether Chroma was included. Restore reads this to know what to pull back. | - -Things that are **not** in the backup and must be handled separately: - -- The `.env` file under `deploy/dalidou/` — secrets live out of git - and out of the backup on purpose. The operator must re-place it - on any fresh host. -- The source content under `sources/vault` and `sources/drive` — - these are read-only inputs by convention, owned by AtoVault / - AtoDrive, and backed up there. -- Any running transient state (in-flight HTTP requests, ingestion - queues). Stop the service cleanly if you care about those. - -## What restore does - -`restore_runtime_backup(stamp, confirm_service_stopped=True)`: - -1. **Validates** the backup first via `validate_backup()` — - refuses to run on any error (missing metadata, corrupt snapshot - db, etc.). -2. **Takes a pre-restore safety snapshot** of the current state - (SQLite only, not Chroma — to keep it fast) and returns its - stamp. This is the reversibility guarantee: if the restore was - the wrong call, you can roll it back by restoring the - pre-restore snapshot. -3. **Forces a WAL checkpoint** on the current db - (`PRAGMA wal_checkpoint(TRUNCATE)`) to flush any lingering - writes and release OS file handles on `-wal`/`-shm`, so the - copy step won't race a half-open sqlite connection. -4. **Removes stale WAL/SHM sidecars** next to the target db. - The snapshot `.db` is a self-contained main-file image with no - WAL of its own; leftover `-wal` from the old live process - would desync against the restored main file. -5. **Copies the snapshot db** over the live db path. -6. **Restores the registry JSON** if the snapshot captured one. -7. **Restores the Chroma tree** if the snapshot captured one and - `include_chroma` resolves to true (defaults to whether the - snapshot has Chroma). -8. **Runs `PRAGMA integrity_check`** on the restored db and - reports the result alongside a summary of what was touched. - -If `confirm_service_stopped` is not passed, the function refuses — -this is deliberate. Hot-restoring into a running service is not -supported and would corrupt state. - -## The drill - -Run this from a Dalidou host with the AtoCore container already -deployed and healthy. The whole drill takes under two minutes. It -does not touch source content or disturb any `.env` secrets. - -### Step 1. Capture a snapshot via the HTTP API - -The running service holds the db; use the admin route so the -Chroma snapshot is taken under `exclusive_ingestion()`. The -endpoint takes a JSON body (not a query string): - -```bash -curl -fsS -X POST 'http://127.0.0.1:8100/admin/backup' \ - -H 'Content-Type: application/json' \ - -d '{"include_chroma": true}' \ - | python3 -m json.tool -``` - -Record the `backup_root` and note the stamp (the last path segment, -e.g. `20260409T012345Z`). That stamp is the input to the restore -step. - -### Step 2. Record a known piece of live state - -Pick something small and unambiguous to use as a marker. The -simplest is the current health snapshot plus a memory count: - -```bash -curl -fsS 'http://127.0.0.1:8100/health' | python3 -m json.tool -``` - -Note the `memory_count`, `interaction_count`, and `build_sha`. These -are your pre-drill baseline. - -### Step 3. Mutate live state AFTER the backup - -Write something the restore should reverse. Any write endpoint is -fine — a throwaway test memory is the cleanest. The request body -must include `memory_type` (the AtoCore memory schema requires it): - -```bash -curl -fsS -X POST 'http://127.0.0.1:8100/memory' \ - -H 'Content-Type: application/json' \ - -d '{ - "memory_type": "note", - "content": "DRILL-MARKER: this memory should not survive the restore", - "project": "drill", - "confidence": 1.0 - }' \ - | python3 -m json.tool -``` - -Record the returned `id`. Confirm it's there: - -```bash -curl -fsS 'http://127.0.0.1:8100/health' | python3 -m json.tool -# memory_count should be baseline + 1 - -# And you can list the drill-project memories directly: -curl -fsS 'http://127.0.0.1:8100/memory?project=drill' | python3 -m json.tool -# should return the DRILL-MARKER memory -``` - -### Step 4. Stop the service - -```bash -cd /srv/storage/atocore/app/deploy/dalidou -docker compose down -``` - -Wait for the container to actually exit: - -```bash -docker compose ps -# atocore should be gone or Exited -``` - -### Step 5. Restore from the snapshot - -Run the restore inside a one-shot container that reuses the same -volumes as the live service. This guarantees the paths resolve -identically to the running container's view. - -```bash -cd /srv/storage/atocore/app/deploy/dalidou -docker compose run --rm --entrypoint python atocore \ - -m atocore.ops.backup restore \ - \ - --confirm-service-stopped -``` - -The output is JSON; the important fields are: - -- `pre_restore_snapshot`: stamp of the safety snapshot of live - state at the moment of restore. **Write this down.** If the - restore turns out to be the wrong call, this is how you roll - it back. -- `db_restored`: `true` -- `registry_restored`: `true` if the backup had a registry -- `chroma_restored`: `true` if the backup had a chroma snapshot -- `restored_integrity_ok`: **must be `true`** — if this is false, - STOP and do not start the service; investigate the integrity - error first. - -If restoration fails at any step, the function raises a clean -`RuntimeError` and nothing partial is committed past the main file -swap. The pre-restore safety snapshot is your rollback anchor. - -### Step 6. Start the service back up - -```bash -cd /srv/storage/atocore/app/deploy/dalidou -docker compose up -d -``` - -Wait for `/health` to respond: - -```bash -for i in 1 2 3 4 5 6 7 8 9 10; do - curl -fsS 'http://127.0.0.1:8100/health' \ - && break || { echo "not ready ($i/10)"; sleep 3; } -done -``` - -### Step 7. Verify the drill marker is gone - -```bash -curl -fsS 'http://127.0.0.1:8100/health' | python3 -m json.tool -# memory_count should equal the Step 2 baseline, NOT baseline + 1 -``` - -You can also list the drill-project memories directly: - -```bash -curl -fsS 'http://127.0.0.1:8100/memory?project=drill' | python3 -m json.tool -# should return an empty list — the DRILL-MARKER memory was rolled back -``` - -For a semantic-retrieval cross-check, issue a query (the `/query` -endpoint takes `prompt`, not `query`): - -```bash -curl -fsS -X POST 'http://127.0.0.1:8100/query' \ - -H 'Content-Type: application/json' \ - -d '{"prompt": "DRILL-MARKER drill marker", "top_k": 5}' \ - | python3 -m json.tool -# should not return the DRILL-MARKER memory in the hits -``` - -If the marker is gone and `memory_count` matches the baseline, the -drill **passed**. The runtime store has a trustworthy rollback. - -### Step 8. (Optional) Clean up the safety snapshot - -If everything went smoothly you can leave the pre-restore safety -snapshot on disk for a few days as a paranoia buffer. There's no -automatic cleanup yet — `list_runtime_backups()` will show it, and -you can remove it by hand once you're confident: - -```bash -rm -rf /srv/storage/atocore/backups/snapshots/ -``` - -## Failure modes and recovery - -### Restore reports `restored_integrity_ok: false` - -The copied db failed `PRAGMA integrity_check`. Do **not** start -the service. This usually means either the source snapshot was -itself corrupt (and `validate_backup` should have caught it — file -a bug if it didn't), or the copy was interrupted. Options: - -1. Validate the source snapshot directly: - `python -m atocore.ops.backup validate ` -2. Pick a different, older snapshot and retry the restore. -3. Roll the db back to your pre-restore safety snapshot. - -### The live container won't start after restore - -Check the container logs: - -```bash -cd /srv/storage/atocore/app/deploy/dalidou -docker compose logs --tail=100 atocore -``` - -Common causes: - -- Schema drift between the snapshot and the current code version. - `_apply_migrations` in `src/atocore/models/database.py` is - idempotent and should absorb most forward migrations, but a - backward restore (running new code against an older snapshot) - may hit unexpected state. The migration only ADDs columns, so - the opposite direction is usually safe, but verify. -- Chroma and SQLite disagreeing about what chunks exist. The - backup captures them together to minimize this, but if you - restore SQLite without Chroma (`--no-chroma`), retrieval may - return stale vectors. Re-ingest if this happens. - -### The drill marker is still present after restore - -Something went wrong. Possible causes: - -- You restored a snapshot taken AFTER the drill marker was - written (wrong stamp). -- The service was writing during the drill and committed the - marker before `docker compose down`. Double-check the order. -- The restore silently skipped the db step. Check the restore - output for `db_restored: true` and `restored_integrity_ok: true`. - -Roll back to the pre-restore safety snapshot and retry with the -correct source snapshot. - -## When to run this drill - -- **Before** enabling any new write-path automation (auto-capture, - automated ingestion, reinforcement sweeps, scheduled extraction). -- **After** any change to `src/atocore/ops/backup.py` or the - schema migrations in `src/atocore/models/database.py`. -- **After** a Dalidou OS upgrade or docker version bump. -- **Monthly** as a standing operational check. - -Record each drill run (pass/fail) somewhere durable — even a line -in the project journal is enough. A drill you ran once and never -again is barely more than a drill you never ran. diff --git a/docs/backup-restore-procedure.md b/docs/backup-restore-procedure.md index a6e427b..032d5a8 100644 --- a/docs/backup-restore-procedure.md +++ b/docs/backup-restore-procedure.md @@ -146,141 +146,181 @@ of bytes. ## Restore procedure +Since 2026-04-09 the restore is implemented as a proper module +function plus CLI entry point: `restore_runtime_backup()` in +`src/atocore/ops/backup.py`, invoked as +`python -m atocore.ops.backup restore --confirm-service-stopped`. +It automatically takes a pre-restore safety snapshot (your rollback +anchor), handles SQLite WAL/SHM cleanly, restores the registry, and +runs `PRAGMA integrity_check` on the restored db. This replaces the +earlier manual `sudo cp` sequence. + +The function refuses to run without `--confirm-service-stopped`. +This is deliberate: hot-restoring into a running service corrupts +SQLite state. + ### Pre-flight (always) 1. Identify which snapshot you want to restore. List available snapshots and pick by timestamp: ```bash - curl -fsS http://dalidou:8100/admin/backup | jq '.backups[].stamp' + curl -fsS http://127.0.0.1:8100/admin/backup | jq '.backups[].stamp' ``` 2. Validate it. Refuse to restore an invalid backup: ```bash - STAMP=20260407T060000Z - curl -fsS http://dalidou:8100/admin/backup/$STAMP/validate | jq . + STAMP=20260409T060000Z + curl -fsS http://127.0.0.1:8100/admin/backup/$STAMP/validate | jq . ``` 3. **Stop AtoCore.** SQLite cannot be hot-restored under a running process and Chroma will not pick up new files until the process restarts. ```bash - docker compose stop atocore - # or: sudo systemctl stop atocore - ``` -4. **Take a safety snapshot of the current state** before overwriting - it. This is your "if the restore makes things worse, here's the - undo" backup. - ```bash - PRESERVE_STAMP=$(date -u +%Y%m%dT%H%M%SZ) - sudo cp /srv/storage/atocore/data/db/atocore.db \ - /srv/storage/atocore/backups/pre-restore-$PRESERVE_STAMP.db - sudo cp /srv/storage/atocore/config/project-registry.json \ - /srv/storage/atocore/backups/pre-restore-$PRESERVE_STAMP.registry.json 2>/dev/null || true + cd /srv/storage/atocore/app/deploy/dalidou + docker compose down + docker compose ps # atocore should be Exited/gone ``` -### Restore the SQLite database +### Run the restore + +Use a one-shot container that reuses the live service's volume +mounts so every path (`db_path`, `chroma_path`, backup dir) resolves +to the same place the main service would see: ```bash -SNAPSHOT_DIR=/srv/storage/atocore/backups/snapshots/$STAMP -sudo cp $SNAPSHOT_DIR/db/atocore.db \ - /srv/storage/atocore/data/db/atocore.db -sudo chown 1000:1000 /srv/storage/atocore/data/db/atocore.db -sudo chmod 600 /srv/storage/atocore/data/db/atocore.db +cd /srv/storage/atocore/app/deploy/dalidou +docker compose run --rm --entrypoint python atocore \ + -m atocore.ops.backup restore \ + $STAMP \ + --confirm-service-stopped ``` -The chown should match the gitea/atocore container user. Verify -by checking the existing perms before overwriting: +Output is a JSON document. The critical fields: -```bash -stat -c '%U:%G %a' /srv/storage/atocore/data/db/atocore.db -``` +- `pre_restore_snapshot`: stamp of the safety snapshot of live + state taken right before the restore. **Write this down.** If + the restore was the wrong call, this is how you roll it back. +- `db_restored`: should be `true` +- `registry_restored`: `true` if the backup captured a registry +- `chroma_restored`: `true` if the backup captured a chroma tree + and include_chroma resolved to true (default) +- `restored_integrity_ok`: **must be `true`** — if this is false, + STOP and do not start the service; investigate the integrity + error first. The restored file is still on disk but untrusted. -### Restore the project registry +### Controlling the restore -```bash -if [ -f $SNAPSHOT_DIR/config/project-registry.json ]; then - sudo cp $SNAPSHOT_DIR/config/project-registry.json \ - /srv/storage/atocore/config/project-registry.json - sudo chown 1000:1000 /srv/storage/atocore/config/project-registry.json - sudo chmod 644 /srv/storage/atocore/config/project-registry.json -fi -``` +The CLI supports a few flags for finer control: -If the snapshot does not contain a registry, the current registry is -preserved. The pre-flight safety copy still gives you a recovery path -if you need to roll back. +- `--no-pre-snapshot` skips the pre-restore safety snapshot. Use + this only when you know you have another rollback path. +- `--no-chroma` restores only SQLite + registry, leaving the + current Chroma dir alone. Useful if Chroma is consistent but + SQLite needs a rollback. +- `--chroma` forces Chroma restoration even if the metadata + doesn't clearly indicate the snapshot has it (rare). -### Restore the Chroma vector store (if it was in the snapshot) +### Chroma restore and bind-mounted volumes -```bash -if [ -d $SNAPSHOT_DIR/chroma ]; then - # Move the current chroma dir aside as a safety copy - sudo mv /srv/storage/atocore/data/chroma \ - /srv/storage/atocore/data/chroma.pre-restore-$PRESERVE_STAMP +The Chroma dir on Dalidou is a bind-mounted Docker volume. The +restore cannot `rmtree` the destination (you can't unlink a mount +point — it raises `OSError [Errno 16] Device or resource busy`), +so the function clears the dir's CONTENTS and uses +`copytree(dirs_exist_ok=True)` to copy the snapshot back in. The +regression test `test_restore_chroma_does_not_unlink_destination_directory` +in `tests/test_backup.py` captures the destination inode before +and after restore and asserts it's stable — the same invariant +that protects the bind mount. - # Copy the snapshot in - sudo cp -a $SNAPSHOT_DIR/chroma /srv/storage/atocore/data/chroma - sudo chown -R 1000:1000 /srv/storage/atocore/data/chroma -fi -``` - -If the snapshot does NOT contain a Chroma dir but the SQLite -restore would leave the vector store and the SQL store inconsistent -(e.g. SQL has chunks the vector store doesn't), you have two -options: - -- **Option 1: rebuild the vector store from source documents.** Run - ingestion fresh after the SQL restore. This regenerates embeddings - from the actual source files. Slow but produces a perfectly - consistent state. -- **Option 2: accept the inconsistency and live with stale-vector - filtering.** The retriever already drops vector results whose - SQL row no longer exists (`_existing_chunk_ids` filter), so the - inconsistency surfaces as missing results, not bad ones. - -For an unplanned restore, Option 2 is the right immediate move. -Then schedule a fresh ingestion pass to rebuild the vector store -properly. +This was discovered during the first real Dalidou restore drill +on 2026-04-09. If you see a new restore failure with +`Device or resource busy`, something has regressed this fix. ### Restart AtoCore ```bash -docker compose up -d atocore -# or: sudo systemctl start atocore +cd /srv/storage/atocore/app/deploy/dalidou +docker compose up -d +# Wait for /health to come up +for i in 1 2 3 4 5 6 7 8 9 10; do + curl -fsS http://127.0.0.1:8100/health \ + && break || { echo "not ready ($i/10)"; sleep 3; } +done ``` ### Post-restore verification ```bash # 1. Service is healthy -curl -fsS http://dalidou:8100/health | jq . +curl -fsS http://127.0.0.1:8100/health | jq . # 2. Stats look right -curl -fsS http://dalidou:8100/stats | jq . +curl -fsS http://127.0.0.1:8100/stats | jq . # 3. Project registry loads -curl -fsS http://dalidou:8100/projects | jq '.projects | length' +curl -fsS http://127.0.0.1:8100/projects | jq '.projects | length' # 4. A known-good context query returns non-empty results -curl -fsS -X POST http://dalidou:8100/context/build \ +curl -fsS -X POST http://127.0.0.1:8100/context/build \ -H "Content-Type: application/json" \ -d '{"prompt": "what is p05 about", "project": "p05-interferometer"}' | jq '.chunks_used' ``` If any of these are wrong, the restore is bad. Roll back using the -pre-restore safety copy: +pre-restore safety snapshot whose stamp you recorded from the +restore output. The rollback is the same procedure — stop the +service and restore that stamp: ```bash -docker compose stop atocore -sudo cp /srv/storage/atocore/backups/pre-restore-$PRESERVE_STAMP.db \ - /srv/storage/atocore/data/db/atocore.db -sudo cp /srv/storage/atocore/backups/pre-restore-$PRESERVE_STAMP.registry.json \ - /srv/storage/atocore/config/project-registry.json 2>/dev/null || true -# If you also restored chroma: -sudo rm -rf /srv/storage/atocore/data/chroma -sudo mv /srv/storage/atocore/data/chroma.pre-restore-$PRESERVE_STAMP \ - /srv/storage/atocore/data/chroma -docker compose up -d atocore +docker compose down +docker compose run --rm --entrypoint python atocore \ + -m atocore.ops.backup restore \ + $PRE_RESTORE_SNAPSHOT_STAMP \ + --confirm-service-stopped \ + --no-pre-snapshot +docker compose up -d ``` +(`--no-pre-snapshot` because the rollback itself doesn't need one; +you already have the original snapshot as a fallback if everything +goes sideways.) + +### Restore drill + +The restore is exercised at three levels: + +1. **Unit tests.** `tests/test_backup.py` has six restore tests + (refuse-without-confirm, invalid backup, full round-trip, + Chroma round-trip, inode-stability regression, WAL sidecar + cleanup, skip-pre-snapshot). These run in CI on every commit. +2. **Module-level round-trip.** + `test_restore_round_trip_reverses_post_backup_mutations` is + the canonical drill in code form: seed baseline, snapshot, + mutate, restore, assert mutation reversed + baseline survived + + pre-restore snapshot captured the mutation. +3. **Live drill on Dalidou.** Periodically run the full procedure + against the real service with a disposable drill-marker + memory (created via `POST /memory` with `memory_type=episodic` + and `project=drill`), following the sequence above and then + verifying the marker is gone afterward via + `GET /memory?project=drill`. The first such drill on + 2026-04-09 surfaced the bind-mount bug; future runs + primarily exist to verify the fix stays fixed. + +Run the live drill: + +- **Before** enabling any new write-path automation (auto-capture, + automated ingestion, reinforcement sweeps). +- **After** any change to `src/atocore/ops/backup.py` or to + schema migrations in `src/atocore/models/database.py`. +- **After** a Dalidou OS upgrade or docker version bump. +- **At least once per quarter** as a standing operational check. +- **After any incident** that touched the storage layer. + +Record each drill run (stamp, pre-restore snapshot stamp, pass/fail, +any surprises) somewhere durable — a line in the project journal +or a git commit message is enough. A drill you ran once and never +again is barely more than a drill you never ran. + ## Retention policy - **Last 7 daily backups**: kept verbatim @@ -296,32 +336,26 @@ A simple cron-based cleanup script is the next step: 0 4 * * * /srv/storage/atocore/scripts/cleanup-old-backups.sh ``` -## Drill schedule - -A backup that has never been restored is theoretical. The schedule: - -- **At least once per quarter**, perform a full restore drill on a - staging environment (or a temporary container with a separate - data dir) and verify the post-restore checks pass. -- **After every breaking schema migration**, perform a restore drill - to confirm the migration is reversible. -- **After any incident** that touched the storage layer (the EXDEV - bug from April 2026 is a good example), confirm the next backup - validates clean. - ## Common failure modes and what to do about them | Symptom | Likely cause | Action | |---|---|---| | `db_integrity_check_failed` on validation | SQLite snapshot copied while a write was in progress, or disk corruption | Take a fresh backup and validate again. If it fails twice, suspect the underlying disk. | | `registry_invalid_json` | Registry was being edited at backup time | Take a fresh backup. The registry is small so this is cheap. | -| `chroma_snapshot_missing` after a restore | Snapshot was DB-only and the restore didn't move the existing chroma dir | Either rebuild via fresh ingestion or restore an older snapshot that includes Chroma. | +| Restore: `restored_integrity_ok: false` | Source snapshot was itself corrupt (validation should have caught it — file a bug) or copy was interrupted mid-write | Do NOT start the service. Validate the snapshot directly with `python -m atocore.ops.backup validate `, try a different older snapshot, or roll back to the pre-restore safety snapshot. | +| Restore: `OSError [Errno 16] Device or resource busy` on Chroma | Old code tried to `rmtree` the Chroma mount point. Fixed on 2026-04-09 by `test_restore_chroma_does_not_unlink_destination_directory` | Ensure you're running commit 2026-04-09 or later; if you need to work around an older build, use `--no-chroma` and restore Chroma contents manually. | +| `chroma_snapshot_missing` after a restore | Snapshot was DB-only | Either rebuild via fresh ingestion or restore an older snapshot that includes Chroma. | | Service won't start after restore | Permissions wrong on the restored files | Re-run `chown 1000:1000` (or whatever the gitea/atocore container user is) on the data dir. | | `/stats` returns 0 documents after restore | The SQL store was restored but the source paths in `source_documents` don't match the current Dalidou paths | This means the backup came from a different deployment. Don't trust this restore — it's pulling from the wrong layout. | +| Drill marker still present after restore | Wrong stamp, service still writing during `docker compose down`, or the restore JSON didn't report `db_restored: true` | Roll back via the pre-restore safety snapshot and retry with the correct source snapshot. | ## Open follow-ups (not yet implemented) -1. **Retention cleanup script**: see the cron entry above. +Tracked separately in `docs/next-steps.md` — the list below is the +backup-specific subset. + +1. **Retention cleanup script**: see the cron entry above. The + snapshots directory grows monotonically until this exists. 2. **Off-Dalidou backup target**: currently snapshots live on the same disk as the live data. A real disaster-recovery story needs at least one snapshot on a different physical machine. @@ -338,23 +372,59 @@ A backup that has never been restored is theoretical. The schedule: improvement would be incremental snapshots via filesystem-level snapshotting (LVM, btrfs, ZFS). +**Done** (kept for historical reference): + +- ~~Implement `restore_runtime_backup()` as a proper module + function so the restore isn't a manual `sudo cp` dance~~ — + landed 2026-04-09 in commit 3362080, followed by the + Chroma bind-mount fix from the first real drill. + ## Quickstart cheat sheet ```bash # Daily backup (DB + registry only — fast) -curl -fsS -X POST http://dalidou:8100/admin/backup \ +curl -fsS -X POST http://127.0.0.1:8100/admin/backup \ -H "Content-Type: application/json" -d '{}' # Weekly backup (DB + registry + Chroma — slower, holds ingestion lock) -curl -fsS -X POST http://dalidou:8100/admin/backup \ +curl -fsS -X POST http://127.0.0.1:8100/admin/backup \ -H "Content-Type: application/json" -d '{"include_chroma": true}' # List backups -curl -fsS http://dalidou:8100/admin/backup | jq '.backups[].stamp' +curl -fsS http://127.0.0.1:8100/admin/backup | jq '.backups[].stamp' # Validate the most recent backup -LATEST=$(curl -fsS http://dalidou:8100/admin/backup | jq -r '.backups[-1].stamp') -curl -fsS http://dalidou:8100/admin/backup/$LATEST/validate | jq . +LATEST=$(curl -fsS http://127.0.0.1:8100/admin/backup | jq -r '.backups[-1].stamp') +curl -fsS http://127.0.0.1:8100/admin/backup/$LATEST/validate | jq . -# Full restore — see the "Restore procedure" section above +# Full restore (service must be stopped first) +cd /srv/storage/atocore/app/deploy/dalidou +docker compose down +docker compose run --rm --entrypoint python atocore \ + -m atocore.ops.backup restore $STAMP --confirm-service-stopped +docker compose up -d + +# Live drill: exercise the full create -> mutate -> restore flow +# against the running service. The marker memory uses +# memory_type=episodic (valid types: identity, preference, project, +# episodic, knowledge, adaptation) and project=drill so it's easy +# to find via GET /memory?project=drill before and after. +# +# See the "Restore drill" section above for the full sequence. +STAMP=$(curl -fsS -X POST http://127.0.0.1:8100/admin/backup \ + -H 'Content-Type: application/json' \ + -d '{"include_chroma": true}' | jq -r '.backup_root' | awk -F/ '{print $NF}') + +curl -fsS -X POST http://127.0.0.1:8100/memory \ + -H 'Content-Type: application/json' \ + -d '{"memory_type":"episodic","content":"DRILL-MARKER","project":"drill","confidence":1.0}' + +cd /srv/storage/atocore/app/deploy/dalidou +docker compose down +docker compose run --rm --entrypoint python atocore \ + -m atocore.ops.backup restore $STAMP --confirm-service-stopped +docker compose up -d + +# Marker should be gone: +curl -fsS 'http://127.0.0.1:8100/memory?project=drill' | jq . ``` diff --git a/docs/current-state.md b/docs/current-state.md index 6bafb66..61b22fe 100644 --- a/docs/current-state.md +++ b/docs/current-state.md @@ -200,10 +200,30 @@ The runtime has now been hardened in a few practical ways: - SQLite connections use a configurable busy timeout - SQLite uses WAL mode to reduce transient lock pain under normal concurrent use - project registry writes are atomic file replacements rather than in-place rewrites -- a first runtime backup path now exists for: - - SQLite - - project registry +- a full runtime backup and restore path now exists and has been exercised on + live Dalidou: + - SQLite (hot online backup via `conn.backup()`) + - project registry (file copy) + - Chroma vector store (cold directory copy under `exclusive_ingestion()`) - backup metadata + - `restore_runtime_backup()` with CLI entry point + (`python -m atocore.ops.backup restore + --confirm-service-stopped`), pre-restore safety snapshot for + rollback, WAL/SHM sidecar cleanup, `PRAGMA integrity_check` + on the restored file + - the first live drill on 2026-04-09 surfaced and fixed a Chroma + restore bug on Docker bind-mounted volumes (`shutil.rmtree` + on a mount point); a regression test now asserts the + destination inode is stable across restore +- deploy provenance is visible end-to-end: + - `/health` reports `build_sha`, `build_time`, `build_branch` + from env vars wired by `deploy.sh` + - `deploy.sh` Step 6 verifies the live `build_sha` matches the + just-built commit (exit code 6 on drift) so "live is current?" + can be answered precisely, not just by `__version__` + - `deploy.sh` Step 1.5 detects that the script itself changed + in the pulled commit and re-execs into the fresh copy, so + the deploy never silently runs the old script against new source This does not eliminate every concurrency edge, but it materially improves the current operational baseline. @@ -224,15 +244,19 @@ This separation is healthy: ## Immediate Next Focus -1. Use the new T420-side organic routing layer in real OpenClaw workflows -2. Tighten retrieval quality for the now fully ingested active project corpora -3. Move to Wave 2 trusted-operational ingestion instead of blindly widening raw corpus further -4. Keep the new engineering-knowledge architecture docs as implementation guidance while avoiding premature schema work -5. Expand the boring operations baseline: - - restore validation - - Chroma rebuild / backup policy - - retention -6. Only later consider write-back, reflection, or deeper autonomous behaviors +1. Re-run the full backup/restore drill on Dalidou with the + Chroma bind-mount fix in place (end-to-end green, not the + partial pass from 2026-04-09) +2. Turn on auto-capture of Claude Code sessions in conservative + mode now that the restore path is trustworthy +3. Use the new T420-side organic routing layer in real OpenClaw workflows +4. Tighten retrieval quality for the now fully ingested active project corpora +5. Move to Wave 2 trusted-operational ingestion instead of blindly widening raw corpus further +6. Keep the new engineering-knowledge architecture docs as implementation guidance while avoiding premature schema work +7. Expand the remaining boring operations baseline: + - retention policy cleanup script + - off-Dalidou backup target (rsync or similar) +8. Only later consider write-back, reflection, or deeper autonomous behaviors See also: diff --git a/docs/next-steps.md b/docs/next-steps.md index bc9666c..ac33f1a 100644 --- a/docs/next-steps.md +++ b/docs/next-steps.md @@ -20,45 +20,75 @@ This working list should be read alongside: ## Immediate Next Steps -1. Use the T420 `atocore-context` skill and the new organic routing layer in +1. Re-run the backup/restore drill on Dalidou with the Chroma + bind-mount fix in place + - the 2026-04-09 drill was a PARTIAL PASS: db restore + marker + reversal worked cleanly, but the Chroma step failed with + `OSError [Errno 16] Device or resource busy` because + `shutil.rmtree` cannot unlink a Docker bind-mounted volume + - fix landed immediately after: `restore_runtime_backup()` now + clears the destination's CONTENTS and uses + `copytree(dirs_exist_ok=True)`, and the regression test + `test_restore_chroma_does_not_unlink_destination_directory` + asserts the destination inode is stable + - need a green end-to-end run with `--chroma` actually + working in-container before enabling write-path automation +2. Turn on auto-capture of Claude Code sessions once the drill + re-run is clean + - conservative mode: Stop hook posts to `/interactions`, + no auto-extraction into review queue without review cadence + in place +3. Use the T420 `atocore-context` skill and the new organic routing layer in real OpenClaw workflows - confirm `auto-context` feels natural - confirm project inference is good enough in practice - confirm the fail-open behavior remains acceptable in practice -2. Review retrieval quality after the first real project ingestion batch +4. Review retrieval quality after the first real project ingestion batch - check whether the top hits are useful - check whether trusted project state remains dominant - reduce cross-project competition and prompt ambiguity where needed - use `debug-context` to inspect the exact last AtoCore supplement -3. Treat the active-project full markdown/text wave as complete +5. Treat the active-project full markdown/text wave as complete - `p04-gigabit` - `p05-interferometer` - `p06-polisher` -4. Define a cleaner source refresh model +6. Define a cleaner source refresh model - make the difference between source truth, staged inputs, and machine store explicit - move toward a project source registry and refresh workflow - foundation now exists via project registry + per-project refresh API - registration policy + template + proposal + approved registration are now the normal path for new projects -5. Move to Wave 2 trusted-operational ingestion +7. Move to Wave 2 trusted-operational ingestion - curated dashboards - decision logs - milestone/current-status views - operational truth, not just raw project notes -6. Integrate the new engineering architecture docs into active planning, not immediate schema code +8. Integrate the new engineering architecture docs into active planning, not immediate schema code - keep `docs/architecture/engineering-knowledge-hybrid-architecture.md` as the target layer model - keep `docs/architecture/engineering-ontology-v1.md` as the V1 structured-domain target - do not start entity/relationship persistence until the ingestion, retrieval, registry, and backup baseline feels boring and stable -7. Define backup and export procedures for Dalidou - - exercise the new SQLite + registry snapshot path on Dalidou - - Chroma backup or rebuild policy - - retention and restore validation - - admin backup endpoint now supports `include_chroma` cold snapshot - under the ingestion lock and `validate` confirms each snapshot is - openable; remaining work is the operational retention policy -8. Keep deeper automatic runtime integration modest until the organic read-only - model has proven value +9. Finish the boring operations baseline around backup + - retention policy cleanup script (snapshots dir grows + monotonically today) + - off-Dalidou backup target (at minimum an rsync to laptop or + another host so a single-disk failure isn't terminal) + - automatic post-backup validation (have `create_runtime_backup` + call `validate_backup` on its own output and refuse to + declare success if validation fails) + - DONE in commits be40994 / 0382238 / 3362080 / this one: + - `create_runtime_backup` + `list_runtime_backups` + + `validate_backup` + `restore_runtime_backup` with CLI + - `POST /admin/backup` with `include_chroma=true` under + the ingestion lock + - `/health` build_sha / build_time / build_branch provenance + - `deploy.sh` self-update re-exec guard + build_sha drift + verification + - live drill procedure in `docs/backup-restore-procedure.md` + with failure-mode table and the memory_type=episodic + marker pattern from the 2026-04-09 drill +10. Keep deeper automatic runtime integration modest until the organic read-only + model has proven value ## Trusted State Status diff --git a/src/atocore/ops/backup.py b/src/atocore/ops/backup.py index 825ae1c..0c2e885 100644 --- a/src/atocore/ops/backup.py +++ b/src/atocore/ops/backup.py @@ -337,9 +337,22 @@ def restore_runtime_backup( src_chroma = Path(chroma_snapshot_path) if src_chroma.exists() and src_chroma.is_dir(): dst_chroma = _config.settings.chroma_path - if dst_chroma.exists(): - shutil.rmtree(dst_chroma) - shutil.copytree(src_chroma, dst_chroma) + # Do NOT rmtree the destination itself: in a Dockerized + # deployment the chroma dir is a bind-mounted volume, and + # unlinking a mount point raises + # OSError [Errno 16] Device or resource busy. + # Instead, clear the directory's CONTENTS and copytree into + # it with dirs_exist_ok=True. This is equivalent to an + # rmtree+copytree for restore purposes but stays inside the + # mount boundary. Discovered during the first real restore + # drill on Dalidou (2026-04-09). + dst_chroma.mkdir(parents=True, exist_ok=True) + for item in dst_chroma.iterdir(): + if item.is_dir() and not item.is_symlink(): + shutil.rmtree(item) + else: + item.unlink() + shutil.copytree(src_chroma, dst_chroma, dirs_exist_ok=True) chroma_restored = True restored_integrity_ok = False diff --git a/tests/test_backup.py b/tests/test_backup.py index 7b0ef0f..c617f16 100644 --- a/tests/test_backup.py +++ b/tests/test_backup.py @@ -326,6 +326,65 @@ def test_restore_round_trip_with_chroma(tmp_path, monkeypatch): config.settings = original_settings +def test_restore_chroma_does_not_unlink_destination_directory(tmp_path, monkeypatch): + """Regression: restore must not rmtree the chroma dir itself. + + In a Dockerized deployment the chroma dir is a bind-mounted + volume. Calling shutil.rmtree on a mount point raises + ``OSError [Errno 16] Device or resource busy``, which broke the + first real Dalidou drill on 2026-04-09. The fix clears the + directory's CONTENTS and copytree(dirs_exist_ok=True) into it, + keeping the directory inode (and any bind mount) intact. + + This test captures the inode of the destination directory before + and after restore and asserts they match — that's what a + bind-mounted chroma dir would also see. + """ + monkeypatch.setenv("ATOCORE_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("ATOCORE_BACKUP_DIR", str(tmp_path / "backups")) + monkeypatch.setenv( + "ATOCORE_PROJECT_REGISTRY_PATH", str(tmp_path / "config" / "project-registry.json") + ) + + original_settings = config.settings + try: + config.settings = config.Settings() + init_db() + + chroma_dir = config.settings.chroma_path + (chroma_dir / "coll-a").mkdir(parents=True, exist_ok=True) + (chroma_dir / "coll-a" / "baseline.bin").write_bytes(b"baseline") + + create_runtime_backup( + datetime(2026, 4, 9, 15, 0, 0, tzinfo=UTC), include_chroma=True + ) + + # Capture the destination directory's stat signature before restore. + chroma_stat_before = chroma_dir.stat() + + # Add a file post-backup so restore has work to do. + (chroma_dir / "coll-a" / "post_backup.bin").write_bytes(b"post") + + restore_runtime_backup( + "20260409T150000Z", confirm_service_stopped=True + ) + + # Directory still exists (would have failed on mount point) and + # its st_ino matches — the mount itself wasn't unlinked. + assert chroma_dir.exists() + chroma_stat_after = chroma_dir.stat() + assert chroma_stat_before.st_ino == chroma_stat_after.st_ino, ( + "chroma directory inode changed — restore recreated the " + "directory instead of clearing its contents; this would " + "fail on a Docker bind-mounted volume" + ) + # And the contents did actually get restored. + assert (chroma_dir / "coll-a" / "baseline.bin").exists() + assert not (chroma_dir / "coll-a" / "post_backup.bin").exists() + finally: + config.settings = original_settings + + def test_restore_skips_pre_snapshot_when_requested(tmp_path, monkeypatch): monkeypatch.setenv("ATOCORE_DATA_DIR", str(tmp_path / "data")) monkeypatch.setenv("ATOCORE_BACKUP_DIR", str(tmp_path / "backups"))