From c9757e313ad687877fe2ac415f4da547f86bc13f Mon Sep 17 00:00:00 2001 From: Anto01 Date: Mon, 6 Apr 2026 10:15:00 -0400 Subject: [PATCH] Harden runtime and add backup foundation --- .env.example | 1 + docs/backup-strategy.md | 80 ++++++++++++++++++++++++++++++++ docs/current-state.md | 38 +++++++++++++-- docs/next-steps.md | 7 ++- src/atocore/config.py | 1 + src/atocore/models/database.py | 8 +++- src/atocore/ops/__init__.py | 1 + src/atocore/ops/backup.py | 70 ++++++++++++++++++++++++++++ src/atocore/projects/registry.py | 15 ++++-- tests/test_backup.py | 71 ++++++++++++++++++++++++++++ tests/test_database.py | 49 +++++++++++++++++++ 11 files changed, 331 insertions(+), 10 deletions(-) create mode 100644 docs/backup-strategy.md create mode 100644 src/atocore/ops/__init__.py create mode 100644 src/atocore/ops/backup.py create mode 100644 tests/test_backup.py create mode 100644 tests/test_database.py diff --git a/.env.example b/.env.example index 18ff83d..8c69287 100644 --- a/.env.example +++ b/.env.example @@ -17,6 +17,7 @@ ATOCORE_PROJECT_REGISTRY_DIR=./config ATOCORE_PROJECT_REGISTRY_PATH=./config/project-registry.json ATOCORE_HOST=127.0.0.1 ATOCORE_PORT=8100 +ATOCORE_DB_BUSY_TIMEOUT_MS=5000 ATOCORE_EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 ATOCORE_CHUNK_MAX_SIZE=800 ATOCORE_CHUNK_OVERLAP=100 diff --git a/docs/backup-strategy.md b/docs/backup-strategy.md new file mode 100644 index 0000000..5c5bd71 --- /dev/null +++ b/docs/backup-strategy.md @@ -0,0 +1,80 @@ +# AtoCore Backup Strategy + +## Purpose + +This document describes the current backup baseline for the Dalidou-hosted +AtoCore machine store. + +The immediate goal is not full disaster-proof automation yet. The goal is to +have one safe, repeatable way to snapshot the most important writable state. + +## Current Backup Baseline + +Today, the safest hot-backup target is: + +- SQLite machine database +- project registry JSON +- backup metadata describing what was captured + +This is now supported by: + +- `python -m atocore.ops.backup` + +## What The Script Captures + +The backup command creates a timestamped snapshot under: + +- `ATOCORE_BACKUP_DIR/snapshots//` + +It currently writes: + +- `db/atocore.db` + - created with SQLite's backup API +- `config/project-registry.json` + - copied if it exists +- `backup-metadata.json` + - timestamp, paths, and backup notes + +## What It Does Not Yet Capture + +The current script does not hot-backup Chroma. + +That is intentional. + +For now, Chroma should be treated as one of: + +- rebuildable derived state +- or something that needs a deliberate cold snapshot/export workflow + +Until that workflow exists, do not rely on ad hoc live file copies of the +vector store while the service is actively writing. + +## Dalidou Use + +On Dalidou, the canonical machine paths are: + +- DB: + - `/srv/storage/atocore/data/db/atocore.db` +- registry: + - `/srv/storage/atocore/config/project-registry.json` +- backups: + - `/srv/storage/atocore/backups` + +So a normal backup run should happen on Dalidou itself, not from another +machine. + +## Next Backup Improvements + +1. decide Chroma policy clearly + - rebuild vs cold snapshot vs export +2. add a simple scheduled backup routine on Dalidou +3. add retention policy for old snapshots +4. optionally add a restore validation check + +## Healthy Rule + +Do not design around syncing the live machine DB/vector store between machines. + +Back up the canonical Dalidou state. +Restore from Dalidou state. +Keep OpenClaw as a client of AtoCore, not a storage peer. diff --git a/docs/current-state.md b/docs/current-state.md index 5ac96e8..f1a9d8c 100644 --- a/docs/current-state.md +++ b/docs/current-state.md @@ -39,6 +39,11 @@ now includes a first curated ingestion batch for the active projects. - context builder - API routes for query, context, health, and source status - project registry and per-project refresh foundation +- project registration lifecycle: + - template + - proposal preview + - approved registration + - refresh - env-driven storage and deployment paths - Dalidou Docker deployment foundation - initial AtoCore self-knowledge corpus ingested on Dalidou @@ -64,6 +69,11 @@ The service and storage foundation are live on Dalidou. The machine-data host is real and canonical. +The project registry is now also persisted in a canonical mounted config path on +Dalidou: + +- `/srv/storage/atocore/config/project-registry.json` + The content corpus is partially populated now. The Dalidou instance already contains: @@ -88,9 +98,9 @@ The Dalidou instance already contains: Current live stats after the latest documentation sync and active-project ingest passes: -- `source_documents`: 34 -- `source_chunks`: 551 -- `vectors`: 551 +- `source_documents`: 35 +- `source_chunks`: 560 +- `vectors`: 560 The broader long-term corpus is still not fully populated yet. Wider project and vault ingestion remains a deliberate next step rather than something already @@ -149,8 +159,28 @@ The source refresh model now has a concrete foundation in code: - a project registry file defines known project ids, aliases, and ingest roots - the API can list registered projects +- the API can return a registration template +- the API can preview a registration without mutating state +- the API can persist an approved registration - the API can refresh one registered project at a time +This lifecycle is now coherent end to end for normal use. + +## Reliability Baseline + +The runtime has now been hardened in a few practical ways: + +- SQLite connections use a configurable busy timeout +- SQLite uses WAL mode to reduce transient lock pain under normal concurrent use +- project registry writes are atomic file replacements rather than in-place rewrites +- a first runtime backup path now exists for: + - SQLite + - project registry + - backup metadata + +This does not eliminate every concurrency edge, but it materially improves the +current operational baseline. + In `Trusted Project State`: - each active seeded project now has a conservative trusted-state set @@ -167,7 +197,7 @@ This separation is healthy: ## Immediate Next Focus -1. Use the new T420-side AtoCore skill in real OpenClaw workflows +1. Use the new T420-side AtoCore skill and registration flow in real OpenClaw workflows 2. Tighten retrieval quality for the newly seeded active projects 3. Define the first broader AtoVault/AtoDrive ingestion batches 4. Add backup/export strategy for Dalidou machine state diff --git a/docs/next-steps.md b/docs/next-steps.md index 24a1212..5ef68ae 100644 --- a/docs/next-steps.md +++ b/docs/next-steps.md @@ -31,10 +31,12 @@ AtoCore now has: explicit - move toward a project source registry and refresh workflow - foundation now exists via project registry + per-project refresh API - - registration policy + template are now the next normal path for new projects + - registration policy + template + proposal + approved registration are now + the normal path for new projects 5. Define backup and export procedures for Dalidou - - SQLite snapshot/backup strategy + - exercise the new SQLite + registry snapshot path on Dalidou - Chroma backup or rebuild policy + - retention and restore validation 6. Keep deeper automatic runtime integration deferred until the read-only model has proven value @@ -101,6 +103,7 @@ P06: The next batch is successful if: - OpenClaw can use AtoCore naturally when context is needed +- OpenClaw can also register a new project cleanly before refreshing it - AtoCore answers correctly for the active project set - retrieval surfaces the seeded project docs instead of mostly AtoCore meta-docs - trusted project state remains concise and high confidence diff --git a/src/atocore/config.py b/src/atocore/config.py index 8f05fe5..a5c2411 100644 --- a/src/atocore/config.py +++ b/src/atocore/config.py @@ -24,6 +24,7 @@ class Settings(BaseSettings): project_registry_path: Path = Path("./config/project-registry.json") host: str = "127.0.0.1" port: int = 8100 + db_busy_timeout_ms: int = 5000 # Embedding embedding_model: str = ( diff --git a/src/atocore/models/database.py b/src/atocore/models/database.py index 618d0fc..4700722 100644 --- a/src/atocore/models/database.py +++ b/src/atocore/models/database.py @@ -100,9 +100,15 @@ def _column_exists(conn: sqlite3.Connection, table: str, column: str) -> bool: def get_connection() -> Generator[sqlite3.Connection, None, None]: """Get a database connection with row factory.""" _ensure_data_dir() - conn = sqlite3.connect(str(_config.settings.db_path)) + conn = sqlite3.connect( + str(_config.settings.db_path), + timeout=_config.settings.db_busy_timeout_ms / 1000, + ) conn.row_factory = sqlite3.Row conn.execute("PRAGMA foreign_keys = ON") + conn.execute(f"PRAGMA busy_timeout = {_config.settings.db_busy_timeout_ms}") + conn.execute("PRAGMA journal_mode = WAL") + conn.execute("PRAGMA synchronous = NORMAL") try: yield conn conn.commit() diff --git a/src/atocore/ops/__init__.py b/src/atocore/ops/__init__.py new file mode 100644 index 0000000..951b98f --- /dev/null +++ b/src/atocore/ops/__init__.py @@ -0,0 +1 @@ +"""Operational utilities for running AtoCore safely.""" diff --git a/src/atocore/ops/backup.py b/src/atocore/ops/backup.py new file mode 100644 index 0000000..7a33ee3 --- /dev/null +++ b/src/atocore/ops/backup.py @@ -0,0 +1,70 @@ +"""Create safe runtime backups for the AtoCore machine store.""" + +from __future__ import annotations + +import json +import sqlite3 +from datetime import datetime, UTC +from pathlib import Path + +import atocore.config as _config +from atocore.models.database import init_db +from atocore.observability.logger import get_logger + +log = get_logger("backup") + + +def create_runtime_backup(timestamp: datetime | None = None) -> dict: + """Create a hot backup of the SQLite DB plus registry/config metadata.""" + init_db() + now = timestamp or datetime.now(UTC) + stamp = now.strftime("%Y%m%dT%H%M%SZ") + + backup_root = _config.settings.resolved_backup_dir / "snapshots" / stamp + db_backup_dir = backup_root / "db" + config_backup_dir = backup_root / "config" + metadata_path = backup_root / "backup-metadata.json" + + db_backup_dir.mkdir(parents=True, exist_ok=True) + config_backup_dir.mkdir(parents=True, exist_ok=True) + + db_snapshot_path = db_backup_dir / _config.settings.db_path.name + _backup_sqlite_db(_config.settings.db_path, db_snapshot_path) + + registry_snapshot = None + registry_path = _config.settings.resolved_project_registry_path + if registry_path.exists(): + registry_snapshot = config_backup_dir / registry_path.name + registry_snapshot.write_text(registry_path.read_text(encoding="utf-8"), encoding="utf-8") + + metadata = { + "created_at": now.isoformat(), + "backup_root": str(backup_root), + "db_snapshot_path": str(db_snapshot_path), + "db_size_bytes": db_snapshot_path.stat().st_size, + "registry_snapshot_path": str(registry_snapshot) if registry_snapshot else "", + "vector_store_note": "Chroma hot backup is not included in this script; use a cold snapshot or rebuild/export workflow.", + } + metadata_path.write_text(json.dumps(metadata, indent=2, ensure_ascii=True) + "\n", encoding="utf-8") + + log.info("runtime_backup_created", backup_root=str(backup_root), db_snapshot=str(db_snapshot_path)) + return metadata + + +def _backup_sqlite_db(source_path: Path, dest_path: Path) -> None: + source_conn = sqlite3.connect(str(source_path)) + dest_conn = sqlite3.connect(str(dest_path)) + try: + source_conn.backup(dest_conn) + finally: + dest_conn.close() + source_conn.close() + + +def main() -> None: + result = create_runtime_backup() + print(json.dumps(result, indent=2, ensure_ascii=True)) + + +if __name__ == "__main__": + main() diff --git a/src/atocore/projects/registry.py b/src/atocore/projects/registry.py index 9070c66..613b8a1 100644 --- a/src/atocore/projects/registry.py +++ b/src/atocore/projects/registry.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import tempfile from dataclasses import asdict, dataclass from pathlib import Path @@ -320,7 +321,15 @@ def _load_registry_payload(registry_path: Path) -> dict: def _write_registry_payload(registry_path: Path, payload: dict) -> None: registry_path.parent.mkdir(parents=True, exist_ok=True) - registry_path.write_text( - json.dumps(payload, indent=2, ensure_ascii=True) + "\n", + rendered = json.dumps(payload, indent=2, ensure_ascii=True) + "\n" + with tempfile.NamedTemporaryFile( + mode="w", encoding="utf-8", - ) + dir=registry_path.parent, + prefix=f"{registry_path.stem}.", + suffix=".tmp", + delete=False, + ) as tmp_file: + tmp_file.write(rendered) + temp_path = Path(tmp_file.name) + temp_path.replace(registry_path) diff --git a/tests/test_backup.py b/tests/test_backup.py new file mode 100644 index 0000000..889b0e8 --- /dev/null +++ b/tests/test_backup.py @@ -0,0 +1,71 @@ +"""Tests for runtime backup creation.""" + +import json +import sqlite3 +from datetime import UTC, datetime + +import atocore.config as config +from atocore.models.database import init_db +from atocore.ops.backup import create_runtime_backup + + +def test_create_runtime_backup_copies_db_and_registry(tmp_path, monkeypatch): + monkeypatch.setenv("ATOCORE_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("ATOCORE_BACKUP_DIR", str(tmp_path / "backups")) + monkeypatch.setenv( + "ATOCORE_PROJECT_REGISTRY_PATH", str(tmp_path / "config" / "project-registry.json") + ) + + registry_path = tmp_path / "config" / "project-registry.json" + registry_path.parent.mkdir(parents=True) + registry_path.write_text('{"projects":[{"id":"p01-example","aliases":[],"ingest_roots":[{"source":"vault","subpath":"incoming/projects/p01-example"}]}]}\n', encoding="utf-8") + + original_settings = config.settings + try: + config.settings = config.Settings() + init_db() + with sqlite3.connect(str(config.settings.db_path)) as conn: + conn.execute("INSERT INTO projects (id, name) VALUES (?, ?)", ("p01", "P01 Example")) + conn.commit() + + result = create_runtime_backup(datetime(2026, 4, 6, 18, 0, 0, tzinfo=UTC)) + finally: + config.settings = original_settings + + db_snapshot = tmp_path / "backups" / "snapshots" / "20260406T180000Z" / "db" / "atocore.db" + registry_snapshot = ( + tmp_path / "backups" / "snapshots" / "20260406T180000Z" / "config" / "project-registry.json" + ) + metadata_path = ( + tmp_path / "backups" / "snapshots" / "20260406T180000Z" / "backup-metadata.json" + ) + + assert result["db_snapshot_path"] == str(db_snapshot) + assert db_snapshot.exists() + assert registry_snapshot.exists() + assert metadata_path.exists() + + with sqlite3.connect(str(db_snapshot)) as conn: + row = conn.execute("SELECT name FROM projects WHERE id = ?", ("p01",)).fetchone() + assert row[0] == "P01 Example" + + metadata = json.loads(metadata_path.read_text(encoding="utf-8")) + assert metadata["registry_snapshot_path"] == str(registry_snapshot) + + +def test_create_runtime_backup_handles_missing_registry(tmp_path, monkeypatch): + monkeypatch.setenv("ATOCORE_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("ATOCORE_BACKUP_DIR", str(tmp_path / "backups")) + monkeypatch.setenv( + "ATOCORE_PROJECT_REGISTRY_PATH", str(tmp_path / "config" / "project-registry.json") + ) + + original_settings = config.settings + try: + config.settings = config.Settings() + init_db() + result = create_runtime_backup(datetime(2026, 4, 6, 19, 0, 0, tzinfo=UTC)) + finally: + config.settings = original_settings + + assert result["registry_snapshot_path"] == "" diff --git a/tests/test_database.py b/tests/test_database.py new file mode 100644 index 0000000..9e91a45 --- /dev/null +++ b/tests/test_database.py @@ -0,0 +1,49 @@ +"""Tests for SQLite connection pragmas and runtime behavior.""" + +import sqlite3 + +import atocore.config as config +from atocore.models.database import get_connection, init_db + + +def test_get_connection_applies_busy_timeout_and_wal(tmp_path, monkeypatch): + monkeypatch.setenv("ATOCORE_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("ATOCORE_DB_BUSY_TIMEOUT_MS", "7000") + + original_settings = config.settings + try: + config.settings = config.Settings() + init_db() + with get_connection() as conn: + busy_timeout = conn.execute("PRAGMA busy_timeout").fetchone()[0] + journal_mode = conn.execute("PRAGMA journal_mode").fetchone()[0] + foreign_keys = conn.execute("PRAGMA foreign_keys").fetchone()[0] + finally: + config.settings = original_settings + + assert busy_timeout == 7000 + assert str(journal_mode).lower() == "wal" + assert foreign_keys == 1 + + +def test_get_connection_uses_configured_timeout_value(tmp_path, monkeypatch): + monkeypatch.setenv("ATOCORE_DATA_DIR", str(tmp_path / "data")) + monkeypatch.setenv("ATOCORE_DB_BUSY_TIMEOUT_MS", "2500") + + original_settings = config.settings + original_connect = sqlite3.connect + calls = [] + + def fake_connect(*args, **kwargs): + calls.append(kwargs.get("timeout")) + return original_connect(*args, **kwargs) + + try: + config.settings = config.Settings() + monkeypatch.setattr("atocore.models.database.sqlite3.connect", fake_connect) + init_db() + finally: + config.settings = original_settings + + assert calls + assert calls[0] == 2.5