From b492f5f7b0a355437f1fef7ab316cb988584af2f Mon Sep 17 00:00:00 2001 From: Anto01 Date: Wed, 8 Apr 2026 19:02:57 -0400 Subject: [PATCH] fix: schema init ordering, deploy.sh default, client BASE_URL docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three issues Dalidou Claude surfaced during the first real deploy of commit e877e5b to the live service (report from 2026-04-08). Bug 1 was the critical one — a schema init ordering bug that would have bitten every future upgrade from a pre-Phase-9 schema — and the other two were usability traps around hostname resolution. Bug 1 (CRITICAL): schema init ordering -------------------------------------- src/atocore/models/database.py SCHEMA_SQL contained CREATE INDEX statements that referenced columns added later by _apply_migrations(): CREATE INDEX IF NOT EXISTS idx_memories_project ON memories(project); CREATE INDEX IF NOT EXISTS idx_interactions_project_name ON interactions(project); CREATE INDEX IF NOT EXISTS idx_interactions_session ON interactions(session_id); On a FRESH install, CREATE TABLE IF NOT EXISTS creates the tables with the Phase 9 shape (columns present), so the CREATE INDEX runs cleanly and _apply_migrations is effectively a no-op. On an UPGRADE from a pre-Phase-9 schema, CREATE TABLE IF NOT EXISTS is a no-op (the tables already exist in the old shape), the columns are NOT added yet, and the CREATE INDEX fails with "OperationalError: no such column: project" before _apply_migrations gets a chance to add the columns. Dalidou Claude hit this exactly when redeploying from 0.1.0 to 0.2.0 — had to manually ALTER TABLE to add the Phase 9 columns before the container could start. The fix is to remove the Phase 9-column indexes from SCHEMA_SQL. They already exist in _apply_migrations() AFTER the corresponding ALTER TABLE, so they still get created on both fresh and upgrade paths — just after the columns exist, not before. Indexes still in SCHEMA_SQL (all safe — reference columns that have existed since the first release): - idx_chunks_document on source_chunks(document_id) - idx_memories_type on memories(memory_type) - idx_memories_status on memories(status) - idx_interactions_project on interactions(project_id) Indexes moved to _apply_migrations (already there — just no longer duplicated in SCHEMA_SQL): - idx_memories_project on memories(project) - idx_interactions_project_name on interactions(project) - idx_interactions_session on interactions(session_id) - idx_interactions_created_at on interactions(created_at) Regression test: tests/test_database.py --------------------------------------- New test_init_db_upgrades_pre_phase9_schema_without_failing: - Seeds the DB with the exact pre-Phase-9 shape (no project / last_referenced_at / reference_count on memories; no project / client / session_id / response / memories_used / chunks_used on interactions) - Calls init_db() — which used to raise OperationalError before the fix - Verifies all Phase 9 columns are present after the call - Verifies the migration indexes exist Before the fix this test would have failed with "OperationalError: no such column: project" on the init_db call. After the fix it passes. This locks the invariant "init_db is safe on any legacy schema shape" so the bug can't silently come back. Full suite: 216 passing (was 215), 1 warning. The +1 is the new regression test. Bug 3 (usability): deploy.sh DNS default ---------------------------------------- deploy/dalidou/deploy.sh ATOCORE_GIT_REMOTE defaulted to http://dalidou:3000/Antoine/ATOCore.git which requires the "dalidou" hostname to resolve. On the Dalidou host itself it didn't (no /etc/hosts entry for localhost alias), so deploy.sh had to be run with the IP as a manual workaround. Fix: default ATOCORE_GIT_REMOTE to http://127.0.0.1:3000/Antoine/ATOCore.git. Loopback always works on the host running the script. Callers from a remote host (e.g. running deploy.sh from a laptop against the Dalidou LAN) set ATOCORE_GIT_REMOTE explicitly. The script header's Environment Variables section documents this with an explicit reference to the 2026-04-08 Dalidou deploy report so the rationale isn't lost. docs/dalidou-deployment.md gets a new "Troubleshooting hostname resolution" subsection and a new example invocation showing how to deploy from a remote host with an explicit ATOCORE_GIT_REMOTE override. Bug 2 (usability): atocore_client.py ATOCORE_BASE_URL documentation ------------------------------------------------------------------- scripts/atocore_client.py Same class of issue as bug 3. BASE_URL defaults to http://dalidou:8100 which resolves fine from a remote caller (laptop, T420/OpenClaw over Tailscale) but NOT from the Dalidou host itself or from inside the atocore container. Dalidou Claude saw the CLI return {"status": "unavailable", "fail_open": true} while direct curl to http://127.0.0.1:8100 worked. The fix here is NOT to change the default (remote callers are the common case and would break) but to DOCUMENT the override clearly so the next operator knows what's happening: - The script module docstring grew a new "Environment variables" section covering ATOCORE_BASE_URL, ATOCORE_TIMEOUT_SECONDS, ATOCORE_REFRESH_TIMEOUT_SECONDS, and ATOCORE_FAIL_OPEN, with the explicit override example for on-host/in-container use - It calls out the exact symptom (fail-open envelope when the base URL doesn't resolve) so the diagnosis is obvious from the error alone - docs/dalidou-deployment.md troubleshooting section mirrors this guidance so there's one place to look regardless of whether the operator starts with the client help or the deploy doc What this commit does NOT do ---------------------------- - Does NOT change the default ATOCORE_BASE_URL. Doing that would break the T420 OpenClaw helper and every remote caller who currently relies on the hostname. Documentation is the right fix for this case. - Does NOT fix /etc/hosts on Dalidou. That's a host-level configuration issue that the user can fix if they prefer having the hostname resolve; the deploy.sh fix makes it unnecessary regardless. - Does NOT re-run the validation on Dalidou. The next step is for the live service to pull this commit via deploy.sh (which should now work without the IP workaround) and re-run the Phase 9 loop test to confirm nothing regressed. --- deploy/dalidou/deploy.sh | 12 ++- docs/dalidou-deployment.md | 28 +++++++ scripts/atocore_client.py | 41 +++++++++- src/atocore/models/database.py | 12 ++- tests/test_database.py | 135 +++++++++++++++++++++++++++++++++ 5 files changed, 219 insertions(+), 9 deletions(-) diff --git a/deploy/dalidou/deploy.sh b/deploy/dalidou/deploy.sh index c2c7551..9e01545 100644 --- a/deploy/dalidou/deploy.sh +++ b/deploy/dalidou/deploy.sh @@ -25,7 +25,15 @@ # --------------------- # # ATOCORE_APP_DIR default /srv/storage/atocore/app -# ATOCORE_GIT_REMOTE default http://dalidou:3000/Antoine/ATOCore.git +# ATOCORE_GIT_REMOTE default http://127.0.0.1:3000/Antoine/ATOCore.git +# This is the local Dalidou gitea, reached +# via loopback. Override only when running +# the deploy from a remote host. The default +# is loopback (not the hostname "dalidou") +# because the hostname doesn't reliably +# resolve on the host itself — Dalidou +# Claude's first deploy had to work around +# exactly this. # ATOCORE_BRANCH default main # ATOCORE_DEPLOY_DRY_RUN if set to 1, report only, no mutations # ATOCORE_HEALTH_URL default http://127.0.0.1:8100/health @@ -60,7 +68,7 @@ set -euo pipefail APP_DIR="${ATOCORE_APP_DIR:-/srv/storage/atocore/app}" -GIT_REMOTE="${ATOCORE_GIT_REMOTE:-http://dalidou:3000/Antoine/ATOCore.git}" +GIT_REMOTE="${ATOCORE_GIT_REMOTE:-http://127.0.0.1:3000/Antoine/ATOCore.git}" BRANCH="${ATOCORE_BRANCH:-main}" HEALTH_URL="${ATOCORE_HEALTH_URL:-http://127.0.0.1:8100/health}" DRY_RUN="${ATOCORE_DEPLOY_DRY_RUN:-0}" diff --git a/docs/dalidou-deployment.md b/docs/dalidou-deployment.md index 8ba3577..af61147 100644 --- a/docs/dalidou-deployment.md +++ b/docs/dalidou-deployment.md @@ -104,6 +104,11 @@ ATOCORE_BRANCH=codex/some-feature \ # Dry-run: show what would happen without touching anything ATOCORE_DEPLOY_DRY_RUN=1 \ bash /srv/storage/atocore/app/deploy/dalidou/deploy.sh + +# Deploy from a remote host (e.g. the laptop) using the Tailscale +# or LAN address instead of loopback +ATOCORE_GIT_REMOTE=http://192.168.86.50:3000/Antoine/ATOCore.git \ + bash /srv/storage/atocore/app/deploy/dalidou/deploy.sh ``` The script is idempotent and safe to re-run. It never touches the @@ -112,6 +117,29 @@ service startup by the lifespan handler in `src/atocore/main.py` which calls `init_db()` (which in turn runs the ALTER TABLE statements in `_apply_migrations`). +### Troubleshooting hostname resolution + +`deploy.sh` defaults `ATOCORE_GIT_REMOTE` to +`http://127.0.0.1:3000/Antoine/ATOCore.git` (loopback) because the +hostname "dalidou" doesn't reliably resolve on the host itself — +the first real Dalidou deploy hit exactly this on 2026-04-08. If +you need to override (e.g. running deploy.sh from a laptop against +the Dalidou LAN), set `ATOCORE_GIT_REMOTE` explicitly. + +The same applies to `scripts/atocore_client.py`: its default +`ATOCORE_BASE_URL` is `http://dalidou:8100` for remote callers, but +when running the client on Dalidou itself (or inside the container +via `docker exec`), override to loopback: + +```bash +ATOCORE_BASE_URL=http://127.0.0.1:8100 \ + python scripts/atocore_client.py health +``` + +If you see `{"status": "unavailable", "fail_open": true}` from the +client, the first thing to check is whether the base URL resolves +from where you're running the client. + ### Deployment drift detection `/health` reports both `version` and `code_version` fields, both set diff --git a/scripts/atocore_client.py b/scripts/atocore_client.py index 1592b28..16e4768 100644 --- a/scripts/atocore_client.py +++ b/scripts/atocore_client.py @@ -1,8 +1,43 @@ """Operator-facing API client for live AtoCore instances. -This script is intentionally external to the app runtime. It is for admins and -operators who want a convenient way to inspect live project state, refresh -projects, audit retrieval quality, and manage trusted project-state entries. +This script is intentionally external to the app runtime. It is for admins +and operators who want a convenient way to inspect live project state, +refresh projects, audit retrieval quality, manage trusted project-state +entries, and drive the Phase 9 reflection loop (capture, extract, queue, +promote, reject). + +Environment variables +--------------------- + +ATOCORE_BASE_URL + Base URL of the AtoCore service (default: ``http://dalidou:8100``). + + When running ON the Dalidou host itself or INSIDE the Dalidou + container, override this with loopback or the real IP:: + + ATOCORE_BASE_URL=http://127.0.0.1:8100 \\ + python scripts/atocore_client.py health + + The default hostname "dalidou" is meant for cases where the + caller is a remote machine (laptop, T420/OpenClaw, etc.) with + "dalidou" in its /etc/hosts or resolvable via Tailscale. It does + NOT reliably resolve on the host itself or inside the container, + and when it fails the client returns + ``{"status": "unavailable", "fail_open": true}`` — the right + diagnosis when that happens is to set ATOCORE_BASE_URL explicitly + to 127.0.0.1:8100 and retry. + +ATOCORE_TIMEOUT_SECONDS + Request timeout for most operations (default: 30). + +ATOCORE_REFRESH_TIMEOUT_SECONDS + Longer timeout for project refresh operations which can be slow + (default: 1800). + +ATOCORE_FAIL_OPEN + When "true" (default), network errors return a small fail-open + envelope instead of raising. Set to "false" for admin operations + where you need the real error. """ from __future__ import annotations diff --git a/src/atocore/models/database.py b/src/atocore/models/database.py index d35dda3..77f50ba 100644 --- a/src/atocore/models/database.py +++ b/src/atocore/models/database.py @@ -71,14 +71,18 @@ CREATE TABLE IF NOT EXISTS interactions ( created_at DATETIME DEFAULT CURRENT_TIMESTAMP ); +-- Indexes that reference columns guaranteed to exist since the first +-- release ship here. Indexes that reference columns added by later +-- migrations (memories.project, interactions.project, +-- interactions.session_id) are created inside _apply_migrations AFTER +-- the corresponding ALTER TABLE, NOT here. Creating them here would +-- fail on upgrade from a pre-migration schema because CREATE TABLE +-- IF NOT EXISTS is a no-op on an existing table, so the new columns +-- wouldn't be added before the CREATE INDEX runs. CREATE INDEX IF NOT EXISTS idx_chunks_document ON source_chunks(document_id); CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type); -CREATE INDEX IF NOT EXISTS idx_memories_project ON memories(project); CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status); CREATE INDEX IF NOT EXISTS idx_interactions_project ON interactions(project_id); -CREATE INDEX IF NOT EXISTS idx_interactions_project_name ON interactions(project); -CREATE INDEX IF NOT EXISTS idx_interactions_session ON interactions(session_id); -CREATE INDEX IF NOT EXISTS idx_interactions_created_at ON interactions(created_at); """ diff --git a/tests/test_database.py b/tests/test_database.py index 9e91a45..f49ee9f 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -47,3 +47,138 @@ def test_get_connection_uses_configured_timeout_value(tmp_path, monkeypatch): assert calls assert calls[0] == 2.5 + + +def test_init_db_upgrades_pre_phase9_schema_without_failing(tmp_path, monkeypatch): + """Regression test for the schema init ordering bug caught during + the first real Dalidou deploy (report from 2026-04-08). + + Before the fix, SCHEMA_SQL contained CREATE INDEX statements that + referenced columns (memories.project, interactions.project, + interactions.session_id) added by _apply_migrations later in + init_db. On a fresh install this worked because CREATE TABLE + created the tables with the new columns before the CREATE INDEX + ran, but on UPGRADE from a pre-Phase-9 schema the CREATE TABLE + IF NOT EXISTS was a no-op and the CREATE INDEX hit + OperationalError: no such column. + + This test seeds the tables with the OLD pre-Phase-9 shape then + calls init_db() and verifies that: + + - init_db does not raise + - The new columns were added via _apply_migrations + - The new indexes exist + + If the bug is reintroduced by moving a CREATE INDEX for a + migration column back into SCHEMA_SQL, this test will fail + with OperationalError before reaching the assertions. + """ + monkeypatch.setenv("ATOCORE_DATA_DIR", str(tmp_path / "data")) + original_settings = config.settings + try: + config.settings = config.Settings() + + # Step 1: create the data dir and open a direct connection + config.ensure_runtime_dirs() + db_path = config.settings.db_path + + # Step 2: seed the DB with the old pre-Phase-9 shape. No + # project/last_referenced_at/reference_count on memories; no + # project/client/session_id/response/memories_used/chunks_used + # on interactions. We also need the prerequisite tables + # (projects, source_documents, source_chunks) because the + # memories table has an FK to source_chunks. + with sqlite3.connect(str(db_path)) as conn: + conn.executescript( + """ + CREATE TABLE source_documents ( + id TEXT PRIMARY KEY, + file_path TEXT UNIQUE NOT NULL, + file_hash TEXT NOT NULL, + title TEXT, + doc_type TEXT DEFAULT 'markdown', + tags TEXT DEFAULT '[]', + ingested_at DATETIME DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE source_chunks ( + id TEXT PRIMARY KEY, + document_id TEXT NOT NULL REFERENCES source_documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + heading_path TEXT DEFAULT '', + char_count INTEGER NOT NULL, + metadata TEXT DEFAULT '{}', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE memories ( + id TEXT PRIMARY KEY, + memory_type TEXT NOT NULL, + content TEXT NOT NULL, + source_chunk_id TEXT REFERENCES source_chunks(id), + confidence REAL DEFAULT 1.0, + status TEXT DEFAULT 'active', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE projects ( + id TEXT PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + description TEXT DEFAULT '', + status TEXT DEFAULT 'active', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE interactions ( + id TEXT PRIMARY KEY, + prompt TEXT NOT NULL, + context_pack TEXT DEFAULT '{}', + response_summary TEXT DEFAULT '', + project_id TEXT REFERENCES projects(id), + created_at DATETIME DEFAULT CURRENT_TIMESTAMP + ); + """ + ) + conn.commit() + + # Step 3: call init_db — this used to raise on the upgrade + # path. After the fix it should succeed. + init_db() + + # Step 4: verify the migrations ran — Phase 9 columns present + with sqlite3.connect(str(db_path)) as conn: + conn.row_factory = sqlite3.Row + memories_cols = { + row["name"] for row in conn.execute("PRAGMA table_info(memories)") + } + interactions_cols = { + row["name"] + for row in conn.execute("PRAGMA table_info(interactions)") + } + + assert "project" in memories_cols + assert "last_referenced_at" in memories_cols + assert "reference_count" in memories_cols + + assert "project" in interactions_cols + assert "client" in interactions_cols + assert "session_id" in interactions_cols + assert "response" in interactions_cols + assert "memories_used" in interactions_cols + assert "chunks_used" in interactions_cols + + # Step 5: verify the indexes on migration columns exist + index_rows = conn.execute( + "SELECT name FROM sqlite_master WHERE type='index' AND tbl_name IN ('memories','interactions')" + ).fetchall() + index_names = {row["name"] for row in index_rows} + + assert "idx_memories_project" in index_names + assert "idx_interactions_project_name" in index_names + assert "idx_interactions_session" in index_names + finally: + config.settings = original_settings