diff --git a/deploy/dalidou/deploy.sh b/deploy/dalidou/deploy.sh index 68ab572..77b493c 100644 --- a/deploy/dalidou/deploy.sh +++ b/deploy/dalidou/deploy.sh @@ -161,18 +161,33 @@ else fi # --------------------------------------------------------------------- -# Step 2: show what we're deploying +# Step 2: capture build provenance to pass to the container # --------------------------------------------------------------------- +# +# We compute the full SHA, the short SHA, the UTC build timestamp, +# and the source branch. These get exported as env vars before +# `docker compose up -d --build` so the running container can read +# them at startup and report them via /health. The post-deploy +# verification step (Step 6) reads /health and compares the +# reported SHA against this value to detect any silent drift. -log "Step 2: deployable commit" +log "Step 2: capturing build provenance" if [ "$DRY_RUN" != "1" ] && [ -d "$APP_DIR/.git" ]; then - ( cd "$APP_DIR" && git log --oneline -1 ) - ( cd "$APP_DIR" && git rev-parse HEAD > /tmp/atocore-deploying-sha.txt ) - DEPLOYING_SHA="$(cat /tmp/atocore-deploying-sha.txt | cut -c1-7)" - log " commit: $DEPLOYING_SHA" + DEPLOYING_SHA_FULL="$(cd "$APP_DIR" && git rev-parse HEAD)" + DEPLOYING_SHA="$(echo "$DEPLOYING_SHA_FULL" | cut -c1-7)" + DEPLOYING_TIME="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + DEPLOYING_BRANCH="$BRANCH" + log " commit: $DEPLOYING_SHA ($DEPLOYING_SHA_FULL)" + log " built at: $DEPLOYING_TIME" + log " branch: $DEPLOYING_BRANCH" + ( cd "$APP_DIR" && git log --oneline -1 ) | sed 's/^/ /' + export ATOCORE_BUILD_SHA="$DEPLOYING_SHA_FULL" + export ATOCORE_BUILD_TIME="$DEPLOYING_TIME" + export ATOCORE_BUILD_BRANCH="$DEPLOYING_BRANCH" else log " [dry-run] would read git log from $APP_DIR" DEPLOYING_SHA="dry-run" + DEPLOYING_SHA_FULL="dry-run" fi # --------------------------------------------------------------------- @@ -222,14 +237,26 @@ if ! curl -fsS "$HEALTH_URL" > /tmp/atocore-health.json 2>/dev/null; then fi # --------------------------------------------------------------------- -# Step 6: verify the deployed version matches expectations +# Step 6: verify the deployed build matches what we just shipped # --------------------------------------------------------------------- +# +# Two layers of comparison: +# +# - code_version: matches src/atocore/__init__.py::__version__. +# Coarse: any commit between version bumps reports the same value. +# - build_sha: full git SHA the container was built from. Set as +# an env var by Step 2 above and read by /health from +# ATOCORE_BUILD_SHA. This is the precise drift signal — if the +# live build_sha doesn't match $DEPLOYING_SHA_FULL, the build +# didn't pick up the new source. -log "Step 6: verifying deployed version" +log "Step 6: verifying deployed build" log " /health response:" if command -v jq >/dev/null 2>&1; then jq . < /tmp/atocore-health.json | sed 's/^/ /' REPORTED_VERSION="$(jq -r '.code_version // .version' < /tmp/atocore-health.json)" + REPORTED_SHA="$(jq -r '.build_sha // "unknown"' < /tmp/atocore-health.json)" + REPORTED_BUILD_TIME="$(jq -r '.build_time // "unknown"' < /tmp/atocore-health.json)" else cat /tmp/atocore-health.json | sed 's/^/ /' echo @@ -237,21 +264,47 @@ else if [ -z "$REPORTED_VERSION" ]; then REPORTED_VERSION="$(grep -o '"version":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)" fi + REPORTED_SHA="$(grep -o '"build_sha":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)" + REPORTED_SHA="${REPORTED_SHA:-unknown}" + REPORTED_BUILD_TIME="$(grep -o '"build_time":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)" + REPORTED_BUILD_TIME="${REPORTED_BUILD_TIME:-unknown}" fi EXPECTED_VERSION="$(grep -oE "__version__ = \"[^\"]+\"" "$APP_DIR/src/atocore/__init__.py" | head -1 | cut -d'"' -f2)" -log " expected code_version: $EXPECTED_VERSION (from $APP_DIR/src/atocore/__init__.py)" -log " reported code_version: $REPORTED_VERSION (from live /health)" +log " Layer 1 — coarse version:" +log " expected code_version: $EXPECTED_VERSION (from src/atocore/__init__.py)" +log " reported code_version: $REPORTED_VERSION (from live /health)" if [ "$REPORTED_VERSION" != "$EXPECTED_VERSION" ]; then - log "WARNING: deployed version mismatch" - log " the container may not have picked up the new image" - log " try: docker compose down && docker compose up -d --build" + log "FATAL: code_version mismatch" + log " the container may not have picked up the new image" + log " try: docker compose down && docker compose up -d --build" exit 4 fi +log " Layer 2 — precise build SHA:" +log " expected build_sha: $DEPLOYING_SHA_FULL (from this deploy.sh run)" +log " reported build_sha: $REPORTED_SHA (from live /health)" +log " reported build_time: $REPORTED_BUILD_TIME" + +if [ "$REPORTED_SHA" != "$DEPLOYING_SHA_FULL" ]; then + log "FATAL: build_sha mismatch" + log " the live container is reporting a different commit than" + log " the one this deploy.sh run just shipped. Possible causes:" + log " - the container is using a cached image instead of the" + log " freshly-built one (try: docker compose build --no-cache)" + log " - the env vars didn't propagate (check that" + log " deploy/dalidou/docker-compose.yml has the environment" + log " section with ATOCORE_BUILD_SHA)" + log " - another process restarted the container between the" + log " build and the health check" + exit 6 +fi + log "Deploy complete." -log " commit: $DEPLOYING_SHA" +log " commit: $DEPLOYING_SHA ($DEPLOYING_SHA_FULL)" log " code_version: $REPORTED_VERSION" +log " build_sha: $REPORTED_SHA" +log " build_time: $REPORTED_BUILD_TIME" log " health: ok" diff --git a/deploy/dalidou/docker-compose.yml b/deploy/dalidou/docker-compose.yml index 58ca585..245755f 100644 --- a/deploy/dalidou/docker-compose.yml +++ b/deploy/dalidou/docker-compose.yml @@ -9,6 +9,15 @@ services: - "${ATOCORE_PORT:-8100}:8100" env_file: - .env + environment: + # Build provenance — set by deploy/dalidou/deploy.sh on each + # rebuild so /health can report exactly which commit is live. + # Defaults to 'unknown' for direct `docker compose up` runs that + # bypass deploy.sh; in that case the operator should run + # deploy.sh instead so the deployed SHA is recorded. + ATOCORE_BUILD_SHA: "${ATOCORE_BUILD_SHA:-unknown}" + ATOCORE_BUILD_TIME: "${ATOCORE_BUILD_TIME:-unknown}" + ATOCORE_BUILD_BRANCH: "${ATOCORE_BUILD_BRANCH:-unknown}" volumes: - ${ATOCORE_DB_DIR}:${ATOCORE_DB_DIR} - ${ATOCORE_CHROMA_DIR}:${ATOCORE_CHROMA_DIR} diff --git a/docs/dalidou-deployment.md b/docs/dalidou-deployment.md index af61147..644fed6 100644 --- a/docs/dalidou-deployment.md +++ b/docs/dalidou-deployment.md @@ -142,19 +142,55 @@ from where you're running the client. ### Deployment drift detection -`/health` reports both `version` and `code_version` fields, both set -from `atocore.__version__` at import time. To check whether the -deployed code matches the repo's `main` branch: +`/health` reports drift signals at three increasing levels of +precision: + +| Field | Source | Precision | When to use | +|---|---|---|---| +| `version` / `code_version` | `atocore.__version__` (manual bump) | coarse — same value across many commits | quick smoke check that the right *release* is running | +| `build_sha` | `ATOCORE_BUILD_SHA` env var, set by `deploy.sh` per build | precise — changes per commit | the canonical drift signal | +| `build_time` / `build_branch` | same env var path | per-build | forensics when multiple branches in flight | + +The **precise** check (run on the laptop or any host that can curl +the live service AND has the source repo at hand): ```bash -# What's running -curl -s http://127.0.0.1:8100/health | grep -o '"code_version":"[^"]*"' +# What's actually running on Dalidou +LIVE_SHA=$(curl -fsS http://dalidou:8100/health | grep -o '"build_sha":"[^"]*"' | cut -d'"' -f4) -# What's in the repo's main branch -grep '__version__' /srv/storage/atocore/app/src/atocore/__init__.py +# What the deployed branch tip should be +EXPECTED_SHA=$(cd /srv/storage/atocore/app && git rev-parse HEAD) + +# Compare +if [ "$LIVE_SHA" = "$EXPECTED_SHA" ]; then + echo "live is current at $LIVE_SHA" +else + echo "DRIFT: live $LIVE_SHA vs expected $EXPECTED_SHA" + echo "run deploy.sh to sync" +fi ``` -If these differ, the deployment is stale. Run `deploy.sh` to sync. +The `deploy.sh` script does exactly this comparison automatically +in its post-deploy verification step (Step 6) and exits non-zero +on mismatch. So the **simplest drift check** is just to run +`deploy.sh` — if there's nothing to deploy, it succeeds quickly; +if the live service is stale, it deploys and verifies. + +If `/health` reports `build_sha: "unknown"`, the running container +was started without `deploy.sh` (probably via `docker compose up` +directly), and the build provenance was never recorded. Re-run +via `deploy.sh` to fix. + +The coarse `code_version` check is still useful as a quick visual +sanity check — bumping `__version__` from `0.2.0` to `0.3.0` +signals a meaningful release boundary even if the precise +`build_sha` is what tools should compare against: + +```bash +# Quick sanity check (coarse) +curl -s http://127.0.0.1:8100/health | grep -o '"code_version":"[^"]*"' +grep '__version__' /srv/storage/atocore/app/src/atocore/__init__.py +``` ### Schema migrations on redeploy diff --git a/src/atocore/api/routes.py b/src/atocore/api/routes.py index c5b1270..4dfe288 100644 --- a/src/atocore/api/routes.py +++ b/src/atocore/api/routes.py @@ -744,13 +744,32 @@ def api_validate_backup(stamp: str) -> dict: def api_health() -> dict: """Health check. - The ``version`` and ``code_version`` fields both report the value - of ``atocore.__version__`` from the deployed code. Comparing this - to the main branch's ``__version__`` is the fastest way to detect - deployment drift: if they differ, the running service is behind - the repo and needs a redeploy (see - ``docs/dalidou-deployment.md`` and ``deploy/dalidou/deploy.sh``). + Three layers of version reporting, in increasing precision: + + - ``version`` / ``code_version``: ``atocore.__version__`` (e.g. + "0.2.0"). Bumped manually on commits that change the API + surface, schema, or user-visible behavior. Coarse — any + number of commits can land between bumps without changing + this value. + - ``build_sha``: full git SHA of the commit the running + container was built from. Set by ``deploy/dalidou/deploy.sh`` + via the ``ATOCORE_BUILD_SHA`` env var on every rebuild. + Reports ``"unknown"`` for builds that bypass deploy.sh + (direct ``docker compose up`` etc.). This is the precise + drift signal: if the live ``build_sha`` doesn't match the + tip of the deployed branch on Gitea, the service is stale + regardless of what ``code_version`` says. + - ``build_time`` / ``build_branch``: when and from which branch + the live container was built. Useful for forensics when + multiple branches are in flight or when build_sha is + ambiguous (e.g. a force-push to the same SHA). + + The deploy.sh post-deploy verification step compares the live + ``build_sha`` to the SHA it just set, and exits non-zero on + mismatch. """ + import os + from atocore import __version__ store = get_vector_store() @@ -759,6 +778,9 @@ def api_health() -> dict: "status": "ok", "version": __version__, "code_version": __version__, + "build_sha": os.environ.get("ATOCORE_BUILD_SHA", "unknown"), + "build_time": os.environ.get("ATOCORE_BUILD_TIME", "unknown"), + "build_branch": os.environ.get("ATOCORE_BUILD_BRANCH", "unknown"), "vectors_count": store.count, "env": _config.settings.env, "machine_paths": { diff --git a/tests/test_api_storage.py b/tests/test_api_storage.py index 925b7ea..41c8713 100644 --- a/tests/test_api_storage.py +++ b/tests/test_api_storage.py @@ -50,6 +50,65 @@ def test_health_endpoint_exposes_machine_paths_and_source_readiness(tmp_data_dir assert "run_dir" in body["machine_paths"] +def test_health_endpoint_reports_code_version_from_module(tmp_data_dir): + """The /health response must include code_version reflecting + atocore.__version__, so deployment drift detection works.""" + from atocore import __version__ + + client = TestClient(app) + response = client.get("/health") + + assert response.status_code == 200 + body = response.json() + assert body["version"] == __version__ + assert body["code_version"] == __version__ + + +def test_health_endpoint_reports_build_metadata_from_env(tmp_data_dir, monkeypatch): + """The /health response must include build_sha, build_time, and + build_branch from the ATOCORE_BUILD_* env vars, so deploy.sh can + detect precise drift via SHA comparison instead of relying on + the coarse code_version field. + + Regression test for the codex finding from 2026-04-08: + code_version 0.2.0 is too coarse to trust as a 'live is current' + signal because it only changes on manual bumps. The build_sha + field changes per commit and is set by deploy.sh. + """ + monkeypatch.setenv("ATOCORE_BUILD_SHA", "abc1234567890fedcba0987654321") + monkeypatch.setenv("ATOCORE_BUILD_TIME", "2026-04-09T01:23:45Z") + monkeypatch.setenv("ATOCORE_BUILD_BRANCH", "main") + + client = TestClient(app) + response = client.get("/health") + + assert response.status_code == 200 + body = response.json() + assert body["build_sha"] == "abc1234567890fedcba0987654321" + assert body["build_time"] == "2026-04-09T01:23:45Z" + assert body["build_branch"] == "main" + + +def test_health_endpoint_reports_unknown_when_build_env_unset(tmp_data_dir, monkeypatch): + """When deploy.sh hasn't set the build env vars (e.g. someone + ran `docker compose up` directly), /health reports 'unknown' + for all three build fields. This is a clear signal to the + operator that the deploy provenance is missing and they should + re-run via deploy.sh.""" + monkeypatch.delenv("ATOCORE_BUILD_SHA", raising=False) + monkeypatch.delenv("ATOCORE_BUILD_TIME", raising=False) + monkeypatch.delenv("ATOCORE_BUILD_BRANCH", raising=False) + + client = TestClient(app) + response = client.get("/health") + + assert response.status_code == 200 + body = response.json() + assert body["build_sha"] == "unknown" + assert body["build_time"] == "unknown" + assert body["build_branch"] == "unknown" + + def test_projects_endpoint_reports_registered_projects(tmp_data_dir, monkeypatch): vault_dir = tmp_data_dir / "vault-source" drive_dir = tmp_data_dir / "drive-source"