When deploy.sh itself changes in the commit being pulled, the bash process is still running the OLD script from memory — git reset --hard updated the file on disk but the in-memory instructions are stale. This bit the 2026-04-09 Dalidou deploy: the old pre-build-sha Step 2 ran against fresh source, so the container started with ATOCORE_BUILD_SHA="unknown" instead of the real commit. Manual re-run fixed it, but the class of bug will re-emerge every time deploy.sh itself changes. Fix (Step 1.5): - After git reset --hard, sha1 the running script ($0) and the on-disk copy at $APP_DIR/deploy/dalidou/deploy.sh - If they differ, export ATOCORE_DEPLOY_REEXECED=1 and exec into the fresh copy so Step 2 onward runs under the new script - The sentinel env var prevents recursion - Skipped in dry-run mode, when $0 isn't readable, or when the on-disk script doesn't exist yet Docs (docs/dalidou-deployment.md): - New "The deploy.sh self-update race" troubleshooting section explaining the root cause, the Step 1.5 mechanism, what the log output looks like, and how to opt out Verified syntax and dry-run. 219/219 tests still passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
350 lines
15 KiB
Bash
350 lines
15 KiB
Bash
#!/usr/bin/env bash
|
|
#
|
|
# deploy/dalidou/deploy.sh
|
|
# -------------------------
|
|
# One-shot deploy script for updating the running AtoCore container
|
|
# on Dalidou from the current Gitea main branch.
|
|
#
|
|
# The script is idempotent and safe to re-run. It handles both the
|
|
# first-time deploy (where /srv/storage/atocore/app may not yet be
|
|
# a git checkout) and the ongoing update case (where it is).
|
|
#
|
|
# Usage
|
|
# -----
|
|
#
|
|
# # Normal update from main (most common)
|
|
# bash deploy/dalidou/deploy.sh
|
|
#
|
|
# # Deploy a specific branch or tag
|
|
# ATOCORE_BRANCH=codex/some-feature bash deploy/dalidou/deploy.sh
|
|
#
|
|
# # Dry-run: show what would happen without touching anything
|
|
# ATOCORE_DEPLOY_DRY_RUN=1 bash deploy/dalidou/deploy.sh
|
|
#
|
|
# Environment variables
|
|
# ---------------------
|
|
#
|
|
# ATOCORE_APP_DIR default /srv/storage/atocore/app
|
|
# ATOCORE_GIT_REMOTE default http://127.0.0.1:3000/Antoine/ATOCore.git
|
|
# This is the local Dalidou gitea, reached
|
|
# via loopback. Override only when running
|
|
# the deploy from a remote host. The default
|
|
# is loopback (not the hostname "dalidou")
|
|
# because the hostname doesn't reliably
|
|
# resolve on the host itself — Dalidou
|
|
# Claude's first deploy had to work around
|
|
# exactly this.
|
|
# ATOCORE_BRANCH default main
|
|
# ATOCORE_DEPLOY_DRY_RUN if set to 1, report only, no mutations
|
|
# ATOCORE_HEALTH_URL default http://127.0.0.1:8100/health
|
|
#
|
|
# Safety rails
|
|
# ------------
|
|
#
|
|
# - If the app dir exists but is NOT a git repo, the script renames
|
|
# it to <dir>.pre-git-<timestamp> before re-cloning, so you never
|
|
# lose the pre-existing snapshot to a git clobber.
|
|
# - If the health check fails after restart, the script exits
|
|
# non-zero and prints the container logs tail for diagnosis.
|
|
# - Dry-run mode is the default recommendation for the first deploy
|
|
# on a new environment: it shows the planned git operations and
|
|
# the compose command without actually running them.
|
|
#
|
|
# What this script does NOT do
|
|
# ----------------------------
|
|
#
|
|
# - Does not manage secrets / .env files. The caller is responsible
|
|
# for placing deploy/dalidou/.env before running.
|
|
# - Does not run a backup before deploying. Run the backup endpoint
|
|
# first if you want a pre-deploy snapshot.
|
|
# - Does not roll back on health-check failure. If deploy fails,
|
|
# the previous container is already stopped; you need to redeploy
|
|
# a known-good commit to recover.
|
|
# - Does not touch the database. The Phase 9 schema migrations in
|
|
# src/atocore/models/database.py::_apply_migrations are idempotent
|
|
# ALTER TABLE ADD COLUMN calls that run at service startup via the
|
|
# lifespan handler. Stale pre-Phase-9 schema is upgraded in place.
|
|
|
|
set -euo pipefail
|
|
|
|
APP_DIR="${ATOCORE_APP_DIR:-/srv/storage/atocore/app}"
|
|
GIT_REMOTE="${ATOCORE_GIT_REMOTE:-http://127.0.0.1:3000/Antoine/ATOCore.git}"
|
|
BRANCH="${ATOCORE_BRANCH:-main}"
|
|
HEALTH_URL="${ATOCORE_HEALTH_URL:-http://127.0.0.1:8100/health}"
|
|
DRY_RUN="${ATOCORE_DEPLOY_DRY_RUN:-0}"
|
|
COMPOSE_DIR="$APP_DIR/deploy/dalidou"
|
|
|
|
log() { printf '==> %s\n' "$*"; }
|
|
run() {
|
|
if [ "$DRY_RUN" = "1" ]; then
|
|
printf ' [dry-run] %s\n' "$*"
|
|
else
|
|
eval "$@"
|
|
fi
|
|
}
|
|
|
|
log "AtoCore deploy starting"
|
|
log " app dir: $APP_DIR"
|
|
log " git remote: $GIT_REMOTE"
|
|
log " branch: $BRANCH"
|
|
log " health url: $HEALTH_URL"
|
|
log " dry run: $DRY_RUN"
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Step 0: pre-flight permission check
|
|
# ---------------------------------------------------------------------
|
|
#
|
|
# If $APP_DIR exists but the current user cannot write to it (because
|
|
# a previous manual deploy left it root-owned, for example), the git
|
|
# fetch / reset in step 1 will fail with cryptic errors. Detect this
|
|
# up front and give the operator a clean remediation command instead
|
|
# of letting git produce half-state on partial failure. This was the
|
|
# exact workaround the 2026-04-08 Dalidou redeploy needed — pre-
|
|
# existing root ownership from the pre-phase9 manual schema fix.
|
|
|
|
if [ -d "$APP_DIR" ] && [ "$DRY_RUN" != "1" ]; then
|
|
if [ ! -w "$APP_DIR" ] || [ ! -r "$APP_DIR/.git" ] 2>/dev/null; then
|
|
log "WARNING: app dir exists but may not be writable by current user"
|
|
fi
|
|
current_owner="$(stat -c '%U:%G' "$APP_DIR" 2>/dev/null || echo unknown)"
|
|
current_user="$(id -un 2>/dev/null || echo unknown)"
|
|
current_uid_gid="$(id -u 2>/dev/null):$(id -g 2>/dev/null)"
|
|
log "Step 0: permission check"
|
|
log " app dir owner: $current_owner"
|
|
log " current user: $current_user ($current_uid_gid)"
|
|
# Try to write a tiny marker file. If it fails, surface a clean
|
|
# remediation message and exit before git produces confusing
|
|
# half-state.
|
|
marker="$APP_DIR/.deploy-permission-check"
|
|
if ! ( : > "$marker" ) 2>/dev/null; then
|
|
log "FATAL: cannot write to $APP_DIR as $current_user"
|
|
log ""
|
|
log "The app dir is owned by $current_owner and the current user"
|
|
log "doesn't have write permission. This usually happens after a"
|
|
log "manual workaround deploy that ran as root."
|
|
log ""
|
|
log "Remediation (pick the one that matches your setup):"
|
|
log ""
|
|
log " # If you have passwordless sudo and gitea runs as UID 1000:"
|
|
log " sudo chown -R 1000:1000 $APP_DIR"
|
|
log ""
|
|
log " # If you're running deploy.sh itself as root:"
|
|
log " sudo bash $0"
|
|
log ""
|
|
log " # If neither works, do it via a throwaway container:"
|
|
log " docker run --rm -v $APP_DIR:/app alpine \\"
|
|
log " chown -R 1000:1000 /app"
|
|
log ""
|
|
log "Then re-run deploy.sh."
|
|
exit 5
|
|
fi
|
|
rm -f "$marker" 2>/dev/null || true
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Step 1: make sure $APP_DIR is a proper git checkout of the branch
|
|
# ---------------------------------------------------------------------
|
|
|
|
if [ -d "$APP_DIR/.git" ]; then
|
|
log "Step 1: app dir is already a git checkout; fetching latest"
|
|
run "cd '$APP_DIR' && git fetch origin '$BRANCH'"
|
|
run "cd '$APP_DIR' && git reset --hard 'origin/$BRANCH'"
|
|
else
|
|
log "Step 1: app dir is NOT a git checkout; converting"
|
|
if [ -d "$APP_DIR" ]; then
|
|
BACKUP="${APP_DIR}.pre-git-$(date -u +%Y%m%dT%H%M%SZ)"
|
|
log " backing up existing snapshot to $BACKUP"
|
|
run "mv '$APP_DIR' '$BACKUP'"
|
|
fi
|
|
log " cloning $GIT_REMOTE -> $APP_DIR (branch: $BRANCH)"
|
|
run "git clone --branch '$BRANCH' '$GIT_REMOTE' '$APP_DIR'"
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Step 1.5: self-update re-exec guard
|
|
# ---------------------------------------------------------------------
|
|
#
|
|
# When deploy.sh itself changes in the commit we just pulled, the bash
|
|
# process running this script is still executing the OLD deploy.sh
|
|
# from memory — git reset --hard updated the file on disk but our
|
|
# in-memory instructions are stale. That's exactly how the first
|
|
# 2026-04-09 Dalidou deploy silently wrote "unknown" build_sha: old
|
|
# Step 2 logic ran against fresh source. Detect the mismatch and
|
|
# re-exec into the fresh copy so every post-update run exercises the
|
|
# new script.
|
|
#
|
|
# Guard rails:
|
|
# - Only runs when $APP_DIR exists, holds a git checkout, and a
|
|
# deploy.sh exists there (i.e. after Step 1 succeeded).
|
|
# - Uses a sentinel env var ATOCORE_DEPLOY_REEXECED=1 to make sure
|
|
# we only re-exec once, never recurse.
|
|
# - Skipped in dry-run mode (no mutation).
|
|
# - Skipped if $0 isn't a readable file (bash -c pipe inputs, etc.).
|
|
|
|
if [ "$DRY_RUN" != "1" ] \
|
|
&& [ -z "${ATOCORE_DEPLOY_REEXECED:-}" ] \
|
|
&& [ -r "$0" ] \
|
|
&& [ -f "$APP_DIR/deploy/dalidou/deploy.sh" ]; then
|
|
ON_DISK_HASH="$(sha1sum "$APP_DIR/deploy/dalidou/deploy.sh" 2>/dev/null | awk '{print $1}')"
|
|
RUNNING_HASH="$(sha1sum "$0" 2>/dev/null | awk '{print $1}')"
|
|
if [ -n "$ON_DISK_HASH" ] \
|
|
&& [ -n "$RUNNING_HASH" ] \
|
|
&& [ "$ON_DISK_HASH" != "$RUNNING_HASH" ]; then
|
|
log "Step 1.5: deploy.sh changed in the pulled commit; re-exec'ing"
|
|
log " running script hash: $RUNNING_HASH"
|
|
log " on-disk script hash: $ON_DISK_HASH"
|
|
log " re-exec -> $APP_DIR/deploy/dalidou/deploy.sh"
|
|
export ATOCORE_DEPLOY_REEXECED=1
|
|
exec bash "$APP_DIR/deploy/dalidou/deploy.sh" "$@"
|
|
fi
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Step 2: capture build provenance to pass to the container
|
|
# ---------------------------------------------------------------------
|
|
#
|
|
# We compute the full SHA, the short SHA, the UTC build timestamp,
|
|
# and the source branch. These get exported as env vars before
|
|
# `docker compose up -d --build` so the running container can read
|
|
# them at startup and report them via /health. The post-deploy
|
|
# verification step (Step 6) reads /health and compares the
|
|
# reported SHA against this value to detect any silent drift.
|
|
|
|
log "Step 2: capturing build provenance"
|
|
if [ "$DRY_RUN" != "1" ] && [ -d "$APP_DIR/.git" ]; then
|
|
DEPLOYING_SHA_FULL="$(cd "$APP_DIR" && git rev-parse HEAD)"
|
|
DEPLOYING_SHA="$(echo "$DEPLOYING_SHA_FULL" | cut -c1-7)"
|
|
DEPLOYING_TIME="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
DEPLOYING_BRANCH="$BRANCH"
|
|
log " commit: $DEPLOYING_SHA ($DEPLOYING_SHA_FULL)"
|
|
log " built at: $DEPLOYING_TIME"
|
|
log " branch: $DEPLOYING_BRANCH"
|
|
( cd "$APP_DIR" && git log --oneline -1 ) | sed 's/^/ /'
|
|
export ATOCORE_BUILD_SHA="$DEPLOYING_SHA_FULL"
|
|
export ATOCORE_BUILD_TIME="$DEPLOYING_TIME"
|
|
export ATOCORE_BUILD_BRANCH="$DEPLOYING_BRANCH"
|
|
else
|
|
log " [dry-run] would read git log from $APP_DIR"
|
|
DEPLOYING_SHA="dry-run"
|
|
DEPLOYING_SHA_FULL="dry-run"
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Step 3: preserve the .env file (it's not in git)
|
|
# ---------------------------------------------------------------------
|
|
|
|
ENV_FILE="$COMPOSE_DIR/.env"
|
|
if [ "$DRY_RUN" != "1" ] && [ ! -f "$ENV_FILE" ]; then
|
|
log "Step 3: WARNING — $ENV_FILE does not exist"
|
|
log " the compose workflow needs this file to map mount points"
|
|
log " copy deploy/dalidou/.env.example to $ENV_FILE and edit it"
|
|
log " before re-running this script"
|
|
exit 2
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Step 4: rebuild and restart the container
|
|
# ---------------------------------------------------------------------
|
|
|
|
log "Step 4: rebuilding and restarting the atocore container"
|
|
run "cd '$COMPOSE_DIR' && docker compose up -d --build"
|
|
|
|
if [ "$DRY_RUN" = "1" ]; then
|
|
log "dry-run complete — no mutations performed"
|
|
exit 0
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Step 5: wait for the service to come up and pass the health check
|
|
# ---------------------------------------------------------------------
|
|
|
|
log "Step 5: waiting for /health to respond"
|
|
for i in 1 2 3 4 5 6 7 8 9 10; do
|
|
if curl -fsS "$HEALTH_URL" > /tmp/atocore-health.json 2>/dev/null; then
|
|
log " service is responding"
|
|
break
|
|
fi
|
|
log " not ready yet ($i/10); waiting 3s"
|
|
sleep 3
|
|
done
|
|
|
|
if ! curl -fsS "$HEALTH_URL" > /tmp/atocore-health.json 2>/dev/null; then
|
|
log "FATAL: service did not come up within 30 seconds"
|
|
log " container logs (last 50 lines):"
|
|
cd "$COMPOSE_DIR" && docker compose logs --tail=50 atocore || true
|
|
exit 3
|
|
fi
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Step 6: verify the deployed build matches what we just shipped
|
|
# ---------------------------------------------------------------------
|
|
#
|
|
# Two layers of comparison:
|
|
#
|
|
# - code_version: matches src/atocore/__init__.py::__version__.
|
|
# Coarse: any commit between version bumps reports the same value.
|
|
# - build_sha: full git SHA the container was built from. Set as
|
|
# an env var by Step 2 above and read by /health from
|
|
# ATOCORE_BUILD_SHA. This is the precise drift signal — if the
|
|
# live build_sha doesn't match $DEPLOYING_SHA_FULL, the build
|
|
# didn't pick up the new source.
|
|
|
|
log "Step 6: verifying deployed build"
|
|
log " /health response:"
|
|
if command -v jq >/dev/null 2>&1; then
|
|
jq . < /tmp/atocore-health.json | sed 's/^/ /'
|
|
REPORTED_VERSION="$(jq -r '.code_version // .version' < /tmp/atocore-health.json)"
|
|
REPORTED_SHA="$(jq -r '.build_sha // "unknown"' < /tmp/atocore-health.json)"
|
|
REPORTED_BUILD_TIME="$(jq -r '.build_time // "unknown"' < /tmp/atocore-health.json)"
|
|
else
|
|
cat /tmp/atocore-health.json | sed 's/^/ /'
|
|
echo
|
|
REPORTED_VERSION="$(grep -o '"code_version":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)"
|
|
if [ -z "$REPORTED_VERSION" ]; then
|
|
REPORTED_VERSION="$(grep -o '"version":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)"
|
|
fi
|
|
REPORTED_SHA="$(grep -o '"build_sha":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)"
|
|
REPORTED_SHA="${REPORTED_SHA:-unknown}"
|
|
REPORTED_BUILD_TIME="$(grep -o '"build_time":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)"
|
|
REPORTED_BUILD_TIME="${REPORTED_BUILD_TIME:-unknown}"
|
|
fi
|
|
|
|
EXPECTED_VERSION="$(grep -oE "__version__ = \"[^\"]+\"" "$APP_DIR/src/atocore/__init__.py" | head -1 | cut -d'"' -f2)"
|
|
|
|
log " Layer 1 — coarse version:"
|
|
log " expected code_version: $EXPECTED_VERSION (from src/atocore/__init__.py)"
|
|
log " reported code_version: $REPORTED_VERSION (from live /health)"
|
|
|
|
if [ "$REPORTED_VERSION" != "$EXPECTED_VERSION" ]; then
|
|
log "FATAL: code_version mismatch"
|
|
log " the container may not have picked up the new image"
|
|
log " try: docker compose down && docker compose up -d --build"
|
|
exit 4
|
|
fi
|
|
|
|
log " Layer 2 — precise build SHA:"
|
|
log " expected build_sha: $DEPLOYING_SHA_FULL (from this deploy.sh run)"
|
|
log " reported build_sha: $REPORTED_SHA (from live /health)"
|
|
log " reported build_time: $REPORTED_BUILD_TIME"
|
|
|
|
if [ "$REPORTED_SHA" != "$DEPLOYING_SHA_FULL" ]; then
|
|
log "FATAL: build_sha mismatch"
|
|
log " the live container is reporting a different commit than"
|
|
log " the one this deploy.sh run just shipped. Possible causes:"
|
|
log " - the container is using a cached image instead of the"
|
|
log " freshly-built one (try: docker compose build --no-cache)"
|
|
log " - the env vars didn't propagate (check that"
|
|
log " deploy/dalidou/docker-compose.yml has the environment"
|
|
log " section with ATOCORE_BUILD_SHA)"
|
|
log " - another process restarted the container between the"
|
|
log " build and the health check"
|
|
exit 6
|
|
fi
|
|
|
|
log "Deploy complete."
|
|
log " commit: $DEPLOYING_SHA ($DEPLOYING_SHA_FULL)"
|
|
log " code_version: $REPORTED_VERSION"
|
|
log " build_sha: $REPORTED_SHA"
|
|
log " build_time: $REPORTED_BUILD_TIME"
|
|
log " health: ok"
|