deploy/dalidou/deploy.sh

#!/usr/bin/env bash
#
# deploy/dalidou/deploy.sh
# -------------------------
# One-shot deploy script for updating the running AtoCore container
# on Dalidou from the current Gitea main branch.
#
# The script is idempotent and safe to re-run. It handles both the
# first-time deploy (where /srv/storage/atocore/app may not yet be
# a git checkout) and the ongoing update case (where it is).
#
# Usage
# -----
#
#   # Normal update from main (most common)
#   bash deploy/dalidou/deploy.sh
#
#   # Deploy a specific branch or tag
#   ATOCORE_BRANCH=codex/some-feature bash deploy/dalidou/deploy.sh
#
#   # Dry-run: show what would happen without touching anything
#   ATOCORE_DEPLOY_DRY_RUN=1 bash deploy/dalidou/deploy.sh
#
# Environment variables
# ---------------------
#
#   ATOCORE_APP_DIR      default /srv/storage/atocore/app
#   ATOCORE_GIT_REMOTE   default http://127.0.0.1:3000/Antoine/ATOCore.git
#                        This is the local Dalidou gitea, reached
#                        via loopback. Override only when running
#                        the deploy from a remote host. The default
#                        is loopback (not the hostname "dalidou")
#                        because the hostname doesn't reliably
#                        resolve on the host itself — Dalidou
#                        Claude's first deploy had to work around
#                        exactly this.
#   ATOCORE_BRANCH       default main
#   ATOCORE_DEPLOY_DRY_RUN  if set to 1, report only, no mutations
#   ATOCORE_HEALTH_URL   default http://127.0.0.1:8100/health
#
# Safety rails
# ------------
#
# - If the app dir exists but is NOT a git repo, the script renames
#   it to <dir>.pre-git-<timestamp> before re-cloning, so you never
#   lose the pre-existing snapshot to a git clobber.
# - If the health check fails after restart, the script exits
#   non-zero and prints the container logs tail for diagnosis.
# - Dry-run mode is the default recommendation for the first deploy
#   on a new environment: it shows the planned git operations and
#   the compose command without actually running them.
#
# What this script does NOT do
# ----------------------------
#
# - Does not manage secrets / .env files. The caller is responsible
#   for placing deploy/dalidou/.env before running.
# - Does not run a backup before deploying. Run the backup endpoint
#   first if you want a pre-deploy snapshot.
# - Does not roll back on health-check failure. If deploy fails,
#   the previous container is already stopped; you need to redeploy
#   a known-good commit to recover.
# - Does not touch the database. The Phase 9 schema migrations in
#   src/atocore/models/database.py::_apply_migrations are idempotent
#   ALTER TABLE ADD COLUMN calls that run at service startup via the
#   lifespan handler. Stale pre-Phase-9 schema is upgraded in place.

set -euo pipefail

APP_DIR="${ATOCORE_APP_DIR:-/srv/storage/atocore/app}"
GIT_REMOTE="${ATOCORE_GIT_REMOTE:-http://127.0.0.1:3000/Antoine/ATOCore.git}"
BRANCH="${ATOCORE_BRANCH:-main}"
HEALTH_URL="${ATOCORE_HEALTH_URL:-http://127.0.0.1:8100/health}"
DRY_RUN="${ATOCORE_DEPLOY_DRY_RUN:-0}"
COMPOSE_DIR="$APP_DIR/deploy/dalidou"

log() { printf '==> %s\n' "$*"; }
run() {
    if [ "$DRY_RUN" = "1" ]; then
        printf '    [dry-run] %s\n' "$*"
    else
        eval "$@"
    fi
}

log "AtoCore deploy starting"
log "  app dir:    $APP_DIR"
log "  git remote: $GIT_REMOTE"
log "  branch:     $BRANCH"
log "  health url: $HEALTH_URL"
log "  dry run:    $DRY_RUN"

# ---------------------------------------------------------------------
# Step 0: pre-flight permission check
# ---------------------------------------------------------------------
#
# If $APP_DIR exists but the current user cannot write to it (because
# a previous manual deploy left it root-owned, for example), the git
# fetch / reset in step 1 will fail with cryptic errors. Detect this
# up front and give the operator a clean remediation command instead
# of letting git produce half-state on partial failure. This was the
# exact workaround the 2026-04-08 Dalidou redeploy needed — pre-
# existing root ownership from the pre-phase9 manual schema fix.

if [ -d "$APP_DIR" ] && [ "$DRY_RUN" != "1" ]; then
    if [ ! -w "$APP_DIR" ] || [ ! -r "$APP_DIR/.git" ] 2>/dev/null; then
        log "WARNING: app dir exists but may not be writable by current user"
    fi
    current_owner="$(stat -c '%U:%G' "$APP_DIR" 2>/dev/null || echo unknown)"
    current_user="$(id -un 2>/dev/null || echo unknown)"
    current_uid_gid="$(id -u 2>/dev/null):$(id -g 2>/dev/null)"
    log "Step 0: permission check"
    log "  app dir owner: $current_owner"
    log "  current user:  $current_user ($current_uid_gid)"
    # Try to write a tiny marker file. If it fails, surface a clean
    # remediation message and exit before git produces confusing
    # half-state.
    marker="$APP_DIR/.deploy-permission-check"
    if ! ( : > "$marker" ) 2>/dev/null; then
        log "FATAL: cannot write to $APP_DIR as $current_user"
        log ""
        log "The app dir is owned by $current_owner and the current user"
        log "doesn't have write permission. This usually happens after a"
        log "manual workaround deploy that ran as root."
        log ""
        log "Remediation (pick the one that matches your setup):"
        log ""
        log "  # If you have passwordless sudo and gitea runs as UID 1000:"
        log "  sudo chown -R 1000:1000 $APP_DIR"
        log ""
        log "  # If you're running deploy.sh itself as root:"
        log "  sudo bash $0"
        log ""
        log "  # If neither works, do it via a throwaway container:"
        log "  docker run --rm -v $APP_DIR:/app alpine \\"
        log "      chown -R 1000:1000 /app"
        log ""
        log "Then re-run deploy.sh."
        exit 5
    fi
    rm -f "$marker" 2>/dev/null || true
fi

# ---------------------------------------------------------------------
# Step 1: make sure $APP_DIR is a proper git checkout of the branch
# ---------------------------------------------------------------------

if [ -d "$APP_DIR/.git" ]; then
    log "Step 1: app dir is already a git checkout; fetching latest"
    run "cd '$APP_DIR' && git fetch origin '$BRANCH'"
    run "cd '$APP_DIR' && git reset --hard 'origin/$BRANCH'"
else
    log "Step 1: app dir is NOT a git checkout; converting"
    if [ -d "$APP_DIR" ]; then
        BACKUP="${APP_DIR}.pre-git-$(date -u +%Y%m%dT%H%M%SZ)"
        log "       backing up existing snapshot to $BACKUP"
        run "mv '$APP_DIR' '$BACKUP'"
    fi
    log "       cloning $GIT_REMOTE -> $APP_DIR (branch: $BRANCH)"
    run "git clone --branch '$BRANCH' '$GIT_REMOTE' '$APP_DIR'"
fi

# ---------------------------------------------------------------------
# Step 1.5: self-update re-exec guard
# ---------------------------------------------------------------------
#
# When deploy.sh itself changes in the commit we just pulled, the bash
# process running this script is still executing the OLD deploy.sh
# from memory — git reset --hard updated the file on disk but our
# in-memory instructions are stale. That's exactly how the first
# 2026-04-09 Dalidou deploy silently wrote "unknown" build_sha: old
# Step 2 logic ran against fresh source. Detect the mismatch and
# re-exec into the fresh copy so every post-update run exercises the
# new script.
#
# Guard rails:
# - Only runs when $APP_DIR exists, holds a git checkout, and a
#   deploy.sh exists there (i.e. after Step 1 succeeded).
# - Uses a sentinel env var ATOCORE_DEPLOY_REEXECED=1 to make sure
#   we only re-exec once, never recurse.
# - Skipped in dry-run mode (no mutation).
# - Skipped if $0 isn't a readable file (bash -c pipe inputs, etc.).

if [ "$DRY_RUN" != "1" ] \
    && [ -z "${ATOCORE_DEPLOY_REEXECED:-}" ] \
    && [ -r "$0" ] \
    && [ -f "$APP_DIR/deploy/dalidou/deploy.sh" ]; then
    ON_DISK_HASH="$(sha1sum "$APP_DIR/deploy/dalidou/deploy.sh" 2>/dev/null | awk '{print $1}')"
    RUNNING_HASH="$(sha1sum "$0" 2>/dev/null | awk '{print $1}')"
    if [ -n "$ON_DISK_HASH" ] \
        && [ -n "$RUNNING_HASH" ] \
        && [ "$ON_DISK_HASH" != "$RUNNING_HASH" ]; then
        log "Step 1.5: deploy.sh changed in the pulled commit; re-exec'ing"
        log "  running script hash: $RUNNING_HASH"
        log "  on-disk script hash: $ON_DISK_HASH"
        log "  re-exec -> $APP_DIR/deploy/dalidou/deploy.sh"
        export ATOCORE_DEPLOY_REEXECED=1
        exec bash "$APP_DIR/deploy/dalidou/deploy.sh" "$@"
    fi
fi

# ---------------------------------------------------------------------
# Step 2: capture build provenance to pass to the container
# ---------------------------------------------------------------------
#
# We compute the full SHA, the short SHA, the UTC build timestamp,
# and the source branch. These get exported as env vars before
# `docker compose up -d --build` so the running container can read
# them at startup and report them via /health. The post-deploy
# verification step (Step 6) reads /health and compares the
# reported SHA against this value to detect any silent drift.

log "Step 2: capturing build provenance"
if [ "$DRY_RUN" != "1" ] && [ -d "$APP_DIR/.git" ]; then
    DEPLOYING_SHA_FULL="$(cd "$APP_DIR" && git rev-parse HEAD)"
    DEPLOYING_SHA="$(echo "$DEPLOYING_SHA_FULL" | cut -c1-7)"
    DEPLOYING_TIME="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
    DEPLOYING_BRANCH="$BRANCH"
    log "  commit:    $DEPLOYING_SHA ($DEPLOYING_SHA_FULL)"
    log "  built at:  $DEPLOYING_TIME"
    log "  branch:    $DEPLOYING_BRANCH"
    ( cd "$APP_DIR" && git log --oneline -1 ) | sed 's/^/  /'
    export ATOCORE_BUILD_SHA="$DEPLOYING_SHA_FULL"
    export ATOCORE_BUILD_TIME="$DEPLOYING_TIME"
    export ATOCORE_BUILD_BRANCH="$DEPLOYING_BRANCH"
else
    log "  [dry-run] would read git log from $APP_DIR"
    DEPLOYING_SHA="dry-run"
    DEPLOYING_SHA_FULL="dry-run"
fi

# ---------------------------------------------------------------------
# Step 3: preserve the .env file (it's not in git)
# ---------------------------------------------------------------------

ENV_FILE="$COMPOSE_DIR/.env"
if [ "$DRY_RUN" != "1" ] && [ ! -f "$ENV_FILE" ]; then
    log "Step 3: WARNING — $ENV_FILE does not exist"
    log "       the compose workflow needs this file to map mount points"
    log "       copy deploy/dalidou/.env.example to $ENV_FILE and edit it"
    log "       before re-running this script"
    exit 2
fi

# ---------------------------------------------------------------------
# Step 4: rebuild and restart the container
# ---------------------------------------------------------------------

log "Step 4: rebuilding and restarting the atocore container"
run "cd '$COMPOSE_DIR' && docker compose up -d --build"

if [ "$DRY_RUN" = "1" ]; then
    log "dry-run complete — no mutations performed"
    exit 0
fi

# ---------------------------------------------------------------------
# Step 5: wait for the service to come up and pass the health check
# ---------------------------------------------------------------------

log "Step 5: waiting for /health to respond"
for i in 1 2 3 4 5 6 7 8 9 10; do
    if curl -fsS "$HEALTH_URL" > /tmp/atocore-health.json 2>/dev/null; then
        log "       service is responding"
        break
    fi
    log "       not ready yet ($i/10); waiting 3s"
    sleep 3
done

if ! curl -fsS "$HEALTH_URL" > /tmp/atocore-health.json 2>/dev/null; then
    log "FATAL: service did not come up within 30 seconds"
    log "       container logs (last 50 lines):"
    cd "$COMPOSE_DIR" && docker compose logs --tail=50 atocore || true
    exit 3
fi

# ---------------------------------------------------------------------
# Step 6: verify the deployed build matches what we just shipped
# ---------------------------------------------------------------------
#
# Two layers of comparison:
#
# - code_version: matches src/atocore/__init__.py::__version__.
#   Coarse: any commit between version bumps reports the same value.
# - build_sha: full git SHA the container was built from. Set as
#   an env var by Step 2 above and read by /health from
#   ATOCORE_BUILD_SHA. This is the precise drift signal — if the
#   live build_sha doesn't match $DEPLOYING_SHA_FULL, the build
#   didn't pick up the new source.

log "Step 6: verifying deployed build"
log "  /health response:"
if command -v jq >/dev/null 2>&1; then
    jq . < /tmp/atocore-health.json | sed 's/^/    /'
    REPORTED_VERSION="$(jq -r '.code_version // .version' < /tmp/atocore-health.json)"
    REPORTED_SHA="$(jq -r '.build_sha // "unknown"' < /tmp/atocore-health.json)"
    REPORTED_BUILD_TIME="$(jq -r '.build_time // "unknown"' < /tmp/atocore-health.json)"
else
    cat /tmp/atocore-health.json | sed 's/^/    /'
    echo
    REPORTED_VERSION="$(grep -o '"code_version":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)"
    if [ -z "$REPORTED_VERSION" ]; then
        REPORTED_VERSION="$(grep -o '"version":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)"
    fi
    REPORTED_SHA="$(grep -o '"build_sha":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)"
    REPORTED_SHA="${REPORTED_SHA:-unknown}"
    REPORTED_BUILD_TIME="$(grep -o '"build_time":"[^"]*"' /tmp/atocore-health.json | head -1 | cut -d'"' -f4)"
    REPORTED_BUILD_TIME="${REPORTED_BUILD_TIME:-unknown}"
fi

EXPECTED_VERSION="$(grep -oE "__version__ = \"[^\"]+\"" "$APP_DIR/src/atocore/__init__.py" | head -1 | cut -d'"' -f2)"

log "  Layer 1 — coarse version:"
log "    expected code_version: $EXPECTED_VERSION (from src/atocore/__init__.py)"
log "    reported code_version: $REPORTED_VERSION (from live /health)"

if [ "$REPORTED_VERSION" != "$EXPECTED_VERSION" ]; then
    log "FATAL: code_version mismatch"
    log "       the container may not have picked up the new image"
    log "       try: docker compose down && docker compose up -d --build"
    exit 4
fi

log "  Layer 2 — precise build SHA:"
log "    expected build_sha: $DEPLOYING_SHA_FULL (from this deploy.sh run)"
log "    reported build_sha: $REPORTED_SHA (from live /health)"
log "    reported build_time: $REPORTED_BUILD_TIME"

if [ "$REPORTED_SHA" != "$DEPLOYING_SHA_FULL" ]; then
    log "FATAL: build_sha mismatch"
    log "       the live container is reporting a different commit than"
    log "       the one this deploy.sh run just shipped. Possible causes:"
    log "       - the container is using a cached image instead of the"
    log "         freshly-built one (try: docker compose build --no-cache)"
    log "       - the env vars didn't propagate (check that"
    log "         deploy/dalidou/docker-compose.yml has the environment"
    log "         section with ATOCORE_BUILD_SHA)"
    log "       - another process restarted the container between the"
    log "         build and the health check"
    exit 6
fi

log "Deploy complete."
log "  commit:       $DEPLOYING_SHA ($DEPLOYING_SHA_FULL)"
log "  code_version: $REPORTED_VERSION"
log "  build_sha:    $REPORTED_SHA"
log "  build_time:   $REPORTED_BUILD_TIME"
log "  health:       ok"