#!/usr/bin/env bash # # deploy/dalidou/batch-extract.sh # -------------------------------- # Host-side LLM batch extraction for Dalidou. # # The claude CLI is available on the Dalidou HOST but NOT inside the # Docker container. This script runs on the host, fetches recent # interactions from the AtoCore API, runs the LLM extractor locally # (claude -p sonnet), and posts candidates back to the API. # # Intended to be called from cron-backup.sh after backup/cleanup/rsync, # or manually via: # # bash /srv/storage/atocore/app/deploy/dalidou/batch-extract.sh # # Environment variables: # ATOCORE_URL default http://127.0.0.1:8100 # ATOCORE_EXTRACT_LIMIT default 50 set -euo pipefail ATOCORE_URL="${ATOCORE_URL:-http://127.0.0.1:8100}" LIMIT="${ATOCORE_EXTRACT_LIMIT:-50}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" APP_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" TIMESTAMP="$(date -u +%Y-%m-%dT%H:%M:%SZ)" log() { printf '[%s] %s\n' "$TIMESTAMP" "$*"; } # The Python script needs the atocore source on PYTHONPATH export PYTHONPATH="$APP_DIR/src:${PYTHONPATH:-}" log "=== AtoCore batch extraction + triage starting ===" log "URL=$ATOCORE_URL LIMIT=$LIMIT" # --- Pipeline stats accumulator --- EXTRACT_OUT="" TRIAGE_OUT="" HARNESS_OUT="" # Step A: Extract candidates from recent interactions log "Step A: LLM extraction" EXTRACT_OUT=$(python3 "$APP_DIR/scripts/batch_llm_extract_live.py" \ --base-url "$ATOCORE_URL" \ --limit "$LIMIT" \ 2>&1) || { log "WARN: batch extraction failed (non-blocking)" } echo "$EXTRACT_OUT" # Step B: Auto-triage candidates in the queue log "Step B: auto-triage" TRIAGE_OUT=$(python3 "$APP_DIR/scripts/auto_triage.py" \ --base-url "$ATOCORE_URL" \ 2>&1) || { log "WARN: auto-triage failed (non-blocking)" } echo "$TRIAGE_OUT" # Step B2: Auto-promote reinforced candidates + expire stale ones log "Step B2: auto-promote + expire" python3 "$APP_DIR/scripts/auto_promote_reinforced.py" \ 2>&1 || { log "WARN: auto-promote/expire failed (non-blocking)" } # Step C: Daily project synthesis (keeps wiki/mirror pages fresh) log "Step C: project synthesis (daily)" python3 "$APP_DIR/scripts/synthesize_projects.py" \ --base-url "$ATOCORE_URL" \ 2>&1 || { log "WARN: synthesis failed (non-blocking)" } # Step D: Weekly lint pass (Sundays only — heavier, not needed daily) if [[ "$(date -u +%u)" == "7" ]]; then log "Step D: weekly lint pass" python3 "$APP_DIR/scripts/lint_knowledge_base.py" \ --base-url "$ATOCORE_URL" \ 2>&1 || true fi # Step E: Retrieval harness (daily) log "Step E: retrieval harness" HARNESS_OUT=$(python3 "$APP_DIR/scripts/retrieval_eval.py" \ --json \ --base-url "$ATOCORE_URL" \ 2>&1) || { log "WARN: retrieval harness failed (non-blocking)" } echo "$HARNESS_OUT" # Step F: Persist pipeline summary to project state log "Step F: pipeline summary" python3 -c " import json, urllib.request, re, sys base = '$ATOCORE_URL' ts = '$TIMESTAMP' def post_state(key, value): body = json.dumps({ 'project': 'atocore', 'category': 'status', 'key': key, 'value': value, 'source': 'nightly pipeline', }).encode() req = urllib.request.Request( f'{base}/project/state', data=body, headers={'Content-Type': 'application/json'}, method='POST', ) try: urllib.request.urlopen(req, timeout=10) except Exception as e: print(f'WARN: failed to persist {key}: {e}', file=sys.stderr) # Parse harness JSON harness = {} try: harness = json.loads('''$HARNESS_OUT''') post_state('retrieval_harness_result', json.dumps({ 'passed': harness.get('passed', 0), 'total': harness.get('total', 0), 'failures': [f['name'] for f in harness.get('fixtures', []) if not f.get('ok')], 'run_at': ts, })) p, t = harness.get('passed', '?'), harness.get('total', '?') print(f'Harness: {p}/{t}') except Exception: print('WARN: could not parse harness output') # Parse triage counts from stdout triage_out = '''$TRIAGE_OUT''' promoted = len(re.findall(r'promoted', triage_out, re.IGNORECASE)) rejected = len(re.findall(r'rejected', triage_out, re.IGNORECASE)) needs_human = len(re.findall(r'needs.human', triage_out, re.IGNORECASE)) # Build summary summary = { 'run_at': ts, 'harness_passed': harness.get('passed', -1), 'harness_total': harness.get('total', -1), 'triage_promoted': promoted, 'triage_rejected': rejected, 'triage_needs_human': needs_human, } post_state('pipeline_last_run', ts) post_state('pipeline_summary', json.dumps(summary)) print(f'Pipeline summary persisted: {json.dumps(summary)}') " 2>&1 || { log "WARN: pipeline summary persistence failed (non-blocking)" } log "=== AtoCore batch extraction + triage complete ==="