Files
ATOCore/deploy/dalidou/batch-extract.sh
Anto01 6a2471d509 fix: persist +x bit on deploy scripts + hook scripts
Git on Windows was stripping the executable bit every time a script
got edited, which broke the dedup-watcher cron (~100s of 'Permission
denied' entries in dedup-watcher.log since 7A deploy) and silently
disabled the auto-triage-watcher, batch-extract, graduation-watcher,
and hourly-extract cadences whenever they were touched from Windows.

Used `git update-index --chmod=+x` to store the bits in the index so
subsequent deploys preserve them regardless of the editor platform.

No functional changes; the scripts themselves are unchanged.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 15:39:26 -04:00

275 lines
9.0 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
#
# deploy/dalidou/batch-extract.sh
# --------------------------------
# Host-side LLM batch extraction for Dalidou.
#
# The claude CLI is available on the Dalidou HOST but NOT inside the
# Docker container. This script runs on the host, fetches recent
# interactions from the AtoCore API, runs the LLM extractor locally
# (claude -p sonnet), and posts candidates back to the API.
#
# Intended to be called from cron-backup.sh after backup/cleanup/rsync,
# or manually via:
#
# bash /srv/storage/atocore/app/deploy/dalidou/batch-extract.sh
#
# Environment variables:
# ATOCORE_URL default http://127.0.0.1:8100
# ATOCORE_EXTRACT_LIMIT default 50
set -euo pipefail
ATOCORE_URL="${ATOCORE_URL:-http://127.0.0.1:8100}"
LIMIT="${ATOCORE_EXTRACT_LIMIT:-50}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
APP_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
TIMESTAMP="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
log() { printf '[%s] %s\n' "$TIMESTAMP" "$*"; }
# The Python script needs the atocore source on PYTHONPATH
export PYTHONPATH="$APP_DIR/src:${PYTHONPATH:-}"
log "=== AtoCore batch extraction + triage starting ==="
log "URL=$ATOCORE_URL LIMIT=$LIMIT"
# --- Pipeline stats accumulator ---
EXTRACT_OUT=""
TRIAGE_OUT=""
HARNESS_OUT=""
# Step A: Extract candidates from recent interactions
log "Step A: LLM extraction"
EXTRACT_OUT=$(python3 "$APP_DIR/scripts/batch_llm_extract_live.py" \
--base-url "$ATOCORE_URL" \
--limit "$LIMIT" \
2>&1) || {
log "WARN: batch extraction failed (non-blocking)"
}
echo "$EXTRACT_OUT"
# Step B: Auto-triage candidates in the queue
log "Step B: auto-triage"
TRIAGE_OUT=$(python3 "$APP_DIR/scripts/auto_triage.py" \
--base-url "$ATOCORE_URL" \
2>&1) || {
log "WARN: auto-triage failed (non-blocking)"
}
echo "$TRIAGE_OUT"
# Step B2: Auto-promote reinforced candidates + expire stale ones
log "Step B2: auto-promote + expire"
python3 "$APP_DIR/scripts/auto_promote_reinforced.py" \
2>&1 || {
log "WARN: auto-promote/expire failed (non-blocking)"
}
# Step C: Daily project synthesis (keeps wiki/mirror pages fresh)
log "Step C: project synthesis (daily)"
python3 "$APP_DIR/scripts/synthesize_projects.py" \
--base-url "$ATOCORE_URL" \
2>&1 || {
log "WARN: synthesis failed (non-blocking)"
}
# Step D: Weekly lint pass (Sundays only — heavier, not needed daily)
if [[ "$(date -u +%u)" == "7" ]]; then
log "Step D: weekly lint pass"
python3 "$APP_DIR/scripts/lint_knowledge_base.py" \
--base-url "$ATOCORE_URL" \
2>&1 || true
fi
# Step E: Retrieval harness (daily)
log "Step E: retrieval harness"
HARNESS_OUT=$(python3 "$APP_DIR/scripts/retrieval_eval.py" \
--json \
--base-url "$ATOCORE_URL" \
2>&1) || {
log "WARN: retrieval harness failed (non-blocking)"
}
echo "$HARNESS_OUT"
# Step F: Persist pipeline summary to project state
log "Step F: pipeline summary"
python3 -c "
import json, urllib.request, re, sys
base = '$ATOCORE_URL'
ts = '$TIMESTAMP'
def post_state(key, value):
body = json.dumps({
'project': 'atocore', 'category': 'status',
'key': key, 'value': value, 'source': 'nightly pipeline',
}).encode()
req = urllib.request.Request(
f'{base}/project/state', data=body,
headers={'Content-Type': 'application/json'}, method='POST',
)
try:
urllib.request.urlopen(req, timeout=10)
except Exception as e:
print(f'WARN: failed to persist {key}: {e}', file=sys.stderr)
# Parse harness JSON
harness = {}
try:
harness = json.loads('''$HARNESS_OUT''')
post_state('retrieval_harness_result', json.dumps({
'passed': harness.get('passed', 0),
'total': harness.get('total', 0),
'failures': [f['name'] for f in harness.get('fixtures', []) if not f.get('ok')],
'run_at': ts,
}))
p, t = harness.get('passed', '?'), harness.get('total', '?')
print(f'Harness: {p}/{t}')
except Exception:
print('WARN: could not parse harness output')
# Parse triage counts from stdout
triage_out = '''$TRIAGE_OUT'''
promoted = len(re.findall(r'promoted', triage_out, re.IGNORECASE))
rejected = len(re.findall(r'rejected', triage_out, re.IGNORECASE))
needs_human = len(re.findall(r'needs.human', triage_out, re.IGNORECASE))
# Build summary
summary = {
'run_at': ts,
'harness_passed': harness.get('passed', -1),
'harness_total': harness.get('total', -1),
'triage_promoted': promoted,
'triage_rejected': rejected,
'triage_needs_human': needs_human,
}
post_state('pipeline_last_run', ts)
post_state('pipeline_summary', json.dumps(summary))
print(f'Pipeline summary persisted: {json.dumps(summary)}')
" 2>&1 || {
log "WARN: pipeline summary persistence failed (non-blocking)"
}
# Step F2: Emerging-concepts detector (Phase 6 C.1)
log "Step F2: emerging-concepts detector"
python3 "$APP_DIR/scripts/detect_emerging.py" \
--base-url "$ATOCORE_URL" \
2>&1 || {
log "WARN: emerging detector failed (non-blocking)"
}
# Step F3: Transient-to-durable extension (Phase 6 C.3)
log "Step F3: transient-to-durable extension"
curl -sSf -X POST "$ATOCORE_URL/admin/memory/extend-reinforced" \
-H 'Content-Type: application/json' \
2>&1 | tail -5 || {
log "WARN: extend-reinforced failed (non-blocking)"
}
# Step F4: Confidence decay on unreferenced cold memories (Phase 7D)
# Daily: memories with reference_count=0 AND idle > 30 days → confidence × 0.97.
# Below 0.3 → auto-supersede with audit. Reversible via reinforcement.
log "Step F4: confidence decay"
curl -sSf -X POST "$ATOCORE_URL/admin/memory/decay-run" \
-H 'Content-Type: application/json' \
-d '{"idle_days_threshold": 30, "daily_decay_factor": 0.97, "supersede_confidence_floor": 0.30}' \
2>&1 | tail -5 || {
log "WARN: decay-run failed (non-blocking)"
}
# Step B3: Memory dedup scan (Phase 7A)
# Nightly at 0.90 (tight — only near-duplicates). Sundays run a deeper
# pass at 0.85 to catch semantically-similar-but-differently-worded memories.
if [[ "$(date -u +%u)" == "7" ]]; then
DEDUP_THRESHOLD="0.85"
DEDUP_BATCH="80"
log "Step B3: memory dedup (Sunday deep pass, threshold $DEDUP_THRESHOLD)"
else
DEDUP_THRESHOLD="0.90"
DEDUP_BATCH="50"
log "Step B3: memory dedup (daily, threshold $DEDUP_THRESHOLD)"
fi
python3 "$APP_DIR/scripts/memory_dedup.py" \
--base-url "$ATOCORE_URL" \
--similarity-threshold "$DEDUP_THRESHOLD" \
--max-batch "$DEDUP_BATCH" \
2>&1 || {
log "WARN: memory dedup failed (non-blocking)"
}
# Step B4: Tag canonicalization (Phase 7C, weekly Sundays)
# Autonomous: LLM proposes alias→canonical maps, auto-applies confidence >= 0.8.
# Projects tokens are protected (skipped on both sides). Borderline proposals
# land in /admin/tags/aliases for human review.
if [[ "$(date -u +%u)" == "7" ]]; then
log "Step B4: tag canonicalization (Sunday)"
python3 "$APP_DIR/scripts/canonicalize_tags.py" \
--base-url "$ATOCORE_URL" \
2>&1 || {
log "WARN: tag canonicalization failed (non-blocking)"
}
fi
# Step G: Integrity check (Phase 4 V1)
log "Step G: integrity check"
python3 "$APP_DIR/scripts/integrity_check.py" \
--base-url "$ATOCORE_URL" \
2>&1 || {
log "WARN: integrity check failed (non-blocking)"
}
# Step H: Pipeline-level alerts — detect conditions that warrant attention
log "Step H: pipeline alerts"
python3 -c "
import json, os, sys, urllib.request
sys.path.insert(0, '$APP_DIR/src')
from atocore.observability.alerts import emit_alert
base = '$ATOCORE_URL'
def get_state(project='atocore'):
try:
req = urllib.request.Request(f'{base}/project/state/{project}')
resp = urllib.request.urlopen(req, timeout=10)
return json.loads(resp.read()).get('entries', [])
except Exception:
return []
def get_dashboard():
try:
req = urllib.request.Request(f'{base}/admin/dashboard')
resp = urllib.request.urlopen(req, timeout=10)
return json.loads(resp.read())
except Exception:
return {}
state = {(e['category'], e['key']): e['value'] for e in get_state()}
dash = get_dashboard()
# Harness regression check
harness_raw = state.get(('status', 'retrieval_harness_result'))
if harness_raw:
try:
h = json.loads(harness_raw)
passed, total = h.get('passed', 0), h.get('total', 0)
if total > 0:
rate = passed / total
if rate < 0.85:
emit_alert('warning', 'Retrieval harness below 85%',
f'Only {passed}/{total} fixtures passing ({rate:.0%}). Failures: {h.get(\"failures\", [])[:5]}',
context={'pass_rate': rate})
except Exception:
pass
# Candidate queue pileup
candidates = dash.get('memories', {}).get('candidates', 0)
if candidates > 200:
emit_alert('warning', 'Candidate queue not draining',
f'{candidates} candidates pending. Auto-triage may be stuck or rate-limited.',
context={'candidates': candidates})
print('pipeline alerts check complete')
" 2>&1 || true
log "=== AtoCore batch extraction + triage complete ==="