diff --git a/config/project-registry.json b/config/project-registry.json index 5c385c4..9ce40da 100644 --- a/config/project-registry.json +++ b/config/project-registry.json @@ -15,7 +15,7 @@ { "id": "p04-gigabit", "aliases": ["p04", "gigabit", "gigaBIT"], - "description": "Curated staged docs for the P04 GigaBIT mirror architecture and OTA optics project.", + "description": "Active P04 GigaBIT mirror project corpus from PKM plus staged operational docs.", "ingest_roots": [ { "source": "vault", @@ -27,7 +27,7 @@ { "id": "p05-interferometer", "aliases": ["p05", "interferometer"], - "description": "Curated staged docs for the P05 interferometer architecture, vendors, and error-budget project.", + "description": "Active P05 interferometer corpus from PKM plus selected repo context and vendor documentation.", "ingest_roots": [ { "source": "vault", @@ -39,7 +39,7 @@ { "id": "p06-polisher", "aliases": ["p06", "polisher"], - "description": "Curated staged docs for the P06 polisher project.", + "description": "Active P06 polisher corpus from PKM, software-suite notes, and selected repo context.", "ingest_roots": [ { "source": "vault", diff --git a/docs/current-state.md b/docs/current-state.md index 576869c..b83b370 100644 --- a/docs/current-state.md +++ b/docs/current-state.md @@ -52,7 +52,7 @@ now includes a first curated ingestion batch for the active projects. - Dalidou Docker deployment foundation - initial AtoCore self-knowledge corpus ingested on Dalidou - T420/OpenClaw read-only AtoCore helper skill -- first curated active-project corpus batch for: +- full active-project markdown/text corpus wave for: - `p04-gigabit` - `p05-interferometer` - `p06-polisher` @@ -87,7 +87,7 @@ The Dalidou instance already contains: - Master Plan V3 - Build Spec V1 - trusted project-state entries for `atocore` -- curated staged project docs for: +- full staged project markdown/text corpora for: - `p04-gigabit` - `p05-interferometer` - `p06-polisher` @@ -99,12 +99,12 @@ The Dalidou instance already contains: - `p05-interferometer` - `p06-polisher` -Current live stats after the latest documentation sync and active-project ingest -passes: +Current live stats after the full active-project wave are now far beyond the +initial seed stage: -- `source_documents`: 36 -- `source_chunks`: 568 -- `vectors`: 568 +- more than `1,100` source documents +- more than `20,000` chunks +- matching vector count The broader long-term corpus is still not fully populated yet. Wider project and vault ingestion remains a deliberate next step rather than something already @@ -115,8 +115,8 @@ primarily visible under: - `/srv/storage/atocore/sources/vault/incoming/projects` -This staged area is now useful for review because it contains the curated -project docs that were actually ingested for the first active-project batch. +This staged area is now useful for review because it contains the markdown/text +project docs that were actually ingested for the full active-project wave. It is important to read this staged area correctly: @@ -166,10 +166,12 @@ These are curated summaries and extracted stable project signals. In `source_documents` / retrieval corpus: -- real project documents are now present for the same active project set +- full project markdown/text corpora are now present for the active project set - retrieval is no longer limited to AtoCore self-knowledge only -- the current corpus is still selective rather than exhaustive -- that selectivity is intentional at this stage +- the current corpus is broad enough that ranking quality matters more than + corpus presence alone +- underspecified prompts can still pull in historical or archive material, so + project-aware routing and better ranking remain important The source refresh model now has a concrete foundation in code: @@ -223,8 +225,8 @@ This separation is healthy: ## Immediate Next Focus 1. Use the new T420-side organic routing layer in real OpenClaw workflows -2. Keep tightening retrieval quality for the newly seeded active projects -3. Define the first broader AtoVault/AtoDrive ingestion batches +2. Tighten retrieval quality for the now fully ingested active project corpora +3. Move to Wave 2 trusted-operational ingestion instead of blindly widening raw corpus further 4. Keep the new engineering-knowledge architecture docs as implementation guidance while avoiding premature schema work 5. Expand the boring operations baseline: - restore validation @@ -234,6 +236,7 @@ This separation is healthy: See also: +- [ingestion-waves.md](C:/Users/antoi/ATOCore/docs/ingestion-waves.md) - [master-plan-status.md](C:/Users/antoi/ATOCore/docs/master-plan-status.md) ## Guiding Constraints diff --git a/docs/ingestion-waves.md b/docs/ingestion-waves.md new file mode 100644 index 0000000..3f46b60 --- /dev/null +++ b/docs/ingestion-waves.md @@ -0,0 +1,129 @@ +# AtoCore Ingestion Waves + +## Purpose + +This document tracks how the corpus should grow without losing signal quality. + +The rule is: + +- ingest in waves +- validate retrieval after each wave +- only then widen the source scope + +## Wave 1 - Active Project Full Markdown Corpus + +Status: complete + +Projects: + +- `p04-gigabit` +- `p05-interferometer` +- `p06-polisher` + +What was ingested: + +- the full markdown/text PKM stacks for the three active projects +- selected staged operational docs already under the Dalidou source roots +- selected repo markdown/text context for: + - `Fullum-Interferometer` + - `polisher-sim` + - `Polisher-Toolhead` (when markdown exists) + +What was intentionally excluded: + +- binaries +- images +- PDFs +- generated outputs unless they were plain text reports +- dependency folders +- hidden runtime junk + +Practical result: + +- AtoCore moved from a curated-seed corpus to a real active-project corpus +- the live corpus now contains well over one thousand source documents and over + twenty thousand chunks +- project-specific context building is materially stronger than before + +Main lesson from Wave 1: + +- full project ingestion is valuable +- but broad historical/archive material can dilute retrieval for underspecified + prompts +- context quality now depends more strongly on good project hints and better + ranking than on corpus size alone + +## Wave 2 - Trusted Operational Layer Expansion + +Status: next + +Goal: + +- expand `AtoDrive`-style operational truth for the active projects + +Candidate inputs: + +- current status dashboards +- decision logs +- milestone tracking +- curated requirements baselines +- explicit next-step plans + +Why this matters: + +- this raises the quality of the high-trust layer instead of only widening + general retrieval + +## Wave 3 - Broader Active Engineering References + +Status: planned + +Goal: + +- ingest reusable engineering references that support the active project set + without dumping the entire vault + +Candidate inputs: + +- interferometry reference notes directly tied to `p05` +- polishing physics references directly tied to `p06` +- mirror and structural reference material directly tied to `p04` + +Rule: + +- only bring in references with a clear connection to active work + +## Wave 4 - Wider PKM Population + +Status: deferred + +Goal: + +- widen beyond the active projects while preserving retrieval quality + +Preconditions: + +- stronger ranking +- better project-aware routing +- stable operational restore path +- clearer promotion rules for trusted state + +## Validation After Each Wave + +After every ingestion wave, verify: + +- `stats` +- project-specific `query` +- project-specific `context-build` +- `debug-context` +- whether trusted project state still dominates when it should +- whether cross-project bleed is getting worse or better + +## Working Rule + +The next wave should only happen when the current wave is: + +- ingested +- inspected +- retrieval-tested +- operationally stable diff --git a/docs/next-steps.md b/docs/next-steps.md index 6707db6..f984f1a 100644 --- a/docs/next-steps.md +++ b/docs/next-steps.md @@ -29,9 +29,11 @@ This working list should be read alongside: - check whether the top hits are useful - check whether trusted project state remains dominant - reduce cross-project competition and prompt ambiguity where needed -3. Continue controlled project ingestion only where the current corpus is still - thin - - a few additional anchor docs per active project + - use `debug-context` to inspect the exact last AtoCore supplement +3. Treat the active-project full markdown/text wave as complete + - `p04-gigabit` + - `p05-interferometer` + - `p06-polisher` 4. Define a cleaner source refresh model - make the difference between source truth, staged inputs, and machine store explicit @@ -39,15 +41,20 @@ This working list should be read alongside: - foundation now exists via project registry + per-project refresh API - registration policy + template + proposal + approved registration are now the normal path for new projects -5. Integrate the new engineering architecture docs into active planning, not immediate schema code +5. Move to Wave 2 trusted-operational ingestion + - curated dashboards + - decision logs + - milestone/current-status views + - operational truth, not just raw project notes +6. Integrate the new engineering architecture docs into active planning, not immediate schema code - keep `docs/architecture/engineering-knowledge-hybrid-architecture.md` as the target layer model - keep `docs/architecture/engineering-ontology-v1.md` as the V1 structured-domain target - do not start entity/relationship persistence until the ingestion, retrieval, registry, and backup baseline feels boring and stable -6. Define backup and export procedures for Dalidou +7. Define backup and export procedures for Dalidou - exercise the new SQLite + registry snapshot path on Dalidou - Chroma backup or rebuild policy - retention and restore validation -7. Keep deeper automatic runtime integration modest until the organic read-only +8. Keep deeper automatic runtime integration modest until the organic read-only model has proven value ## Trusted State Status @@ -69,36 +76,39 @@ This materially improves `context/build` quality for project-hinted prompts. ## Recommended Near-Term Project Work -The first curated batch is already in. +The active-project full markdown/text wave is now in. The near-term work is now: 1. strengthen retrieval quality -2. add a few more anchor docs only where retrieval is still weak +2. promote or refine trusted operational truth where the broad corpus is now too noisy 3. keep trusted project state concise and high-confidence +4. widen only through named ingestion waves -## Recommended Additional Anchor Docs +## Recommended Next Wave Inputs -1. `p04-gigabit` -2. `p05-interferometer` -3. `p06-polisher` +Wave 2 should emphasize trusted operational truth, not bulk historical notes. P04: -- 1 to 2 more strong study summaries -- 1 to 2 more meeting notes with actual decisions +- current status dashboard +- current selected design path +- current frame interface truth +- current next-step milestone view P05: -- a couple more architecture docs -- selected vendor-response notes -- possibly one or two NX/WAVE consumer docs +- selected vendor path +- current error-budget baseline +- current architecture freeze or open decisions +- current procurement / next-action view P06: -- more explicit interface/schema docs if needed -- selected operations or UI docs -- a distilled non-empty operational context doc to replace an empty `_context.md` +- current system map +- current shared contracts baseline +- current calibration procedure truth +- current July / proving roadmap view ## Deferred On Purpose @@ -115,6 +125,8 @@ The next batch is successful if: - OpenClaw can use AtoCore naturally when context is needed - OpenClaw can infer registered projects and call AtoCore organically for project-knowledge questions +- the active-project full corpus wave can be inspected and used concretely + through `auto-context`, `context-build`, and `debug-context` - OpenClaw can also register a new project cleanly before refreshing it - existing project registrations can be refined safely before refresh when the staged source set evolves diff --git a/docs/openclaw-integration-contract.md b/docs/openclaw-integration-contract.md index e57f789..f526d23 100644 --- a/docs/openclaw-integration-contract.md +++ b/docs/openclaw-integration-contract.md @@ -82,6 +82,7 @@ The current helper script exposes: - `project-template` - `detect-project ` - `auto-context [budget] [project]` +- `debug-context` - `propose-project ...` - `register-project ...` - `update-project ...` @@ -125,6 +126,8 @@ Recommended first behavior: 1. OpenClaw receives a user request 2. If the prompt looks like project knowledge, OpenClaw should try: - `auto-context "" 3000` + - optionally `debug-context` immediately after if a human wants to inspect + the exact AtoCore supplement 3. If the prompt is clearly asking for trusted current truth, OpenClaw should prefer: - `project-state ` diff --git a/src/atocore/api/routes.py b/src/atocore/api/routes.py index 6d268d1..a80b77b 100644 --- a/src/atocore/api/routes.py +++ b/src/atocore/api/routes.py @@ -18,6 +18,7 @@ from atocore.context.project_state import ( set_state, ) from atocore.ingestion.pipeline import ( + exclusive_ingestion, get_ingestion_stats, get_source_status, ingest_configured_sources, @@ -153,12 +154,13 @@ def api_ingest(req: IngestRequest) -> IngestResponse: """Ingest a markdown file or folder.""" target = Path(req.path) try: - if target.is_file(): - results = [ingest_file(target)] - elif target.is_dir(): - results = ingest_folder(target) - else: - raise HTTPException(status_code=404, detail=f"Path not found: {req.path}") + with exclusive_ingestion(): + if target.is_file(): + results = [ingest_file(target)] + elif target.is_dir(): + results = ingest_folder(target) + else: + raise HTTPException(status_code=404, detail=f"Path not found: {req.path}") except HTTPException: raise except Exception as e: @@ -171,7 +173,8 @@ def api_ingest(req: IngestRequest) -> IngestResponse: def api_ingest_sources() -> IngestSourcesResponse: """Ingest enabled configured source directories.""" try: - results = ingest_configured_sources() + with exclusive_ingestion(): + results = ingest_configured_sources() except Exception as e: log.error("ingest_sources_failed", error=str(e)) raise HTTPException(status_code=500, detail=f"Configured source ingestion failed: {e}") @@ -246,7 +249,8 @@ def api_project_update(project_name: str, req: ProjectUpdateRequest) -> dict: def api_refresh_project(project_name: str, purge_deleted: bool = False) -> ProjectRefreshResponse: """Refresh one registered project from its configured ingest roots.""" try: - result = refresh_registered_project(project_name, purge_deleted=purge_deleted) + with exclusive_ingestion(): + result = refresh_registered_project(project_name, purge_deleted=purge_deleted) except ValueError as e: raise HTTPException(status_code=404, detail=str(e)) except Exception as e: diff --git a/src/atocore/ingestion/pipeline.py b/src/atocore/ingestion/pipeline.py index ae2470b..78052ea 100644 --- a/src/atocore/ingestion/pipeline.py +++ b/src/atocore/ingestion/pipeline.py @@ -2,8 +2,10 @@ import hashlib import json +import threading import time import uuid +from contextlib import contextmanager from pathlib import Path import atocore.config as _config @@ -17,6 +19,17 @@ log = get_logger("ingestion") # Encodings to try when reading markdown files _ENCODINGS = ["utf-8", "utf-8-sig", "latin-1", "cp1252"] +_INGESTION_LOCK = threading.Lock() + + +@contextmanager +def exclusive_ingestion(): + """Serialize long-running ingestion operations across API requests.""" + _INGESTION_LOCK.acquire() + try: + yield + finally: + _INGESTION_LOCK.release() def ingest_file(file_path: Path) -> dict: diff --git a/tests/test_api_storage.py b/tests/test_api_storage.py index 25fee45..7eefbc4 100644 --- a/tests/test_api_storage.py +++ b/tests/test_api_storage.py @@ -1,5 +1,7 @@ """Tests for storage-related API readiness endpoints.""" +from contextlib import contextmanager + from fastapi.testclient import TestClient import atocore.config as config @@ -152,6 +154,38 @@ def test_project_refresh_endpoint_uses_registered_roots(tmp_data_dir, monkeypatc assert response.json()["project"] == "p05-interferometer" +def test_project_refresh_endpoint_serializes_ingestion(tmp_data_dir, monkeypatch): + config.settings = config.Settings() + events = [] + + @contextmanager + def fake_lock(): + events.append("enter") + try: + yield + finally: + events.append("exit") + + def fake_refresh_registered_project(project_name, purge_deleted=False): + events.append(("refresh", project_name, purge_deleted)) + return { + "project": "p05-interferometer", + "aliases": ["p05"], + "description": "P05 docs", + "purge_deleted": purge_deleted, + "roots": [], + } + + monkeypatch.setattr("atocore.api.routes.exclusive_ingestion", fake_lock) + monkeypatch.setattr("atocore.api.routes.refresh_registered_project", fake_refresh_registered_project) + + client = TestClient(app) + response = client.post("/projects/p05/refresh") + + assert response.status_code == 200 + assert events == ["enter", ("refresh", "p05", False), "exit"] + + def test_projects_template_endpoint_returns_template(tmp_data_dir, monkeypatch): config.settings = config.Settings()