From 05c11fd4fb29e5fb0701952871eb3c13e6d1f63e Mon Sep 17 00:00:00 2001 From: Anto01 Date: Fri, 24 Apr 2026 11:32:46 -0400 Subject: [PATCH] fix(retrieval): fail open on registry resolution errors --- DEV-LEDGER.md | 4 ++- docs/current-state.md | 2 +- docs/master-plan-status.md | 2 +- src/atocore/ingestion/pipeline.py | 7 +++- src/atocore/retrieval/retriever.py | 20 +++++++++-- tests/test_ingestion.py | 39 ++++++++++++++++++++++ tests/test_retrieval.py | 53 ++++++++++++++++++++++++++++++ 7 files changed, 121 insertions(+), 6 deletions(-) diff --git a/DEV-LEDGER.md b/DEV-LEDGER.md index 2ced5c3..a918bca 100644 --- a/DEV-LEDGER.md +++ b/DEV-LEDGER.md @@ -9,7 +9,7 @@ - **live_sha** (Dalidou `/health` build_sha): `f44a211` (verified 2026-04-24T14:48:44Z post audit-improvements deploy; status=ok) - **last_updated**: 2026-04-24 by Codex (retrieval boundary deployed; project_id metadata branch started) - **main_tip**: `f44a211` -- **test_count**: 565 on `codex/project-id-metadata-retrieval` (deployed main baseline: 553) +- **test_count**: 567 on `codex/project-id-metadata-retrieval` (deployed main baseline: 553) - **harness**: `19/20 PASS` on live Dalidou, 0 blocking failures, 1 known content gap (`p04-constraints`) - **vectors**: 33,253 - **active_memories**: 290 (`/admin/dashboard` 2026-04-24; note integrity panel reports a separate active_memory_count=951 and needs reconciliation) @@ -174,6 +174,8 @@ One branch `codex/extractor-eval-loop` for Day 1-5, a second `codex/retrieval-ha - **2026-04-24 Codex (project_id audit response)** Applied independent-audit fixes on `codex/project-id-metadata-retrieval`. Closed the nightly `/ingest/sources` clobber risk by adding registry-level `derive_project_id_for_path()` and making unscoped `ingest_file()` derive ownership from registered ingest roots when possible; `refresh_registered_project()` still passes the canonical project id directly. Changed retrieval so empty `project_id` falls through to legacy path/tag ownership instead of short-circuiting as unowned. Hardened `scripts/backfill_chunk_project_ids.py`: `--apply` now requires `--chroma-snapshot-confirmed`, runs Chroma metadata updates before SQLite writes, batches updates, skips/report missing vectors, skips/report malformed metadata, reports already-tagged rows, and turns missing ingestion tables into a JSON `db_warning` instead of a traceback. Added tests for auto-derive ingestion, empty-project fallback, ingest-root overlap rejection, and backfill dry-run/apply/snapshot/missing-vector/malformed cases. Verified targeted suite (`test_backfill_chunk_project_ids.py`, `test_ingestion.py`, `test_project_registry.py`, `test_retrieval.py`): 45 passed. Verified full suite: 565 passed in 73.16s. Local dry-run on empty/default data returns 0 updates with `db_warning` rather than crashing. Branch still not merged/deployed. +- **2026-04-24 Codex (project_id final hardening before merge)** Applied the final independent-review P2s on `codex/project-id-metadata-retrieval`: `ingest_file()` still fails open when project-id derivation fails, but now emits `project_id_derivation_failed` with file path and error; retrieval now catches registry failures both at project-scope resolution and the soft project-match boost path, logs warnings, and serves unscoped rather than raising. Added regression tests for both fail-open paths. Verified targeted suite (`test_ingestion.py`, `test_retrieval.py`, `test_backfill_chunk_project_ids.py`, `test_project_registry.py`): 47 passed. Verified full suite: 567 passed in 79.66s. Branch still not merged/deployed. + - **2026-04-24 Codex (audit improvements foundation)** Started implementation of the audit recommendations on branch `codex/audit-improvements-foundation` from `origin/main@c53e61e`. First tranche: registry-aware project-scoped retrieval filtering (`ATOCORE_RANK_PROJECT_SCOPE_FILTER`, widened candidate pull before filtering), eval harness known-issue lane, two p05 project-bleed fixtures, `scripts/live_status.py`, README/current-state/master-plan status refresh. Verified `pytest -q`: 550 passed in 67.11s. Live retrieval harness against undeployed production: 20 fixtures, 18 pass, 1 known issue (`p04-constraints` Zerodur/1.2 content gap), 1 blocking guard (`p05-broad-status-no-atomizer`) still failing because production has not yet deployed the retrieval filter and currently pulls `P04-GigaBIT-M1-KB-design` into broad p05 status context. Live dashboard refresh: health ok, build `2b86543`, docs 1748, chunks/vectors 33253, interactions 948, active memories 289, candidates 0, project_state total 128. Noted count discrepancy: dashboard memories.active=289 while integrity active_memory_count=951; schedule reconciliation in a follow-up. - **2026-04-24 Codex (independent-audit hardening)** Applied the Opus independent audit's fast follow-ups before merge/deploy. Closed the two P1s by making project-scope ownership path/tag-based only, adding path-segment/tag-exact matching to avoid short-alias substring collisions, and keeping title/heading text out of provenance decisions. Added regression tests for title poisoning, substring collision, and unknown-project fallback. Added retrieval log fields `raw_results_count`, `post_filter_count`, `post_filter_dropped`, and `underfilled`. Added retrieval-eval run metadata (`generated_at`, `base_url`, `/health`) and `live_status.py` auth-token/status support. README now documents the ranking knobs and clarifies that the hard scope filter and soft project match boost are separate controls. Verified `pytest -q`: 553 passed in 66.07s. Live production remains expected-predeploy: 20 fixtures, 18 pass, 1 known content gap, 1 blocking p05 bleed guard. Latest live dashboard: build `2b86543`, docs 1748, chunks/vectors 33253, interactions 950, active memories 290, candidates 0, project_state total 128. diff --git a/docs/current-state.md b/docs/current-state.md index 00eea48..14dd753 100644 --- a/docs/current-state.md +++ b/docs/current-state.md @@ -2,7 +2,7 @@ Update 2026-04-24: audit-improvements deployed as `f44a211`; live harness is 19/20 with 0 blocking failures and 1 known content gap. Active follow-up branch -`codex/project-id-metadata-retrieval` is at 565 passing tests. +`codex/project-id-metadata-retrieval` is at 567 passing tests. Live deploy: `2b86543` · Dalidou health: ok · Harness: 18/20 with 1 known content gap and 1 current blocking project-bleed guard · Tests: 553 passing. diff --git a/docs/master-plan-status.md b/docs/master-plan-status.md index eefbe39..6a7202b 100644 --- a/docs/master-plan-status.md +++ b/docs/master-plan-status.md @@ -152,7 +152,7 @@ deferred from the shared client until their workflows are exercised. - query-relevance memory ranking with overlap-density scoring - retrieval eval harness: 20 fixtures; current live has 19 pass, 1 known content gap, and 0 blocking failures after the audit-improvements deploy -- 565 tests passing on the active `codex/project-id-metadata-retrieval` branch +- 567 tests passing on the active `codex/project-id-metadata-retrieval` branch - nightly pipeline: backup → cleanup → rsync → OpenClaw import → vault refresh → extract → triage → **auto-promote/expire** → weekly synth/lint → **retrieval harness** → **pipeline summary to project state** - Phase 10 operational: reinforcement-based auto-promotion (ref_count ≥ 3, confidence ≥ 0.7) + stale candidate expiry (14 days unreinforced) - pipeline health visible in dashboard: interaction totals by client, pipeline last_run, harness results, triage stats diff --git a/src/atocore/ingestion/pipeline.py b/src/atocore/ingestion/pipeline.py index ce7ad2e..8aba226 100644 --- a/src/atocore/ingestion/pipeline.py +++ b/src/atocore/ingestion/pipeline.py @@ -42,7 +42,12 @@ def ingest_file(file_path: Path, project_id: str = "") -> dict: from atocore.projects.registry import derive_project_id_for_path project_id = derive_project_id_for_path(file_path) - except Exception: + except Exception as exc: + log.warning( + "project_id_derivation_failed", + file_path=str(file_path), + error=str(exc), + ) project_id = "" if not file_path.exists(): diff --git a/src/atocore/retrieval/retriever.py b/src/atocore/retrieval/retriever.py index 685fbb0..79cda1b 100644 --- a/src/atocore/retrieval/retriever.py +++ b/src/atocore/retrieval/retriever.py @@ -84,7 +84,15 @@ def retrieve( """Retrieve the most relevant chunks for a query.""" top_k = top_k or _config.settings.context_top_k start = time.time() - scoped_project = get_registered_project(project_hint) if project_hint else None + try: + scoped_project = get_registered_project(project_hint) if project_hint else None + except Exception as exc: + log.warning( + "project_scope_resolution_failed", + project_hint=project_hint, + error=str(exc), + ) + scoped_project = None scope_filter_enabled = bool(scoped_project and _config.settings.rank_project_scope_filter) registered_projects = None query_top_k = top_k @@ -292,7 +300,15 @@ def _project_match_boost(project_hint: str, metadata: dict) -> float: if not hint_lower: return 1.0 - project = get_registered_project(project_hint) + try: + project = get_registered_project(project_hint) + except Exception as exc: + log.warning( + "project_match_boost_resolution_failed", + project_hint=project_hint, + error=str(exc), + ) + project = None candidate_names = _project_scope_terms(project) if project is not None else {hint_lower} for candidate in candidate_names: if _metadata_has_term(metadata, candidate): diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 9ab6277..1026995 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -163,6 +163,45 @@ def test_ingest_file_derives_project_id_from_registry_root(tmp_data_dir, tmp_pat assert all(meta["project_id"] == "p04-gigabit" for meta in fake_store.metadatas) +def test_ingest_file_logs_and_fails_open_when_project_derivation_fails( + tmp_data_dir, + sample_markdown, + monkeypatch, +): + """A broken registry should be visible but should not block ingestion.""" + init_db() + warnings = [] + + class FakeVectorStore: + def __init__(self): + self.metadatas = [] + + def add(self, ids, documents, metadatas): + self.metadatas.extend(metadatas) + + def delete(self, ids): + return None + + fake_store = FakeVectorStore() + monkeypatch.setattr("atocore.ingestion.pipeline.get_vector_store", lambda: fake_store) + monkeypatch.setattr( + "atocore.projects.registry.derive_project_id_for_path", + lambda path: (_ for _ in ()).throw(ValueError("registry broken")), + ) + monkeypatch.setattr( + "atocore.ingestion.pipeline.log.warning", + lambda event, **kwargs: warnings.append((event, kwargs)), + ) + + result = ingest_file(sample_markdown) + + assert result["status"] == "ingested" + assert fake_store.metadatas + assert all(meta["project_id"] == "" for meta in fake_store.metadatas) + assert warnings[0][0] == "project_id_derivation_failed" + assert "registry broken" in warnings[0][1]["error"] + + def test_ingest_project_folder_passes_project_id_to_files(tmp_data_dir, sample_folder, monkeypatch): seen = [] diff --git a/tests/test_retrieval.py b/tests/test_retrieval.py index 5349b16..f108caa 100644 --- a/tests/test_retrieval.py +++ b/tests/test_retrieval.py @@ -566,6 +566,59 @@ def test_retrieve_unknown_project_hint_does_not_widen_or_filter(monkeypatch): assert [r.chunk_id for r in results] == ["chunk-a", "chunk-b"] +def test_retrieve_fails_open_when_project_scope_resolution_fails(monkeypatch): + warnings = [] + + class FakeStore: + def query(self, query_embedding, top_k=10, where=None): + assert top_k == 2 + return { + "ids": [["chunk-a", "chunk-b"]], + "documents": [["doc a", "doc b"]], + "metadatas": [[ + { + "heading_path": "Overview", + "source_file": "p04-gigabit/file.md", + "tags": "[]", + "title": "A", + "document_id": "doc-a", + }, + { + "heading_path": "Overview", + "source_file": "p05-interferometer/file.md", + "tags": "[]", + "title": "B", + "document_id": "doc-b", + }, + ]], + "distances": [[0.2, 0.21]], + } + + monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) + monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) + monkeypatch.setattr( + "atocore.retrieval.retriever._existing_chunk_ids", + lambda chunk_ids: set(chunk_ids), + ) + monkeypatch.setattr( + "atocore.retrieval.retriever.get_registered_project", + lambda project_name: (_ for _ in ()).throw(ValueError("registry overlap")), + ) + monkeypatch.setattr( + "atocore.retrieval.retriever.log.warning", + lambda event, **kwargs: warnings.append((event, kwargs)), + ) + + results = retrieve("overview", top_k=2, project_hint="p04") + + assert [r.chunk_id for r in results] == ["chunk-a", "chunk-b"] + assert {warning[0] for warning in warnings} == { + "project_scope_resolution_failed", + "project_match_boost_resolution_failed", + } + assert all("registry overlap" in warning[1]["error"] for warning in warnings) + + def test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths(monkeypatch): class FakeStore: def query(self, query_embedding, top_k=10, where=None):