"""Tests for the retrieval system.""" from atocore.ingestion.pipeline import ingest_file from atocore.models.database import get_connection, init_db from atocore.retrieval.retriever import retrieve from atocore.retrieval.vector_store import get_vector_store def test_retrieve_returns_results(tmp_data_dir, sample_markdown): """Test that retrieval returns relevant chunks.""" init_db() ingest_file(sample_markdown) results = retrieve("What are the memory types?", top_k=5) assert len(results) > 0 assert all(r.score > 0 for r in results) assert all(r.content for r in results) def test_retrieve_scores_ranked(tmp_data_dir, sample_markdown): """Test that results are ranked by score.""" init_db() ingest_file(sample_markdown) results = retrieve("architecture layers", top_k=5) if len(results) >= 2: scores = [r.score for r in results] assert scores == sorted(scores, reverse=True) def test_vector_store_count(tmp_data_dir, sample_markdown): """Test that vector store tracks chunk count.""" init_db() # Reset singleton for clean test import atocore.retrieval.vector_store as vs vs._store = None ingest_file(sample_markdown) store = get_vector_store() assert store.count > 0 def test_retrieve_skips_stale_vector_entries(tmp_data_dir, sample_markdown, monkeypatch): """Retriever should ignore vector hits whose chunk rows no longer exist.""" init_db() ingest_file(sample_markdown) with get_connection() as conn: chunk_ids = [row["id"] for row in conn.execute("SELECT id FROM source_chunks").fetchall()] class FakeStore: def query(self, query_embedding, top_k=10, where=None): return { "ids": [[chunk_ids[0], "missing-chunk"]], "documents": [["valid doc", "stale doc"]], "metadatas": [[ {"heading_path": "Overview", "source_file": "valid.md", "tags": "[]", "title": "Valid", "document_id": "doc-1"}, {"heading_path": "Ghost", "source_file": "ghost.md", "tags": "[]", "title": "Ghost", "document_id": "doc-2"}, ]], "distances": [[0.1, 0.2]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) results = retrieve("overview", top_k=2) assert len(results) == 1 assert results[0].chunk_id == chunk_ids[0] def test_retrieve_project_hint_boosts_matching_chunks(monkeypatch): target_project = type( "Project", (), { "project_id": "p04-gigabit", "aliases": ("p04", "gigabit"), "ingest_roots": (), }, )() other_project = type( "Project", (), { "project_id": "p05-interferometer", "aliases": ("p05", "interferometer"), "ingest_roots": (), }, )() class FakeStore: def query(self, query_embedding, top_k=10, where=None): assert top_k == 8 return { "ids": [["chunk-a", "chunk-b"]], "documents": [["project doc", "other doc"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "p04-gigabit/pkm/_index.md", "tags": '["p04-gigabit"]', "title": "P04", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "p05-interferometer/pkm/_index.md", "tags": '["p05-interferometer"]', "title": "P05", "document_id": "doc-b", }, ]], "distances": [[0.3, 0.25]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: target_project, ) monkeypatch.setattr( "atocore.retrieval.retriever.load_project_registry", lambda: [target_project, other_project], ) results = retrieve("mirror architecture", top_k=2, project_hint="p04") assert len(results) == 1 assert results[0].chunk_id == "chunk-a" def test_retrieve_project_scope_allows_unowned_global_chunks(monkeypatch): target_project = type( "Project", (), { "project_id": "p04-gigabit", "aliases": ("p04", "gigabit"), "ingest_roots": (), }, )() class FakeStore: def query(self, query_embedding, top_k=10, where=None): return { "ids": [["chunk-a", "chunk-global"]], "documents": [["project doc", "global doc"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "p04-gigabit/pkm/_index.md", "tags": '["p04-gigabit"]', "title": "P04", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "shared/engineering-rules.md", "tags": "[]", "title": "Shared engineering rules", "document_id": "doc-global", }, ]], "distances": [[0.2, 0.21]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: target_project, ) monkeypatch.setattr( "atocore.retrieval.retriever.load_project_registry", lambda: [target_project], ) results = retrieve("mirror architecture", top_k=2, project_hint="p04") assert [r.chunk_id for r in results] == ["chunk-a", "chunk-global"] def test_retrieve_project_scope_filter_can_be_disabled(monkeypatch): target_project = type( "Project", (), { "project_id": "p04-gigabit", "aliases": ("p04", "gigabit"), "ingest_roots": (), }, )() other_project = type( "Project", (), { "project_id": "p05-interferometer", "aliases": ("p05", "interferometer"), "ingest_roots": (), }, )() class FakeStore: def query(self, query_embedding, top_k=10, where=None): assert top_k == 2 return { "ids": [["chunk-a", "chunk-b"]], "documents": [["project doc", "other project doc"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "p04-gigabit/pkm/_index.md", "tags": '["p04-gigabit"]', "title": "P04", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "p05-interferometer/pkm/_index.md", "tags": '["p05-interferometer"]', "title": "P05", "document_id": "doc-b", }, ]], "distances": [[0.2, 0.2]], } monkeypatch.setattr("atocore.config.settings.rank_project_scope_filter", False) monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: target_project, ) monkeypatch.setattr( "atocore.retrieval.retriever.load_project_registry", lambda: [target_project, other_project], ) results = retrieve("mirror architecture", top_k=2, project_hint="p04") assert {r.chunk_id for r in results} == {"chunk-a", "chunk-b"} def test_retrieve_project_scope_ignores_title_for_ownership(monkeypatch): target_project = type( "Project", (), { "project_id": "p04-gigabit", "aliases": ("p04", "gigabit"), "ingest_roots": (), }, )() other_project = type( "Project", (), { "project_id": "p06-polisher", "aliases": ("p06", "polisher", "p11"), "ingest_roots": (), }, )() class FakeStore: def query(self, query_embedding, top_k=10, where=None): return { "ids": [["chunk-target", "chunk-poisoned-title"]], "documents": [["p04 doc", "p06 doc"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "p04-gigabit/pkm/_index.md", "tags": '["p04-gigabit"]', "title": "P04", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "p06-polisher/pkm/architecture.md", "tags": '["p06-polisher"]', "title": "GigaBIT M1 mirror lessons", "document_id": "doc-b", }, ]], "distances": [[0.2, 0.19]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: target_project, ) monkeypatch.setattr( "atocore.retrieval.retriever.load_project_registry", lambda: [target_project, other_project], ) results = retrieve("mirror architecture", top_k=2, project_hint="p04") assert [r.chunk_id for r in results] == ["chunk-target"] def test_retrieve_project_scope_uses_path_segments_not_substrings(monkeypatch): target_project = type( "Project", (), { "project_id": "p05-interferometer", "aliases": ("p05", "interferometer"), "ingest_roots": (), }, )() abb_project = type( "Project", (), { "project_id": "abb-space", "aliases": ("abb",), "ingest_roots": (), }, )() class FakeStore: def query(self, query_embedding, top_k=10, where=None): return { "ids": [["chunk-target", "chunk-global"]], "documents": [["p05 doc", "global doc"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "p05-interferometer/pkm/_index.md", "tags": '["p05-interferometer"]', "title": "P05", "document_id": "doc-a", }, { "heading_path": "Abbreviation notes", "source_file": "shared/cabbage-abbreviations.md", "tags": "[]", "title": "ABB-style abbreviations", "document_id": "doc-global", }, ]], "distances": [[0.2, 0.21]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: target_project, ) monkeypatch.setattr( "atocore.retrieval.retriever.load_project_registry", lambda: [target_project, abb_project], ) results = retrieve("abbreviations", top_k=2, project_hint="p05") assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"] def test_retrieve_project_scope_prefers_exact_project_id(monkeypatch): target_project = type( "Project", (), { "project_id": "p04-gigabit", "aliases": ("p04", "gigabit"), "ingest_roots": (), }, )() other_project = type( "Project", (), { "project_id": "p06-polisher", "aliases": ("p06", "polisher"), "ingest_roots": (), }, )() class FakeStore: def query(self, query_embedding, top_k=10, where=None): return { "ids": [["chunk-target", "chunk-other", "chunk-global"]], "documents": [["target doc", "other doc", "global doc"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "legacy/unhelpful-path.md", "tags": "[]", "title": "Target", "project_id": "p04-gigabit", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "p04-gigabit/title-poisoned.md", "tags": '["p04-gigabit"]', "title": "Looks target-owned but is explicit p06", "project_id": "p06-polisher", "document_id": "doc-b", }, { "heading_path": "Overview", "source_file": "shared/global.md", "tags": "[]", "title": "Shared", "project_id": "", "document_id": "doc-global", }, ]], "distances": [[0.2, 0.19, 0.21]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: target_project, ) monkeypatch.setattr( "atocore.retrieval.retriever.load_project_registry", lambda: [target_project, other_project], ) results = retrieve("mirror architecture", top_k=3, project_hint="p04") assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"] def test_retrieve_empty_project_id_falls_back_to_path_ownership(monkeypatch): target_project = type( "Project", (), { "project_id": "p04-gigabit", "aliases": ("p04", "gigabit"), "ingest_roots": (), }, )() other_project = type( "Project", (), { "project_id": "p05-interferometer", "aliases": ("p05", "interferometer"), "ingest_roots": (), }, )() class FakeStore: def query(self, query_embedding, top_k=10, where=None): return { "ids": [["chunk-target", "chunk-other"]], "documents": [["target doc", "other doc"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "p04-gigabit/status.md", "tags": "[]", "title": "Target", "project_id": "", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "p05-interferometer/status.md", "tags": "[]", "title": "Other", "project_id": "", "document_id": "doc-b", }, ]], "distances": [[0.2, 0.19]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: target_project, ) monkeypatch.setattr( "atocore.retrieval.retriever.load_project_registry", lambda: [target_project, other_project], ) results = retrieve("mirror architecture", top_k=2, project_hint="p04") assert [r.chunk_id for r in results] == ["chunk-target"] def test_retrieve_unknown_project_hint_does_not_widen_or_filter(monkeypatch): class FakeStore: def query(self, query_embedding, top_k=10, where=None): assert top_k == 2 return { "ids": [["chunk-a", "chunk-b"]], "documents": [["doc a", "doc b"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "project-a/file.md", "tags": "[]", "title": "A", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "project-b/file.md", "tags": "[]", "title": "B", "document_id": "doc-b", }, ]], "distances": [[0.2, 0.21]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: None, ) results = retrieve("overview", top_k=2, project_hint="unknown-project") assert [r.chunk_id for r in results] == ["chunk-a", "chunk-b"] def test_retrieve_fails_open_when_project_scope_resolution_fails(monkeypatch): warnings = [] class FakeStore: def query(self, query_embedding, top_k=10, where=None): assert top_k == 2 return { "ids": [["chunk-a", "chunk-b"]], "documents": [["doc a", "doc b"]], "metadatas": [[ { "heading_path": "Overview", "source_file": "p04-gigabit/file.md", "tags": "[]", "title": "A", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "p05-interferometer/file.md", "tags": "[]", "title": "B", "document_id": "doc-b", }, ]], "distances": [[0.2, 0.21]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: (_ for _ in ()).throw(ValueError("registry overlap")), ) monkeypatch.setattr( "atocore.retrieval.retriever.log.warning", lambda event, **kwargs: warnings.append((event, kwargs)), ) results = retrieve("overview", top_k=2, project_hint="p04") assert [r.chunk_id for r in results] == ["chunk-a", "chunk-b"] assert {warning[0] for warning in warnings} == { "project_scope_resolution_failed", "project_match_boost_resolution_failed", } assert all("registry overlap" in warning[1]["error"] for warning in warnings) def test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths(monkeypatch): class FakeStore: def query(self, query_embedding, top_k=10, where=None): return { "ids": [["chunk-archive", "chunk-requirements"]], "documents": [["archive doc", "requirements doc"]], "metadatas": [[ { "heading_path": "History", "source_file": "p05-interferometer/pkm/_archive/old/Error-Budget.md", "tags": '["p05-interferometer"]', "title": "Old Error Budget", "document_id": "doc-a", }, { "heading_path": "Overview", "source_file": "p05-interferometer/pkm/Requirements/Error-Budget.md", "tags": '["p05-interferometer"]', "title": "Error Budget", "document_id": "doc-b", }, ]], "distances": [[0.2, 0.24]], } monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) monkeypatch.setattr( "atocore.retrieval.retriever._existing_chunk_ids", lambda chunk_ids: set(chunk_ids), ) monkeypatch.setattr( "atocore.retrieval.retriever.get_registered_project", lambda project_name: type( "Project", (), { "project_id": "p05-interferometer", "aliases": ("p05", "interferometer"), "ingest_roots": (), }, )(), ) results = retrieve( "interferometer error budget vendor constraints", top_k=2, project_hint="p05-interferometer", ) assert len(results) == 2 assert results[0].chunk_id == "chunk-requirements" assert results[0].score > results[1].score