feat(retrieval): persist explicit chunk project ids

2026-04-24 11:02:30 -04:00
parent f44a211497
commit c03022d864
12 changed files with 332 additions and 24 deletions
--- a/tests/test_ingestion.py
+++ b/tests/test_ingestion.py
@@ -1,8 +1,10 @@
 """Tests for the ingestion pipeline."""

+import json
+
 from atocore.ingestion.parser import parse_markdown
 from atocore.models.database import get_connection, init_db
-from atocore.ingestion.pipeline import ingest_file, ingest_folder
+from atocore.ingestion.pipeline import ingest_file, ingest_folder, ingest_project_folder


 def test_parse_markdown(sample_markdown):
@@ -69,6 +71,54 @@ def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
    assert result["status"] == "ingested"


+def test_ingest_file_records_project_id_metadata(tmp_data_dir, sample_markdown, monkeypatch):
+    """Project-aware ingestion should tag DB and vector metadata exactly."""
+    init_db()
+
+    class FakeVectorStore:
+        def __init__(self):
+            self.metadatas = []
+
+        def add(self, ids, documents, metadatas):
+            self.metadatas.extend(metadatas)
+
+        def delete(self, ids):
+            return None
+
+    fake_store = FakeVectorStore()
+    monkeypatch.setattr("atocore.ingestion.pipeline.get_vector_store", lambda: fake_store)
+
+    result = ingest_file(sample_markdown, project_id="p04-gigabit")
+
+    assert result["status"] == "ingested"
+    assert fake_store.metadatas
+    assert all(meta["project_id"] == "p04-gigabit" for meta in fake_store.metadatas)
+
+    with get_connection() as conn:
+        rows = conn.execute("SELECT metadata FROM source_chunks").fetchall()
+    assert rows
+    assert all(
+        json.loads(row["metadata"])["project_id"] == "p04-gigabit"
+        for row in rows
+    )
+
+
+def test_ingest_project_folder_passes_project_id_to_files(tmp_data_dir, sample_folder, monkeypatch):
+    seen = []
+
+    def fake_ingest_file(path, project_id=""):
+        seen.append((path.name, project_id))
+        return {"file": str(path), "status": "ingested"}
+
+    monkeypatch.setattr("atocore.ingestion.pipeline.ingest_file", fake_ingest_file)
+    monkeypatch.setattr("atocore.ingestion.pipeline._purge_deleted_files", lambda *args, **kwargs: 0)
+
+    ingest_project_folder(sample_folder, project_id="p05-interferometer")
+
+    assert seen
+    assert {project_id for _, project_id in seen} == {"p05-interferometer"}
+
+
 def test_parse_markdown_uses_supplied_text(sample_markdown):
    """Parsing should be able to reuse pre-read content from ingestion."""
    latin_text = """---\ntags: parser\n---\n# Parser Title\n\nBody text."""
--- a/tests/test_project_registry.py
+++ b/tests/test_project_registry.py
@@ -133,8 +133,8 @@ def test_refresh_registered_project_ingests_registered_roots(tmp_path, monkeypat

    calls = []

-    def fake_ingest_folder(path, purge_deleted=True):
-        calls.append((str(path), purge_deleted))
+    def fake_ingest_folder(path, purge_deleted=True, project_id=""):
+        calls.append((str(path), purge_deleted, project_id))
        return [{"file": str(path / "README.md"), "status": "ingested"}]

    monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir))
@@ -144,7 +144,7 @@ def test_refresh_registered_project_ingests_registered_roots(tmp_path, monkeypat
    original_settings = config.settings
    try:
        config.settings = config.Settings()
-        monkeypatch.setattr("atocore.projects.registry.ingest_folder", fake_ingest_folder)
+        monkeypatch.setattr("atocore.projects.registry.ingest_project_folder", fake_ingest_folder)
        result = refresh_registered_project("polisher")
    finally:
        config.settings = original_settings
@@ -153,6 +153,7 @@ def test_refresh_registered_project_ingests_registered_roots(tmp_path, monkeypat
    assert len(calls) == 1
    assert calls[0][0].endswith("p06-polisher")
    assert calls[0][1] is False
+    assert calls[0][2] == "p06-polisher"
    assert result["roots"][0]["status"] == "ingested"
    assert result["status"] == "ingested"
    assert result["roots_ingested"] == 1
@@ -188,7 +189,7 @@ def test_refresh_registered_project_reports_nothing_to_ingest_when_all_missing(
        encoding="utf-8",
    )

-    def fail_ingest_folder(path, purge_deleted=True):
+    def fail_ingest_folder(path, purge_deleted=True, project_id=""):
        raise AssertionError(f"ingest_folder should not be called for missing root: {path}")

    monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir))
@@ -198,7 +199,7 @@ def test_refresh_registered_project_reports_nothing_to_ingest_when_all_missing(
    original_settings = config.settings
    try:
        config.settings = config.Settings()
-        monkeypatch.setattr("atocore.projects.registry.ingest_folder", fail_ingest_folder)
+        monkeypatch.setattr("atocore.projects.registry.ingest_project_folder", fail_ingest_folder)
        result = refresh_registered_project("ghost")
    finally:
        config.settings = original_settings
@@ -238,7 +239,7 @@ def test_refresh_registered_project_reports_partial_status(tmp_path, monkeypatch
        encoding="utf-8",
    )

-    def fake_ingest_folder(path, purge_deleted=True):
+    def fake_ingest_folder(path, purge_deleted=True, project_id=""):
        return [{"file": str(path / "README.md"), "status": "ingested"}]

    monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir))
@@ -248,7 +249,7 @@ def test_refresh_registered_project_reports_partial_status(tmp_path, monkeypatch
    original_settings = config.settings
    try:
        config.settings = config.Settings()
-        monkeypatch.setattr("atocore.projects.registry.ingest_folder", fake_ingest_folder)
+        monkeypatch.setattr("atocore.projects.registry.ingest_project_folder", fake_ingest_folder)
        result = refresh_registered_project("mixed")
    finally:
        config.settings = original_settings
--- a/tests/test_retrieval.py
+++ b/tests/test_retrieval.py
@@ -384,6 +384,80 @@ def test_retrieve_project_scope_uses_path_segments_not_substrings(monkeypatch):
    assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"]


+def test_retrieve_project_scope_prefers_exact_project_id(monkeypatch):
+    target_project = type(
+        "Project",
+        (),
+        {
+            "project_id": "p04-gigabit",
+            "aliases": ("p04", "gigabit"),
+            "ingest_roots": (),
+        },
+    )()
+    other_project = type(
+        "Project",
+        (),
+        {
+            "project_id": "p06-polisher",
+            "aliases": ("p06", "polisher"),
+            "ingest_roots": (),
+        },
+    )()
+
+    class FakeStore:
+        def query(self, query_embedding, top_k=10, where=None):
+            return {
+                "ids": [["chunk-target", "chunk-other", "chunk-global"]],
+                "documents": [["target doc", "other doc", "global doc"]],
+                "metadatas": [[
+                    {
+                        "heading_path": "Overview",
+                        "source_file": "legacy/unhelpful-path.md",
+                        "tags": "[]",
+                        "title": "Target",
+                        "project_id": "p04-gigabit",
+                        "document_id": "doc-a",
+                    },
+                    {
+                        "heading_path": "Overview",
+                        "source_file": "p04-gigabit/title-poisoned.md",
+                        "tags": '["p04-gigabit"]',
+                        "title": "Looks target-owned but is explicit p06",
+                        "project_id": "p06-polisher",
+                        "document_id": "doc-b",
+                    },
+                    {
+                        "heading_path": "Overview",
+                        "source_file": "shared/global.md",
+                        "tags": "[]",
+                        "title": "Shared",
+                        "project_id": "",
+                        "document_id": "doc-global",
+                    },
+                ]],
+                "distances": [[0.2, 0.19, 0.21]],
+            }
+
+    monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
+    monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
+    monkeypatch.setattr(
+        "atocore.retrieval.retriever._existing_chunk_ids",
+        lambda chunk_ids: set(chunk_ids),
+    )
+    monkeypatch.setattr(
+        "atocore.retrieval.retriever.get_registered_project",
+        lambda project_name: target_project,
+    )
+    monkeypatch.setattr(
+        "atocore.retrieval.retriever.load_project_registry",
+        lambda: [target_project, other_project],
+    )
+
+    results = retrieve("mirror architecture", top_k=3, project_hint="p04")
+
+    assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"]
+
+
 def test_retrieve_unknown_project_hint_does_not_widen_or_filter(monkeypatch):
    class FakeStore:
        def query(self, query_embedding, top_k=10, where=None):