feat(retrieval): persist explicit chunk project ids

This commit is contained in:
2026-04-24 11:02:30 -04:00
parent f44a211497
commit c03022d864
12 changed files with 332 additions and 24 deletions

View File

@@ -384,6 +384,80 @@ def test_retrieve_project_scope_uses_path_segments_not_substrings(monkeypatch):
assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"]
def test_retrieve_project_scope_prefers_exact_project_id(monkeypatch):
target_project = type(
"Project",
(),
{
"project_id": "p04-gigabit",
"aliases": ("p04", "gigabit"),
"ingest_roots": (),
},
)()
other_project = type(
"Project",
(),
{
"project_id": "p06-polisher",
"aliases": ("p06", "polisher"),
"ingest_roots": (),
},
)()
class FakeStore:
def query(self, query_embedding, top_k=10, where=None):
return {
"ids": [["chunk-target", "chunk-other", "chunk-global"]],
"documents": [["target doc", "other doc", "global doc"]],
"metadatas": [[
{
"heading_path": "Overview",
"source_file": "legacy/unhelpful-path.md",
"tags": "[]",
"title": "Target",
"project_id": "p04-gigabit",
"document_id": "doc-a",
},
{
"heading_path": "Overview",
"source_file": "p04-gigabit/title-poisoned.md",
"tags": '["p04-gigabit"]',
"title": "Looks target-owned but is explicit p06",
"project_id": "p06-polisher",
"document_id": "doc-b",
},
{
"heading_path": "Overview",
"source_file": "shared/global.md",
"tags": "[]",
"title": "Shared",
"project_id": "",
"document_id": "doc-global",
},
]],
"distances": [[0.2, 0.19, 0.21]],
}
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
monkeypatch.setattr(
"atocore.retrieval.retriever._existing_chunk_ids",
lambda chunk_ids: set(chunk_ids),
)
monkeypatch.setattr(
"atocore.retrieval.retriever.get_registered_project",
lambda project_name: target_project,
)
monkeypatch.setattr(
"atocore.retrieval.retriever.load_project_registry",
lambda: [target_project, other_project],
)
results = retrieve("mirror architecture", top_k=3, project_hint="p04")
assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"]
def test_retrieve_unknown_project_hint_does_not_widen_or_filter(monkeypatch):
class FakeStore:
def query(self, query_embedding, top_k=10, where=None):