fix: pass project_hint into retrieve and add path-signal ranking
Two changes that belong together:
1. builder.build_context() now passes project_hint into retrieve(),
so the project-aware boost actually fires for the retrieval pipeline
driven by /context/build. Before this, only direct /query callers
benefited from the registered-project boost.
2. retriever now applies two more ranking signals on every chunk:
- _query_match_boost: boosts chunks whose source/title/heading
echo high-signal query tokens (stop list filters out generic
words like "the", "project", "system")
- _path_signal_boost: down-weights archival noise (_archive,
_history, pre-cleanup, reviews) by 0.72 and up-weights current
high-signal docs (status, decision, requirements, charter,
system-map, error-budget, ...) by 1.18
Tests:
- test_context_builder_passes_project_hint_to_retrieval verifies
the wiring fix
- test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths
verifies the new ranking helpers prefer current docs over archive
This addresses the cross-project competition and archive bleed
called out in current-state.md after the Wave 1 ingestion.
This commit is contained in:
@@ -118,3 +118,58 @@ def test_retrieve_project_hint_boosts_matching_chunks(monkeypatch):
|
||||
assert len(results) == 2
|
||||
assert results[0].chunk_id == "chunk-a"
|
||||
assert results[0].score > results[1].score
|
||||
|
||||
|
||||
def test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths(monkeypatch):
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
return {
|
||||
"ids": [["chunk-archive", "chunk-requirements"]],
|
||||
"documents": [["archive doc", "requirements doc"]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"heading_path": "History",
|
||||
"source_file": "p05-interferometer/pkm/_archive/old/Error-Budget.md",
|
||||
"tags": '["p05-interferometer"]',
|
||||
"title": "Old Error Budget",
|
||||
"document_id": "doc-a",
|
||||
},
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p05-interferometer/pkm/Requirements/Error-Budget.md",
|
||||
"tags": '["p05-interferometer"]',
|
||||
"title": "Error Budget",
|
||||
"document_id": "doc-b",
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.24]],
|
||||
}
|
||||
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever._existing_chunk_ids",
|
||||
lambda chunk_ids: set(chunk_ids),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.get_registered_project",
|
||||
lambda project_name: type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p05-interferometer",
|
||||
"aliases": ("p05", "interferometer"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)(),
|
||||
)
|
||||
|
||||
results = retrieve(
|
||||
"interferometer error budget vendor constraints",
|
||||
top_k=2,
|
||||
project_hint="p05-interferometer",
|
||||
)
|
||||
|
||||
assert len(results) == 2
|
||||
assert results[0].chunk_id == "chunk-requirements"
|
||||
assert results[0].score > results[1].score
|
||||
|
||||
Reference in New Issue
Block a user