fix(retrieval): enforce project-scoped context boundaries
This commit is contained in:
@@ -46,6 +46,8 @@ def test_settings_keep_legacy_db_path_when_present(tmp_path, monkeypatch):
|
||||
|
||||
def test_ranking_weights_are_tunable_via_env(monkeypatch):
|
||||
monkeypatch.setenv("ATOCORE_RANK_PROJECT_MATCH_BOOST", "3.5")
|
||||
monkeypatch.setenv("ATOCORE_RANK_PROJECT_SCOPE_FILTER", "false")
|
||||
monkeypatch.setenv("ATOCORE_RANK_PROJECT_SCOPE_CANDIDATE_MULTIPLIER", "6")
|
||||
monkeypatch.setenv("ATOCORE_RANK_QUERY_TOKEN_STEP", "0.12")
|
||||
monkeypatch.setenv("ATOCORE_RANK_QUERY_TOKEN_CAP", "1.5")
|
||||
monkeypatch.setenv("ATOCORE_RANK_PATH_HIGH_SIGNAL_BOOST", "1.25")
|
||||
@@ -54,6 +56,8 @@ def test_ranking_weights_are_tunable_via_env(monkeypatch):
|
||||
settings = config.Settings()
|
||||
|
||||
assert settings.rank_project_match_boost == 3.5
|
||||
assert settings.rank_project_scope_filter is False
|
||||
assert settings.rank_project_scope_candidate_multiplier == 6
|
||||
assert settings.rank_query_token_step == 0.12
|
||||
assert settings.rank_query_token_cap == 1.5
|
||||
assert settings.rank_path_high_signal_boost == 1.25
|
||||
|
||||
@@ -70,8 +70,28 @@ def test_retrieve_skips_stale_vector_entries(tmp_data_dir, sample_markdown, monk
|
||||
|
||||
|
||||
def test_retrieve_project_hint_boosts_matching_chunks(monkeypatch):
|
||||
target_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p04-gigabit",
|
||||
"aliases": ("p04", "gigabit"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
other_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p05-interferometer",
|
||||
"aliases": ("p05", "interferometer"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
assert top_k == 8
|
||||
return {
|
||||
"ids": [["chunk-a", "chunk-b"]],
|
||||
"documents": [["project doc", "other doc"]],
|
||||
@@ -102,22 +122,308 @@ def test_retrieve_project_hint_boosts_matching_chunks(monkeypatch):
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.get_registered_project",
|
||||
lambda project_name: type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p04-gigabit",
|
||||
"aliases": ("p04", "gigabit"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)(),
|
||||
lambda project_name: target_project,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.load_project_registry",
|
||||
lambda: [target_project, other_project],
|
||||
)
|
||||
|
||||
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
||||
|
||||
assert len(results) == 2
|
||||
assert len(results) == 1
|
||||
assert results[0].chunk_id == "chunk-a"
|
||||
assert results[0].score > results[1].score
|
||||
|
||||
|
||||
def test_retrieve_project_scope_allows_unowned_global_chunks(monkeypatch):
|
||||
target_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p04-gigabit",
|
||||
"aliases": ("p04", "gigabit"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
return {
|
||||
"ids": [["chunk-a", "chunk-global"]],
|
||||
"documents": [["project doc", "global doc"]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p04-gigabit/pkm/_index.md",
|
||||
"tags": '["p04-gigabit"]',
|
||||
"title": "P04",
|
||||
"document_id": "doc-a",
|
||||
},
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "shared/engineering-rules.md",
|
||||
"tags": "[]",
|
||||
"title": "Shared engineering rules",
|
||||
"document_id": "doc-global",
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.21]],
|
||||
}
|
||||
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever._existing_chunk_ids",
|
||||
lambda chunk_ids: set(chunk_ids),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.get_registered_project",
|
||||
lambda project_name: target_project,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.load_project_registry",
|
||||
lambda: [target_project],
|
||||
)
|
||||
|
||||
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
||||
|
||||
assert [r.chunk_id for r in results] == ["chunk-a", "chunk-global"]
|
||||
|
||||
|
||||
def test_retrieve_project_scope_filter_can_be_disabled(monkeypatch):
|
||||
target_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p04-gigabit",
|
||||
"aliases": ("p04", "gigabit"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
other_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p05-interferometer",
|
||||
"aliases": ("p05", "interferometer"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
assert top_k == 2
|
||||
return {
|
||||
"ids": [["chunk-a", "chunk-b"]],
|
||||
"documents": [["project doc", "other project doc"]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p04-gigabit/pkm/_index.md",
|
||||
"tags": '["p04-gigabit"]',
|
||||
"title": "P04",
|
||||
"document_id": "doc-a",
|
||||
},
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p05-interferometer/pkm/_index.md",
|
||||
"tags": '["p05-interferometer"]',
|
||||
"title": "P05",
|
||||
"document_id": "doc-b",
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.2]],
|
||||
}
|
||||
|
||||
monkeypatch.setattr("atocore.config.settings.rank_project_scope_filter", False)
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever._existing_chunk_ids",
|
||||
lambda chunk_ids: set(chunk_ids),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.get_registered_project",
|
||||
lambda project_name: target_project,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.load_project_registry",
|
||||
lambda: [target_project, other_project],
|
||||
)
|
||||
|
||||
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
||||
|
||||
assert {r.chunk_id for r in results} == {"chunk-a", "chunk-b"}
|
||||
|
||||
|
||||
def test_retrieve_project_scope_ignores_title_for_ownership(monkeypatch):
|
||||
target_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p04-gigabit",
|
||||
"aliases": ("p04", "gigabit"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
other_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p06-polisher",
|
||||
"aliases": ("p06", "polisher", "p11"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
return {
|
||||
"ids": [["chunk-target", "chunk-poisoned-title"]],
|
||||
"documents": [["p04 doc", "p06 doc"]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p04-gigabit/pkm/_index.md",
|
||||
"tags": '["p04-gigabit"]',
|
||||
"title": "P04",
|
||||
"document_id": "doc-a",
|
||||
},
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p06-polisher/pkm/architecture.md",
|
||||
"tags": '["p06-polisher"]',
|
||||
"title": "GigaBIT M1 mirror lessons",
|
||||
"document_id": "doc-b",
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.19]],
|
||||
}
|
||||
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever._existing_chunk_ids",
|
||||
lambda chunk_ids: set(chunk_ids),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.get_registered_project",
|
||||
lambda project_name: target_project,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.load_project_registry",
|
||||
lambda: [target_project, other_project],
|
||||
)
|
||||
|
||||
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
||||
|
||||
assert [r.chunk_id for r in results] == ["chunk-target"]
|
||||
|
||||
|
||||
def test_retrieve_project_scope_uses_path_segments_not_substrings(monkeypatch):
|
||||
target_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p05-interferometer",
|
||||
"aliases": ("p05", "interferometer"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
abb_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "abb-space",
|
||||
"aliases": ("abb",),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
return {
|
||||
"ids": [["chunk-target", "chunk-global"]],
|
||||
"documents": [["p05 doc", "global doc"]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p05-interferometer/pkm/_index.md",
|
||||
"tags": '["p05-interferometer"]',
|
||||
"title": "P05",
|
||||
"document_id": "doc-a",
|
||||
},
|
||||
{
|
||||
"heading_path": "Abbreviation notes",
|
||||
"source_file": "shared/cabbage-abbreviations.md",
|
||||
"tags": "[]",
|
||||
"title": "ABB-style abbreviations",
|
||||
"document_id": "doc-global",
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.21]],
|
||||
}
|
||||
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever._existing_chunk_ids",
|
||||
lambda chunk_ids: set(chunk_ids),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.get_registered_project",
|
||||
lambda project_name: target_project,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.load_project_registry",
|
||||
lambda: [target_project, abb_project],
|
||||
)
|
||||
|
||||
results = retrieve("abbreviations", top_k=2, project_hint="p05")
|
||||
|
||||
assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"]
|
||||
|
||||
|
||||
def test_retrieve_unknown_project_hint_does_not_widen_or_filter(monkeypatch):
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
assert top_k == 2
|
||||
return {
|
||||
"ids": [["chunk-a", "chunk-b"]],
|
||||
"documents": [["doc a", "doc b"]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "project-a/file.md",
|
||||
"tags": "[]",
|
||||
"title": "A",
|
||||
"document_id": "doc-a",
|
||||
},
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "project-b/file.md",
|
||||
"tags": "[]",
|
||||
"title": "B",
|
||||
"document_id": "doc-b",
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.21]],
|
||||
}
|
||||
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever._existing_chunk_ids",
|
||||
lambda chunk_ids: set(chunk_ids),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.get_registered_project",
|
||||
lambda project_name: None,
|
||||
)
|
||||
|
||||
results = retrieve("overview", top_k=2, project_hint="unknown-project")
|
||||
|
||||
assert [r.chunk_id for r in results] == ["chunk-a", "chunk-b"]
|
||||
|
||||
|
||||
def test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths(monkeypatch):
|
||||
|
||||
Reference in New Issue
Block a user