fix: pass project_hint into retrieve and add path-signal ranking
Two changes that belong together:
1. builder.build_context() now passes project_hint into retrieve(),
so the project-aware boost actually fires for the retrieval pipeline
driven by /context/build. Before this, only direct /query callers
benefited from the registered-project boost.
2. retriever now applies two more ranking signals on every chunk:
- _query_match_boost: boosts chunks whose source/title/heading
echo high-signal query tokens (stop list filters out generic
words like "the", "project", "system")
- _path_signal_boost: down-weights archival noise (_archive,
_history, pre-cleanup, reviews) by 0.72 and up-weights current
high-signal docs (status, decision, requirements, charter,
system-map, error-budget, ...) by 1.18
Tests:
- test_context_builder_passes_project_hint_to_retrieval verifies
the wiring fix
- test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths
verifies the new ranking helpers prefer current docs over archive
This addresses the cross-project competition and archive bleed
called out in current-state.md after the Wave 1 ingestion.
This commit is contained in:
@@ -104,7 +104,15 @@ def build_context(
|
|||||||
retrieval_budget = budget - project_state_chars - memory_chars
|
retrieval_budget = budget - project_state_chars - memory_chars
|
||||||
|
|
||||||
# 4. Retrieve candidates
|
# 4. Retrieve candidates
|
||||||
candidates = retrieve(user_prompt, top_k=_config.settings.context_top_k) if retrieval_budget > 0 else []
|
candidates = (
|
||||||
|
retrieve(
|
||||||
|
user_prompt,
|
||||||
|
top_k=_config.settings.context_top_k,
|
||||||
|
project_hint=project_hint,
|
||||||
|
)
|
||||||
|
if retrieval_budget > 0
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
|
||||||
# 5. Score and rank
|
# 5. Score and rank
|
||||||
scored = _rank_chunks(candidates, project_hint)
|
scored = _rank_chunks(candidates, project_hint)
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Retrieval: query → ranked chunks."""
|
"""Retrieval: query to ranked chunks."""
|
||||||
|
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
@@ -12,6 +13,54 @@ from atocore.retrieval.vector_store import get_vector_store
|
|||||||
|
|
||||||
log = get_logger("retriever")
|
log = get_logger("retriever")
|
||||||
|
|
||||||
|
_STOP_TOKENS = {
|
||||||
|
"about",
|
||||||
|
"and",
|
||||||
|
"current",
|
||||||
|
"for",
|
||||||
|
"from",
|
||||||
|
"into",
|
||||||
|
"like",
|
||||||
|
"project",
|
||||||
|
"shared",
|
||||||
|
"system",
|
||||||
|
"that",
|
||||||
|
"the",
|
||||||
|
"this",
|
||||||
|
"what",
|
||||||
|
"with",
|
||||||
|
}
|
||||||
|
|
||||||
|
_HIGH_SIGNAL_HINTS = (
|
||||||
|
"status",
|
||||||
|
"decision",
|
||||||
|
"requirements",
|
||||||
|
"requirement",
|
||||||
|
"roadmap",
|
||||||
|
"charter",
|
||||||
|
"system-map",
|
||||||
|
"system_map",
|
||||||
|
"contracts",
|
||||||
|
"schema",
|
||||||
|
"architecture",
|
||||||
|
"workflow",
|
||||||
|
"error-budget",
|
||||||
|
"comparison-matrix",
|
||||||
|
"selection-decision",
|
||||||
|
)
|
||||||
|
|
||||||
|
_LOW_SIGNAL_HINTS = (
|
||||||
|
"/_archive/",
|
||||||
|
"\\_archive\\",
|
||||||
|
"/archive/",
|
||||||
|
"\\archive\\",
|
||||||
|
"_history",
|
||||||
|
"history",
|
||||||
|
"pre-cleanup",
|
||||||
|
"pre-migration",
|
||||||
|
"reviews/",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ChunkResult:
|
class ChunkResult:
|
||||||
@@ -38,10 +87,6 @@ def retrieve(
|
|||||||
query_embedding = embed_query(query)
|
query_embedding = embed_query(query)
|
||||||
store = get_vector_store()
|
store = get_vector_store()
|
||||||
|
|
||||||
# Build filter
|
|
||||||
# Tags are stored as JSON strings like '["tag1", "tag2"]'.
|
|
||||||
# We use $contains with quoted tag to avoid substring false positives
|
|
||||||
# (e.g. searching "prod" won't match "production" because we search '"prod"').
|
|
||||||
where = None
|
where = None
|
||||||
if filter_tags:
|
if filter_tags:
|
||||||
if len(filter_tags) == 1:
|
if len(filter_tags) == 1:
|
||||||
@@ -66,13 +111,14 @@ def retrieve(
|
|||||||
for i, chunk_id in enumerate(results["ids"][0]):
|
for i, chunk_id in enumerate(results["ids"][0]):
|
||||||
if chunk_id not in existing_ids:
|
if chunk_id not in existing_ids:
|
||||||
continue
|
continue
|
||||||
# ChromaDB returns distances (lower = more similar for cosine)
|
|
||||||
# Convert to similarity score (1 - distance)
|
|
||||||
distance = results["distances"][0][i] if results["distances"] else 0
|
distance = results["distances"][0][i] if results["distances"] else 0
|
||||||
score = 1.0 - distance
|
score = 1.0 - distance
|
||||||
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||||
content = results["documents"][0][i] if results["documents"] else ""
|
content = results["documents"][0][i] if results["documents"] else ""
|
||||||
|
|
||||||
|
score *= _query_match_boost(query, meta)
|
||||||
|
score *= _path_signal_boost(meta)
|
||||||
if project_hint:
|
if project_hint:
|
||||||
score *= _project_match_boost(project_hint, meta)
|
score *= _project_match_boost(project_hint, meta)
|
||||||
|
|
||||||
@@ -132,6 +178,47 @@ def _project_match_boost(project_hint: str, metadata: dict) -> float:
|
|||||||
return 1.0
|
return 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def _query_match_boost(query: str, metadata: dict) -> float:
|
||||||
|
"""Boost chunks whose path/title/headings echo the query's high-signal terms."""
|
||||||
|
tokens = [
|
||||||
|
token
|
||||||
|
for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", query.lower())
|
||||||
|
if token not in _STOP_TOKENS
|
||||||
|
]
|
||||||
|
if not tokens:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
searchable = " ".join(
|
||||||
|
[
|
||||||
|
str(metadata.get("source_file", "")).lower(),
|
||||||
|
str(metadata.get("title", "")).lower(),
|
||||||
|
str(metadata.get("heading_path", "")).lower(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
matches = sum(1 for token in set(tokens) if token in searchable)
|
||||||
|
if matches <= 0:
|
||||||
|
return 1.0
|
||||||
|
return min(1.0 + matches * 0.08, 1.32)
|
||||||
|
|
||||||
|
|
||||||
|
def _path_signal_boost(metadata: dict) -> float:
|
||||||
|
"""Prefer current high-signal docs and gently down-rank archival noise."""
|
||||||
|
searchable = " ".join(
|
||||||
|
[
|
||||||
|
str(metadata.get("source_file", "")).lower(),
|
||||||
|
str(metadata.get("title", "")).lower(),
|
||||||
|
str(metadata.get("heading_path", "")).lower(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
multiplier = 1.0
|
||||||
|
if any(hint in searchable for hint in _LOW_SIGNAL_HINTS):
|
||||||
|
multiplier *= 0.72
|
||||||
|
if any(hint in searchable for hint in _HIGH_SIGNAL_HINTS):
|
||||||
|
multiplier *= 1.18
|
||||||
|
return multiplier
|
||||||
|
|
||||||
|
|
||||||
def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
|
def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
|
||||||
"""Filter out stale vector entries whose chunk rows no longer exist."""
|
"""Filter out stale vector entries whose chunk rows no longer exist."""
|
||||||
if not chunk_ids:
|
if not chunk_ids:
|
||||||
|
|||||||
@@ -41,6 +41,23 @@ def test_context_with_project_hint(tmp_data_dir, sample_markdown):
|
|||||||
assert pack.total_chars > 0
|
assert pack.total_chars > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_context_builder_passes_project_hint_to_retrieval(monkeypatch):
|
||||||
|
init_db()
|
||||||
|
init_project_state_schema()
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_retrieve(query, top_k=None, filter_tags=None, project_hint=None):
|
||||||
|
calls.append((query, project_hint))
|
||||||
|
return []
|
||||||
|
|
||||||
|
monkeypatch.setattr("atocore.context.builder.retrieve", fake_retrieve)
|
||||||
|
|
||||||
|
build_context("architecture", project_hint="p05-interferometer", budget=300)
|
||||||
|
|
||||||
|
assert calls == [("architecture", "p05-interferometer")]
|
||||||
|
|
||||||
|
|
||||||
def test_last_context_pack_stored(tmp_data_dir, sample_markdown):
|
def test_last_context_pack_stored(tmp_data_dir, sample_markdown):
|
||||||
"""Test that last context pack is stored for debug."""
|
"""Test that last context pack is stored for debug."""
|
||||||
init_db()
|
init_db()
|
||||||
|
|||||||
@@ -118,3 +118,58 @@ def test_retrieve_project_hint_boosts_matching_chunks(monkeypatch):
|
|||||||
assert len(results) == 2
|
assert len(results) == 2
|
||||||
assert results[0].chunk_id == "chunk-a"
|
assert results[0].chunk_id == "chunk-a"
|
||||||
assert results[0].score > results[1].score
|
assert results[0].score > results[1].score
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths(monkeypatch):
|
||||||
|
class FakeStore:
|
||||||
|
def query(self, query_embedding, top_k=10, where=None):
|
||||||
|
return {
|
||||||
|
"ids": [["chunk-archive", "chunk-requirements"]],
|
||||||
|
"documents": [["archive doc", "requirements doc"]],
|
||||||
|
"metadatas": [[
|
||||||
|
{
|
||||||
|
"heading_path": "History",
|
||||||
|
"source_file": "p05-interferometer/pkm/_archive/old/Error-Budget.md",
|
||||||
|
"tags": '["p05-interferometer"]',
|
||||||
|
"title": "Old Error Budget",
|
||||||
|
"document_id": "doc-a",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading_path": "Overview",
|
||||||
|
"source_file": "p05-interferometer/pkm/Requirements/Error-Budget.md",
|
||||||
|
"tags": '["p05-interferometer"]',
|
||||||
|
"title": "Error Budget",
|
||||||
|
"document_id": "doc-b",
|
||||||
|
},
|
||||||
|
]],
|
||||||
|
"distances": [[0.2, 0.24]],
|
||||||
|
}
|
||||||
|
|
||||||
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
||||||
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
||||||
|
lambda chunk_ids: set(chunk_ids),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"atocore.retrieval.retriever.get_registered_project",
|
||||||
|
lambda project_name: type(
|
||||||
|
"Project",
|
||||||
|
(),
|
||||||
|
{
|
||||||
|
"project_id": "p05-interferometer",
|
||||||
|
"aliases": ("p05", "interferometer"),
|
||||||
|
"ingest_roots": (),
|
||||||
|
},
|
||||||
|
)(),
|
||||||
|
)
|
||||||
|
|
||||||
|
results = retrieve(
|
||||||
|
"interferometer error budget vendor constraints",
|
||||||
|
top_k=2,
|
||||||
|
project_hint="p05-interferometer",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(results) == 2
|
||||||
|
assert results[0].chunk_id == "chunk-requirements"
|
||||||
|
assert results[0].score > results[1].score
|
||||||
|
|||||||
Reference in New Issue
Block a user