fix: pass project_hint into retrieve and add path-signal ranking

Two changes that belong together:

1. builder.build_context() now passes project_hint into retrieve(),
   so the project-aware boost actually fires for the retrieval pipeline
   driven by /context/build. Before this, only direct /query callers
   benefited from the registered-project boost.

2. retriever now applies two more ranking signals on every chunk:
   - _query_match_boost: boosts chunks whose source/title/heading
     echo high-signal query tokens (stop list filters out generic
     words like "the", "project", "system")
   - _path_signal_boost: down-weights archival noise (_archive,
     _history, pre-cleanup, reviews) by 0.72 and up-weights current
     high-signal docs (status, decision, requirements, charter,
     system-map, error-budget, ...) by 1.18

Tests:
- test_context_builder_passes_project_hint_to_retrieval verifies
  the wiring fix
- test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths
  verifies the new ranking helpers prefer current docs over archive

This addresses the cross-project competition and archive bleed
called out in current-state.md after the Wave 1 ingestion.
This commit is contained in:
2026-04-06 18:37:07 -04:00
parent bdb42dba05
commit 14ab7c8e9f
4 changed files with 175 additions and 8 deletions

View File

@@ -104,7 +104,15 @@ def build_context(
retrieval_budget = budget - project_state_chars - memory_chars
# 4. Retrieve candidates
candidates = retrieve(user_prompt, top_k=_config.settings.context_top_k) if retrieval_budget > 0 else []
candidates = (
retrieve(
user_prompt,
top_k=_config.settings.context_top_k,
project_hint=project_hint,
)
if retrieval_budget > 0
else []
)
# 5. Score and rank
scored = _rank_chunks(candidates, project_hint)

View File

@@ -1,5 +1,6 @@
"""Retrieval: query ranked chunks."""
"""Retrieval: query to ranked chunks."""
import re
import time
from dataclasses import dataclass
@@ -12,6 +13,54 @@ from atocore.retrieval.vector_store import get_vector_store
log = get_logger("retriever")
_STOP_TOKENS = {
"about",
"and",
"current",
"for",
"from",
"into",
"like",
"project",
"shared",
"system",
"that",
"the",
"this",
"what",
"with",
}
_HIGH_SIGNAL_HINTS = (
"status",
"decision",
"requirements",
"requirement",
"roadmap",
"charter",
"system-map",
"system_map",
"contracts",
"schema",
"architecture",
"workflow",
"error-budget",
"comparison-matrix",
"selection-decision",
)
_LOW_SIGNAL_HINTS = (
"/_archive/",
"\\_archive\\",
"/archive/",
"\\archive\\",
"_history",
"history",
"pre-cleanup",
"pre-migration",
"reviews/",
)
@dataclass
class ChunkResult:
@@ -38,10 +87,6 @@ def retrieve(
query_embedding = embed_query(query)
store = get_vector_store()
# Build filter
# Tags are stored as JSON strings like '["tag1", "tag2"]'.
# We use $contains with quoted tag to avoid substring false positives
# (e.g. searching "prod" won't match "production" because we search '"prod"').
where = None
if filter_tags:
if len(filter_tags) == 1:
@@ -66,13 +111,14 @@ def retrieve(
for i, chunk_id in enumerate(results["ids"][0]):
if chunk_id not in existing_ids:
continue
# ChromaDB returns distances (lower = more similar for cosine)
# Convert to similarity score (1 - distance)
distance = results["distances"][0][i] if results["distances"] else 0
score = 1.0 - distance
meta = results["metadatas"][0][i] if results["metadatas"] else {}
content = results["documents"][0][i] if results["documents"] else ""
score *= _query_match_boost(query, meta)
score *= _path_signal_boost(meta)
if project_hint:
score *= _project_match_boost(project_hint, meta)
@@ -132,6 +178,47 @@ def _project_match_boost(project_hint: str, metadata: dict) -> float:
return 1.0
def _query_match_boost(query: str, metadata: dict) -> float:
"""Boost chunks whose path/title/headings echo the query's high-signal terms."""
tokens = [
token
for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", query.lower())
if token not in _STOP_TOKENS
]
if not tokens:
return 1.0
searchable = " ".join(
[
str(metadata.get("source_file", "")).lower(),
str(metadata.get("title", "")).lower(),
str(metadata.get("heading_path", "")).lower(),
]
)
matches = sum(1 for token in set(tokens) if token in searchable)
if matches <= 0:
return 1.0
return min(1.0 + matches * 0.08, 1.32)
def _path_signal_boost(metadata: dict) -> float:
"""Prefer current high-signal docs and gently down-rank archival noise."""
searchable = " ".join(
[
str(metadata.get("source_file", "")).lower(),
str(metadata.get("title", "")).lower(),
str(metadata.get("heading_path", "")).lower(),
]
)
multiplier = 1.0
if any(hint in searchable for hint in _LOW_SIGNAL_HINTS):
multiplier *= 0.72
if any(hint in searchable for hint in _HIGH_SIGNAL_HINTS):
multiplier *= 1.18
return multiplier
def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
"""Filter out stale vector entries whose chunk rows no longer exist."""
if not chunk_ids: