fix: pass project_hint into retrieve and add path-signal ranking

Two changes that belong together: 1. builder.build_context() now passes project_hint into retrieve(), so the project-aware boost actually fires for the retrieval pipeline driven by /context/build. Before this, only direct /query callers benefited from the registered-project boost. 2. retriever now applies two more ranking signals on every chunk: - _query_match_boost: boosts chunks whose source/title/heading echo high-signal query tokens (stop list filters out generic words like "the", "project", "system") - _path_signal_boost: down-weights archival noise (_archive, _history, pre-cleanup, reviews) by 0.72 and up-weights current high-signal docs (status, decision, requirements, charter, system-map, error-budget, ...) by 1.18 Tests: - test_context_builder_passes_project_hint_to_retrieval verifies the wiring fix - test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths verifies the new ranking helpers prefer current docs over archive This addresses the cross-project competition and archive bleed called out in current-state.md after the Wave 1 ingestion.
2026-04-06 18:37:07 -04:00
parent bdb42dba05
commit 14ab7c8e9f
4 changed files with 175 additions and 8 deletions
--- a/src/atocore/context/builder.py
+++ b/src/atocore/context/builder.py
@@ -104,7 +104,15 @@ def build_context(
    retrieval_budget = budget - project_state_chars - memory_chars

    # 4. Retrieve candidates
-    candidates = retrieve(user_prompt, top_k=_config.settings.context_top_k) if retrieval_budget > 0 else []
+    candidates = (
+        retrieve(
+            user_prompt,
+            top_k=_config.settings.context_top_k,
+            project_hint=project_hint,
+        )
+        if retrieval_budget > 0
+        else []
+    )

    # 5. Score and rank
    scored = _rank_chunks(candidates, project_hint)
--- a/src/atocore/retrieval/retriever.py
+++ b/src/atocore/retrieval/retriever.py
@@ -1,5 +1,6 @@
-"""Retrieval: query → ranked chunks."""
+"""Retrieval: query to ranked chunks."""

+import re
 import time
 from dataclasses import dataclass

@@ -12,6 +13,54 @@ from atocore.retrieval.vector_store import get_vector_store

 log = get_logger("retriever")

+_STOP_TOKENS = {
+    "about",
+    "and",
+    "current",
+    "for",
+    "from",
+    "into",
+    "like",
+    "project",
+    "shared",
+    "system",
+    "that",
+    "the",
+    "this",
+    "what",
+    "with",
+}
+
+_HIGH_SIGNAL_HINTS = (
+    "status",
+    "decision",
+    "requirements",
+    "requirement",
+    "roadmap",
+    "charter",
+    "system-map",
+    "system_map",
+    "contracts",
+    "schema",
+    "architecture",
+    "workflow",
+    "error-budget",
+    "comparison-matrix",
+    "selection-decision",
+)
+
+_LOW_SIGNAL_HINTS = (
+    "/_archive/",
+    "\\_archive\\",
+    "/archive/",
+    "\\archive\\",
+    "_history",
+    "history",
+    "pre-cleanup",
+    "pre-migration",
+    "reviews/",
+)
+

@dataclass
 class ChunkResult:
@@ -38,10 +87,6 @@ def retrieve(
    query_embedding = embed_query(query)
    store = get_vector_store()

-    # Build filter
-    # Tags are stored as JSON strings like '["tag1", "tag2"]'.
-    # We use $contains with quoted tag to avoid substring false positives
-    # (e.g. searching "prod" won't match "production" because we search '"prod"').
    where = None
    if filter_tags:
        if len(filter_tags) == 1:
@@ -66,13 +111,14 @@ def retrieve(
        for i, chunk_id in enumerate(results["ids"][0]):
            if chunk_id not in existing_ids:
                continue
-            # ChromaDB returns distances (lower = more similar for cosine)
-            # Convert to similarity score (1 - distance)
+
            distance = results["distances"][0][i] if results["distances"] else 0
            score = 1.0 - distance
            meta = results["metadatas"][0][i] if results["metadatas"] else {}
            content = results["documents"][0][i] if results["documents"] else ""

+            score *= _query_match_boost(query, meta)
+            score *= _path_signal_boost(meta)
            if project_hint:
                score *= _project_match_boost(project_hint, meta)

@@ -132,6 +178,47 @@ def _project_match_boost(project_hint: str, metadata: dict) -> float:
    return 1.0


+def _query_match_boost(query: str, metadata: dict) -> float:
+    """Boost chunks whose path/title/headings echo the query's high-signal terms."""
+    tokens = [
+        token
+        for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", query.lower())
+        if token not in _STOP_TOKENS
+    ]
+    if not tokens:
+        return 1.0
+
+    searchable = " ".join(
+        [
+            str(metadata.get("source_file", "")).lower(),
+            str(metadata.get("title", "")).lower(),
+            str(metadata.get("heading_path", "")).lower(),
+        ]
+    )
+    matches = sum(1 for token in set(tokens) if token in searchable)
+    if matches <= 0:
+        return 1.0
+    return min(1.0 + matches * 0.08, 1.32)
+
+
+def _path_signal_boost(metadata: dict) -> float:
+    """Prefer current high-signal docs and gently down-rank archival noise."""
+    searchable = " ".join(
+        [
+            str(metadata.get("source_file", "")).lower(),
+            str(metadata.get("title", "")).lower(),
+            str(metadata.get("heading_path", "")).lower(),
+        ]
+    )
+
+    multiplier = 1.0
+    if any(hint in searchable for hint in _LOW_SIGNAL_HINTS):
+        multiplier *= 0.72
+    if any(hint in searchable for hint in _HIGH_SIGNAL_HINTS):
+        multiplier *= 1.18
+    return multiplier
+
+
 def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
    """Filter out stale vector entries whose chunk rows no longer exist."""
    if not chunk_ids:
--- a/tests/test_context_builder.py
+++ b/tests/test_context_builder.py
@@ -41,6 +41,23 @@ def test_context_with_project_hint(tmp_data_dir, sample_markdown):
    assert pack.total_chars > 0


+def test_context_builder_passes_project_hint_to_retrieval(monkeypatch):
+    init_db()
+    init_project_state_schema()
+
+    calls = []
+
+    def fake_retrieve(query, top_k=None, filter_tags=None, project_hint=None):
+        calls.append((query, project_hint))
+        return []
+
+    monkeypatch.setattr("atocore.context.builder.retrieve", fake_retrieve)
+
+    build_context("architecture", project_hint="p05-interferometer", budget=300)
+
+    assert calls == [("architecture", "p05-interferometer")]
+
+
 def test_last_context_pack_stored(tmp_data_dir, sample_markdown):
    """Test that last context pack is stored for debug."""
    init_db()
--- a/tests/test_retrieval.py
+++ b/tests/test_retrieval.py
@@ -118,3 +118,58 @@ def test_retrieve_project_hint_boosts_matching_chunks(monkeypatch):
    assert len(results) == 2
    assert results[0].chunk_id == "chunk-a"
    assert results[0].score > results[1].score
+
+
+def test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths(monkeypatch):
+    class FakeStore:
+        def query(self, query_embedding, top_k=10, where=None):
+            return {
+                "ids": [["chunk-archive", "chunk-requirements"]],
+                "documents": [["archive doc", "requirements doc"]],
+                "metadatas": [[
+                    {
+                        "heading_path": "History",
+                        "source_file": "p05-interferometer/pkm/_archive/old/Error-Budget.md",
+                        "tags": '["p05-interferometer"]',
+                        "title": "Old Error Budget",
+                        "document_id": "doc-a",
+                    },
+                    {
+                        "heading_path": "Overview",
+                        "source_file": "p05-interferometer/pkm/Requirements/Error-Budget.md",
+                        "tags": '["p05-interferometer"]',
+                        "title": "Error Budget",
+                        "document_id": "doc-b",
+                    },
+                ]],
+                "distances": [[0.2, 0.24]],
+            }
+
+    monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
+    monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
+    monkeypatch.setattr(
+        "atocore.retrieval.retriever._existing_chunk_ids",
+        lambda chunk_ids: set(chunk_ids),
+    )
+    monkeypatch.setattr(
+        "atocore.retrieval.retriever.get_registered_project",
+        lambda project_name: type(
+            "Project",
+            (),
+            {
+                "project_id": "p05-interferometer",
+                "aliases": ("p05", "interferometer"),
+                "ingest_roots": (),
+            },
+        )(),
+    )
+
+    results = retrieve(
+        "interferometer error budget vendor constraints",
+        top_k=2,
+        project_hint="p05-interferometer",
+    )
+
+    assert len(results) == 2
+    assert results[0].chunk_id == "chunk-requirements"
+    assert results[0].score > results[1].score