diff --git a/src/atocore/context/builder.py b/src/atocore/context/builder.py index 8ec41ec..928ea56 100644 --- a/src/atocore/context/builder.py +++ b/src/atocore/context/builder.py @@ -104,7 +104,15 @@ def build_context( retrieval_budget = budget - project_state_chars - memory_chars # 4. Retrieve candidates - candidates = retrieve(user_prompt, top_k=_config.settings.context_top_k) if retrieval_budget > 0 else [] + candidates = ( + retrieve( + user_prompt, + top_k=_config.settings.context_top_k, + project_hint=project_hint, + ) + if retrieval_budget > 0 + else [] + ) # 5. Score and rank scored = _rank_chunks(candidates, project_hint) diff --git a/src/atocore/retrieval/retriever.py b/src/atocore/retrieval/retriever.py index 524a523..462ceb9 100644 --- a/src/atocore/retrieval/retriever.py +++ b/src/atocore/retrieval/retriever.py @@ -1,5 +1,6 @@ -"""Retrieval: query → ranked chunks.""" +"""Retrieval: query to ranked chunks.""" +import re import time from dataclasses import dataclass @@ -12,6 +13,54 @@ from atocore.retrieval.vector_store import get_vector_store log = get_logger("retriever") +_STOP_TOKENS = { + "about", + "and", + "current", + "for", + "from", + "into", + "like", + "project", + "shared", + "system", + "that", + "the", + "this", + "what", + "with", +} + +_HIGH_SIGNAL_HINTS = ( + "status", + "decision", + "requirements", + "requirement", + "roadmap", + "charter", + "system-map", + "system_map", + "contracts", + "schema", + "architecture", + "workflow", + "error-budget", + "comparison-matrix", + "selection-decision", +) + +_LOW_SIGNAL_HINTS = ( + "/_archive/", + "\\_archive\\", + "/archive/", + "\\archive\\", + "_history", + "history", + "pre-cleanup", + "pre-migration", + "reviews/", +) + @dataclass class ChunkResult: @@ -38,10 +87,6 @@ def retrieve( query_embedding = embed_query(query) store = get_vector_store() - # Build filter - # Tags are stored as JSON strings like '["tag1", "tag2"]'. - # We use $contains with quoted tag to avoid substring false positives - # (e.g. searching "prod" won't match "production" because we search '"prod"'). where = None if filter_tags: if len(filter_tags) == 1: @@ -66,13 +111,14 @@ def retrieve( for i, chunk_id in enumerate(results["ids"][0]): if chunk_id not in existing_ids: continue - # ChromaDB returns distances (lower = more similar for cosine) - # Convert to similarity score (1 - distance) + distance = results["distances"][0][i] if results["distances"] else 0 score = 1.0 - distance meta = results["metadatas"][0][i] if results["metadatas"] else {} content = results["documents"][0][i] if results["documents"] else "" + score *= _query_match_boost(query, meta) + score *= _path_signal_boost(meta) if project_hint: score *= _project_match_boost(project_hint, meta) @@ -132,6 +178,47 @@ def _project_match_boost(project_hint: str, metadata: dict) -> float: return 1.0 +def _query_match_boost(query: str, metadata: dict) -> float: + """Boost chunks whose path/title/headings echo the query's high-signal terms.""" + tokens = [ + token + for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", query.lower()) + if token not in _STOP_TOKENS + ] + if not tokens: + return 1.0 + + searchable = " ".join( + [ + str(metadata.get("source_file", "")).lower(), + str(metadata.get("title", "")).lower(), + str(metadata.get("heading_path", "")).lower(), + ] + ) + matches = sum(1 for token in set(tokens) if token in searchable) + if matches <= 0: + return 1.0 + return min(1.0 + matches * 0.08, 1.32) + + +def _path_signal_boost(metadata: dict) -> float: + """Prefer current high-signal docs and gently down-rank archival noise.""" + searchable = " ".join( + [ + str(metadata.get("source_file", "")).lower(), + str(metadata.get("title", "")).lower(), + str(metadata.get("heading_path", "")).lower(), + ] + ) + + multiplier = 1.0 + if any(hint in searchable for hint in _LOW_SIGNAL_HINTS): + multiplier *= 0.72 + if any(hint in searchable for hint in _HIGH_SIGNAL_HINTS): + multiplier *= 1.18 + return multiplier + + def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]: """Filter out stale vector entries whose chunk rows no longer exist.""" if not chunk_ids: diff --git a/tests/test_context_builder.py b/tests/test_context_builder.py index 7fea372..fe3c24f 100644 --- a/tests/test_context_builder.py +++ b/tests/test_context_builder.py @@ -41,6 +41,23 @@ def test_context_with_project_hint(tmp_data_dir, sample_markdown): assert pack.total_chars > 0 +def test_context_builder_passes_project_hint_to_retrieval(monkeypatch): + init_db() + init_project_state_schema() + + calls = [] + + def fake_retrieve(query, top_k=None, filter_tags=None, project_hint=None): + calls.append((query, project_hint)) + return [] + + monkeypatch.setattr("atocore.context.builder.retrieve", fake_retrieve) + + build_context("architecture", project_hint="p05-interferometer", budget=300) + + assert calls == [("architecture", "p05-interferometer")] + + def test_last_context_pack_stored(tmp_data_dir, sample_markdown): """Test that last context pack is stored for debug.""" init_db() diff --git a/tests/test_retrieval.py b/tests/test_retrieval.py index 46ff746..121575d 100644 --- a/tests/test_retrieval.py +++ b/tests/test_retrieval.py @@ -118,3 +118,58 @@ def test_retrieve_project_hint_boosts_matching_chunks(monkeypatch): assert len(results) == 2 assert results[0].chunk_id == "chunk-a" assert results[0].score > results[1].score + + +def test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths(monkeypatch): + class FakeStore: + def query(self, query_embedding, top_k=10, where=None): + return { + "ids": [["chunk-archive", "chunk-requirements"]], + "documents": [["archive doc", "requirements doc"]], + "metadatas": [[ + { + "heading_path": "History", + "source_file": "p05-interferometer/pkm/_archive/old/Error-Budget.md", + "tags": '["p05-interferometer"]', + "title": "Old Error Budget", + "document_id": "doc-a", + }, + { + "heading_path": "Overview", + "source_file": "p05-interferometer/pkm/Requirements/Error-Budget.md", + "tags": '["p05-interferometer"]', + "title": "Error Budget", + "document_id": "doc-b", + }, + ]], + "distances": [[0.2, 0.24]], + } + + monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore()) + monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1]) + monkeypatch.setattr( + "atocore.retrieval.retriever._existing_chunk_ids", + lambda chunk_ids: set(chunk_ids), + ) + monkeypatch.setattr( + "atocore.retrieval.retriever.get_registered_project", + lambda project_name: type( + "Project", + (), + { + "project_id": "p05-interferometer", + "aliases": ("p05", "interferometer"), + "ingest_roots": (), + }, + )(), + ) + + results = retrieve( + "interferometer error budget vendor constraints", + top_k=2, + project_hint="p05-interferometer", + ) + + assert len(results) == 2 + assert results[0].chunk_id == "chunk-requirements" + assert results[0].score > results[1].score