"""Retrieval: query to ranked chunks.""" import re import time from dataclasses import dataclass import atocore.config as _config from atocore.models.database import get_connection from atocore.observability.logger import get_logger from atocore.projects.registry import get_registered_project from atocore.retrieval.embeddings import embed_query from atocore.retrieval.vector_store import get_vector_store log = get_logger("retriever") _STOP_TOKENS = { "about", "and", "current", "for", "from", "into", "like", "project", "shared", "system", "that", "the", "this", "what", "with", } _HIGH_SIGNAL_HINTS = ( "status", "decision", "requirements", "requirement", "roadmap", "charter", "system-map", "system_map", "contracts", "schema", "architecture", "workflow", "error-budget", "comparison-matrix", "selection-decision", ) _LOW_SIGNAL_HINTS = ( "/_archive/", "\\_archive\\", "/archive/", "\\archive\\", "_history", "history", "pre-cleanup", "pre-migration", "reviews/", ) @dataclass class ChunkResult: chunk_id: str content: str score: float heading_path: str source_file: str tags: str title: str document_id: str def retrieve( query: str, top_k: int | None = None, filter_tags: list[str] | None = None, project_hint: str | None = None, ) -> list[ChunkResult]: """Retrieve the most relevant chunks for a query.""" top_k = top_k or _config.settings.context_top_k start = time.time() query_embedding = embed_query(query) store = get_vector_store() where = None if filter_tags: if len(filter_tags) == 1: where = {"tags": {"$contains": f'"{filter_tags[0]}"'}} else: where = { "$and": [ {"tags": {"$contains": f'"{tag}"'}} for tag in filter_tags ] } results = store.query( query_embedding=query_embedding, top_k=top_k, where=where, ) chunks = [] if results and results["ids"] and results["ids"][0]: existing_ids = _existing_chunk_ids(results["ids"][0]) for i, chunk_id in enumerate(results["ids"][0]): if chunk_id not in existing_ids: continue distance = results["distances"][0][i] if results["distances"] else 0 score = 1.0 - distance meta = results["metadatas"][0][i] if results["metadatas"] else {} content = results["documents"][0][i] if results["documents"] else "" score *= _query_match_boost(query, meta) score *= _path_signal_boost(meta) if project_hint: score *= _project_match_boost(project_hint, meta) chunks.append( ChunkResult( chunk_id=chunk_id, content=content, score=round(score, 4), heading_path=meta.get("heading_path", ""), source_file=meta.get("source_file", ""), tags=meta.get("tags", "[]"), title=meta.get("title", ""), document_id=meta.get("document_id", ""), ) ) duration_ms = int((time.time() - start) * 1000) chunks.sort(key=lambda chunk: chunk.score, reverse=True) log.info( "retrieval_done", query=query[:100], top_k=top_k, results_count=len(chunks), duration_ms=duration_ms, ) return chunks def _project_match_boost(project_hint: str, metadata: dict) -> float: """Return a project-aware relevance multiplier for raw retrieval.""" hint_lower = project_hint.strip().lower() if not hint_lower: return 1.0 source_file = str(metadata.get("source_file", "")).lower() title = str(metadata.get("title", "")).lower() tags = str(metadata.get("tags", "")).lower() searchable = " ".join([source_file, title, tags]) project = get_registered_project(project_hint) candidate_names = {hint_lower} if project is not None: candidate_names.add(project.project_id.lower()) candidate_names.update(alias.lower() for alias in project.aliases) candidate_names.update( source_ref.subpath.replace("\\", "/").strip("/").split("/")[-1].lower() for source_ref in project.ingest_roots if source_ref.subpath.strip("/\\") ) for candidate in candidate_names: if candidate and candidate in searchable: return _config.settings.rank_project_match_boost return 1.0 def _query_match_boost(query: str, metadata: dict) -> float: """Boost chunks whose path/title/headings echo the query's high-signal terms.""" tokens = [ token for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", query.lower()) if token not in _STOP_TOKENS ] if not tokens: return 1.0 searchable = " ".join( [ str(metadata.get("source_file", "")).lower(), str(metadata.get("title", "")).lower(), str(metadata.get("heading_path", "")).lower(), ] ) matches = sum(1 for token in set(tokens) if token in searchable) if matches <= 0: return 1.0 return min( 1.0 + matches * _config.settings.rank_query_token_step, _config.settings.rank_query_token_cap, ) def _path_signal_boost(metadata: dict) -> float: """Prefer current high-signal docs and gently down-rank archival noise.""" searchable = " ".join( [ str(metadata.get("source_file", "")).lower(), str(metadata.get("title", "")).lower(), str(metadata.get("heading_path", "")).lower(), ] ) multiplier = 1.0 if any(hint in searchable for hint in _LOW_SIGNAL_HINTS): multiplier *= _config.settings.rank_path_low_signal_penalty if any(hint in searchable for hint in _HIGH_SIGNAL_HINTS): multiplier *= _config.settings.rank_path_high_signal_boost return multiplier def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]: """Filter out stale vector entries whose chunk rows no longer exist.""" if not chunk_ids: return set() placeholders = ", ".join("?" for _ in chunk_ids) with get_connection() as conn: rows = conn.execute( f"SELECT id FROM source_chunks WHERE id IN ({placeholders})", chunk_ids, ).fetchall() return {row["id"] for row in rows}