ATOCore/src/atocore/retrieval/retriever.py

"""Retrieval: query to ranked chunks."""

import re
import time
from dataclasses import dataclass

import atocore.config as _config
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
from atocore.projects.registry import get_registered_project
from atocore.retrieval.embeddings import embed_query
from atocore.retrieval.vector_store import get_vector_store

log = get_logger("retriever")

_STOP_TOKENS = {
    "about",
    "and",
    "current",
    "for",
    "from",
    "into",
    "like",
    "project",
    "shared",
    "system",
    "that",
    "the",
    "this",
    "what",
    "with",
}

_HIGH_SIGNAL_HINTS = (
    "status",
    "decision",
    "requirements",
    "requirement",
    "roadmap",
    "charter",
    "system-map",
    "system_map",
    "contracts",
    "schema",
    "architecture",
    "workflow",
    "error-budget",
    "comparison-matrix",
    "selection-decision",
)

_LOW_SIGNAL_HINTS = (
    "/_archive/",
    "\\_archive\\",
    "/archive/",
    "\\archive\\",
    "_history",
    "history",
    "pre-cleanup",
    "pre-migration",
    "reviews/",
)


@dataclass
class ChunkResult:
    chunk_id: str
    content: str
    score: float
    heading_path: str
    source_file: str
    tags: str
    title: str
    document_id: str


def retrieve(
    query: str,
    top_k: int | None = None,
    filter_tags: list[str] | None = None,
    project_hint: str | None = None,
) -> list[ChunkResult]:
    """Retrieve the most relevant chunks for a query."""
    top_k = top_k or _config.settings.context_top_k
    start = time.time()

    query_embedding = embed_query(query)
    store = get_vector_store()

    where = None
    if filter_tags:
        if len(filter_tags) == 1:
            where = {"tags": {"$contains": f'"{filter_tags[0]}"'}}
        else:
            where = {
                "$and": [
                    {"tags": {"$contains": f'"{tag}"'}}
                    for tag in filter_tags
                ]
            }

    results = store.query(
        query_embedding=query_embedding,
        top_k=top_k,
        where=where,
    )

    chunks = []
    if results and results["ids"] and results["ids"][0]:
        existing_ids = _existing_chunk_ids(results["ids"][0])
        for i, chunk_id in enumerate(results["ids"][0]):
            if chunk_id not in existing_ids:
                continue

            distance = results["distances"][0][i] if results["distances"] else 0
            score = 1.0 - distance
            meta = results["metadatas"][0][i] if results["metadatas"] else {}
            content = results["documents"][0][i] if results["documents"] else ""

            score *= _query_match_boost(query, meta)
            score *= _path_signal_boost(meta)
            if project_hint:
                score *= _project_match_boost(project_hint, meta)

            chunks.append(
                ChunkResult(
                    chunk_id=chunk_id,
                    content=content,
                    score=round(score, 4),
                    heading_path=meta.get("heading_path", ""),
                    source_file=meta.get("source_file", ""),
                    tags=meta.get("tags", "[]"),
                    title=meta.get("title", ""),
                    document_id=meta.get("document_id", ""),
                )
            )

    duration_ms = int((time.time() - start) * 1000)
    chunks.sort(key=lambda chunk: chunk.score, reverse=True)

    log.info(
        "retrieval_done",
        query=query[:100],
        top_k=top_k,
        results_count=len(chunks),
        duration_ms=duration_ms,
    )

    return chunks


def _project_match_boost(project_hint: str, metadata: dict) -> float:
    """Return a project-aware relevance multiplier for raw retrieval."""
    hint_lower = project_hint.strip().lower()
    if not hint_lower:
        return 1.0

    source_file = str(metadata.get("source_file", "")).lower()
    title = str(metadata.get("title", "")).lower()
    tags = str(metadata.get("tags", "")).lower()
    searchable = " ".join([source_file, title, tags])

    project = get_registered_project(project_hint)
    candidate_names = {hint_lower}
    if project is not None:
        candidate_names.add(project.project_id.lower())
        candidate_names.update(alias.lower() for alias in project.aliases)
        candidate_names.update(
            source_ref.subpath.replace("\\", "/").strip("/").split("/")[-1].lower()
            for source_ref in project.ingest_roots
            if source_ref.subpath.strip("/\\")
        )

    for candidate in candidate_names:
        if candidate and candidate in searchable:
            return _config.settings.rank_project_match_boost

    return 1.0


def _query_match_boost(query: str, metadata: dict) -> float:
    """Boost chunks whose path/title/headings echo the query's high-signal terms."""
    tokens = [
        token
        for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", query.lower())
        if token not in _STOP_TOKENS
    ]
    if not tokens:
        return 1.0

    searchable = " ".join(
        [
            str(metadata.get("source_file", "")).lower(),
            str(metadata.get("title", "")).lower(),
            str(metadata.get("heading_path", "")).lower(),
        ]
    )
    matches = sum(1 for token in set(tokens) if token in searchable)
    if matches <= 0:
        return 1.0
    return min(
        1.0 + matches * _config.settings.rank_query_token_step,
        _config.settings.rank_query_token_cap,
    )


def _path_signal_boost(metadata: dict) -> float:
    """Prefer current high-signal docs and gently down-rank archival noise."""
    searchable = " ".join(
        [
            str(metadata.get("source_file", "")).lower(),
            str(metadata.get("title", "")).lower(),
            str(metadata.get("heading_path", "")).lower(),
        ]
    )

    multiplier = 1.0
    if any(hint in searchable for hint in _LOW_SIGNAL_HINTS):
        multiplier *= _config.settings.rank_path_low_signal_penalty
    if any(hint in searchable for hint in _HIGH_SIGNAL_HINTS):
        multiplier *= _config.settings.rank_path_high_signal_boost
    return multiplier


def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
    """Filter out stale vector entries whose chunk rows no longer exist."""
    if not chunk_ids:
        return set()

    placeholders = ", ".join("?" for _ in chunk_ids)
    with get_connection() as conn:
        rows = conn.execute(
            f"SELECT id FROM source_chunks WHERE id IN ({placeholders})",
            chunk_ids,
        ).fetchall()
    return {row["id"] for row in rows}