"""Retrieval: query → ranked chunks."""

import time
from dataclasses import dataclass

import atocore.config as _config
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
from atocore.projects.registry import get_registered_project
from atocore.retrieval.embeddings import embed_query
from atocore.retrieval.vector_store import get_vector_store

log = get_logger("retriever")


@dataclass
class ChunkResult:
    chunk_id: str
    content: str
    score: float
    heading_path: str
    source_file: str
    tags: str
    title: str
    document_id: str


def retrieve(
    query: str,
    top_k: int | None = None,
    filter_tags: list[str] | None = None,
    project_hint: str | None = None,
) -> list[ChunkResult]:
    """Retrieve the most relevant chunks for a query."""
    top_k = top_k or _config.settings.context_top_k
    start = time.time()

    query_embedding = embed_query(query)
    store = get_vector_store()

    # Build filter
    # Tags are stored as JSON strings like '["tag1", "tag2"]'.
    # We use $contains with quoted tag to avoid substring false positives
    # (e.g. searching "prod" won't match "production" because we search '"prod"').
    where = None
    if filter_tags:
        if len(filter_tags) == 1:
            where = {"tags": {"$contains": f'"{filter_tags[0]}"'}}
        else:
            where = {
                "$and": [
                    {"tags": {"$contains": f'"{tag}"'}}
                    for tag in filter_tags
                ]
            }

    results = store.query(
        query_embedding=query_embedding,
        top_k=top_k,
        where=where,
    )

    chunks = []
    if results and results["ids"] and results["ids"][0]:
        existing_ids = _existing_chunk_ids(results["ids"][0])
        for i, chunk_id in enumerate(results["ids"][0]):
            if chunk_id not in existing_ids:
                continue
            # ChromaDB returns distances (lower = more similar for cosine)
            # Convert to similarity score (1 - distance)
            distance = results["distances"][0][i] if results["distances"] else 0
            score = 1.0 - distance
            meta = results["metadatas"][0][i] if results["metadatas"] else {}
            content = results["documents"][0][i] if results["documents"] else ""

            if project_hint:
                score *= _project_match_boost(project_hint, meta)

            chunks.append(
                ChunkResult(
                    chunk_id=chunk_id,
                    content=content,
                    score=round(score, 4),
                    heading_path=meta.get("heading_path", ""),
                    source_file=meta.get("source_file", ""),
                    tags=meta.get("tags", "[]"),
                    title=meta.get("title", ""),
                    document_id=meta.get("document_id", ""),
                )
            )

    duration_ms = int((time.time() - start) * 1000)
    chunks.sort(key=lambda chunk: chunk.score, reverse=True)

    log.info(
        "retrieval_done",
        query=query[:100],
        top_k=top_k,
        results_count=len(chunks),
        duration_ms=duration_ms,
    )

    return chunks


def _project_match_boost(project_hint: str, metadata: dict) -> float:
    """Return a project-aware relevance multiplier for raw retrieval."""
    hint_lower = project_hint.strip().lower()
    if not hint_lower:
        return 1.0

    source_file = str(metadata.get("source_file", "")).lower()
    title = str(metadata.get("title", "")).lower()
    tags = str(metadata.get("tags", "")).lower()
    searchable = " ".join([source_file, title, tags])

    project = get_registered_project(project_hint)
    candidate_names = {hint_lower}
    if project is not None:
        candidate_names.add(project.project_id.lower())
        candidate_names.update(alias.lower() for alias in project.aliases)
        candidate_names.update(
            source_ref.subpath.replace("\\", "/").strip("/").split("/")[-1].lower()
            for source_ref in project.ingest_roots
            if source_ref.subpath.strip("/\\")
        )

    for candidate in candidate_names:
        if candidate and candidate in searchable:
            return 2.0

    return 1.0


def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
    """Filter out stale vector entries whose chunk rows no longer exist."""
    if not chunk_ids:
        return set()

    placeholders = ", ".join("?" for _ in chunk_ids)
    with get_connection() as conn:
        rows = conn.execute(
            f"SELECT id FROM source_chunks WHERE id IN ({placeholders})",
            chunk_ids,
        ).fetchall()
    return {row["id"] for row in rows}