Files
ATOCore/src/atocore/retrieval/retriever.py

147 lines
4.6 KiB
Python
Raw Normal View History

"""Retrieval: query → ranked chunks."""
import time
from dataclasses import dataclass
import atocore.config as _config
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
2026-04-06 13:32:33 -04:00
from atocore.projects.registry import get_registered_project
from atocore.retrieval.embeddings import embed_query
from atocore.retrieval.vector_store import get_vector_store
log = get_logger("retriever")
@dataclass
class ChunkResult:
chunk_id: str
content: str
score: float
heading_path: str
source_file: str
tags: str
title: str
document_id: str
def retrieve(
query: str,
top_k: int | None = None,
filter_tags: list[str] | None = None,
2026-04-06 13:32:33 -04:00
project_hint: str | None = None,
) -> list[ChunkResult]:
"""Retrieve the most relevant chunks for a query."""
top_k = top_k or _config.settings.context_top_k
start = time.time()
query_embedding = embed_query(query)
store = get_vector_store()
# Build filter
# Tags are stored as JSON strings like '["tag1", "tag2"]'.
# We use $contains with quoted tag to avoid substring false positives
# (e.g. searching "prod" won't match "production" because we search '"prod"').
where = None
if filter_tags:
if len(filter_tags) == 1:
where = {"tags": {"$contains": f'"{filter_tags[0]}"'}}
else:
where = {
"$and": [
{"tags": {"$contains": f'"{tag}"'}}
for tag in filter_tags
]
}
results = store.query(
query_embedding=query_embedding,
top_k=top_k,
where=where,
)
chunks = []
if results and results["ids"] and results["ids"][0]:
existing_ids = _existing_chunk_ids(results["ids"][0])
for i, chunk_id in enumerate(results["ids"][0]):
if chunk_id not in existing_ids:
continue
# ChromaDB returns distances (lower = more similar for cosine)
# Convert to similarity score (1 - distance)
distance = results["distances"][0][i] if results["distances"] else 0
score = 1.0 - distance
meta = results["metadatas"][0][i] if results["metadatas"] else {}
content = results["documents"][0][i] if results["documents"] else ""
2026-04-06 13:32:33 -04:00
if project_hint:
score *= _project_match_boost(project_hint, meta)
chunks.append(
ChunkResult(
chunk_id=chunk_id,
content=content,
score=round(score, 4),
heading_path=meta.get("heading_path", ""),
source_file=meta.get("source_file", ""),
tags=meta.get("tags", "[]"),
title=meta.get("title", ""),
document_id=meta.get("document_id", ""),
)
)
duration_ms = int((time.time() - start) * 1000)
2026-04-06 13:32:33 -04:00
chunks.sort(key=lambda chunk: chunk.score, reverse=True)
log.info(
"retrieval_done",
query=query[:100],
top_k=top_k,
results_count=len(chunks),
duration_ms=duration_ms,
)
return chunks
2026-04-06 13:32:33 -04:00
def _project_match_boost(project_hint: str, metadata: dict) -> float:
"""Return a project-aware relevance multiplier for raw retrieval."""
hint_lower = project_hint.strip().lower()
if not hint_lower:
return 1.0
source_file = str(metadata.get("source_file", "")).lower()
title = str(metadata.get("title", "")).lower()
tags = str(metadata.get("tags", "")).lower()
searchable = " ".join([source_file, title, tags])
project = get_registered_project(project_hint)
candidate_names = {hint_lower}
if project is not None:
candidate_names.add(project.project_id.lower())
candidate_names.update(alias.lower() for alias in project.aliases)
candidate_names.update(
source_ref.subpath.replace("\\", "/").strip("/").split("/")[-1].lower()
for source_ref in project.ingest_roots
if source_ref.subpath.strip("/\\")
)
for candidate in candidate_names:
if candidate and candidate in searchable:
return 2.0
return 1.0
def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
"""Filter out stale vector entries whose chunk rows no longer exist."""
if not chunk_ids:
return set()
placeholders = ", ".join("?" for _ in chunk_ids)
with get_connection() as conn:
rows = conn.execute(
f"SELECT id FROM source_chunks WHERE id IN ({placeholders})",
chunk_ids,
).fetchall()
return {row["id"] for row in rows}