Add project-aware boost to raw query

This commit is contained in:
2026-04-06 13:32:33 -04:00
parent 4aa2b696a9
commit 26bfa94c65
4 changed files with 117 additions and 1 deletions

View File

@@ -6,6 +6,7 @@ from dataclasses import dataclass
import atocore.config as _config
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
from atocore.projects.registry import get_registered_project
from atocore.retrieval.embeddings import embed_query
from atocore.retrieval.vector_store import get_vector_store
@@ -28,6 +29,7 @@ def retrieve(
query: str,
top_k: int | None = None,
filter_tags: list[str] | None = None,
project_hint: str | None = None,
) -> list[ChunkResult]:
"""Retrieve the most relevant chunks for a query."""
top_k = top_k or _config.settings.context_top_k
@@ -71,6 +73,9 @@ def retrieve(
meta = results["metadatas"][0][i] if results["metadatas"] else {}
content = results["documents"][0][i] if results["documents"] else ""
if project_hint:
score *= _project_match_boost(project_hint, meta)
chunks.append(
ChunkResult(
chunk_id=chunk_id,
@@ -85,6 +90,8 @@ def retrieve(
)
duration_ms = int((time.time() - start) * 1000)
chunks.sort(key=lambda chunk: chunk.score, reverse=True)
log.info(
"retrieval_done",
query=query[:100],
@@ -96,6 +103,35 @@ def retrieve(
return chunks
def _project_match_boost(project_hint: str, metadata: dict) -> float:
"""Return a project-aware relevance multiplier for raw retrieval."""
hint_lower = project_hint.strip().lower()
if not hint_lower:
return 1.0
source_file = str(metadata.get("source_file", "")).lower()
title = str(metadata.get("title", "")).lower()
tags = str(metadata.get("tags", "")).lower()
searchable = " ".join([source_file, title, tags])
project = get_registered_project(project_hint)
candidate_names = {hint_lower}
if project is not None:
candidate_names.add(project.project_id.lower())
candidate_names.update(alias.lower() for alias in project.aliases)
candidate_names.update(
source_ref.subpath.replace("\\", "/").strip("/").split("/")[-1].lower()
for source_ref in project.ingest_roots
if source_ref.subpath.strip("/\\")
)
for candidate in candidate_names:
if candidate and candidate in searchable:
return 2.0
return 1.0
def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
"""Filter out stale vector entries whose chunk rows no longer exist."""
if not chunk_ids: