Files
ATOCore/src/atocore/retrieval/retriever.py
Anto01 c9b9eede25 feat: tunable ranking, refresh status, chroma backup + admin endpoints
Three small improvements that move the operational baseline forward
without changing the existing trust model.

1. Tunable retrieval ranking weights
   - rank_project_match_boost, rank_query_token_step,
     rank_query_token_cap, rank_path_high_signal_boost,
     rank_path_low_signal_penalty are now Settings fields
   - all overridable via ATOCORE_* env vars
   - retriever no longer hard-codes 2.0 / 1.18 / 0.72 / 0.08 / 1.32
   - lets ranking be tuned per environment as Wave 1 is exercised
     without code changes

2. /projects/{name}/refresh status
   - refresh_registered_project now returns an overall status field
     ("ingested", "partial", "nothing_to_ingest") plus roots_ingested
     and roots_skipped counters
   - ProjectRefreshResponse advertises the new fields so callers can
     rely on them
   - covers the case where every configured root is missing on disk

3. Chroma cold snapshot + admin backup endpoints
   - create_runtime_backup now accepts include_chroma and writes a
     cold directory copy of the chroma persistence path
   - new list_runtime_backups() and validate_backup() helpers
   - new endpoints:
     - POST /admin/backup            create snapshot (optional chroma)
     - GET  /admin/backup            list snapshots
     - GET  /admin/backup/{stamp}/validate  structural validation
   - chroma snapshots are taken under exclusive_ingestion() so a refresh
     or ingest cannot race with the cold copy
   - backup metadata records what was actually included and how big

Tests:
- 8 new tests covering tunable weights, refresh status branches
  (ingested / partial / nothing_to_ingest), chroma snapshot, list,
  validate, and the API endpoints (including the lock-acquisition path)
- existing fake refresh stubs in test_api_storage.py updated for the
  expanded ProjectRefreshResponse model
- full suite: 105 passing (was 97)

next-steps doc updated to reflect that the chroma snapshot + restore
validation gap from current-state.md is now closed in code; only the
operational retention policy remains.
2026-04-06 18:42:19 -04:00

237 lines
6.5 KiB
Python

"""Retrieval: query to ranked chunks."""
import re
import time
from dataclasses import dataclass
import atocore.config as _config
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
from atocore.projects.registry import get_registered_project
from atocore.retrieval.embeddings import embed_query
from atocore.retrieval.vector_store import get_vector_store
log = get_logger("retriever")
_STOP_TOKENS = {
"about",
"and",
"current",
"for",
"from",
"into",
"like",
"project",
"shared",
"system",
"that",
"the",
"this",
"what",
"with",
}
_HIGH_SIGNAL_HINTS = (
"status",
"decision",
"requirements",
"requirement",
"roadmap",
"charter",
"system-map",
"system_map",
"contracts",
"schema",
"architecture",
"workflow",
"error-budget",
"comparison-matrix",
"selection-decision",
)
_LOW_SIGNAL_HINTS = (
"/_archive/",
"\\_archive\\",
"/archive/",
"\\archive\\",
"_history",
"history",
"pre-cleanup",
"pre-migration",
"reviews/",
)
@dataclass
class ChunkResult:
chunk_id: str
content: str
score: float
heading_path: str
source_file: str
tags: str
title: str
document_id: str
def retrieve(
query: str,
top_k: int | None = None,
filter_tags: list[str] | None = None,
project_hint: str | None = None,
) -> list[ChunkResult]:
"""Retrieve the most relevant chunks for a query."""
top_k = top_k or _config.settings.context_top_k
start = time.time()
query_embedding = embed_query(query)
store = get_vector_store()
where = None
if filter_tags:
if len(filter_tags) == 1:
where = {"tags": {"$contains": f'"{filter_tags[0]}"'}}
else:
where = {
"$and": [
{"tags": {"$contains": f'"{tag}"'}}
for tag in filter_tags
]
}
results = store.query(
query_embedding=query_embedding,
top_k=top_k,
where=where,
)
chunks = []
if results and results["ids"] and results["ids"][0]:
existing_ids = _existing_chunk_ids(results["ids"][0])
for i, chunk_id in enumerate(results["ids"][0]):
if chunk_id not in existing_ids:
continue
distance = results["distances"][0][i] if results["distances"] else 0
score = 1.0 - distance
meta = results["metadatas"][0][i] if results["metadatas"] else {}
content = results["documents"][0][i] if results["documents"] else ""
score *= _query_match_boost(query, meta)
score *= _path_signal_boost(meta)
if project_hint:
score *= _project_match_boost(project_hint, meta)
chunks.append(
ChunkResult(
chunk_id=chunk_id,
content=content,
score=round(score, 4),
heading_path=meta.get("heading_path", ""),
source_file=meta.get("source_file", ""),
tags=meta.get("tags", "[]"),
title=meta.get("title", ""),
document_id=meta.get("document_id", ""),
)
)
duration_ms = int((time.time() - start) * 1000)
chunks.sort(key=lambda chunk: chunk.score, reverse=True)
log.info(
"retrieval_done",
query=query[:100],
top_k=top_k,
results_count=len(chunks),
duration_ms=duration_ms,
)
return chunks
def _project_match_boost(project_hint: str, metadata: dict) -> float:
"""Return a project-aware relevance multiplier for raw retrieval."""
hint_lower = project_hint.strip().lower()
if not hint_lower:
return 1.0
source_file = str(metadata.get("source_file", "")).lower()
title = str(metadata.get("title", "")).lower()
tags = str(metadata.get("tags", "")).lower()
searchable = " ".join([source_file, title, tags])
project = get_registered_project(project_hint)
candidate_names = {hint_lower}
if project is not None:
candidate_names.add(project.project_id.lower())
candidate_names.update(alias.lower() for alias in project.aliases)
candidate_names.update(
source_ref.subpath.replace("\\", "/").strip("/").split("/")[-1].lower()
for source_ref in project.ingest_roots
if source_ref.subpath.strip("/\\")
)
for candidate in candidate_names:
if candidate and candidate in searchable:
return _config.settings.rank_project_match_boost
return 1.0
def _query_match_boost(query: str, metadata: dict) -> float:
"""Boost chunks whose path/title/headings echo the query's high-signal terms."""
tokens = [
token
for token in re.findall(r"[a-z0-9][a-z0-9_-]{2,}", query.lower())
if token not in _STOP_TOKENS
]
if not tokens:
return 1.0
searchable = " ".join(
[
str(metadata.get("source_file", "")).lower(),
str(metadata.get("title", "")).lower(),
str(metadata.get("heading_path", "")).lower(),
]
)
matches = sum(1 for token in set(tokens) if token in searchable)
if matches <= 0:
return 1.0
return min(
1.0 + matches * _config.settings.rank_query_token_step,
_config.settings.rank_query_token_cap,
)
def _path_signal_boost(metadata: dict) -> float:
"""Prefer current high-signal docs and gently down-rank archival noise."""
searchable = " ".join(
[
str(metadata.get("source_file", "")).lower(),
str(metadata.get("title", "")).lower(),
str(metadata.get("heading_path", "")).lower(),
]
)
multiplier = 1.0
if any(hint in searchable for hint in _LOW_SIGNAL_HINTS):
multiplier *= _config.settings.rank_path_low_signal_penalty
if any(hint in searchable for hint in _HIGH_SIGNAL_HINTS):
multiplier *= _config.settings.rank_path_high_signal_boost
return multiplier
def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
"""Filter out stale vector entries whose chunk rows no longer exist."""
if not chunk_ids:
return set()
placeholders = ", ".join("?" for _ in chunk_ids)
with get_connection() as conn:
rows = conn.execute(
f"SELECT id FROM source_chunks WHERE id IN ({placeholders})",
chunk_ids,
).fetchall()
return {row["id"] for row in rows}