feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
83
src/atocore/retrieval/retriever.py
Normal file
83
src/atocore/retrieval/retriever.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""Retrieval: query → ranked chunks."""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.embeddings import embed_query
|
||||
from atocore.retrieval.vector_store import get_vector_store
|
||||
|
||||
log = get_logger("retriever")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkResult:
|
||||
chunk_id: str
|
||||
content: str
|
||||
score: float
|
||||
heading_path: str
|
||||
source_file: str
|
||||
tags: str
|
||||
title: str
|
||||
document_id: str
|
||||
|
||||
|
||||
def retrieve(
|
||||
query: str,
|
||||
top_k: int | None = None,
|
||||
filter_tags: list[str] | None = None,
|
||||
) -> list[ChunkResult]:
|
||||
"""Retrieve the most relevant chunks for a query."""
|
||||
top_k = top_k or settings.context_top_k
|
||||
start = time.time()
|
||||
|
||||
query_embedding = embed_query(query)
|
||||
store = get_vector_store()
|
||||
|
||||
# Build filter
|
||||
where = None
|
||||
if filter_tags:
|
||||
# ChromaDB where filter for tags (stored as JSON string)
|
||||
# Simple contains check — works for single-tag filtering
|
||||
where = {"tags": {"$contains": filter_tags[0]}}
|
||||
|
||||
results = store.query(
|
||||
query_embedding=query_embedding,
|
||||
top_k=top_k,
|
||||
where=where,
|
||||
)
|
||||
|
||||
chunks = []
|
||||
if results and results["ids"] and results["ids"][0]:
|
||||
for i, chunk_id in enumerate(results["ids"][0]):
|
||||
# ChromaDB returns distances (lower = more similar for cosine)
|
||||
# Convert to similarity score (1 - distance)
|
||||
distance = results["distances"][0][i] if results["distances"] else 0
|
||||
score = 1.0 - distance
|
||||
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||
content = results["documents"][0][i] if results["documents"] else ""
|
||||
|
||||
chunks.append(
|
||||
ChunkResult(
|
||||
chunk_id=chunk_id,
|
||||
content=content,
|
||||
score=round(score, 4),
|
||||
heading_path=meta.get("heading_path", ""),
|
||||
source_file=meta.get("source_file", ""),
|
||||
tags=meta.get("tags", "[]"),
|
||||
title=meta.get("title", ""),
|
||||
document_id=meta.get("document_id", ""),
|
||||
)
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
log.info(
|
||||
"retrieval_done",
|
||||
query=query[:100],
|
||||
top_k=top_k,
|
||||
results_count=len(chunks),
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user