feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
0
src/atocore/retrieval/__init__.py
Normal file
0
src/atocore/retrieval/__init__.py
Normal file
32
src/atocore/retrieval/embeddings.py
Normal file
32
src/atocore/retrieval/embeddings.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Embedding model management."""
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
|
||||
log = get_logger("embeddings")
|
||||
|
||||
_model: SentenceTransformer | None = None
|
||||
|
||||
|
||||
def get_model() -> SentenceTransformer:
|
||||
"""Load and cache the embedding model."""
|
||||
global _model
|
||||
if _model is None:
|
||||
log.info("loading_embedding_model", model=settings.embedding_model)
|
||||
_model = SentenceTransformer(settings.embedding_model)
|
||||
log.info("embedding_model_loaded", model=settings.embedding_model)
|
||||
return _model
|
||||
|
||||
|
||||
def embed_texts(texts: list[str]) -> list[list[float]]:
|
||||
"""Generate embeddings for a list of texts."""
|
||||
model = get_model()
|
||||
embeddings = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
|
||||
return embeddings.tolist()
|
||||
|
||||
|
||||
def embed_query(query: str) -> list[float]:
|
||||
"""Generate embedding for a single query."""
|
||||
return embed_texts([query])[0]
|
||||
83
src/atocore/retrieval/retriever.py
Normal file
83
src/atocore/retrieval/retriever.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""Retrieval: query → ranked chunks."""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.embeddings import embed_query
|
||||
from atocore.retrieval.vector_store import get_vector_store
|
||||
|
||||
log = get_logger("retriever")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkResult:
|
||||
chunk_id: str
|
||||
content: str
|
||||
score: float
|
||||
heading_path: str
|
||||
source_file: str
|
||||
tags: str
|
||||
title: str
|
||||
document_id: str
|
||||
|
||||
|
||||
def retrieve(
|
||||
query: str,
|
||||
top_k: int | None = None,
|
||||
filter_tags: list[str] | None = None,
|
||||
) -> list[ChunkResult]:
|
||||
"""Retrieve the most relevant chunks for a query."""
|
||||
top_k = top_k or settings.context_top_k
|
||||
start = time.time()
|
||||
|
||||
query_embedding = embed_query(query)
|
||||
store = get_vector_store()
|
||||
|
||||
# Build filter
|
||||
where = None
|
||||
if filter_tags:
|
||||
# ChromaDB where filter for tags (stored as JSON string)
|
||||
# Simple contains check — works for single-tag filtering
|
||||
where = {"tags": {"$contains": filter_tags[0]}}
|
||||
|
||||
results = store.query(
|
||||
query_embedding=query_embedding,
|
||||
top_k=top_k,
|
||||
where=where,
|
||||
)
|
||||
|
||||
chunks = []
|
||||
if results and results["ids"] and results["ids"][0]:
|
||||
for i, chunk_id in enumerate(results["ids"][0]):
|
||||
# ChromaDB returns distances (lower = more similar for cosine)
|
||||
# Convert to similarity score (1 - distance)
|
||||
distance = results["distances"][0][i] if results["distances"] else 0
|
||||
score = 1.0 - distance
|
||||
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||
content = results["documents"][0][i] if results["documents"] else ""
|
||||
|
||||
chunks.append(
|
||||
ChunkResult(
|
||||
chunk_id=chunk_id,
|
||||
content=content,
|
||||
score=round(score, 4),
|
||||
heading_path=meta.get("heading_path", ""),
|
||||
source_file=meta.get("source_file", ""),
|
||||
tags=meta.get("tags", "[]"),
|
||||
title=meta.get("title", ""),
|
||||
document_id=meta.get("document_id", ""),
|
||||
)
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
log.info(
|
||||
"retrieval_done",
|
||||
query=query[:100],
|
||||
top_k=top_k,
|
||||
results_count=len(chunks),
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
return chunks
|
||||
77
src/atocore/retrieval/vector_store.py
Normal file
77
src/atocore/retrieval/vector_store.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""ChromaDB vector store wrapper."""
|
||||
|
||||
import chromadb
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.embeddings import embed_texts
|
||||
|
||||
log = get_logger("vector_store")
|
||||
|
||||
COLLECTION_NAME = "atocore_chunks"
|
||||
|
||||
_store: "VectorStore | None" = None
|
||||
|
||||
|
||||
class VectorStore:
|
||||
"""Wrapper around ChromaDB for chunk storage and retrieval."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
settings.chroma_path.mkdir(parents=True, exist_ok=True)
|
||||
self._client = chromadb.PersistentClient(path=str(settings.chroma_path))
|
||||
self._collection = self._client.get_or_create_collection(
|
||||
name=COLLECTION_NAME,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
log.info("vector_store_initialized", path=str(settings.chroma_path))
|
||||
|
||||
def add(
|
||||
self,
|
||||
ids: list[str],
|
||||
documents: list[str],
|
||||
metadatas: list[dict],
|
||||
) -> None:
|
||||
"""Add chunks with embeddings to the store."""
|
||||
embeddings = embed_texts(documents)
|
||||
self._collection.add(
|
||||
ids=ids,
|
||||
embeddings=embeddings,
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
)
|
||||
log.debug("vectors_added", count=len(ids))
|
||||
|
||||
def query(
|
||||
self,
|
||||
query_embedding: list[float],
|
||||
top_k: int = 10,
|
||||
where: dict | None = None,
|
||||
) -> dict:
|
||||
"""Query the store for similar chunks."""
|
||||
kwargs: dict = {
|
||||
"query_embeddings": [query_embedding],
|
||||
"n_results": top_k,
|
||||
"include": ["documents", "metadatas", "distances"],
|
||||
}
|
||||
if where:
|
||||
kwargs["where"] = where
|
||||
|
||||
return self._collection.query(**kwargs)
|
||||
|
||||
def delete(self, ids: list[str]) -> None:
|
||||
"""Delete chunks by IDs."""
|
||||
if ids:
|
||||
self._collection.delete(ids=ids)
|
||||
log.debug("vectors_deleted", count=len(ids))
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
return self._collection.count()
|
||||
|
||||
|
||||
def get_vector_store() -> VectorStore:
|
||||
"""Get or create the singleton vector store."""
|
||||
global _store
|
||||
if _store is None:
|
||||
_store = VectorStore()
|
||||
return _store
|
||||
Reference in New Issue
Block a user