feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions

View File

View File

@@ -0,0 +1,32 @@
"""Embedding model management."""
from sentence_transformers import SentenceTransformer
from atocore.config import settings
from atocore.observability.logger import get_logger
log = get_logger("embeddings")
_model: SentenceTransformer | None = None
def get_model() -> SentenceTransformer:
"""Load and cache the embedding model."""
global _model
if _model is None:
log.info("loading_embedding_model", model=settings.embedding_model)
_model = SentenceTransformer(settings.embedding_model)
log.info("embedding_model_loaded", model=settings.embedding_model)
return _model
def embed_texts(texts: list[str]) -> list[list[float]]:
"""Generate embeddings for a list of texts."""
model = get_model()
embeddings = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
return embeddings.tolist()
def embed_query(query: str) -> list[float]:
"""Generate embedding for a single query."""
return embed_texts([query])[0]

View File

@@ -0,0 +1,83 @@
"""Retrieval: query → ranked chunks."""
import time
from dataclasses import dataclass
from atocore.config import settings
from atocore.observability.logger import get_logger
from atocore.retrieval.embeddings import embed_query
from atocore.retrieval.vector_store import get_vector_store
log = get_logger("retriever")
@dataclass
class ChunkResult:
chunk_id: str
content: str
score: float
heading_path: str
source_file: str
tags: str
title: str
document_id: str
def retrieve(
query: str,
top_k: int | None = None,
filter_tags: list[str] | None = None,
) -> list[ChunkResult]:
"""Retrieve the most relevant chunks for a query."""
top_k = top_k or settings.context_top_k
start = time.time()
query_embedding = embed_query(query)
store = get_vector_store()
# Build filter
where = None
if filter_tags:
# ChromaDB where filter for tags (stored as JSON string)
# Simple contains check — works for single-tag filtering
where = {"tags": {"$contains": filter_tags[0]}}
results = store.query(
query_embedding=query_embedding,
top_k=top_k,
where=where,
)
chunks = []
if results and results["ids"] and results["ids"][0]:
for i, chunk_id in enumerate(results["ids"][0]):
# ChromaDB returns distances (lower = more similar for cosine)
# Convert to similarity score (1 - distance)
distance = results["distances"][0][i] if results["distances"] else 0
score = 1.0 - distance
meta = results["metadatas"][0][i] if results["metadatas"] else {}
content = results["documents"][0][i] if results["documents"] else ""
chunks.append(
ChunkResult(
chunk_id=chunk_id,
content=content,
score=round(score, 4),
heading_path=meta.get("heading_path", ""),
source_file=meta.get("source_file", ""),
tags=meta.get("tags", "[]"),
title=meta.get("title", ""),
document_id=meta.get("document_id", ""),
)
)
duration_ms = int((time.time() - start) * 1000)
log.info(
"retrieval_done",
query=query[:100],
top_k=top_k,
results_count=len(chunks),
duration_ms=duration_ms,
)
return chunks

View File

@@ -0,0 +1,77 @@
"""ChromaDB vector store wrapper."""
import chromadb
from atocore.config import settings
from atocore.observability.logger import get_logger
from atocore.retrieval.embeddings import embed_texts
log = get_logger("vector_store")
COLLECTION_NAME = "atocore_chunks"
_store: "VectorStore | None" = None
class VectorStore:
"""Wrapper around ChromaDB for chunk storage and retrieval."""
def __init__(self) -> None:
settings.chroma_path.mkdir(parents=True, exist_ok=True)
self._client = chromadb.PersistentClient(path=str(settings.chroma_path))
self._collection = self._client.get_or_create_collection(
name=COLLECTION_NAME,
metadata={"hnsw:space": "cosine"},
)
log.info("vector_store_initialized", path=str(settings.chroma_path))
def add(
self,
ids: list[str],
documents: list[str],
metadatas: list[dict],
) -> None:
"""Add chunks with embeddings to the store."""
embeddings = embed_texts(documents)
self._collection.add(
ids=ids,
embeddings=embeddings,
documents=documents,
metadatas=metadatas,
)
log.debug("vectors_added", count=len(ids))
def query(
self,
query_embedding: list[float],
top_k: int = 10,
where: dict | None = None,
) -> dict:
"""Query the store for similar chunks."""
kwargs: dict = {
"query_embeddings": [query_embedding],
"n_results": top_k,
"include": ["documents", "metadatas", "distances"],
}
if where:
kwargs["where"] = where
return self._collection.query(**kwargs)
def delete(self, ids: list[str]) -> None:
"""Delete chunks by IDs."""
if ids:
self._collection.delete(ids=ids)
log.debug("vectors_deleted", count=len(ids))
@property
def count(self) -> int:
return self._collection.count()
def get_vector_store() -> VectorStore:
"""Get or create the singleton vector store."""
global _store
if _store is None:
_store = VectorStore()
return _store