feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions

3
src/atocore/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
"""AtoCore — Personal Context Engine."""
__version__ = "0.1.0"

View File

132
src/atocore/api/routes.py Normal file
View File

@@ -0,0 +1,132 @@
"""FastAPI route definitions."""
from pathlib import Path
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from atocore.context.builder import (
ContextPack,
build_context,
get_last_context_pack,
_pack_to_dict,
)
from atocore.ingestion.pipeline import ingest_file, ingest_folder
from atocore.retrieval.retriever import retrieve
from atocore.retrieval.vector_store import get_vector_store
router = APIRouter()
# --- Request/Response models ---
class IngestRequest(BaseModel):
path: str # file or folder path
class IngestResponse(BaseModel):
results: list[dict]
class QueryRequest(BaseModel):
prompt: str
top_k: int = 10
filter_tags: list[str] | None = None
class QueryResponse(BaseModel):
results: list[dict]
class ContextBuildRequest(BaseModel):
prompt: str
project: str | None = None
budget: int | None = None
class ContextBuildResponse(BaseModel):
formatted_context: str
full_prompt: str
chunks_used: int
total_chars: int
budget: int
budget_remaining: int
duration_ms: int
chunks: list[dict]
# --- Endpoints ---
@router.post("/ingest", response_model=IngestResponse)
def api_ingest(req: IngestRequest):
"""Ingest a markdown file or folder."""
target = Path(req.path)
if target.is_file():
results = [ingest_file(target)]
elif target.is_dir():
results = ingest_folder(target)
else:
raise HTTPException(status_code=404, detail=f"Path not found: {req.path}")
return IngestResponse(results=results)
@router.post("/query", response_model=QueryResponse)
def api_query(req: QueryRequest):
"""Retrieve relevant chunks for a prompt."""
chunks = retrieve(req.prompt, top_k=req.top_k, filter_tags=req.filter_tags)
return QueryResponse(
results=[
{
"chunk_id": c.chunk_id,
"content": c.content,
"score": c.score,
"heading_path": c.heading_path,
"source_file": c.source_file,
"title": c.title,
}
for c in chunks
]
)
@router.post("/context/build", response_model=ContextBuildResponse)
def api_build_context(req: ContextBuildRequest):
"""Build a full context pack for a prompt."""
pack = build_context(
user_prompt=req.prompt,
project_hint=req.project,
budget=req.budget,
)
pack_dict = _pack_to_dict(pack)
return ContextBuildResponse(
formatted_context=pack.formatted_context,
full_prompt=pack.full_prompt,
chunks_used=len(pack.chunks_used),
total_chars=pack.total_chars,
budget=pack.budget,
budget_remaining=pack.budget_remaining,
duration_ms=pack.duration_ms,
chunks=pack_dict["chunks"],
)
@router.get("/health")
def api_health():
"""Health check."""
store = get_vector_store()
return {
"status": "ok",
"version": "0.1.0",
"vectors_count": store.count,
}
@router.get("/debug/context")
def api_debug_context():
"""Inspect the last assembled context pack."""
pack = get_last_context_pack()
if pack is None:
return {"message": "No context pack built yet."}
return _pack_to_dict(pack)

39
src/atocore/config.py Normal file
View File

@@ -0,0 +1,39 @@
"""AtoCore configuration via environment variables."""
from pathlib import Path
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
debug: bool = False
data_dir: Path = Path("./data")
host: str = "127.0.0.1"
port: int = 8100
# Embedding
embedding_model: str = (
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)
# Chunking
chunk_max_size: int = 800
chunk_overlap: int = 100
chunk_min_size: int = 50
# Context
context_budget: int = 3000
context_top_k: int = 15
model_config = {"env_prefix": "ATOCORE_"}
@property
def db_path(self) -> Path:
return self.data_dir / "atocore.db"
@property
def chroma_path(self) -> Path:
return self.data_dir / "chroma"
settings = Settings()

View File

View File

@@ -0,0 +1,212 @@
"""Context pack assembly: retrieve, rank, budget, format."""
import json
import time
from dataclasses import dataclass, field
from pathlib import Path
from atocore.config import settings
from atocore.observability.logger import get_logger
from atocore.retrieval.retriever import ChunkResult, retrieve
log = get_logger("context_builder")
SYSTEM_PREFIX = (
"You have access to the following personal context from the user's knowledge base.\n"
"Use it to inform your answer. If the context is not relevant, ignore it.\n"
"Do not mention the context system unless asked."
)
# Last built context pack for debug inspection
_last_context_pack: "ContextPack | None" = None
@dataclass
class ContextChunk:
content: str
source_file: str
heading_path: str
score: float
char_count: int
@dataclass
class ContextPack:
chunks_used: list[ContextChunk] = field(default_factory=list)
total_chars: int = 0
budget: int = 0
budget_remaining: int = 0
formatted_context: str = ""
full_prompt: str = ""
query: str = ""
project_hint: str = ""
duration_ms: int = 0
def build_context(
user_prompt: str,
project_hint: str | None = None,
budget: int | None = None,
) -> ContextPack:
"""Build a context pack for a user prompt."""
global _last_context_pack
start = time.time()
budget = budget or settings.context_budget
# 1. Retrieve candidates
candidates = retrieve(user_prompt, top_k=settings.context_top_k)
# 2. Score and rank
scored = _rank_chunks(candidates, project_hint)
# 3. Select within budget
selected = _select_within_budget(scored, budget)
# 4. Format
formatted = _format_context_block(selected)
# 5. Build full prompt
full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"
total_chars = sum(c.char_count for c in selected)
duration_ms = int((time.time() - start) * 1000)
pack = ContextPack(
chunks_used=selected,
total_chars=total_chars,
budget=budget,
budget_remaining=budget - total_chars,
formatted_context=formatted,
full_prompt=full_prompt,
query=user_prompt,
project_hint=project_hint or "",
duration_ms=duration_ms,
)
_last_context_pack = pack
log.info(
"context_built",
chunks_used=len(selected),
total_chars=total_chars,
budget_remaining=budget - total_chars,
duration_ms=duration_ms,
)
log.debug("context_pack_detail", pack=_pack_to_dict(pack))
return pack
def get_last_context_pack() -> ContextPack | None:
"""Return the last built context pack for debug inspection."""
return _last_context_pack
def _rank_chunks(
candidates: list[ChunkResult],
project_hint: str | None,
) -> list[tuple[float, ChunkResult]]:
"""Rank candidates with boosting for project match."""
scored = []
seen_content: set[str] = set()
for chunk in candidates:
# Deduplicate by content prefix (first 200 chars)
content_key = chunk.content[:200]
if content_key in seen_content:
continue
seen_content.add(content_key)
# Base score from similarity
final_score = chunk.score
# Project boost
if project_hint:
tags_str = chunk.tags.lower() if chunk.tags else ""
source_str = chunk.source_file.lower()
title_str = chunk.title.lower() if chunk.title else ""
hint_lower = project_hint.lower()
if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:
final_score += 0.3
scored.append((final_score, chunk))
# Sort by score descending
scored.sort(key=lambda x: x[0], reverse=True)
return scored
def _select_within_budget(
scored: list[tuple[float, ChunkResult]],
budget: int,
) -> list[ContextChunk]:
"""Select top chunks that fit within the character budget."""
selected = []
used = 0
for score, chunk in scored:
chunk_len = len(chunk.content)
if used + chunk_len > budget:
continue
selected.append(
ContextChunk(
content=chunk.content,
source_file=_shorten_path(chunk.source_file),
heading_path=chunk.heading_path,
score=score,
char_count=chunk_len,
)
)
used += chunk_len
return selected
def _format_context_block(chunks: list[ContextChunk]) -> str:
"""Format chunks into the context block string."""
if not chunks:
return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
lines = ["--- AtoCore Context ---"]
for chunk in chunks:
lines.append(
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
)
lines.append(chunk.content)
lines.append("")
lines.append("--- End Context ---")
return "\n".join(lines)
def _shorten_path(path: str) -> str:
"""Shorten an absolute path to a relative-like display."""
p = Path(path)
parts = p.parts
# Show last 3 parts at most
if len(parts) > 3:
return str(Path(*parts[-3:]))
return str(p)
def _pack_to_dict(pack: ContextPack) -> dict:
"""Convert a context pack to a JSON-serializable dict."""
return {
"query": pack.query,
"project_hint": pack.project_hint,
"chunks_used": len(pack.chunks_used),
"total_chars": pack.total_chars,
"budget": pack.budget,
"budget_remaining": pack.budget_remaining,
"duration_ms": pack.duration_ms,
"chunks": [
{
"source_file": c.source_file,
"heading_path": c.heading_path,
"score": c.score,
"char_count": c.char_count,
"content_preview": c.content[:100],
}
for c in pack.chunks_used
],
}

View File

View File

@@ -0,0 +1,146 @@
"""Heading-aware recursive markdown chunking."""
import re
from dataclasses import dataclass, field
from atocore.config import settings
@dataclass
class Chunk:
content: str
chunk_index: int
heading_path: str
char_count: int
metadata: dict = field(default_factory=dict)
def chunk_markdown(
body: str,
base_metadata: dict | None = None,
max_size: int | None = None,
overlap: int | None = None,
min_size: int | None = None,
) -> list[Chunk]:
"""Split markdown body into chunks using heading-aware strategy.
1. Split on H2 boundaries
2. If section > max_size, split on H3
3. If still > max_size, split on paragraph breaks
4. If still > max_size, hard split with overlap
"""
max_size = max_size or settings.chunk_max_size
overlap = overlap or settings.chunk_overlap
min_size = min_size or settings.chunk_min_size
base_metadata = base_metadata or {}
sections = _split_by_heading(body, level=2)
raw_chunks: list[tuple[str, str]] = [] # (heading_path, content)
for heading, content in sections:
if len(content) <= max_size:
raw_chunks.append((heading, content))
else:
# Try splitting on H3
subsections = _split_by_heading(content, level=3)
for sub_heading, sub_content in subsections:
full_path = (
f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading
)
if len(sub_content) <= max_size:
raw_chunks.append((full_path, sub_content))
else:
# Split on paragraphs
para_chunks = _split_by_paragraphs(
sub_content, max_size, overlap
)
for pc in para_chunks:
raw_chunks.append((full_path, pc))
# Build final chunks, filtering out too-small ones
chunks = []
idx = 0
for heading_path, content in raw_chunks:
content = content.strip()
if len(content) < min_size:
continue
chunks.append(
Chunk(
content=content,
chunk_index=idx,
heading_path=heading_path,
char_count=len(content),
metadata={**base_metadata},
)
)
idx += 1
return chunks
def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:
"""Split text by heading level. Returns (heading_text, section_content) pairs."""
pattern = rf"^({'#' * level})\s+(.+)$"
parts: list[tuple[str, str]] = []
current_heading = ""
current_lines: list[str] = []
for line in text.split("\n"):
match = re.match(pattern, line)
if match:
# Save previous section
if current_lines:
parts.append((current_heading, "\n".join(current_lines)))
current_heading = match.group(2).strip()
current_lines = []
else:
current_lines.append(line)
# Save last section
if current_lines:
parts.append((current_heading, "\n".join(current_lines)))
return parts
def _split_by_paragraphs(
text: str, max_size: int, overlap: int
) -> list[str]:
"""Split text by paragraph breaks, then hard-split if needed."""
paragraphs = re.split(r"\n\n+", text)
chunks: list[str] = []
current = ""
for para in paragraphs:
para = para.strip()
if not para:
continue
if len(current) + len(para) + 2 <= max_size:
current = f"{current}\n\n{para}" if current else para
else:
if current:
chunks.append(current)
# If single paragraph exceeds max, hard split
if len(para) > max_size:
chunks.extend(_hard_split(para, max_size, overlap))
else:
current = para
continue
current = ""
if current:
chunks.append(current)
return chunks
def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
"""Hard split text at max_size with overlap."""
chunks = []
start = 0
while start < len(text):
end = start + max_size
chunks.append(text[start:end])
start = end - overlap
return chunks

View File

@@ -0,0 +1,65 @@
"""Markdown file parsing with frontmatter extraction."""
import re
from dataclasses import dataclass, field
from pathlib import Path
import frontmatter
@dataclass
class ParsedDocument:
file_path: str
title: str
body: str
tags: list[str] = field(default_factory=list)
frontmatter: dict = field(default_factory=dict)
headings: list[tuple[int, str]] = field(default_factory=list)
def parse_markdown(file_path: Path) -> ParsedDocument:
"""Parse a markdown file, extracting frontmatter and structure."""
text = file_path.read_text(encoding="utf-8")
post = frontmatter.loads(text)
meta = dict(post.metadata) if post.metadata else {}
body = post.content.strip()
# Extract title: first H1, or filename
title = _extract_title(body, file_path)
# Extract tags from frontmatter
tags = meta.get("tags", [])
if isinstance(tags, str):
tags = [t.strip() for t in tags.split(",") if t.strip()]
tags = tags or []
# Extract heading structure
headings = _extract_headings(body)
return ParsedDocument(
file_path=str(file_path.resolve()),
title=title,
body=body,
tags=tags,
frontmatter=meta,
headings=headings,
)
def _extract_title(body: str, file_path: Path) -> str:
"""Get title from first H1 or fallback to filename."""
match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
if match:
return match.group(1).strip()
return file_path.stem.replace("_", " ").replace("-", " ").title()
def _extract_headings(body: str) -> list[tuple[int, str]]:
"""Extract all headings with their level."""
headings = []
for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE):
level = len(match.group(1))
text = match.group(2).strip()
headings.append((level, text))
return headings

View File

@@ -0,0 +1,157 @@
"""Ingestion pipeline: parse → chunk → embed → store."""
import hashlib
import json
import time
import uuid
from pathlib import Path
from atocore.config import settings
from atocore.ingestion.chunker import chunk_markdown
from atocore.ingestion.parser import parse_markdown
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
from atocore.retrieval.vector_store import get_vector_store
log = get_logger("ingestion")
def ingest_file(file_path: Path) -> dict:
"""Ingest a single markdown file. Returns stats."""
start = time.time()
file_path = file_path.resolve()
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if file_path.suffix.lower() not in (".md", ".markdown"):
raise ValueError(f"Not a markdown file: {file_path}")
# Read and hash
raw_content = file_path.read_text(encoding="utf-8")
file_hash = hashlib.sha256(raw_content.encode()).hexdigest()
# Check if already ingested and unchanged
with get_connection() as conn:
existing = conn.execute(
"SELECT id, file_hash FROM source_documents WHERE file_path = ?",
(str(file_path),),
).fetchone()
if existing and existing["file_hash"] == file_hash:
log.info("file_skipped_unchanged", file_path=str(file_path))
return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}
# Parse
parsed = parse_markdown(file_path)
# Chunk
base_meta = {
"source_file": str(file_path),
"tags": parsed.tags,
"title": parsed.title,
}
chunks = chunk_markdown(parsed.body, base_metadata=base_meta)
if not chunks:
log.warning("no_chunks_created", file_path=str(file_path))
return {"file": str(file_path), "status": "empty", "chunks": 0}
# Store in DB and vector store
doc_id = str(uuid.uuid4())
vector_store = get_vector_store()
with get_connection() as conn:
# Remove old data if re-ingesting
if existing:
doc_id = existing["id"]
old_chunk_ids = [
row["id"]
for row in conn.execute(
"SELECT id FROM source_chunks WHERE document_id = ?",
(doc_id,),
).fetchall()
]
conn.execute(
"DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
)
conn.execute(
"UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
(file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
)
# Remove old vectors
if old_chunk_ids:
vector_store.delete(old_chunk_ids)
else:
conn.execute(
"INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
(doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
)
# Insert chunks
chunk_ids = []
chunk_contents = []
chunk_metadatas = []
for chunk in chunks:
chunk_id = str(uuid.uuid4())
chunk_ids.append(chunk_id)
chunk_contents.append(chunk.content)
chunk_metadatas.append({
"document_id": doc_id,
"heading_path": chunk.heading_path,
"source_file": str(file_path),
"tags": json.dumps(parsed.tags),
"title": parsed.title,
})
conn.execute(
"INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
(
chunk_id,
doc_id,
chunk.chunk_index,
chunk.content,
chunk.heading_path,
chunk.char_count,
json.dumps(chunk.metadata),
),
)
# Store embeddings
vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)
duration_ms = int((time.time() - start) * 1000)
log.info(
"file_ingested",
file_path=str(file_path),
chunks_created=len(chunks),
duration_ms=duration_ms,
)
return {
"file": str(file_path),
"status": "ingested",
"chunks": len(chunks),
"duration_ms": duration_ms,
}
def ingest_folder(folder_path: Path) -> list[dict]:
"""Ingest all markdown files in a folder recursively."""
folder_path = folder_path.resolve()
if not folder_path.is_dir():
raise NotADirectoryError(f"Not a directory: {folder_path}")
results = []
md_files = sorted(folder_path.rglob("*.md"))
log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))
for md_file in md_files:
try:
result = ingest_file(md_file)
results.append(result)
except Exception as e:
log.error("ingestion_error", file_path=str(md_file), error=str(e))
results.append({"file": str(md_file), "status": "error", "error": str(e)})
return results

33
src/atocore/main.py Normal file
View File

@@ -0,0 +1,33 @@
"""AtoCore — FastAPI application entry point."""
from fastapi import FastAPI
from atocore.api.routes import router
from atocore.config import settings
from atocore.models.database import init_db
from atocore.observability.logger import setup_logging
app = FastAPI(
title="AtoCore",
description="Personal Context Engine for LLM interactions",
version="0.1.0",
)
app.include_router(router)
@app.on_event("startup")
def startup():
setup_logging()
init_db()
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"atocore.main:app",
host=settings.host,
port=settings.port,
reload=True,
)

View File

View File

@@ -0,0 +1,98 @@
"""SQLite database schema and connection management."""
import sqlite3
from contextlib import contextmanager
from pathlib import Path
from typing import Generator
from atocore.config import settings
from atocore.observability.logger import get_logger
log = get_logger("database")
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS source_documents (
id TEXT PRIMARY KEY,
file_path TEXT UNIQUE NOT NULL,
file_hash TEXT NOT NULL,
title TEXT,
doc_type TEXT DEFAULT 'markdown',
tags TEXT DEFAULT '[]',
ingested_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS source_chunks (
id TEXT PRIMARY KEY,
document_id TEXT NOT NULL REFERENCES source_documents(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
heading_path TEXT DEFAULT '',
char_count INTEGER NOT NULL,
metadata TEXT DEFAULT '{}',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS memories (
id TEXT PRIMARY KEY,
memory_type TEXT NOT NULL,
content TEXT NOT NULL,
source_chunk_id TEXT REFERENCES source_chunks(id),
confidence REAL DEFAULT 1.0,
status TEXT DEFAULT 'active',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS projects (
id TEXT PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
description TEXT DEFAULT '',
status TEXT DEFAULT 'active',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS interactions (
id TEXT PRIMARY KEY,
prompt TEXT NOT NULL,
context_pack TEXT DEFAULT '{}',
response_summary TEXT DEFAULT '',
project_id TEXT REFERENCES projects(id),
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_chunks_document ON source_chunks(document_id);
CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type);
CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status);
CREATE INDEX IF NOT EXISTS idx_interactions_project ON interactions(project_id);
"""
def _ensure_data_dir() -> None:
settings.data_dir.mkdir(parents=True, exist_ok=True)
def init_db() -> None:
"""Initialize the database with schema."""
_ensure_data_dir()
with get_connection() as conn:
conn.executescript(SCHEMA_SQL)
log.info("database_initialized", path=str(settings.db_path))
@contextmanager
def get_connection() -> Generator[sqlite3.Connection, None, None]:
"""Get a database connection with row factory."""
_ensure_data_dir()
conn = sqlite3.connect(str(settings.db_path))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
conn.close()

View File

View File

@@ -0,0 +1,41 @@
"""Structured logging for AtoCore."""
import logging
import structlog
from atocore.config import settings
_LOG_LEVELS = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
}
def setup_logging() -> None:
"""Configure structlog with JSON output."""
log_level = "DEBUG" if settings.debug else "INFO"
structlog.configure(
processors=[
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.dev.ConsoleRenderer()
if settings.debug
else structlog.processors.JSONRenderer(),
],
wrapper_class=structlog.make_filtering_bound_logger(
_LOG_LEVELS.get(log_level, logging.INFO)
),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=True,
)
def get_logger(name: str) -> structlog.BoundLogger:
"""Get a named logger."""
return structlog.get_logger(name)

View File

View File

@@ -0,0 +1,32 @@
"""Embedding model management."""
from sentence_transformers import SentenceTransformer
from atocore.config import settings
from atocore.observability.logger import get_logger
log = get_logger("embeddings")
_model: SentenceTransformer | None = None
def get_model() -> SentenceTransformer:
"""Load and cache the embedding model."""
global _model
if _model is None:
log.info("loading_embedding_model", model=settings.embedding_model)
_model = SentenceTransformer(settings.embedding_model)
log.info("embedding_model_loaded", model=settings.embedding_model)
return _model
def embed_texts(texts: list[str]) -> list[list[float]]:
"""Generate embeddings for a list of texts."""
model = get_model()
embeddings = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
return embeddings.tolist()
def embed_query(query: str) -> list[float]:
"""Generate embedding for a single query."""
return embed_texts([query])[0]

View File

@@ -0,0 +1,83 @@
"""Retrieval: query → ranked chunks."""
import time
from dataclasses import dataclass
from atocore.config import settings
from atocore.observability.logger import get_logger
from atocore.retrieval.embeddings import embed_query
from atocore.retrieval.vector_store import get_vector_store
log = get_logger("retriever")
@dataclass
class ChunkResult:
chunk_id: str
content: str
score: float
heading_path: str
source_file: str
tags: str
title: str
document_id: str
def retrieve(
query: str,
top_k: int | None = None,
filter_tags: list[str] | None = None,
) -> list[ChunkResult]:
"""Retrieve the most relevant chunks for a query."""
top_k = top_k or settings.context_top_k
start = time.time()
query_embedding = embed_query(query)
store = get_vector_store()
# Build filter
where = None
if filter_tags:
# ChromaDB where filter for tags (stored as JSON string)
# Simple contains check — works for single-tag filtering
where = {"tags": {"$contains": filter_tags[0]}}
results = store.query(
query_embedding=query_embedding,
top_k=top_k,
where=where,
)
chunks = []
if results and results["ids"] and results["ids"][0]:
for i, chunk_id in enumerate(results["ids"][0]):
# ChromaDB returns distances (lower = more similar for cosine)
# Convert to similarity score (1 - distance)
distance = results["distances"][0][i] if results["distances"] else 0
score = 1.0 - distance
meta = results["metadatas"][0][i] if results["metadatas"] else {}
content = results["documents"][0][i] if results["documents"] else ""
chunks.append(
ChunkResult(
chunk_id=chunk_id,
content=content,
score=round(score, 4),
heading_path=meta.get("heading_path", ""),
source_file=meta.get("source_file", ""),
tags=meta.get("tags", "[]"),
title=meta.get("title", ""),
document_id=meta.get("document_id", ""),
)
)
duration_ms = int((time.time() - start) * 1000)
log.info(
"retrieval_done",
query=query[:100],
top_k=top_k,
results_count=len(chunks),
duration_ms=duration_ms,
)
return chunks

View File

@@ -0,0 +1,77 @@
"""ChromaDB vector store wrapper."""
import chromadb
from atocore.config import settings
from atocore.observability.logger import get_logger
from atocore.retrieval.embeddings import embed_texts
log = get_logger("vector_store")
COLLECTION_NAME = "atocore_chunks"
_store: "VectorStore | None" = None
class VectorStore:
"""Wrapper around ChromaDB for chunk storage and retrieval."""
def __init__(self) -> None:
settings.chroma_path.mkdir(parents=True, exist_ok=True)
self._client = chromadb.PersistentClient(path=str(settings.chroma_path))
self._collection = self._client.get_or_create_collection(
name=COLLECTION_NAME,
metadata={"hnsw:space": "cosine"},
)
log.info("vector_store_initialized", path=str(settings.chroma_path))
def add(
self,
ids: list[str],
documents: list[str],
metadatas: list[dict],
) -> None:
"""Add chunks with embeddings to the store."""
embeddings = embed_texts(documents)
self._collection.add(
ids=ids,
embeddings=embeddings,
documents=documents,
metadatas=metadatas,
)
log.debug("vectors_added", count=len(ids))
def query(
self,
query_embedding: list[float],
top_k: int = 10,
where: dict | None = None,
) -> dict:
"""Query the store for similar chunks."""
kwargs: dict = {
"query_embeddings": [query_embedding],
"n_results": top_k,
"include": ["documents", "metadatas", "distances"],
}
if where:
kwargs["where"] = where
return self._collection.query(**kwargs)
def delete(self, ids: list[str]) -> None:
"""Delete chunks by IDs."""
if ids:
self._collection.delete(ids=ids)
log.debug("vectors_deleted", count=len(ids))
@property
def count(self) -> int:
return self._collection.count()
def get_vector_store() -> VectorStore:
"""Get or create the singleton vector store."""
global _store
if _store is None:
_store = VectorStore()
return _store