feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions
--- a/src/atocore/init.py
+++ b/src/atocore/init.py
@@ -0,0 +1,3 @@
+"""AtoCore — Personal Context Engine."""
+
+__version__ = "0.1.0"
--- a/src/atocore/api/init.py
+++ b/src/atocore/api/init.py
--- a/src/atocore/api/routes.py
+++ b/src/atocore/api/routes.py
@@ -0,0 +1,132 @@
+"""FastAPI route definitions."""
+
+from pathlib import Path
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from atocore.context.builder import (
+    ContextPack,
+    build_context,
+    get_last_context_pack,
+    _pack_to_dict,
+)
+from atocore.ingestion.pipeline import ingest_file, ingest_folder
+from atocore.retrieval.retriever import retrieve
+from atocore.retrieval.vector_store import get_vector_store
+
+router = APIRouter()
+
+
+# --- Request/Response models ---
+
+
+class IngestRequest(BaseModel):
+    path: str  # file or folder path
+
+
+class IngestResponse(BaseModel):
+    results: list[dict]
+
+
+class QueryRequest(BaseModel):
+    prompt: str
+    top_k: int = 10
+    filter_tags: list[str] | None = None
+
+
+class QueryResponse(BaseModel):
+    results: list[dict]
+
+
+class ContextBuildRequest(BaseModel):
+    prompt: str
+    project: str | None = None
+    budget: int | None = None
+
+
+class ContextBuildResponse(BaseModel):
+    formatted_context: str
+    full_prompt: str
+    chunks_used: int
+    total_chars: int
+    budget: int
+    budget_remaining: int
+    duration_ms: int
+    chunks: list[dict]
+
+
+# --- Endpoints ---
+
+
+@router.post("/ingest", response_model=IngestResponse)
+def api_ingest(req: IngestRequest):
+    """Ingest a markdown file or folder."""
+    target = Path(req.path)
+    if target.is_file():
+        results = [ingest_file(target)]
+    elif target.is_dir():
+        results = ingest_folder(target)
+    else:
+        raise HTTPException(status_code=404, detail=f"Path not found: {req.path}")
+    return IngestResponse(results=results)
+
+
+@router.post("/query", response_model=QueryResponse)
+def api_query(req: QueryRequest):
+    """Retrieve relevant chunks for a prompt."""
+    chunks = retrieve(req.prompt, top_k=req.top_k, filter_tags=req.filter_tags)
+    return QueryResponse(
+        results=[
+            {
+                "chunk_id": c.chunk_id,
+                "content": c.content,
+                "score": c.score,
+                "heading_path": c.heading_path,
+                "source_file": c.source_file,
+                "title": c.title,
+            }
+            for c in chunks
+        ]
+    )
+
+
+@router.post("/context/build", response_model=ContextBuildResponse)
+def api_build_context(req: ContextBuildRequest):
+    """Build a full context pack for a prompt."""
+    pack = build_context(
+        user_prompt=req.prompt,
+        project_hint=req.project,
+        budget=req.budget,
+    )
+    pack_dict = _pack_to_dict(pack)
+    return ContextBuildResponse(
+        formatted_context=pack.formatted_context,
+        full_prompt=pack.full_prompt,
+        chunks_used=len(pack.chunks_used),
+        total_chars=pack.total_chars,
+        budget=pack.budget,
+        budget_remaining=pack.budget_remaining,
+        duration_ms=pack.duration_ms,
+        chunks=pack_dict["chunks"],
+    )
+
+
+@router.get("/health")
+def api_health():
+    """Health check."""
+    store = get_vector_store()
+    return {
+        "status": "ok",
+        "version": "0.1.0",
+        "vectors_count": store.count,
+    }
+
+
+@router.get("/debug/context")
+def api_debug_context():
+    """Inspect the last assembled context pack."""
+    pack = get_last_context_pack()
+    if pack is None:
+        return {"message": "No context pack built yet."}
+    return _pack_to_dict(pack)
--- a/src/atocore/config.py
+++ b/src/atocore/config.py
@@ -0,0 +1,39 @@
+"""AtoCore configuration via environment variables."""
+
+from pathlib import Path
+
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    debug: bool = False
+    data_dir: Path = Path("./data")
+    host: str = "127.0.0.1"
+    port: int = 8100
+
+    # Embedding
+    embedding_model: str = (
+        "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+    )
+
+    # Chunking
+    chunk_max_size: int = 800
+    chunk_overlap: int = 100
+    chunk_min_size: int = 50
+
+    # Context
+    context_budget: int = 3000
+    context_top_k: int = 15
+
+    model_config = {"env_prefix": "ATOCORE_"}
+
+    @property
+    def db_path(self) -> Path:
+        return self.data_dir / "atocore.db"
+
+    @property
+    def chroma_path(self) -> Path:
+        return self.data_dir / "chroma"
+
+
+settings = Settings()
--- a/src/atocore/context/init.py
+++ b/src/atocore/context/init.py
--- a/src/atocore/context/builder.py
+++ b/src/atocore/context/builder.py
@@ -0,0 +1,212 @@
+"""Context pack assembly: retrieve, rank, budget, format."""
+
+import json
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from atocore.config import settings
+from atocore.observability.logger import get_logger
+from atocore.retrieval.retriever import ChunkResult, retrieve
+
+log = get_logger("context_builder")
+
+SYSTEM_PREFIX = (
+    "You have access to the following personal context from the user's knowledge base.\n"
+    "Use it to inform your answer. If the context is not relevant, ignore it.\n"
+    "Do not mention the context system unless asked."
+)
+
+# Last built context pack for debug inspection
+_last_context_pack: "ContextPack | None" = None
+
+
+@dataclass
+class ContextChunk:
+    content: str
+    source_file: str
+    heading_path: str
+    score: float
+    char_count: int
+
+
+@dataclass
+class ContextPack:
+    chunks_used: list[ContextChunk] = field(default_factory=list)
+    total_chars: int = 0
+    budget: int = 0
+    budget_remaining: int = 0
+    formatted_context: str = ""
+    full_prompt: str = ""
+    query: str = ""
+    project_hint: str = ""
+    duration_ms: int = 0
+
+
+def build_context(
+    user_prompt: str,
+    project_hint: str | None = None,
+    budget: int | None = None,
+) -> ContextPack:
+    """Build a context pack for a user prompt."""
+    global _last_context_pack
+    start = time.time()
+    budget = budget or settings.context_budget
+
+    # 1. Retrieve candidates
+    candidates = retrieve(user_prompt, top_k=settings.context_top_k)
+
+    # 2. Score and rank
+    scored = _rank_chunks(candidates, project_hint)
+
+    # 3. Select within budget
+    selected = _select_within_budget(scored, budget)
+
+    # 4. Format
+    formatted = _format_context_block(selected)
+
+    # 5. Build full prompt
+    full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"
+
+    total_chars = sum(c.char_count for c in selected)
+    duration_ms = int((time.time() - start) * 1000)
+
+    pack = ContextPack(
+        chunks_used=selected,
+        total_chars=total_chars,
+        budget=budget,
+        budget_remaining=budget - total_chars,
+        formatted_context=formatted,
+        full_prompt=full_prompt,
+        query=user_prompt,
+        project_hint=project_hint or "",
+        duration_ms=duration_ms,
+    )
+
+    _last_context_pack = pack
+
+    log.info(
+        "context_built",
+        chunks_used=len(selected),
+        total_chars=total_chars,
+        budget_remaining=budget - total_chars,
+        duration_ms=duration_ms,
+    )
+    log.debug("context_pack_detail", pack=_pack_to_dict(pack))
+
+    return pack
+
+
+def get_last_context_pack() -> ContextPack | None:
+    """Return the last built context pack for debug inspection."""
+    return _last_context_pack
+
+
+def _rank_chunks(
+    candidates: list[ChunkResult],
+    project_hint: str | None,
+) -> list[tuple[float, ChunkResult]]:
+    """Rank candidates with boosting for project match."""
+    scored = []
+    seen_content: set[str] = set()
+
+    for chunk in candidates:
+        # Deduplicate by content prefix (first 200 chars)
+        content_key = chunk.content[:200]
+        if content_key in seen_content:
+            continue
+        seen_content.add(content_key)
+
+        # Base score from similarity
+        final_score = chunk.score
+
+        # Project boost
+        if project_hint:
+            tags_str = chunk.tags.lower() if chunk.tags else ""
+            source_str = chunk.source_file.lower()
+            title_str = chunk.title.lower() if chunk.title else ""
+            hint_lower = project_hint.lower()
+
+            if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:
+                final_score += 0.3
+
+        scored.append((final_score, chunk))
+
+    # Sort by score descending
+    scored.sort(key=lambda x: x[0], reverse=True)
+    return scored
+
+
+def _select_within_budget(
+    scored: list[tuple[float, ChunkResult]],
+    budget: int,
+) -> list[ContextChunk]:
+    """Select top chunks that fit within the character budget."""
+    selected = []
+    used = 0
+
+    for score, chunk in scored:
+        chunk_len = len(chunk.content)
+        if used + chunk_len > budget:
+            continue
+        selected.append(
+            ContextChunk(
+                content=chunk.content,
+                source_file=_shorten_path(chunk.source_file),
+                heading_path=chunk.heading_path,
+                score=score,
+                char_count=chunk_len,
+            )
+        )
+        used += chunk_len
+
+    return selected
+
+
+def _format_context_block(chunks: list[ContextChunk]) -> str:
+    """Format chunks into the context block string."""
+    if not chunks:
+        return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
+
+    lines = ["--- AtoCore Context ---"]
+    for chunk in chunks:
+        lines.append(
+            f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
+        )
+        lines.append(chunk.content)
+        lines.append("")
+    lines.append("--- End Context ---")
+    return "\n".join(lines)
+
+
+def _shorten_path(path: str) -> str:
+    """Shorten an absolute path to a relative-like display."""
+    p = Path(path)
+    parts = p.parts
+    # Show last 3 parts at most
+    if len(parts) > 3:
+        return str(Path(*parts[-3:]))
+    return str(p)
+
+
+def _pack_to_dict(pack: ContextPack) -> dict:
+    """Convert a context pack to a JSON-serializable dict."""
+    return {
+        "query": pack.query,
+        "project_hint": pack.project_hint,
+        "chunks_used": len(pack.chunks_used),
+        "total_chars": pack.total_chars,
+        "budget": pack.budget,
+        "budget_remaining": pack.budget_remaining,
+        "duration_ms": pack.duration_ms,
+        "chunks": [
+            {
+                "source_file": c.source_file,
+                "heading_path": c.heading_path,
+                "score": c.score,
+                "char_count": c.char_count,
+                "content_preview": c.content[:100],
+            }
+            for c in pack.chunks_used
+        ],
+    }
--- a/src/atocore/ingestion/init.py
+++ b/src/atocore/ingestion/init.py
--- a/src/atocore/ingestion/chunker.py
+++ b/src/atocore/ingestion/chunker.py
@@ -0,0 +1,146 @@
+"""Heading-aware recursive markdown chunking."""
+
+import re
+from dataclasses import dataclass, field
+
+from atocore.config import settings
+
+
+@dataclass
+class Chunk:
+    content: str
+    chunk_index: int
+    heading_path: str
+    char_count: int
+    metadata: dict = field(default_factory=dict)
+
+
+def chunk_markdown(
+    body: str,
+    base_metadata: dict | None = None,
+    max_size: int | None = None,
+    overlap: int | None = None,
+    min_size: int | None = None,
+) -> list[Chunk]:
+    """Split markdown body into chunks using heading-aware strategy.
+
+    1. Split on H2 boundaries
+    2. If section > max_size, split on H3
+    3. If still > max_size, split on paragraph breaks
+    4. If still > max_size, hard split with overlap
+    """
+    max_size = max_size or settings.chunk_max_size
+    overlap = overlap or settings.chunk_overlap
+    min_size = min_size or settings.chunk_min_size
+    base_metadata = base_metadata or {}
+
+    sections = _split_by_heading(body, level=2)
+    raw_chunks: list[tuple[str, str]] = []  # (heading_path, content)
+
+    for heading, content in sections:
+        if len(content) <= max_size:
+            raw_chunks.append((heading, content))
+        else:
+            # Try splitting on H3
+            subsections = _split_by_heading(content, level=3)
+            for sub_heading, sub_content in subsections:
+                full_path = (
+                    f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading
+                )
+                if len(sub_content) <= max_size:
+                    raw_chunks.append((full_path, sub_content))
+                else:
+                    # Split on paragraphs
+                    para_chunks = _split_by_paragraphs(
+                        sub_content, max_size, overlap
+                    )
+                    for pc in para_chunks:
+                        raw_chunks.append((full_path, pc))
+
+    # Build final chunks, filtering out too-small ones
+    chunks = []
+    idx = 0
+    for heading_path, content in raw_chunks:
+        content = content.strip()
+        if len(content) < min_size:
+            continue
+        chunks.append(
+            Chunk(
+                content=content,
+                chunk_index=idx,
+                heading_path=heading_path,
+                char_count=len(content),
+                metadata={**base_metadata},
+            )
+        )
+        idx += 1
+
+    return chunks
+
+
+def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:
+    """Split text by heading level. Returns (heading_text, section_content) pairs."""
+    pattern = rf"^({'#' * level})\s+(.+)$"
+    parts: list[tuple[str, str]] = []
+    current_heading = ""
+    current_lines: list[str] = []
+
+    for line in text.split("\n"):
+        match = re.match(pattern, line)
+        if match:
+            # Save previous section
+            if current_lines:
+                parts.append((current_heading, "\n".join(current_lines)))
+            current_heading = match.group(2).strip()
+            current_lines = []
+        else:
+            current_lines.append(line)
+
+    # Save last section
+    if current_lines:
+        parts.append((current_heading, "\n".join(current_lines)))
+
+    return parts
+
+
+def _split_by_paragraphs(
+    text: str, max_size: int, overlap: int
+) -> list[str]:
+    """Split text by paragraph breaks, then hard-split if needed."""
+    paragraphs = re.split(r"\n\n+", text)
+    chunks: list[str] = []
+    current = ""
+
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+
+        if len(current) + len(para) + 2 <= max_size:
+            current = f"{current}\n\n{para}" if current else para
+        else:
+            if current:
+                chunks.append(current)
+            # If single paragraph exceeds max, hard split
+            if len(para) > max_size:
+                chunks.extend(_hard_split(para, max_size, overlap))
+            else:
+                current = para
+                continue
+            current = ""
+
+    if current:
+        chunks.append(current)
+
+    return chunks
+
+
+def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
+    """Hard split text at max_size with overlap."""
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + max_size
+        chunks.append(text[start:end])
+        start = end - overlap
+    return chunks
--- a/src/atocore/ingestion/parser.py
+++ b/src/atocore/ingestion/parser.py
@@ -0,0 +1,65 @@
+"""Markdown file parsing with frontmatter extraction."""
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import frontmatter
+
+
+@dataclass
+class ParsedDocument:
+    file_path: str
+    title: str
+    body: str
+    tags: list[str] = field(default_factory=list)
+    frontmatter: dict = field(default_factory=dict)
+    headings: list[tuple[int, str]] = field(default_factory=list)
+
+
+def parse_markdown(file_path: Path) -> ParsedDocument:
+    """Parse a markdown file, extracting frontmatter and structure."""
+    text = file_path.read_text(encoding="utf-8")
+    post = frontmatter.loads(text)
+
+    meta = dict(post.metadata) if post.metadata else {}
+    body = post.content.strip()
+
+    # Extract title: first H1, or filename
+    title = _extract_title(body, file_path)
+
+    # Extract tags from frontmatter
+    tags = meta.get("tags", [])
+    if isinstance(tags, str):
+        tags = [t.strip() for t in tags.split(",") if t.strip()]
+    tags = tags or []
+
+    # Extract heading structure
+    headings = _extract_headings(body)
+
+    return ParsedDocument(
+        file_path=str(file_path.resolve()),
+        title=title,
+        body=body,
+        tags=tags,
+        frontmatter=meta,
+        headings=headings,
+    )
+
+
+def _extract_title(body: str, file_path: Path) -> str:
+    """Get title from first H1 or fallback to filename."""
+    match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
+    if match:
+        return match.group(1).strip()
+    return file_path.stem.replace("_", " ").replace("-", " ").title()
+
+
+def _extract_headings(body: str) -> list[tuple[int, str]]:
+    """Extract all headings with their level."""
+    headings = []
+    for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE):
+        level = len(match.group(1))
+        text = match.group(2).strip()
+        headings.append((level, text))
+    return headings
--- a/src/atocore/ingestion/pipeline.py
+++ b/src/atocore/ingestion/pipeline.py
@@ -0,0 +1,157 @@
+"""Ingestion pipeline: parse → chunk → embed → store."""
+
+import hashlib
+import json
+import time
+import uuid
+from pathlib import Path
+
+from atocore.config import settings
+from atocore.ingestion.chunker import chunk_markdown
+from atocore.ingestion.parser import parse_markdown
+from atocore.models.database import get_connection
+from atocore.observability.logger import get_logger
+from atocore.retrieval.vector_store import get_vector_store
+
+log = get_logger("ingestion")
+
+
+def ingest_file(file_path: Path) -> dict:
+    """Ingest a single markdown file. Returns stats."""
+    start = time.time()
+    file_path = file_path.resolve()
+
+    if not file_path.exists():
+        raise FileNotFoundError(f"File not found: {file_path}")
+    if file_path.suffix.lower() not in (".md", ".markdown"):
+        raise ValueError(f"Not a markdown file: {file_path}")
+
+    # Read and hash
+    raw_content = file_path.read_text(encoding="utf-8")
+    file_hash = hashlib.sha256(raw_content.encode()).hexdigest()
+
+    # Check if already ingested and unchanged
+    with get_connection() as conn:
+        existing = conn.execute(
+            "SELECT id, file_hash FROM source_documents WHERE file_path = ?",
+            (str(file_path),),
+        ).fetchone()
+
+        if existing and existing["file_hash"] == file_hash:
+            log.info("file_skipped_unchanged", file_path=str(file_path))
+            return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}
+
+    # Parse
+    parsed = parse_markdown(file_path)
+
+    # Chunk
+    base_meta = {
+        "source_file": str(file_path),
+        "tags": parsed.tags,
+        "title": parsed.title,
+    }
+    chunks = chunk_markdown(parsed.body, base_metadata=base_meta)
+
+    if not chunks:
+        log.warning("no_chunks_created", file_path=str(file_path))
+        return {"file": str(file_path), "status": "empty", "chunks": 0}
+
+    # Store in DB and vector store
+    doc_id = str(uuid.uuid4())
+    vector_store = get_vector_store()
+
+    with get_connection() as conn:
+        # Remove old data if re-ingesting
+        if existing:
+            doc_id = existing["id"]
+            old_chunk_ids = [
+                row["id"]
+                for row in conn.execute(
+                    "SELECT id FROM source_chunks WHERE document_id = ?",
+                    (doc_id,),
+                ).fetchall()
+            ]
+            conn.execute(
+                "DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
+            )
+            conn.execute(
+                "UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
+                (file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
+            )
+            # Remove old vectors
+            if old_chunk_ids:
+                vector_store.delete(old_chunk_ids)
+        else:
+            conn.execute(
+                "INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
+                (doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
+            )
+
+        # Insert chunks
+        chunk_ids = []
+        chunk_contents = []
+        chunk_metadatas = []
+
+        for chunk in chunks:
+            chunk_id = str(uuid.uuid4())
+            chunk_ids.append(chunk_id)
+            chunk_contents.append(chunk.content)
+            chunk_metadatas.append({
+                "document_id": doc_id,
+                "heading_path": chunk.heading_path,
+                "source_file": str(file_path),
+                "tags": json.dumps(parsed.tags),
+                "title": parsed.title,
+            })
+
+            conn.execute(
+                "INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
+                (
+                    chunk_id,
+                    doc_id,
+                    chunk.chunk_index,
+                    chunk.content,
+                    chunk.heading_path,
+                    chunk.char_count,
+                    json.dumps(chunk.metadata),
+                ),
+            )
+
+        # Store embeddings
+        vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)
+
+    duration_ms = int((time.time() - start) * 1000)
+    log.info(
+        "file_ingested",
+        file_path=str(file_path),
+        chunks_created=len(chunks),
+        duration_ms=duration_ms,
+    )
+
+    return {
+        "file": str(file_path),
+        "status": "ingested",
+        "chunks": len(chunks),
+        "duration_ms": duration_ms,
+    }
+
+
+def ingest_folder(folder_path: Path) -> list[dict]:
+    """Ingest all markdown files in a folder recursively."""
+    folder_path = folder_path.resolve()
+    if not folder_path.is_dir():
+        raise NotADirectoryError(f"Not a directory: {folder_path}")
+
+    results = []
+    md_files = sorted(folder_path.rglob("*.md"))
+    log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))
+
+    for md_file in md_files:
+        try:
+            result = ingest_file(md_file)
+            results.append(result)
+        except Exception as e:
+            log.error("ingestion_error", file_path=str(md_file), error=str(e))
+            results.append({"file": str(md_file), "status": "error", "error": str(e)})
+
+    return results
--- a/src/atocore/main.py
+++ b/src/atocore/main.py
@@ -0,0 +1,33 @@
+"""AtoCore — FastAPI application entry point."""
+
+from fastapi import FastAPI
+
+from atocore.api.routes import router
+from atocore.config import settings
+from atocore.models.database import init_db
+from atocore.observability.logger import setup_logging
+
+app = FastAPI(
+    title="AtoCore",
+    description="Personal Context Engine for LLM interactions",
+    version="0.1.0",
+)
+
+app.include_router(router)
+
+
+@app.on_event("startup")
+def startup():
+    setup_logging()
+    init_db()
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(
+        "atocore.main:app",
+        host=settings.host,
+        port=settings.port,
+        reload=True,
+    )
--- a/src/atocore/models/init.py
+++ b/src/atocore/models/init.py
--- a/src/atocore/models/database.py
+++ b/src/atocore/models/database.py
@@ -0,0 +1,98 @@
+"""SQLite database schema and connection management."""
+
+import sqlite3
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator
+
+from atocore.config import settings
+from atocore.observability.logger import get_logger
+
+log = get_logger("database")
+
+SCHEMA_SQL = """
+CREATE TABLE IF NOT EXISTS source_documents (
+    id TEXT PRIMARY KEY,
+    file_path TEXT UNIQUE NOT NULL,
+    file_hash TEXT NOT NULL,
+    title TEXT,
+    doc_type TEXT DEFAULT 'markdown',
+    tags TEXT DEFAULT '[]',
+    ingested_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS source_chunks (
+    id TEXT PRIMARY KEY,
+    document_id TEXT NOT NULL REFERENCES source_documents(id) ON DELETE CASCADE,
+    chunk_index INTEGER NOT NULL,
+    content TEXT NOT NULL,
+    heading_path TEXT DEFAULT '',
+    char_count INTEGER NOT NULL,
+    metadata TEXT DEFAULT '{}',
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS memories (
+    id TEXT PRIMARY KEY,
+    memory_type TEXT NOT NULL,
+    content TEXT NOT NULL,
+    source_chunk_id TEXT REFERENCES source_chunks(id),
+    confidence REAL DEFAULT 1.0,
+    status TEXT DEFAULT 'active',
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS projects (
+    id TEXT PRIMARY KEY,
+    name TEXT UNIQUE NOT NULL,
+    description TEXT DEFAULT '',
+    status TEXT DEFAULT 'active',
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS interactions (
+    id TEXT PRIMARY KEY,
+    prompt TEXT NOT NULL,
+    context_pack TEXT DEFAULT '{}',
+    response_summary TEXT DEFAULT '',
+    project_id TEXT REFERENCES projects(id),
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE INDEX IF NOT EXISTS idx_chunks_document ON source_chunks(document_id);
+CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type);
+CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status);
+CREATE INDEX IF NOT EXISTS idx_interactions_project ON interactions(project_id);
+"""
+
+
+def _ensure_data_dir() -> None:
+    settings.data_dir.mkdir(parents=True, exist_ok=True)
+
+
+def init_db() -> None:
+    """Initialize the database with schema."""
+    _ensure_data_dir()
+    with get_connection() as conn:
+        conn.executescript(SCHEMA_SQL)
+    log.info("database_initialized", path=str(settings.db_path))
+
+
+@contextmanager
+def get_connection() -> Generator[sqlite3.Connection, None, None]:
+    """Get a database connection with row factory."""
+    _ensure_data_dir()
+    conn = sqlite3.connect(str(settings.db_path))
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA foreign_keys = ON")
+    try:
+        yield conn
+        conn.commit()
+    except Exception:
+        conn.rollback()
+        raise
+    finally:
+        conn.close()
--- a/src/atocore/observability/init.py
+++ b/src/atocore/observability/init.py
--- a/src/atocore/observability/logger.py
+++ b/src/atocore/observability/logger.py
@@ -0,0 +1,41 @@
+"""Structured logging for AtoCore."""
+
+import logging
+
+import structlog
+
+from atocore.config import settings
+
+_LOG_LEVELS = {
+    "DEBUG": logging.DEBUG,
+    "INFO": logging.INFO,
+    "WARNING": logging.WARNING,
+    "ERROR": logging.ERROR,
+}
+
+
+def setup_logging() -> None:
+    """Configure structlog with JSON output."""
+    log_level = "DEBUG" if settings.debug else "INFO"
+
+    structlog.configure(
+        processors=[
+            structlog.contextvars.merge_contextvars,
+            structlog.processors.add_log_level,
+            structlog.processors.TimeStamper(fmt="iso"),
+            structlog.dev.ConsoleRenderer()
+            if settings.debug
+            else structlog.processors.JSONRenderer(),
+        ],
+        wrapper_class=structlog.make_filtering_bound_logger(
+            _LOG_LEVELS.get(log_level, logging.INFO)
+        ),
+        context_class=dict,
+        logger_factory=structlog.PrintLoggerFactory(),
+        cache_logger_on_first_use=True,
+    )
+
+
+def get_logger(name: str) -> structlog.BoundLogger:
+    """Get a named logger."""
+    return structlog.get_logger(name)
--- a/src/atocore/retrieval/init.py
+++ b/src/atocore/retrieval/init.py
--- a/src/atocore/retrieval/embeddings.py
+++ b/src/atocore/retrieval/embeddings.py
@@ -0,0 +1,32 @@
+"""Embedding model management."""
+
+from sentence_transformers import SentenceTransformer
+
+from atocore.config import settings
+from atocore.observability.logger import get_logger
+
+log = get_logger("embeddings")
+
+_model: SentenceTransformer | None = None
+
+
+def get_model() -> SentenceTransformer:
+    """Load and cache the embedding model."""
+    global _model
+    if _model is None:
+        log.info("loading_embedding_model", model=settings.embedding_model)
+        _model = SentenceTransformer(settings.embedding_model)
+        log.info("embedding_model_loaded", model=settings.embedding_model)
+    return _model
+
+
+def embed_texts(texts: list[str]) -> list[list[float]]:
+    """Generate embeddings for a list of texts."""
+    model = get_model()
+    embeddings = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
+    return embeddings.tolist()
+
+
+def embed_query(query: str) -> list[float]:
+    """Generate embedding for a single query."""
+    return embed_texts([query])[0]
--- a/src/atocore/retrieval/retriever.py
+++ b/src/atocore/retrieval/retriever.py
@@ -0,0 +1,83 @@
+"""Retrieval: query → ranked chunks."""
+
+import time
+from dataclasses import dataclass
+
+from atocore.config import settings
+from atocore.observability.logger import get_logger
+from atocore.retrieval.embeddings import embed_query
+from atocore.retrieval.vector_store import get_vector_store
+
+log = get_logger("retriever")
+
+
+@dataclass
+class ChunkResult:
+    chunk_id: str
+    content: str
+    score: float
+    heading_path: str
+    source_file: str
+    tags: str
+    title: str
+    document_id: str
+
+
+def retrieve(
+    query: str,
+    top_k: int | None = None,
+    filter_tags: list[str] | None = None,
+) -> list[ChunkResult]:
+    """Retrieve the most relevant chunks for a query."""
+    top_k = top_k or settings.context_top_k
+    start = time.time()
+
+    query_embedding = embed_query(query)
+    store = get_vector_store()
+
+    # Build filter
+    where = None
+    if filter_tags:
+        # ChromaDB where filter for tags (stored as JSON string)
+        # Simple contains check — works for single-tag filtering
+        where = {"tags": {"$contains": filter_tags[0]}}
+
+    results = store.query(
+        query_embedding=query_embedding,
+        top_k=top_k,
+        where=where,
+    )
+
+    chunks = []
+    if results and results["ids"] and results["ids"][0]:
+        for i, chunk_id in enumerate(results["ids"][0]):
+            # ChromaDB returns distances (lower = more similar for cosine)
+            # Convert to similarity score (1 - distance)
+            distance = results["distances"][0][i] if results["distances"] else 0
+            score = 1.0 - distance
+            meta = results["metadatas"][0][i] if results["metadatas"] else {}
+            content = results["documents"][0][i] if results["documents"] else ""
+
+            chunks.append(
+                ChunkResult(
+                    chunk_id=chunk_id,
+                    content=content,
+                    score=round(score, 4),
+                    heading_path=meta.get("heading_path", ""),
+                    source_file=meta.get("source_file", ""),
+                    tags=meta.get("tags", "[]"),
+                    title=meta.get("title", ""),
+                    document_id=meta.get("document_id", ""),
+                )
+            )
+
+    duration_ms = int((time.time() - start) * 1000)
+    log.info(
+        "retrieval_done",
+        query=query[:100],
+        top_k=top_k,
+        results_count=len(chunks),
+        duration_ms=duration_ms,
+    )
+
+    return chunks
--- a/src/atocore/retrieval/vector_store.py
+++ b/src/atocore/retrieval/vector_store.py
@@ -0,0 +1,77 @@
+"""ChromaDB vector store wrapper."""
+
+import chromadb
+
+from atocore.config import settings
+from atocore.observability.logger import get_logger
+from atocore.retrieval.embeddings import embed_texts
+
+log = get_logger("vector_store")
+
+COLLECTION_NAME = "atocore_chunks"
+
+_store: "VectorStore | None" = None
+
+
+class VectorStore:
+    """Wrapper around ChromaDB for chunk storage and retrieval."""
+
+    def __init__(self) -> None:
+        settings.chroma_path.mkdir(parents=True, exist_ok=True)
+        self._client = chromadb.PersistentClient(path=str(settings.chroma_path))
+        self._collection = self._client.get_or_create_collection(
+            name=COLLECTION_NAME,
+            metadata={"hnsw:space": "cosine"},
+        )
+        log.info("vector_store_initialized", path=str(settings.chroma_path))
+
+    def add(
+        self,
+        ids: list[str],
+        documents: list[str],
+        metadatas: list[dict],
+    ) -> None:
+        """Add chunks with embeddings to the store."""
+        embeddings = embed_texts(documents)
+        self._collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            documents=documents,
+            metadatas=metadatas,
+        )
+        log.debug("vectors_added", count=len(ids))
+
+    def query(
+        self,
+        query_embedding: list[float],
+        top_k: int = 10,
+        where: dict | None = None,
+    ) -> dict:
+        """Query the store for similar chunks."""
+        kwargs: dict = {
+            "query_embeddings": [query_embedding],
+            "n_results": top_k,
+            "include": ["documents", "metadatas", "distances"],
+        }
+        if where:
+            kwargs["where"] = where
+
+        return self._collection.query(**kwargs)
+
+    def delete(self, ids: list[str]) -> None:
+        """Delete chunks by IDs."""
+        if ids:
+            self._collection.delete(ids=ids)
+            log.debug("vectors_deleted", count=len(ids))
+
+    @property
+    def count(self) -> int:
+        return self._collection.count()
+
+
+def get_vector_store() -> VectorStore:
+    """Get or create the singleton vector store."""
+    global _store
+    if _store is None:
+        _store = VectorStore()
+    return _store