feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,8 @@
 ATOCORE_DEBUG=false
 ATOCORE_DATA_DIR=./data
 ATOCORE_HOST=127.0.0.1
 ATOCORE_PORT=8100
 ATOCORE_EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
 ATOCORE_CHUNK_MAX_SIZE=800
 ATOCORE_CHUNK_OVERLAP=100
 ATOCORE_CONTEXT_BUDGET=3000
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,12 @@
 data/
 __pycache__/
 *.pyc
 .env
 *.egg-info/
 dist/
 build/
 .pytest_cache/
 htmlcov/
 .coverage
 venv/
 .venv/
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,36 @@
 [build-system]
 requires = ["setuptools>=68.0", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "atocore"
 version = "0.1.0"
 description = "Personal context engine for LLM interactions"
 requires-python = ">=3.11"
 dependencies = [
    "fastapi>=0.110.0",
    "uvicorn[standard]>=0.27.0",
    "python-frontmatter>=1.1.0",
    "chromadb>=0.4.22",
    "sentence-transformers>=2.5.0",
    "pydantic>=2.6.0",
    "pydantic-settings>=2.1.0",
    "structlog>=24.1.0",
 ]
 [project.optional-dependencies]
 dev = [
    "pytest>=8.0.0",
    "pytest-cov>=4.1.0",
    "httpx>=0.27.0",
    "pyyaml>=6.0.0",
 ]
 [tool.setuptools.packages.find]
 where = ["src"]
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]
 python_functions = ["test_*"]
 addopts = "--cov=atocore --cov-report=term-missing -v"
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -0,0 +1,5 @@
 -r requirements.txt
 pytest>=8.0.0
 pytest-cov>=4.1.0
 httpx>=0.27.0
 pyyaml>=6.0.0
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,8 @@
 fastapi>=0.110.0
 uvicorn[standard]>=0.27.0
 python-frontmatter>=1.1.0
 chromadb>=0.4.22
 sentence-transformers>=2.5.0
 pydantic>=2.6.0
 pydantic-settings>=2.1.0
 structlog>=24.1.0
--- a/scripts/ingest_folder.py
+++ b/scripts/ingest_folder.py
@@ -0,0 +1,54 @@
 """CLI script to ingest a folder of markdown files."""
 import argparse
 import json
 import sys
 from pathlib import Path
 # Add src to path
 sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
 from atocore.ingestion.pipeline import ingest_folder
 from atocore.models.database import init_db
 from atocore.observability.logger import setup_logging
 def main():
    parser = argparse.ArgumentParser(description="Ingest markdown files into AtoCore")
    parser.add_argument("--path", required=True, help="Path to folder with markdown files")
    args = parser.parse_args()
    setup_logging()
    init_db()
    folder = Path(args.path)
    if not folder.is_dir():
        print(f"Error: {folder} is not a directory")
        sys.exit(1)
    results = ingest_folder(folder)
    # Summary
    ingested = sum(1 for r in results if r["status"] == "ingested")
    skipped = sum(1 for r in results if r["status"] == "skipped")
    errors = sum(1 for r in results if r["status"] == "error")
    total_chunks = sum(r.get("chunks", 0) for r in results)
    print(f"\n{'='*50}")
    print(f"Ingestion complete:")
    print(f"  Files processed: {len(results)}")
    print(f"  Ingested: {ingested}")
    print(f"  Skipped (unchanged): {skipped}")
    print(f"  Errors: {errors}")
    print(f"  Total chunks created: {total_chunks}")
    print(f"{'='*50}")
    if errors:
        print("\nErrors:")
        for r in results:
            if r["status"] == "error":
                print(f"  {r['file']}: {r['error']}")
 if __name__ == "__main__":
    main()
--- a/scripts/query_test.py
+++ b/scripts/query_test.py
@@ -0,0 +1,76 @@
 """CLI script to run test prompts and compare baseline vs enriched."""
 import argparse
 import sys
 from pathlib import Path
 import yaml
 # Add src to path
 sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
 from atocore.context.builder import build_context
 from atocore.models.database import init_db
 from atocore.observability.logger import setup_logging
 def main():
    parser = argparse.ArgumentParser(description="Run test prompts against AtoCore")
    parser.add_argument(
        "--prompts",
        default=str(Path(__file__).parent.parent / "tests" / "test_prompts" / "prompts.yaml"),
        help="Path to prompts YAML file",
    )
    args = parser.parse_args()
    setup_logging()
    init_db()
    prompts_path = Path(args.prompts)
    if not prompts_path.exists():
        print(f"Error: {prompts_path} not found")
        sys.exit(1)
    with open(prompts_path) as f:
        data = yaml.safe_load(f)
    prompts = data.get("prompts", [])
    print(f"Running {len(prompts)} test prompts...\n")
    for p in prompts:
        prompt_id = p["id"]
        prompt_text = p["prompt"]
        project = p.get("project")
        expected = p.get("expected", "")
        print(f"{'='*60}")
        print(f"[{prompt_id}] {prompt_text}")
        print(f"Project: {project or 'none'}")
        print(f"Expected: {expected}")
        print(f"-" * 60)
        pack = build_context(
            user_prompt=prompt_text,
            project_hint=project,
        )
        print(f"Chunks retrieved: {len(pack.chunks_used)}")
        print(f"Total chars: {pack.total_chars} / {pack.budget}")
        print(f"Duration: {pack.duration_ms}ms")
        print()
        for i, chunk in enumerate(pack.chunks_used[:5]):
            print(f"  [{i+1}] Score: {chunk.score:.2f} | {chunk.source_file}")
            print(f"      Section: {chunk.heading_path}")
            print(f"      Preview: {chunk.content[:120]}...")
            print()
        print(f"Full prompt length: {len(pack.full_prompt)} chars")
        print()
    print(f"{'='*60}")
    print("Done. Review output above to assess retrieval quality.")
 if __name__ == "__main__":
    main()
--- a/src/atocore/init.py
+++ b/src/atocore/init.py
@@ -0,0 +1,3 @@
 """AtoCore — Personal Context Engine."""
 __version__ = "0.1.0"
--- a/src/atocore/api/init.py
+++ b/src/atocore/api/init.py
--- a/src/atocore/api/routes.py
+++ b/src/atocore/api/routes.py
@@ -0,0 +1,132 @@
 """FastAPI route definitions."""
 from pathlib import Path
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 from atocore.context.builder import (
    ContextPack,
    build_context,
    get_last_context_pack,
    _pack_to_dict,
 )
 from atocore.ingestion.pipeline import ingest_file, ingest_folder
 from atocore.retrieval.retriever import retrieve
 from atocore.retrieval.vector_store import get_vector_store
 router = APIRouter()
 # --- Request/Response models ---
 class IngestRequest(BaseModel):
    path: str  # file or folder path
 class IngestResponse(BaseModel):
    results: list[dict]
 class QueryRequest(BaseModel):
    prompt: str
    top_k: int = 10
    filter_tags: list[str] | None = None
 class QueryResponse(BaseModel):
    results: list[dict]
 class ContextBuildRequest(BaseModel):
    prompt: str
    project: str | None = None
    budget: int | None = None
 class ContextBuildResponse(BaseModel):
    formatted_context: str
    full_prompt: str
    chunks_used: int
    total_chars: int
    budget: int
    budget_remaining: int
    duration_ms: int
    chunks: list[dict]
 # --- Endpoints ---
@router.post("/ingest", response_model=IngestResponse)
 def api_ingest(req: IngestRequest):
    """Ingest a markdown file or folder."""
    target = Path(req.path)
    if target.is_file():
        results = [ingest_file(target)]
    elif target.is_dir():
        results = ingest_folder(target)
    else:
        raise HTTPException(status_code=404, detail=f"Path not found: {req.path}")
    return IngestResponse(results=results)
@router.post("/query", response_model=QueryResponse)
 def api_query(req: QueryRequest):
    """Retrieve relevant chunks for a prompt."""
    chunks = retrieve(req.prompt, top_k=req.top_k, filter_tags=req.filter_tags)
    return QueryResponse(
        results=[
            {
                "chunk_id": c.chunk_id,
                "content": c.content,
                "score": c.score,
                "heading_path": c.heading_path,
                "source_file": c.source_file,
                "title": c.title,
            }
            for c in chunks
        ]
    )
@router.post("/context/build", response_model=ContextBuildResponse)
 def api_build_context(req: ContextBuildRequest):
    """Build a full context pack for a prompt."""
    pack = build_context(
        user_prompt=req.prompt,
        project_hint=req.project,
        budget=req.budget,
    )
    pack_dict = _pack_to_dict(pack)
    return ContextBuildResponse(
        formatted_context=pack.formatted_context,
        full_prompt=pack.full_prompt,
        chunks_used=len(pack.chunks_used),
        total_chars=pack.total_chars,
        budget=pack.budget,
        budget_remaining=pack.budget_remaining,
        duration_ms=pack.duration_ms,
        chunks=pack_dict["chunks"],
    )
@router.get("/health")
 def api_health():
    """Health check."""
    store = get_vector_store()
    return {
        "status": "ok",
        "version": "0.1.0",
        "vectors_count": store.count,
    }
@router.get("/debug/context")
 def api_debug_context():
    """Inspect the last assembled context pack."""
    pack = get_last_context_pack()
    if pack is None:
        return {"message": "No context pack built yet."}
    return _pack_to_dict(pack)
--- a/src/atocore/config.py
+++ b/src/atocore/config.py
@@ -0,0 +1,39 @@
 """AtoCore configuration via environment variables."""
 from pathlib import Path
 from pydantic_settings import BaseSettings
 class Settings(BaseSettings):
    debug: bool = False
    data_dir: Path = Path("./data")
    host: str = "127.0.0.1"
    port: int = 8100
    # Embedding
    embedding_model: str = (
        "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    )
    # Chunking
    chunk_max_size: int = 800
    chunk_overlap: int = 100
    chunk_min_size: int = 50
    # Context
    context_budget: int = 3000
    context_top_k: int = 15
    model_config = {"env_prefix": "ATOCORE_"}
    @property
    def db_path(self) -> Path:
        return self.data_dir / "atocore.db"
    @property
    def chroma_path(self) -> Path:
        return self.data_dir / "chroma"
 settings = Settings()
--- a/src/atocore/context/init.py
+++ b/src/atocore/context/init.py
--- a/src/atocore/context/builder.py
+++ b/src/atocore/context/builder.py
@@ -0,0 +1,212 @@
 """Context pack assembly: retrieve, rank, budget, format."""
 import json
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
 from atocore.config import settings
 from atocore.observability.logger import get_logger
 from atocore.retrieval.retriever import ChunkResult, retrieve
 log = get_logger("context_builder")
 SYSTEM_PREFIX = (
    "You have access to the following personal context from the user's knowledge base.\n"
    "Use it to inform your answer. If the context is not relevant, ignore it.\n"
    "Do not mention the context system unless asked."
 )
 # Last built context pack for debug inspection
 _last_context_pack: "ContextPack | None" = None
@dataclass
 class ContextChunk:
    content: str
    source_file: str
    heading_path: str
    score: float
    char_count: int
@dataclass
 class ContextPack:
    chunks_used: list[ContextChunk] = field(default_factory=list)
    total_chars: int = 0
    budget: int = 0
    budget_remaining: int = 0
    formatted_context: str = ""
    full_prompt: str = ""
    query: str = ""
    project_hint: str = ""
    duration_ms: int = 0
 def build_context(
    user_prompt: str,
    project_hint: str | None = None,
    budget: int | None = None,
 ) -> ContextPack:
    """Build a context pack for a user prompt."""
    global _last_context_pack
    start = time.time()
    budget = budget or settings.context_budget
    # 1. Retrieve candidates
    candidates = retrieve(user_prompt, top_k=settings.context_top_k)
    # 2. Score and rank
    scored = _rank_chunks(candidates, project_hint)
    # 3. Select within budget
    selected = _select_within_budget(scored, budget)
    # 4. Format
    formatted = _format_context_block(selected)
    # 5. Build full prompt
    full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"
    total_chars = sum(c.char_count for c in selected)
    duration_ms = int((time.time() - start) * 1000)
    pack = ContextPack(
        chunks_used=selected,
        total_chars=total_chars,
        budget=budget,
        budget_remaining=budget - total_chars,
        formatted_context=formatted,
        full_prompt=full_prompt,
        query=user_prompt,
        project_hint=project_hint or "",
        duration_ms=duration_ms,
    )
    _last_context_pack = pack
    log.info(
        "context_built",
        chunks_used=len(selected),
        total_chars=total_chars,
        budget_remaining=budget - total_chars,
        duration_ms=duration_ms,
    )
    log.debug("context_pack_detail", pack=_pack_to_dict(pack))
    return pack
 def get_last_context_pack() -> ContextPack | None:
    """Return the last built context pack for debug inspection."""
    return _last_context_pack
 def _rank_chunks(
    candidates: list[ChunkResult],
    project_hint: str | None,
 ) -> list[tuple[float, ChunkResult]]:
    """Rank candidates with boosting for project match."""
    scored = []
    seen_content: set[str] = set()
    for chunk in candidates:
        # Deduplicate by content prefix (first 200 chars)
        content_key = chunk.content[:200]
        if content_key in seen_content:
            continue
        seen_content.add(content_key)
        # Base score from similarity
        final_score = chunk.score
        # Project boost
        if project_hint:
            tags_str = chunk.tags.lower() if chunk.tags else ""
            source_str = chunk.source_file.lower()
            title_str = chunk.title.lower() if chunk.title else ""
            hint_lower = project_hint.lower()
            if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:
                final_score += 0.3
        scored.append((final_score, chunk))
    # Sort by score descending
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored
 def _select_within_budget(
    scored: list[tuple[float, ChunkResult]],
    budget: int,
 ) -> list[ContextChunk]:
    """Select top chunks that fit within the character budget."""
    selected = []
    used = 0
    for score, chunk in scored:
        chunk_len = len(chunk.content)
        if used + chunk_len > budget:
            continue
        selected.append(
            ContextChunk(
                content=chunk.content,
                source_file=_shorten_path(chunk.source_file),
                heading_path=chunk.heading_path,
                score=score,
                char_count=chunk_len,
            )
        )
        used += chunk_len
    return selected
 def _format_context_block(chunks: list[ContextChunk]) -> str:
    """Format chunks into the context block string."""
    if not chunks:
        return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
    lines = ["--- AtoCore Context ---"]
    for chunk in chunks:
        lines.append(
            f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
        )
        lines.append(chunk.content)
        lines.append("")
    lines.append("--- End Context ---")
    return "\n".join(lines)
 def _shorten_path(path: str) -> str:
    """Shorten an absolute path to a relative-like display."""
    p = Path(path)
    parts = p.parts
    # Show last 3 parts at most
    if len(parts) > 3:
        return str(Path(*parts[-3:]))
    return str(p)
 def _pack_to_dict(pack: ContextPack) -> dict:
    """Convert a context pack to a JSON-serializable dict."""
    return {
        "query": pack.query,
        "project_hint": pack.project_hint,
        "chunks_used": len(pack.chunks_used),
        "total_chars": pack.total_chars,
        "budget": pack.budget,
        "budget_remaining": pack.budget_remaining,
        "duration_ms": pack.duration_ms,
        "chunks": [
            {
                "source_file": c.source_file,
                "heading_path": c.heading_path,
                "score": c.score,
                "char_count": c.char_count,
                "content_preview": c.content[:100],
            }
            for c in pack.chunks_used
        ],
    }
--- a/src/atocore/ingestion/init.py
+++ b/src/atocore/ingestion/init.py
--- a/src/atocore/ingestion/chunker.py
+++ b/src/atocore/ingestion/chunker.py
@@ -0,0 +1,146 @@
 """Heading-aware recursive markdown chunking."""
 import re
 from dataclasses import dataclass, field
 from atocore.config import settings
@dataclass
 class Chunk:
    content: str
    chunk_index: int
    heading_path: str
    char_count: int
    metadata: dict = field(default_factory=dict)
 def chunk_markdown(
    body: str,
    base_metadata: dict | None = None,
    max_size: int | None = None,
    overlap: int | None = None,
    min_size: int | None = None,
 ) -> list[Chunk]:
    """Split markdown body into chunks using heading-aware strategy.
    1. Split on H2 boundaries
    2. If section > max_size, split on H3
    3. If still > max_size, split on paragraph breaks
    4. If still > max_size, hard split with overlap
    """
    max_size = max_size or settings.chunk_max_size
    overlap = overlap or settings.chunk_overlap
    min_size = min_size or settings.chunk_min_size
    base_metadata = base_metadata or {}
    sections = _split_by_heading(body, level=2)
    raw_chunks: list[tuple[str, str]] = []  # (heading_path, content)
    for heading, content in sections:
        if len(content) <= max_size:
            raw_chunks.append((heading, content))
        else:
            # Try splitting on H3
            subsections = _split_by_heading(content, level=3)
            for sub_heading, sub_content in subsections:
                full_path = (
                    f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading
                )
                if len(sub_content) <= max_size:
                    raw_chunks.append((full_path, sub_content))
                else:
                    # Split on paragraphs
                    para_chunks = _split_by_paragraphs(
                        sub_content, max_size, overlap
                    )
                    for pc in para_chunks:
                        raw_chunks.append((full_path, pc))
    # Build final chunks, filtering out too-small ones
    chunks = []
    idx = 0
    for heading_path, content in raw_chunks:
        content = content.strip()
        if len(content) < min_size:
            continue
        chunks.append(
            Chunk(
                content=content,
                chunk_index=idx,
                heading_path=heading_path,
                char_count=len(content),
                metadata={**base_metadata},
            )
        )
        idx += 1
    return chunks
 def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:
    """Split text by heading level. Returns (heading_text, section_content) pairs."""
    pattern = rf"^({'#' * level})\s+(.+)$"
    parts: list[tuple[str, str]] = []
    current_heading = ""
    current_lines: list[str] = []
    for line in text.split("\n"):
        match = re.match(pattern, line)
        if match:
            # Save previous section
            if current_lines:
                parts.append((current_heading, "\n".join(current_lines)))
            current_heading = match.group(2).strip()
            current_lines = []
        else:
            current_lines.append(line)
    # Save last section
    if current_lines:
        parts.append((current_heading, "\n".join(current_lines)))
    return parts
 def _split_by_paragraphs(
    text: str, max_size: int, overlap: int
 ) -> list[str]:
    """Split text by paragraph breaks, then hard-split if needed."""
    paragraphs = re.split(r"\n\n+", text)
    chunks: list[str] = []
    current = ""
    for para in paragraphs:
        para = para.strip()
        if not para:
            continue
        if len(current) + len(para) + 2 <= max_size:
            current = f"{current}\n\n{para}" if current else para
        else:
            if current:
                chunks.append(current)
            # If single paragraph exceeds max, hard split
            if len(para) > max_size:
                chunks.extend(_hard_split(para, max_size, overlap))
            else:
                current = para
                continue
            current = ""
    if current:
        chunks.append(current)
    return chunks
 def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
    """Hard split text at max_size with overlap."""
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks
--- a/src/atocore/ingestion/parser.py
+++ b/src/atocore/ingestion/parser.py
@@ -0,0 +1,65 @@
 """Markdown file parsing with frontmatter extraction."""
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
 import frontmatter
@dataclass
 class ParsedDocument:
    file_path: str
    title: str
    body: str
    tags: list[str] = field(default_factory=list)
    frontmatter: dict = field(default_factory=dict)
    headings: list[tuple[int, str]] = field(default_factory=list)
 def parse_markdown(file_path: Path) -> ParsedDocument:
    """Parse a markdown file, extracting frontmatter and structure."""
    text = file_path.read_text(encoding="utf-8")
    post = frontmatter.loads(text)
    meta = dict(post.metadata) if post.metadata else {}
    body = post.content.strip()
    # Extract title: first H1, or filename
    title = _extract_title(body, file_path)
    # Extract tags from frontmatter
    tags = meta.get("tags", [])
    if isinstance(tags, str):
        tags = [t.strip() for t in tags.split(",") if t.strip()]
    tags = tags or []
    # Extract heading structure
    headings = _extract_headings(body)
    return ParsedDocument(
        file_path=str(file_path.resolve()),
        title=title,
        body=body,
        tags=tags,
        frontmatter=meta,
        headings=headings,
    )
 def _extract_title(body: str, file_path: Path) -> str:
    """Get title from first H1 or fallback to filename."""
    match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
    if match:
        return match.group(1).strip()
    return file_path.stem.replace("_", " ").replace("-", " ").title()
 def _extract_headings(body: str) -> list[tuple[int, str]]:
    """Extract all headings with their level."""
    headings = []
    for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE):
        level = len(match.group(1))
        text = match.group(2).strip()
        headings.append((level, text))
    return headings
--- a/src/atocore/ingestion/pipeline.py
+++ b/src/atocore/ingestion/pipeline.py
@@ -0,0 +1,157 @@
 """Ingestion pipeline: parse → chunk → embed → store."""
 import hashlib
 import json
 import time
 import uuid
 from pathlib import Path
 from atocore.config import settings
 from atocore.ingestion.chunker import chunk_markdown
 from atocore.ingestion.parser import parse_markdown
 from atocore.models.database import get_connection
 from atocore.observability.logger import get_logger
 from atocore.retrieval.vector_store import get_vector_store
 log = get_logger("ingestion")
 def ingest_file(file_path: Path) -> dict:
    """Ingest a single markdown file. Returns stats."""
    start = time.time()
    file_path = file_path.resolve()
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    if file_path.suffix.lower() not in (".md", ".markdown"):
        raise ValueError(f"Not a markdown file: {file_path}")
    # Read and hash
    raw_content = file_path.read_text(encoding="utf-8")
    file_hash = hashlib.sha256(raw_content.encode()).hexdigest()
    # Check if already ingested and unchanged
    with get_connection() as conn:
        existing = conn.execute(
            "SELECT id, file_hash FROM source_documents WHERE file_path = ?",
            (str(file_path),),
        ).fetchone()
        if existing and existing["file_hash"] == file_hash:
            log.info("file_skipped_unchanged", file_path=str(file_path))
            return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}
    # Parse
    parsed = parse_markdown(file_path)
    # Chunk
    base_meta = {
        "source_file": str(file_path),
        "tags": parsed.tags,
        "title": parsed.title,
    }
    chunks = chunk_markdown(parsed.body, base_metadata=base_meta)
    if not chunks:
        log.warning("no_chunks_created", file_path=str(file_path))
        return {"file": str(file_path), "status": "empty", "chunks": 0}
    # Store in DB and vector store
    doc_id = str(uuid.uuid4())
    vector_store = get_vector_store()
    with get_connection() as conn:
        # Remove old data if re-ingesting
        if existing:
            doc_id = existing["id"]
            old_chunk_ids = [
                row["id"]
                for row in conn.execute(
                    "SELECT id FROM source_chunks WHERE document_id = ?",
                    (doc_id,),
                ).fetchall()
            ]
            conn.execute(
                "DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
            )
            conn.execute(
                "UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
                (file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
            )
            # Remove old vectors
            if old_chunk_ids:
                vector_store.delete(old_chunk_ids)
        else:
            conn.execute(
                "INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
                (doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
            )
        # Insert chunks
        chunk_ids = []
        chunk_contents = []
        chunk_metadatas = []
        for chunk in chunks:
            chunk_id = str(uuid.uuid4())
            chunk_ids.append(chunk_id)
            chunk_contents.append(chunk.content)
            chunk_metadatas.append({
                "document_id": doc_id,
                "heading_path": chunk.heading_path,
                "source_file": str(file_path),
                "tags": json.dumps(parsed.tags),
                "title": parsed.title,
            })
            conn.execute(
                "INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
                (
                    chunk_id,
                    doc_id,
                    chunk.chunk_index,
                    chunk.content,
                    chunk.heading_path,
                    chunk.char_count,
                    json.dumps(chunk.metadata),
                ),
            )
        # Store embeddings
        vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)
    duration_ms = int((time.time() - start) * 1000)
    log.info(
        "file_ingested",
        file_path=str(file_path),
        chunks_created=len(chunks),
        duration_ms=duration_ms,
    )
    return {
        "file": str(file_path),
        "status": "ingested",
        "chunks": len(chunks),
        "duration_ms": duration_ms,
    }
 def ingest_folder(folder_path: Path) -> list[dict]:
    """Ingest all markdown files in a folder recursively."""
    folder_path = folder_path.resolve()
    if not folder_path.is_dir():
        raise NotADirectoryError(f"Not a directory: {folder_path}")
    results = []
    md_files = sorted(folder_path.rglob("*.md"))
    log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))
    for md_file in md_files:
        try:
            result = ingest_file(md_file)
            results.append(result)
        except Exception as e:
            log.error("ingestion_error", file_path=str(md_file), error=str(e))
            results.append({"file": str(md_file), "status": "error", "error": str(e)})
    return results
--- a/src/atocore/main.py
+++ b/src/atocore/main.py
@@ -0,0 +1,33 @@
 """AtoCore — FastAPI application entry point."""
 from fastapi import FastAPI
 from atocore.api.routes import router
 from atocore.config import settings
 from atocore.models.database import init_db
 from atocore.observability.logger import setup_logging
 app = FastAPI(
    title="AtoCore",
    description="Personal Context Engine for LLM interactions",
    version="0.1.0",
 )
 app.include_router(router)
@app.on_event("startup")
 def startup():
    setup_logging()
    init_db()
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
        "atocore.main:app",
        host=settings.host,
        port=settings.port,
        reload=True,
    )
--- a/src/atocore/models/init.py
+++ b/src/atocore/models/init.py
--- a/src/atocore/models/database.py
+++ b/src/atocore/models/database.py
@@ -0,0 +1,98 @@
 """SQLite database schema and connection management."""
 import sqlite3
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Generator
 from atocore.config import settings
 from atocore.observability.logger import get_logger
 log = get_logger("database")
 SCHEMA_SQL = """
 CREATE TABLE IF NOT EXISTS source_documents (
    id TEXT PRIMARY KEY,
    file_path TEXT UNIQUE NOT NULL,
    file_hash TEXT NOT NULL,
    title TEXT,
    doc_type TEXT DEFAULT 'markdown',
    tags TEXT DEFAULT '[]',
    ingested_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE IF NOT EXISTS source_chunks (
    id TEXT PRIMARY KEY,
    document_id TEXT NOT NULL REFERENCES source_documents(id) ON DELETE CASCADE,
    chunk_index INTEGER NOT NULL,
    content TEXT NOT NULL,
    heading_path TEXT DEFAULT '',
    char_count INTEGER NOT NULL,
    metadata TEXT DEFAULT '{}',
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE IF NOT EXISTS memories (
    id TEXT PRIMARY KEY,
    memory_type TEXT NOT NULL,
    content TEXT NOT NULL,
    source_chunk_id TEXT REFERENCES source_chunks(id),
    confidence REAL DEFAULT 1.0,
    status TEXT DEFAULT 'active',
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE IF NOT EXISTS projects (
    id TEXT PRIMARY KEY,
    name TEXT UNIQUE NOT NULL,
    description TEXT DEFAULT '',
    status TEXT DEFAULT 'active',
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE IF NOT EXISTS interactions (
    id TEXT PRIMARY KEY,
    prompt TEXT NOT NULL,
    context_pack TEXT DEFAULT '{}',
    response_summary TEXT DEFAULT '',
    project_id TEXT REFERENCES projects(id),
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE INDEX IF NOT EXISTS idx_chunks_document ON source_chunks(document_id);
 CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type);
 CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status);
 CREATE INDEX IF NOT EXISTS idx_interactions_project ON interactions(project_id);
 """
 def _ensure_data_dir() -> None:
    settings.data_dir.mkdir(parents=True, exist_ok=True)
 def init_db() -> None:
    """Initialize the database with schema."""
    _ensure_data_dir()
    with get_connection() as conn:
        conn.executescript(SCHEMA_SQL)
    log.info("database_initialized", path=str(settings.db_path))
@contextmanager
 def get_connection() -> Generator[sqlite3.Connection, None, None]:
    """Get a database connection with row factory."""
    _ensure_data_dir()
    conn = sqlite3.connect(str(settings.db_path))
    conn.row_factory = sqlite3.Row
    conn.execute("PRAGMA foreign_keys = ON")
    try:
        yield conn
        conn.commit()
    except Exception:
        conn.rollback()
        raise
    finally:
        conn.close()
--- a/src/atocore/observability/init.py
+++ b/src/atocore/observability/init.py
--- a/src/atocore/observability/logger.py
+++ b/src/atocore/observability/logger.py
@@ -0,0 +1,41 @@
 """Structured logging for AtoCore."""
 import logging
 import structlog
 from atocore.config import settings
 _LOG_LEVELS = {
    "DEBUG": logging.DEBUG,
    "INFO": logging.INFO,
    "WARNING": logging.WARNING,
    "ERROR": logging.ERROR,
 }
 def setup_logging() -> None:
    """Configure structlog with JSON output."""
    log_level = "DEBUG" if settings.debug else "INFO"
    structlog.configure(
        processors=[
            structlog.contextvars.merge_contextvars,
            structlog.processors.add_log_level,
            structlog.processors.TimeStamper(fmt="iso"),
            structlog.dev.ConsoleRenderer()
            if settings.debug
            else structlog.processors.JSONRenderer(),
        ],
        wrapper_class=structlog.make_filtering_bound_logger(
            _LOG_LEVELS.get(log_level, logging.INFO)
        ),
        context_class=dict,
        logger_factory=structlog.PrintLoggerFactory(),
        cache_logger_on_first_use=True,
    )
 def get_logger(name: str) -> structlog.BoundLogger:
    """Get a named logger."""
    return structlog.get_logger(name)
--- a/src/atocore/retrieval/init.py
+++ b/src/atocore/retrieval/init.py
--- a/src/atocore/retrieval/embeddings.py
+++ b/src/atocore/retrieval/embeddings.py
@@ -0,0 +1,32 @@
 """Embedding model management."""
 from sentence_transformers import SentenceTransformer
 from atocore.config import settings
 from atocore.observability.logger import get_logger
 log = get_logger("embeddings")
 _model: SentenceTransformer | None = None
 def get_model() -> SentenceTransformer:
    """Load and cache the embedding model."""
    global _model
    if _model is None:
        log.info("loading_embedding_model", model=settings.embedding_model)
        _model = SentenceTransformer(settings.embedding_model)
        log.info("embedding_model_loaded", model=settings.embedding_model)
    return _model
 def embed_texts(texts: list[str]) -> list[list[float]]:
    """Generate embeddings for a list of texts."""
    model = get_model()
    embeddings = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
    return embeddings.tolist()
 def embed_query(query: str) -> list[float]:
    """Generate embedding for a single query."""
    return embed_texts([query])[0]
--- a/src/atocore/retrieval/retriever.py
+++ b/src/atocore/retrieval/retriever.py
@@ -0,0 +1,83 @@
 """Retrieval: query → ranked chunks."""
 import time
 from dataclasses import dataclass
 from atocore.config import settings
 from atocore.observability.logger import get_logger
 from atocore.retrieval.embeddings import embed_query
 from atocore.retrieval.vector_store import get_vector_store
 log = get_logger("retriever")
@dataclass
 class ChunkResult:
    chunk_id: str
    content: str
    score: float
    heading_path: str
    source_file: str
    tags: str
    title: str
    document_id: str
 def retrieve(
    query: str,
    top_k: int | None = None,
    filter_tags: list[str] | None = None,
 ) -> list[ChunkResult]:
    """Retrieve the most relevant chunks for a query."""
    top_k = top_k or settings.context_top_k
    start = time.time()
    query_embedding = embed_query(query)
    store = get_vector_store()
    # Build filter
    where = None
    if filter_tags:
        # ChromaDB where filter for tags (stored as JSON string)
        # Simple contains check — works for single-tag filtering
        where = {"tags": {"$contains": filter_tags[0]}}
    results = store.query(
        query_embedding=query_embedding,
        top_k=top_k,
        where=where,
    )
    chunks = []
    if results and results["ids"] and results["ids"][0]:
        for i, chunk_id in enumerate(results["ids"][0]):
            # ChromaDB returns distances (lower = more similar for cosine)
            # Convert to similarity score (1 - distance)
            distance = results["distances"][0][i] if results["distances"] else 0
            score = 1.0 - distance
            meta = results["metadatas"][0][i] if results["metadatas"] else {}
            content = results["documents"][0][i] if results["documents"] else ""
            chunks.append(
                ChunkResult(
                    chunk_id=chunk_id,
                    content=content,
                    score=round(score, 4),
                    heading_path=meta.get("heading_path", ""),
                    source_file=meta.get("source_file", ""),
                    tags=meta.get("tags", "[]"),
                    title=meta.get("title", ""),
                    document_id=meta.get("document_id", ""),
                )
            )
    duration_ms = int((time.time() - start) * 1000)
    log.info(
        "retrieval_done",
        query=query[:100],
        top_k=top_k,
        results_count=len(chunks),
        duration_ms=duration_ms,
    )
    return chunks
--- a/src/atocore/retrieval/vector_store.py
+++ b/src/atocore/retrieval/vector_store.py
@@ -0,0 +1,77 @@
 """ChromaDB vector store wrapper."""
 import chromadb
 from atocore.config import settings
 from atocore.observability.logger import get_logger
 from atocore.retrieval.embeddings import embed_texts
 log = get_logger("vector_store")
 COLLECTION_NAME = "atocore_chunks"
 _store: "VectorStore | None" = None
 class VectorStore:
    """Wrapper around ChromaDB for chunk storage and retrieval."""
    def __init__(self) -> None:
        settings.chroma_path.mkdir(parents=True, exist_ok=True)
        self._client = chromadb.PersistentClient(path=str(settings.chroma_path))
        self._collection = self._client.get_or_create_collection(
            name=COLLECTION_NAME,
            metadata={"hnsw:space": "cosine"},
        )
        log.info("vector_store_initialized", path=str(settings.chroma_path))
    def add(
        self,
        ids: list[str],
        documents: list[str],
        metadatas: list[dict],
    ) -> None:
        """Add chunks with embeddings to the store."""
        embeddings = embed_texts(documents)
        self._collection.add(
            ids=ids,
            embeddings=embeddings,
            documents=documents,
            metadatas=metadatas,
        )
        log.debug("vectors_added", count=len(ids))
    def query(
        self,
        query_embedding: list[float],
        top_k: int = 10,
        where: dict | None = None,
    ) -> dict:
        """Query the store for similar chunks."""
        kwargs: dict = {
            "query_embeddings": [query_embedding],
            "n_results": top_k,
            "include": ["documents", "metadatas", "distances"],
        }
        if where:
            kwargs["where"] = where
        return self._collection.query(**kwargs)
    def delete(self, ids: list[str]) -> None:
        """Delete chunks by IDs."""
        if ids:
            self._collection.delete(ids=ids)
            log.debug("vectors_deleted", count=len(ids))
    @property
    def count(self) -> int:
        return self._collection.count()
 def get_vector_store() -> VectorStore:
    """Get or create the singleton vector store."""
    global _store
    if _store is None:
        _store = VectorStore()
    return _store
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,114 @@
 """pytest configuration and shared fixtures."""
 import os
 import tempfile
 from pathlib import Path
 import pytest
 # Force test data directory
 os.environ["ATOCORE_DATA_DIR"] = tempfile.mkdtemp(prefix="atocore_test_")
 os.environ["ATOCORE_DEBUG"] = "true"
@pytest.fixture
 def tmp_data_dir(tmp_path):
    """Provide a temporary data directory for tests."""
    os.environ["ATOCORE_DATA_DIR"] = str(tmp_path)
    # Reset singletons
    from atocore import config
    config.settings = config.Settings()
    import atocore.retrieval.vector_store as vs
    vs._store = None
    return tmp_path
@pytest.fixture
 def sample_markdown(tmp_path) -> Path:
    """Create a sample markdown file for testing."""
    md_file = tmp_path / "test_note.md"
    md_file.write_text(
        """---
 tags:
  - atocore
  - architecture
 date: 2026-04-05
 ---
 # AtoCore Architecture
 ## Overview
 AtoCore is a personal context engine that enriches LLM interactions
 with durable memory, structured context, and project knowledge.
 ## Layers
 The system has these layers:
 1. Main PKM (human, messy, exploratory)
 2. AtoVault (system mirror)
 3. AtoDrive (trusted project truth)
 4. Structured Memory (DB)
 5. Semantic Retrieval (vector DB)
 ## Memory Types
 AtoCore supports these memory types:
 - Identity
 - Preferences
 - Project Memory
 - Episodic Memory
 - Knowledge Objects
 - Adaptation Memory
 - Trusted Project State
 ## Trust Precedence
 When sources conflict:
 1. Trusted Project State wins
 2. AtoDrive overrides PKM
 3. Most recent confirmed wins
 4. Higher confidence wins
 5. Equal → flag conflict
 No silent merging.
 """,
        encoding="utf-8",
    )
    return md_file
@pytest.fixture
 def sample_folder(tmp_path, sample_markdown) -> Path:
    """Create a folder with multiple markdown files."""
    # Already has test_note.md from sample_markdown
    second = tmp_path / "second_note.md"
    second.write_text(
        """---
 tags:
  - chunking
 ---
 # Chunking Strategy
 ## Approach
 Heading-aware recursive splitting:
 1. Split on H2 boundaries first
 2. If section > 800 chars, split on H3
 3. If still > 800 chars, split on paragraphs
 4. Hard split at 800 chars with 100 char overlap
 ## Parameters
 - max_chunk_size: 800 characters
 - overlap: 100 characters
 - min_chunk_size: 50 characters
 """,
        encoding="utf-8",
    )
    return tmp_path
--- a/tests/test_chunker.py
+++ b/tests/test_chunker.py
@@ -0,0 +1,73 @@
 """Tests for the markdown chunker."""
 from atocore.ingestion.chunker import chunk_markdown
 def test_basic_chunking():
    """Test that markdown is split into chunks."""
    body = """## Section One
 This is the first section with some content that is long enough to pass the minimum chunk size filter applied by the chunker.
 ## Section Two
 This is the second section with different content that is also long enough to pass the minimum chunk size threshold.
 """
    chunks = chunk_markdown(body)
    assert len(chunks) >= 2
    assert all(c.char_count > 0 for c in chunks)
    assert all(c.chunk_index >= 0 for c in chunks)
 def test_heading_path_preserved():
    """Test that heading paths are captured."""
    body = """## Architecture
 ### Layers
 The system has multiple layers organized in a clear hierarchy for separation of concerns and maintainability.
 """
    chunks = chunk_markdown(body)
    assert len(chunks) >= 1
    # At least one chunk should have heading info
    has_heading = any(c.heading_path for c in chunks)
    assert has_heading
 def test_small_chunks_filtered():
    """Test that very small chunks are discarded."""
    body = """## A
 Hi
 ## B
 This is a real section with enough content to pass the minimum size threshold.
 """
    chunks = chunk_markdown(body, min_size=50)
    # "Hi" should be filtered out
    for c in chunks:
        assert c.char_count >= 50
 def test_large_section_split():
    """Test that large sections are split further."""
    large_content = "Word " * 200  # ~1000 chars
    body = f"## Big Section\n\n{large_content}"
    chunks = chunk_markdown(body, max_size=400)
    assert len(chunks) >= 2
 def test_metadata_passed_through():
    """Test that base metadata is included in chunks."""
    body = "## Test\n\nSome content here that is long enough."
    meta = {"source_file": "/test/file.md", "tags": ["test"]}
    chunks = chunk_markdown(body, base_metadata=meta)
    if chunks:
        assert chunks[0].metadata.get("source_file") == "/test/file.md"
 def test_empty_body():
    """Test chunking an empty body."""
    chunks = chunk_markdown("")
    assert chunks == []
--- a/tests/test_context_builder.py
+++ b/tests/test_context_builder.py
@@ -0,0 +1,60 @@
 """Tests for the context builder."""
 from atocore.context.builder import build_context, get_last_context_pack
 from atocore.ingestion.pipeline import ingest_file
 from atocore.models.database import init_db
 def test_build_context_returns_pack(tmp_data_dir, sample_markdown):
    """Test that context builder returns a valid pack."""
    init_db()
    ingest_file(sample_markdown)
    pack = build_context("What is AtoCore?")
    assert pack.total_chars > 0
    assert len(pack.chunks_used) > 0
    assert pack.budget_remaining >= 0
    assert "--- AtoCore Context ---" in pack.formatted_context
    assert "--- End Context ---" in pack.formatted_context
 def test_context_respects_budget(tmp_data_dir, sample_markdown):
    """Test that context builder respects character budget."""
    init_db()
    ingest_file(sample_markdown)
    pack = build_context("What is AtoCore?", budget=500)
    assert pack.total_chars <= 500
 def test_context_with_project_hint(tmp_data_dir, sample_markdown):
    """Test that project hint boosts relevant chunks."""
    init_db()
    ingest_file(sample_markdown)
    pack = build_context("What is the architecture?", project_hint="atocore")
    assert len(pack.chunks_used) > 0
    # With project hint, we should still get results
    assert pack.total_chars > 0
 def test_last_context_pack_stored(tmp_data_dir, sample_markdown):
    """Test that last context pack is stored for debug."""
    init_db()
    ingest_file(sample_markdown)
    build_context("test prompt")
    last = get_last_context_pack()
    assert last is not None
    assert last.query == "test prompt"
 def test_full_prompt_structure(tmp_data_dir, sample_markdown):
    """Test that the full prompt has correct structure."""
    init_db()
    ingest_file(sample_markdown)
    pack = build_context("What are memory types?")
    assert "knowledge base" in pack.full_prompt.lower()
    assert "--- AtoCore Context ---" in pack.full_prompt
    assert "What are memory types?" in pack.full_prompt
--- a/tests/test_ingestion.py
+++ b/tests/test_ingestion.py
@@ -0,0 +1,71 @@
 """Tests for the ingestion pipeline."""
 from pathlib import Path
 from atocore.ingestion.parser import parse_markdown
 from atocore.models.database import get_connection, init_db
 from atocore.ingestion.pipeline import ingest_file
 def test_parse_markdown(sample_markdown):
    """Test markdown parsing with frontmatter."""
    parsed = parse_markdown(sample_markdown)
    assert parsed.title == "AtoCore Architecture"
    assert "atocore" in parsed.tags
    assert "architecture" in parsed.tags
    assert len(parsed.body) > 0
    assert len(parsed.headings) > 0
 def test_parse_extracts_headings(sample_markdown):
    """Test that headings are extracted correctly."""
    parsed = parse_markdown(sample_markdown)
    heading_texts = [h[1] for h in parsed.headings]
    assert "AtoCore Architecture" in heading_texts
    assert "Overview" in heading_texts
 def test_ingest_file(tmp_data_dir, sample_markdown):
    """Test ingesting a single file."""
    init_db()
    result = ingest_file(sample_markdown)
    assert result["status"] == "ingested"
    assert result["chunks"] > 0
    # Verify the file was stored in DB
    with get_connection() as conn:
        doc = conn.execute(
            "SELECT COUNT(*) as c FROM source_documents WHERE file_path = ?",
            (str(sample_markdown.resolve()),),
        ).fetchone()
        assert doc["c"] == 1
        chunks = conn.execute(
            "SELECT COUNT(*) as c FROM source_chunks sc "
            "JOIN source_documents sd ON sc.document_id = sd.id "
            "WHERE sd.file_path = ?",
            (str(sample_markdown.resolve()),),
        ).fetchone()
        assert chunks["c"] > 0
 def test_ingest_skips_unchanged(tmp_data_dir, sample_markdown):
    """Test that re-ingesting unchanged file is skipped."""
    init_db()
    ingest_file(sample_markdown)
    result = ingest_file(sample_markdown)
    assert result["status"] == "skipped"
 def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
    """Test that changed files are re-ingested."""
    init_db()
    ingest_file(sample_markdown)
    # Modify the file
    sample_markdown.write_text(
        sample_markdown.read_text(encoding="utf-8") + "\n\n## New Section\n\nNew content added.",
        encoding="utf-8",
    )
    result = ingest_file(sample_markdown)
    assert result["status"] == "ingested"
--- a/tests/test_prompts/gigabit_prompts.yaml
+++ b/tests/test_prompts/gigabit_prompts.yaml
@@ -0,0 +1,40 @@
 prompts:
  - id: g1
    prompt: "What is the GigaBIT M1 project about?"
    project: gigabit
    expected: "Should mention 1.2m primary mirror, StarSpec, telescope"
  - id: g2
    prompt: "What are the main requirements for the M1 mirror?"
    project: gigabit
    expected: "Should mention optical/mechanical requirements, SOW, diameter, Zerodur"
  - id: g3
    prompt: "What vendors are involved in the project?"
    project: gigabit
    expected: "Should mention Optiques Fullum, StarSpec, Atomaste, or subcontractors"
  - id: g4
    prompt: "What is the status of the CDR?"
    project: gigabit
    expected: "Should mention Critical Design Review status, CBUSH, design completion"
  - id: g5
    prompt: "What are the key design decisions made so far?"
    project: gigabit
    expected: "Should mention design phases, PDR, assumptions, blank order"
  - id: g6
    prompt: "What FEA optimization work has been done?"
    project: gigabit
    expected: "Should mention FEA analysis, optimization approach, WFE, displacement data"
  - id: g7
    prompt: "What is the cost reduction strategy?"
    project: gigabit
    expected: "Should mention cost reduction campaign, trade-off, topology selection"
  - id: g8
    prompt: "What are the mirror blank specifications?"
    project: gigabit
    expected: "Should mention 1200mm diameter, Zerodur, optical specifications"
--- a/tests/test_prompts/prompts.yaml
+++ b/tests/test_prompts/prompts.yaml
@@ -0,0 +1,40 @@
 prompts:
  - id: p1
    prompt: "What is AtoCore's architecture?"
    project: atocore
    expected: "Should mention layered architecture, SQLite, vector DB"
  - id: p2
    prompt: "What chunking strategy does AtoCore use?"
    project: atocore
    expected: "Should mention heading-aware splitting, 800 char max"
  - id: p3
    prompt: "What is the trust precedence order?"
    project: atocore
    expected: "Should list: Trusted Project State > AtoDrive > validated memory"
  - id: p4
    prompt: "How does AtoCore handle conflicts between sources?"
    project: atocore
    expected: "Should mention conflict resolution rules, no silent merging"
  - id: p5
    prompt: "What are the different memory types?"
    project: atocore
    expected: "Should list: Identity, Preferences, Project, Episodic, Knowledge, Adaptation, Trusted Project State"
  - id: p6
    prompt: "What is the context budget allocation?"
    project: atocore
    expected: "Should mention percentages: identity 5%, preferences 5%, project 20%, episodic 10%, retrieval 60%"
  - id: p7
    prompt: "What is a trivial prompt in AtoCore?"
    project: atocore
    expected: "Should mention: no project ref, no proper nouns, no past context dependency"
  - id: p8
    prompt: "What are the success criteria for the first win?"
    project: atocore
    expected: "Should mention: saves >=5 min lookup, >=80-90% accuracy, >=10 test prompts"
--- a/tests/test_retrieval.py
+++ b/tests/test_retrieval.py
@@ -0,0 +1,41 @@
 """Tests for the retrieval system."""
 from atocore.ingestion.pipeline import ingest_file
 from atocore.models.database import init_db
 from atocore.retrieval.retriever import retrieve
 from atocore.retrieval.vector_store import get_vector_store
 def test_retrieve_returns_results(tmp_data_dir, sample_markdown):
    """Test that retrieval returns relevant chunks."""
    init_db()
    ingest_file(sample_markdown)
    results = retrieve("What are the memory types?", top_k=5)
    assert len(results) > 0
    assert all(r.score > 0 for r in results)
    assert all(r.content for r in results)
 def test_retrieve_scores_ranked(tmp_data_dir, sample_markdown):
    """Test that results are ranked by score."""
    init_db()
    ingest_file(sample_markdown)
    results = retrieve("architecture layers", top_k=5)
    if len(results) >= 2:
        scores = [r.score for r in results]
        assert scores == sorted(scores, reverse=True)
 def test_vector_store_count(tmp_data_dir, sample_markdown):
    """Test that vector store tracks chunk count."""
    init_db()
    # Reset singleton for clean test
    import atocore.retrieval.vector_store as vs
    vs._store = None
    ingest_file(sample_markdown)
    store = get_vector_store()
    assert store.count > 0
		`@@ -0,0 +1,3 @@`
							`"""AtoCore — Personal Context Engine."""`

							`__version__ = "0.1.0"`