feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions
--- a/src/atocore/ingestion/chunker.py
+++ b/src/atocore/ingestion/chunker.py
@@ -0,0 +1,146 @@
+"""Heading-aware recursive markdown chunking."""
+
+import re
+from dataclasses import dataclass, field
+
+from atocore.config import settings
+
+
+@dataclass
+class Chunk:
+    content: str
+    chunk_index: int
+    heading_path: str
+    char_count: int
+    metadata: dict = field(default_factory=dict)
+
+
+def chunk_markdown(
+    body: str,
+    base_metadata: dict | None = None,
+    max_size: int | None = None,
+    overlap: int | None = None,
+    min_size: int | None = None,
+) -> list[Chunk]:
+    """Split markdown body into chunks using heading-aware strategy.
+
+    1. Split on H2 boundaries
+    2. If section > max_size, split on H3
+    3. If still > max_size, split on paragraph breaks
+    4. If still > max_size, hard split with overlap
+    """
+    max_size = max_size or settings.chunk_max_size
+    overlap = overlap or settings.chunk_overlap
+    min_size = min_size or settings.chunk_min_size
+    base_metadata = base_metadata or {}
+
+    sections = _split_by_heading(body, level=2)
+    raw_chunks: list[tuple[str, str]] = []  # (heading_path, content)
+
+    for heading, content in sections:
+        if len(content) <= max_size:
+            raw_chunks.append((heading, content))
+        else:
+            # Try splitting on H3
+            subsections = _split_by_heading(content, level=3)
+            for sub_heading, sub_content in subsections:
+                full_path = (
+                    f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading
+                )
+                if len(sub_content) <= max_size:
+                    raw_chunks.append((full_path, sub_content))
+                else:
+                    # Split on paragraphs
+                    para_chunks = _split_by_paragraphs(
+                        sub_content, max_size, overlap
+                    )
+                    for pc in para_chunks:
+                        raw_chunks.append((full_path, pc))
+
+    # Build final chunks, filtering out too-small ones
+    chunks = []
+    idx = 0
+    for heading_path, content in raw_chunks:
+        content = content.strip()
+        if len(content) < min_size:
+            continue
+        chunks.append(
+            Chunk(
+                content=content,
+                chunk_index=idx,
+                heading_path=heading_path,
+                char_count=len(content),
+                metadata={**base_metadata},
+            )
+        )
+        idx += 1
+
+    return chunks
+
+
+def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:
+    """Split text by heading level. Returns (heading_text, section_content) pairs."""
+    pattern = rf"^({'#' * level})\s+(.+)$"
+    parts: list[tuple[str, str]] = []
+    current_heading = ""
+    current_lines: list[str] = []
+
+    for line in text.split("\n"):
+        match = re.match(pattern, line)
+        if match:
+            # Save previous section
+            if current_lines:
+                parts.append((current_heading, "\n".join(current_lines)))
+            current_heading = match.group(2).strip()
+            current_lines = []
+        else:
+            current_lines.append(line)
+
+    # Save last section
+    if current_lines:
+        parts.append((current_heading, "\n".join(current_lines)))
+
+    return parts
+
+
+def _split_by_paragraphs(
+    text: str, max_size: int, overlap: int
+) -> list[str]:
+    """Split text by paragraph breaks, then hard-split if needed."""
+    paragraphs = re.split(r"\n\n+", text)
+    chunks: list[str] = []
+    current = ""
+
+    for para in paragraphs:
+        para = para.strip()
+        if not para:
+            continue
+
+        if len(current) + len(para) + 2 <= max_size:
+            current = f"{current}\n\n{para}" if current else para
+        else:
+            if current:
+                chunks.append(current)
+            # If single paragraph exceeds max, hard split
+            if len(para) > max_size:
+                chunks.extend(_hard_split(para, max_size, overlap))
+            else:
+                current = para
+                continue
+            current = ""
+
+    if current:
+        chunks.append(current)
+
+    return chunks
+
+
+def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
+    """Hard split text at max_size with overlap."""
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + max_size
+        chunks.append(text[start:end])
+        start = end - overlap
+    return chunks