feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions
--- a/src/atocore/ingestion/parser.py
+++ b/src/atocore/ingestion/parser.py
@@ -0,0 +1,65 @@
+"""Markdown file parsing with frontmatter extraction."""
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+
+import frontmatter
+
+
+@dataclass
+class ParsedDocument:
+    file_path: str
+    title: str
+    body: str
+    tags: list[str] = field(default_factory=list)
+    frontmatter: dict = field(default_factory=dict)
+    headings: list[tuple[int, str]] = field(default_factory=list)
+
+
+def parse_markdown(file_path: Path) -> ParsedDocument:
+    """Parse a markdown file, extracting frontmatter and structure."""
+    text = file_path.read_text(encoding="utf-8")
+    post = frontmatter.loads(text)
+
+    meta = dict(post.metadata) if post.metadata else {}
+    body = post.content.strip()
+
+    # Extract title: first H1, or filename
+    title = _extract_title(body, file_path)
+
+    # Extract tags from frontmatter
+    tags = meta.get("tags", [])
+    if isinstance(tags, str):
+        tags = [t.strip() for t in tags.split(",") if t.strip()]
+    tags = tags or []
+
+    # Extract heading structure
+    headings = _extract_headings(body)
+
+    return ParsedDocument(
+        file_path=str(file_path.resolve()),
+        title=title,
+        body=body,
+        tags=tags,
+        frontmatter=meta,
+        headings=headings,
+    )
+
+
+def _extract_title(body: str, file_path: Path) -> str:
+    """Get title from first H1 or fallback to filename."""
+    match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
+    if match:
+        return match.group(1).strip()
+    return file_path.stem.replace("_", " ").replace("-", " ").title()
+
+
+def _extract_headings(body: str) -> list[tuple[int, str]]:
+    """Extract all headings with their level."""
+    headings = []
+    for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE):
+        level = len(match.group(1))
+        text = match.group(2).strip()
+        headings.append((level, text))
+    return headings