src/atocore/ingestion/parser.py

"""Markdown file parsing with frontmatter extraction."""

import re
from dataclasses import dataclass, field
from pathlib import Path

import frontmatter


@dataclass
class ParsedDocument:
    file_path: str
    title: str
    body: str
    tags: list[str] = field(default_factory=list)
    frontmatter: dict = field(default_factory=dict)
    headings: list[tuple[int, str]] = field(default_factory=list)


def parse_markdown(file_path: Path, text: str | None = None) -> ParsedDocument:
    """Parse a markdown file, extracting frontmatter and structure."""
    raw_text = text if text is not None else file_path.read_text(encoding="utf-8")
    post = frontmatter.loads(raw_text)

    meta = dict(post.metadata) if post.metadata else {}
    body = post.content.strip()

    # Extract title: first H1, or filename
    title = _extract_title(body, file_path)

    # Extract tags from frontmatter
    tags = meta.get("tags", [])
    if isinstance(tags, str):
        tags = [t.strip() for t in tags.split(",") if t.strip()]
    tags = tags or []

    # Extract heading structure
    headings = _extract_headings(body)

    return ParsedDocument(
        file_path=str(file_path.resolve()),
        title=title,
        body=body,
        tags=tags,
        frontmatter=meta,
        headings=headings,
    )


def _extract_title(body: str, file_path: Path) -> str:
    """Get title from first H1 or fallback to filename."""
    match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
    if match:
        return match.group(1).strip()
    return file_path.stem.replace("_", " ").replace("-", " ").title()


def _extract_headings(body: str) -> list[tuple[int, str]]:
    """Extract all headings with their level."""
    headings = []
    for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE):
        level = len(match.group(1))
        text = match.group(2).strip()
        headings.append((level, text))
    return headings
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00			`"""Markdown file parsing with frontmatter extraction."""`

			`import re`
			`from dataclasses import dataclass, field`
			`from pathlib import Path`

			`import frontmatter`


			`@dataclass`
			`class ParsedDocument:`
			`file_path: str`
			`title: str`
			`body: str`
			`tags: list[str] = field(default_factory=list)`
			`frontmatter: dict = field(default_factory=dict)`
			`headings: list[tuple[int, str]] = field(default_factory=list)`


Stabilize core correctness and sync project plan state 2026-04-05 17:53:23 -04:00			`def parse_markdown(file_path: Path, text: str \| None = None) -> ParsedDocument:`
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00			`"""Parse a markdown file, extracting frontmatter and structure."""`
Stabilize core correctness and sync project plan state 2026-04-05 17:53:23 -04:00			`raw_text = text if text is not None else file_path.read_text(encoding="utf-8")`
			`post = frontmatter.loads(raw_text)`
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00
			`meta = dict(post.metadata) if post.metadata else {}`
			`body = post.content.strip()`

			`# Extract title: first H1, or filename`
			`title = _extract_title(body, file_path)`

			`# Extract tags from frontmatter`
			`tags = meta.get("tags", [])`
			`if isinstance(tags, str):`
			`tags = [t.strip() for t in tags.split(",") if t.strip()]`
			`tags = tags or []`

			`# Extract heading structure`
			`headings = _extract_headings(body)`

			`return ParsedDocument(`
			`file_path=str(file_path.resolve()),`
			`title=title,`
			`body=body,`
			`tags=tags,`
			`frontmatter=meta,`
			`headings=headings,`
			`)`


			`def _extract_title(body: str, file_path: Path) -> str:`
			`"""Get title from first H1 or fallback to filename."""`
			`match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)`
			`if match:`
			`return match.group(1).strip()`
			`return file_path.stem.replace("_", " ").replace("-", " ").title()`


			`def _extract_headings(body: str) -> list[tuple[int, str]]:`
			`"""Extract all headings with their level."""`
			`headings = []`
			`for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE):`
			`level = len(match.group(1))`
			`text = match.group(2).strip()`
			`headings.append((level, text))`
			`return headings`