src/atocore/ingestion/chunker.py

"""Heading-aware recursive markdown chunking."""

import re
from dataclasses import dataclass, field

import atocore.config as _config


@dataclass
class Chunk:
    content: str
    chunk_index: int
    heading_path: str
    char_count: int
    metadata: dict = field(default_factory=dict)


def chunk_markdown(
    body: str,
    base_metadata: dict | None = None,
    max_size: int | None = None,
    overlap: int | None = None,
    min_size: int | None = None,
) -> list[Chunk]:
    """Split markdown body into chunks using heading-aware strategy.

    1. Split on H2 boundaries
    2. If section > max_size, split on H3
    3. If still > max_size, split on paragraph breaks
    4. If still > max_size, hard split with overlap
    """
    max_size = max_size or _config.settings.chunk_max_size
    overlap = overlap or _config.settings.chunk_overlap
    min_size = min_size or _config.settings.chunk_min_size
    base_metadata = base_metadata or {}

    sections = _split_by_heading(body, level=2)
    raw_chunks: list[tuple[str, str]] = []  # (heading_path, content)

    for heading, content in sections:
        if len(content) <= max_size:
            raw_chunks.append((heading, content))
        else:
            # Try splitting on H3
            subsections = _split_by_heading(content, level=3)
            for sub_heading, sub_content in subsections:
                full_path = (
                    f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading
                )
                if len(sub_content) <= max_size:
                    raw_chunks.append((full_path, sub_content))
                else:
                    # Split on paragraphs
                    para_chunks = _split_by_paragraphs(
                        sub_content, max_size, overlap
                    )
                    for pc in para_chunks:
                        raw_chunks.append((full_path, pc))

    # Build final chunks, filtering out too-small ones
    chunks = []
    idx = 0
    for heading_path, content in raw_chunks:
        content = content.strip()
        if len(content) < min_size:
            continue
        chunks.append(
            Chunk(
                content=content,
                chunk_index=idx,
                heading_path=heading_path,
                char_count=len(content),
                metadata={**base_metadata},
            )
        )
        idx += 1

    return chunks


def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:
    """Split text by heading level. Returns (heading_text, section_content) pairs."""
    pattern = rf"^({'#' * level})\s+(.+)$"
    parts: list[tuple[str, str]] = []
    current_heading = ""
    current_lines: list[str] = []

    for line in text.split("\n"):
        match = re.match(pattern, line)
        if match:
            # Save previous section
            if current_lines:
                parts.append((current_heading, "\n".join(current_lines)))
            current_heading = match.group(2).strip()
            current_lines = []
        else:
            current_lines.append(line)

    # Save last section
    if current_lines:
        parts.append((current_heading, "\n".join(current_lines)))

    return parts


def _split_by_paragraphs(
    text: str, max_size: int, overlap: int
) -> list[str]:
    """Split text by paragraph breaks, then hard-split if needed."""
    paragraphs = re.split(r"\n\n+", text)
    chunks: list[str] = []
    current = ""

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        if len(current) + len(para) + 2 <= max_size:
            current = f"{current}\n\n{para}" if current else para
        else:
            if current:
                chunks.append(current)
            # If single paragraph exceeds max, hard split
            if len(para) > max_size:
                chunks.extend(_hard_split(para, max_size, overlap))
            else:
                current = para
                continue
            current = ""

    if current:
        chunks.append(current)

    return chunks


def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
    """Hard split text at max_size with overlap."""
    # Prevent infinite loop: overlap must be less than max_size
    if overlap >= max_size:
        overlap = max_size // 4

    chunks = []
    start = 0
    while start < len(text):
        end = start + max_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00			`"""Heading-aware recursive markdown chunking."""`

			`import re`
			`from dataclasses import dataclass, field`

Stabilize core correctness and sync project plan state 2026-04-05 17:53:23 -04:00			`import atocore.config as _config`
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00

			`@dataclass`
			`class Chunk:`
			`content: str`
			`chunk_index: int`
			`heading_path: str`
			`char_count: int`
			`metadata: dict = field(default_factory=dict)`


			`def chunk_markdown(`
			`body: str,`
			`base_metadata: dict \| None = None,`
			`max_size: int \| None = None,`
			`overlap: int \| None = None,`
			`min_size: int \| None = None,`
			`) -> list[Chunk]:`
			`"""Split markdown body into chunks using heading-aware strategy.`

			`1. Split on H2 boundaries`
			`2. If section > max_size, split on H3`
			`3. If still > max_size, split on paragraph breaks`
			`4. If still > max_size, hard split with overlap`
			`"""`
Stabilize core correctness and sync project plan state 2026-04-05 17:53:23 -04:00			`max_size = max_size or _config.settings.chunk_max_size`
			`overlap = overlap or _config.settings.chunk_overlap`
			`min_size = min_size or _config.settings.chunk_min_size`
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00			`base_metadata = base_metadata or {}`

			`sections = _split_by_heading(body, level=2)`
			`raw_chunks: list[tuple[str, str]] = [] # (heading_path, content)`

			`for heading, content in sections:`
			`if len(content) <= max_size:`
			`raw_chunks.append((heading, content))`
			`else:`
			`# Try splitting on H3`
			`subsections = _split_by_heading(content, level=3)`
			`for sub_heading, sub_content in subsections:`
			`full_path = (`
			`f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading`
			`)`
			`if len(sub_content) <= max_size:`
			`raw_chunks.append((full_path, sub_content))`
			`else:`
			`# Split on paragraphs`
			`para_chunks = _split_by_paragraphs(`
			`sub_content, max_size, overlap`
			`)`
			`for pc in para_chunks:`
			`raw_chunks.append((full_path, pc))`

			`# Build final chunks, filtering out too-small ones`
			`chunks = []`
			`idx = 0`
			`for heading_path, content in raw_chunks:`
			`content = content.strip()`
			`if len(content) < min_size:`
			`continue`
			`chunks.append(`
			`Chunk(`
			`content=content,`
			`chunk_index=idx,`
			`heading_path=heading_path,`
			`char_count=len(content),`
			`metadata={**base_metadata},`
			`)`
			`)`
			`idx += 1`

			`return chunks`


			`def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:`
			`"""Split text by heading level. Returns (heading_text, section_content) pairs."""`
			`pattern = rf"^({'#' * level})\s+(.+)$"`
			`parts: list[tuple[str, str]] = []`
			`current_heading = ""`
			`current_lines: list[str] = []`

			`for line in text.split("\n"):`
			`match = re.match(pattern, line)`
			`if match:`
			`# Save previous section`
			`if current_lines:`
			`parts.append((current_heading, "\n".join(current_lines)))`
			`current_heading = match.group(2).strip()`
			`current_lines = []`
			`else:`
			`current_lines.append(line)`

			`# Save last section`
			`if current_lines:`
			`parts.append((current_heading, "\n".join(current_lines)))`

			`return parts`


			`def _split_by_paragraphs(`
			`text: str, max_size: int, overlap: int`
			`) -> list[str]:`
			`"""Split text by paragraph breaks, then hard-split if needed."""`
			`paragraphs = re.split(r"\n\n+", text)`
			`chunks: list[str] = []`
			`current = ""`

			`for para in paragraphs:`
			`para = para.strip()`
			`if not para:`
			`continue`

			`if len(current) + len(para) + 2 <= max_size:`
			`current = f"{current}\n\n{para}" if current else para`
			`else:`
			`if current:`
			`chunks.append(current)`
			`# If single paragraph exceeds max, hard split`
			`if len(para) > max_size:`
			`chunks.extend(_hard_split(para, max_size, overlap))`
			`else:`
			`current = para`
			`continue`
			`current = ""`

			`if current:`
			`chunks.append(current)`

			`return chunks`


			`def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:`
			`"""Hard split text at max_size with overlap."""`
fix: critical bugs and hardening from validation audit - Fix infinite loop in chunker _hard_split when overlap >= max_size - Fix tag filter false positives by quoting tag values in ChromaDB query - Fix score boost semantics (additive → multiplicative) to stay within 0-1 range - Add error handling and type hints to all API routes - Update README with proper project documentation Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:35:37 -04:00			`# Prevent infinite loop: overlap must be less than max_size`
			`if overlap >= max_size:`
			`overlap = max_size // 4`

feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00			`chunks = []`
			`start = 0`
			`while start < len(text):`
			`end = start + max_size`
			`chunks.append(text[start:end])`
			`start = end - overlap`
			`return chunks`