feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
146
src/atocore/ingestion/chunker.py
Normal file
146
src/atocore/ingestion/chunker.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""Heading-aware recursive markdown chunking."""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from atocore.config import settings
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
content: str
|
||||
chunk_index: int
|
||||
heading_path: str
|
||||
char_count: int
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
def chunk_markdown(
|
||||
body: str,
|
||||
base_metadata: dict | None = None,
|
||||
max_size: int | None = None,
|
||||
overlap: int | None = None,
|
||||
min_size: int | None = None,
|
||||
) -> list[Chunk]:
|
||||
"""Split markdown body into chunks using heading-aware strategy.
|
||||
|
||||
1. Split on H2 boundaries
|
||||
2. If section > max_size, split on H3
|
||||
3. If still > max_size, split on paragraph breaks
|
||||
4. If still > max_size, hard split with overlap
|
||||
"""
|
||||
max_size = max_size or settings.chunk_max_size
|
||||
overlap = overlap or settings.chunk_overlap
|
||||
min_size = min_size or settings.chunk_min_size
|
||||
base_metadata = base_metadata or {}
|
||||
|
||||
sections = _split_by_heading(body, level=2)
|
||||
raw_chunks: list[tuple[str, str]] = [] # (heading_path, content)
|
||||
|
||||
for heading, content in sections:
|
||||
if len(content) <= max_size:
|
||||
raw_chunks.append((heading, content))
|
||||
else:
|
||||
# Try splitting on H3
|
||||
subsections = _split_by_heading(content, level=3)
|
||||
for sub_heading, sub_content in subsections:
|
||||
full_path = (
|
||||
f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading
|
||||
)
|
||||
if len(sub_content) <= max_size:
|
||||
raw_chunks.append((full_path, sub_content))
|
||||
else:
|
||||
# Split on paragraphs
|
||||
para_chunks = _split_by_paragraphs(
|
||||
sub_content, max_size, overlap
|
||||
)
|
||||
for pc in para_chunks:
|
||||
raw_chunks.append((full_path, pc))
|
||||
|
||||
# Build final chunks, filtering out too-small ones
|
||||
chunks = []
|
||||
idx = 0
|
||||
for heading_path, content in raw_chunks:
|
||||
content = content.strip()
|
||||
if len(content) < min_size:
|
||||
continue
|
||||
chunks.append(
|
||||
Chunk(
|
||||
content=content,
|
||||
chunk_index=idx,
|
||||
heading_path=heading_path,
|
||||
char_count=len(content),
|
||||
metadata={**base_metadata},
|
||||
)
|
||||
)
|
||||
idx += 1
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:
|
||||
"""Split text by heading level. Returns (heading_text, section_content) pairs."""
|
||||
pattern = rf"^({'#' * level})\s+(.+)$"
|
||||
parts: list[tuple[str, str]] = []
|
||||
current_heading = ""
|
||||
current_lines: list[str] = []
|
||||
|
||||
for line in text.split("\n"):
|
||||
match = re.match(pattern, line)
|
||||
if match:
|
||||
# Save previous section
|
||||
if current_lines:
|
||||
parts.append((current_heading, "\n".join(current_lines)))
|
||||
current_heading = match.group(2).strip()
|
||||
current_lines = []
|
||||
else:
|
||||
current_lines.append(line)
|
||||
|
||||
# Save last section
|
||||
if current_lines:
|
||||
parts.append((current_heading, "\n".join(current_lines)))
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
def _split_by_paragraphs(
|
||||
text: str, max_size: int, overlap: int
|
||||
) -> list[str]:
|
||||
"""Split text by paragraph breaks, then hard-split if needed."""
|
||||
paragraphs = re.split(r"\n\n+", text)
|
||||
chunks: list[str] = []
|
||||
current = ""
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
if len(current) + len(para) + 2 <= max_size:
|
||||
current = f"{current}\n\n{para}" if current else para
|
||||
else:
|
||||
if current:
|
||||
chunks.append(current)
|
||||
# If single paragraph exceeds max, hard split
|
||||
if len(para) > max_size:
|
||||
chunks.extend(_hard_split(para, max_size, overlap))
|
||||
else:
|
||||
current = para
|
||||
continue
|
||||
current = ""
|
||||
|
||||
if current:
|
||||
chunks.append(current)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
|
||||
"""Hard split text at max_size with overlap."""
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + max_size
|
||||
chunks.append(text[start:end])
|
||||
start = end - overlap
|
||||
return chunks
|
||||
Reference in New Issue
Block a user