feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
73
tests/test_chunker.py
Normal file
73
tests/test_chunker.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Tests for the markdown chunker."""
|
||||
|
||||
from atocore.ingestion.chunker import chunk_markdown
|
||||
|
||||
|
||||
def test_basic_chunking():
|
||||
"""Test that markdown is split into chunks."""
|
||||
body = """## Section One
|
||||
|
||||
This is the first section with some content that is long enough to pass the minimum chunk size filter applied by the chunker.
|
||||
|
||||
## Section Two
|
||||
|
||||
This is the second section with different content that is also long enough to pass the minimum chunk size threshold.
|
||||
"""
|
||||
chunks = chunk_markdown(body)
|
||||
assert len(chunks) >= 2
|
||||
assert all(c.char_count > 0 for c in chunks)
|
||||
assert all(c.chunk_index >= 0 for c in chunks)
|
||||
|
||||
|
||||
def test_heading_path_preserved():
|
||||
"""Test that heading paths are captured."""
|
||||
body = """## Architecture
|
||||
|
||||
### Layers
|
||||
|
||||
The system has multiple layers organized in a clear hierarchy for separation of concerns and maintainability.
|
||||
"""
|
||||
chunks = chunk_markdown(body)
|
||||
assert len(chunks) >= 1
|
||||
# At least one chunk should have heading info
|
||||
has_heading = any(c.heading_path for c in chunks)
|
||||
assert has_heading
|
||||
|
||||
|
||||
def test_small_chunks_filtered():
|
||||
"""Test that very small chunks are discarded."""
|
||||
body = """## A
|
||||
|
||||
Hi
|
||||
|
||||
## B
|
||||
|
||||
This is a real section with enough content to pass the minimum size threshold.
|
||||
"""
|
||||
chunks = chunk_markdown(body, min_size=50)
|
||||
# "Hi" should be filtered out
|
||||
for c in chunks:
|
||||
assert c.char_count >= 50
|
||||
|
||||
|
||||
def test_large_section_split():
|
||||
"""Test that large sections are split further."""
|
||||
large_content = "Word " * 200 # ~1000 chars
|
||||
body = f"## Big Section\n\n{large_content}"
|
||||
chunks = chunk_markdown(body, max_size=400)
|
||||
assert len(chunks) >= 2
|
||||
|
||||
|
||||
def test_metadata_passed_through():
|
||||
"""Test that base metadata is included in chunks."""
|
||||
body = "## Test\n\nSome content here that is long enough."
|
||||
meta = {"source_file": "/test/file.md", "tags": ["test"]}
|
||||
chunks = chunk_markdown(body, base_metadata=meta)
|
||||
if chunks:
|
||||
assert chunks[0].metadata.get("source_file") == "/test/file.md"
|
||||
|
||||
|
||||
def test_empty_body():
|
||||
"""Test chunking an empty body."""
|
||||
chunks = chunk_markdown("")
|
||||
assert chunks == []
|
||||
Reference in New Issue
Block a user