"""Tests for the markdown chunker.""" from atocore.ingestion.chunker import chunk_markdown def test_basic_chunking(): """Test that markdown is split into chunks.""" body = """## Section One This is the first section with some content that is long enough to pass the minimum chunk size filter applied by the chunker. ## Section Two This is the second section with different content that is also long enough to pass the minimum chunk size threshold. """ chunks = chunk_markdown(body) assert len(chunks) >= 2 assert all(c.char_count > 0 for c in chunks) assert all(c.chunk_index >= 0 for c in chunks) def test_heading_path_preserved(): """Test that heading paths are captured.""" body = """## Architecture ### Layers The system has multiple layers organized in a clear hierarchy for separation of concerns and maintainability. """ chunks = chunk_markdown(body) assert len(chunks) >= 1 # At least one chunk should have heading info has_heading = any(c.heading_path for c in chunks) assert has_heading def test_small_chunks_filtered(): """Test that very small chunks are discarded.""" body = """## A Hi ## B This is a real section with enough content to pass the minimum size threshold. """ chunks = chunk_markdown(body, min_size=50) # "Hi" should be filtered out for c in chunks: assert c.char_count >= 50 def test_large_section_split(): """Test that large sections are split further.""" large_content = "Word " * 200 # ~1000 chars body = f"## Big Section\n\n{large_content}" chunks = chunk_markdown(body, max_size=400) assert len(chunks) >= 2 def test_metadata_passed_through(): """Test that base metadata is included in chunks.""" body = "## Test\n\nSome content here that is long enough." meta = {"source_file": "/test/file.md", "tags": ["test"]} chunks = chunk_markdown(body, base_metadata=meta) if chunks: assert chunks[0].metadata.get("source_file") == "/test/file.md" def test_empty_body(): """Test chunking an empty body.""" chunks = chunk_markdown("") assert chunks == []