ATOCore/tests/test_chunker.py

"""Tests for the markdown chunker."""

from atocore.ingestion.chunker import chunk_markdown


def test_basic_chunking():
    """Test that markdown is split into chunks."""
    body = """## Section One

This is the first section with some content that is long enough to pass the minimum chunk size filter applied by the chunker.

## Section Two

This is the second section with different content that is also long enough to pass the minimum chunk size threshold.
"""
    chunks = chunk_markdown(body)
    assert len(chunks) >= 2
    assert all(c.char_count > 0 for c in chunks)
    assert all(c.chunk_index >= 0 for c in chunks)


def test_heading_path_preserved():
    """Test that heading paths are captured."""
    body = """## Architecture

### Layers

The system has multiple layers organized in a clear hierarchy for separation of concerns and maintainability.
"""
    chunks = chunk_markdown(body)
    assert len(chunks) >= 1
    # At least one chunk should have heading info
    has_heading = any(c.heading_path for c in chunks)
    assert has_heading


def test_small_chunks_filtered():
    """Test that very small chunks are discarded."""
    body = """## A

Hi

## B

This is a real section with enough content to pass the minimum size threshold.
"""
    chunks = chunk_markdown(body, min_size=50)
    # "Hi" should be filtered out
    for c in chunks:
        assert c.char_count >= 50


def test_large_section_split():
    """Test that large sections are split further."""
    large_content = "Word " * 200  # ~1000 chars
    body = f"## Big Section\n\n{large_content}"
    chunks = chunk_markdown(body, max_size=400)
    assert len(chunks) >= 2


def test_metadata_passed_through():
    """Test that base metadata is included in chunks."""
    body = "## Test\n\nSome content here that is long enough."
    meta = {"source_file": "/test/file.md", "tags": ["test"]}
    chunks = chunk_markdown(body, base_metadata=meta)
    if chunks:
        assert chunks[0].metadata.get("source_file") == "/test/file.md"


def test_empty_body():
    """Test chunking an empty body."""
    chunks = chunk_markdown("")
    assert chunks == []