ATOCore/tests/test_ingestion.py

"""Tests for the ingestion pipeline."""

from pathlib import Path

from atocore.ingestion.parser import parse_markdown
from atocore.models.database import get_connection, init_db
from atocore.ingestion.pipeline import ingest_file


def test_parse_markdown(sample_markdown):
    """Test markdown parsing with frontmatter."""
    parsed = parse_markdown(sample_markdown)
    assert parsed.title == "AtoCore Architecture"
    assert "atocore" in parsed.tags
    assert "architecture" in parsed.tags
    assert len(parsed.body) > 0
    assert len(parsed.headings) > 0


def test_parse_extracts_headings(sample_markdown):
    """Test that headings are extracted correctly."""
    parsed = parse_markdown(sample_markdown)
    heading_texts = [h[1] for h in parsed.headings]
    assert "AtoCore Architecture" in heading_texts
    assert "Overview" in heading_texts


def test_ingest_file(tmp_data_dir, sample_markdown):
    """Test ingesting a single file."""
    init_db()
    result = ingest_file(sample_markdown)
    assert result["status"] == "ingested"
    assert result["chunks"] > 0

    # Verify the file was stored in DB
    with get_connection() as conn:
        doc = conn.execute(
            "SELECT COUNT(*) as c FROM source_documents WHERE file_path = ?",
            (str(sample_markdown.resolve()),),
        ).fetchone()
        assert doc["c"] == 1

        chunks = conn.execute(
            "SELECT COUNT(*) as c FROM source_chunks sc "
            "JOIN source_documents sd ON sc.document_id = sd.id "
            "WHERE sd.file_path = ?",
            (str(sample_markdown.resolve()),),
        ).fetchone()
        assert chunks["c"] > 0


def test_ingest_skips_unchanged(tmp_data_dir, sample_markdown):
    """Test that re-ingesting unchanged file is skipped."""
    init_db()
    ingest_file(sample_markdown)
    result = ingest_file(sample_markdown)
    assert result["status"] == "skipped"


def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
    """Test that changed files are re-ingested."""
    init_db()
    ingest_file(sample_markdown)

    # Modify the file
    sample_markdown.write_text(
        sample_markdown.read_text(encoding="utf-8") + "\n\n## New Section\n\nNew content added.",
        encoding="utf-8",
    )
    result = ingest_file(sample_markdown)
    assert result["status"] == "ingested"