72 lines
2.3 KiB
Python
72 lines
2.3 KiB
Python
|
|
"""Tests for the ingestion pipeline."""
|
||
|
|
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from atocore.ingestion.parser import parse_markdown
|
||
|
|
from atocore.models.database import get_connection, init_db
|
||
|
|
from atocore.ingestion.pipeline import ingest_file
|
||
|
|
|
||
|
|
|
||
|
|
def test_parse_markdown(sample_markdown):
|
||
|
|
"""Test markdown parsing with frontmatter."""
|
||
|
|
parsed = parse_markdown(sample_markdown)
|
||
|
|
assert parsed.title == "AtoCore Architecture"
|
||
|
|
assert "atocore" in parsed.tags
|
||
|
|
assert "architecture" in parsed.tags
|
||
|
|
assert len(parsed.body) > 0
|
||
|
|
assert len(parsed.headings) > 0
|
||
|
|
|
||
|
|
|
||
|
|
def test_parse_extracts_headings(sample_markdown):
|
||
|
|
"""Test that headings are extracted correctly."""
|
||
|
|
parsed = parse_markdown(sample_markdown)
|
||
|
|
heading_texts = [h[1] for h in parsed.headings]
|
||
|
|
assert "AtoCore Architecture" in heading_texts
|
||
|
|
assert "Overview" in heading_texts
|
||
|
|
|
||
|
|
|
||
|
|
def test_ingest_file(tmp_data_dir, sample_markdown):
|
||
|
|
"""Test ingesting a single file."""
|
||
|
|
init_db()
|
||
|
|
result = ingest_file(sample_markdown)
|
||
|
|
assert result["status"] == "ingested"
|
||
|
|
assert result["chunks"] > 0
|
||
|
|
|
||
|
|
# Verify the file was stored in DB
|
||
|
|
with get_connection() as conn:
|
||
|
|
doc = conn.execute(
|
||
|
|
"SELECT COUNT(*) as c FROM source_documents WHERE file_path = ?",
|
||
|
|
(str(sample_markdown.resolve()),),
|
||
|
|
).fetchone()
|
||
|
|
assert doc["c"] == 1
|
||
|
|
|
||
|
|
chunks = conn.execute(
|
||
|
|
"SELECT COUNT(*) as c FROM source_chunks sc "
|
||
|
|
"JOIN source_documents sd ON sc.document_id = sd.id "
|
||
|
|
"WHERE sd.file_path = ?",
|
||
|
|
(str(sample_markdown.resolve()),),
|
||
|
|
).fetchone()
|
||
|
|
assert chunks["c"] > 0
|
||
|
|
|
||
|
|
|
||
|
|
def test_ingest_skips_unchanged(tmp_data_dir, sample_markdown):
|
||
|
|
"""Test that re-ingesting unchanged file is skipped."""
|
||
|
|
init_db()
|
||
|
|
ingest_file(sample_markdown)
|
||
|
|
result = ingest_file(sample_markdown)
|
||
|
|
assert result["status"] == "skipped"
|
||
|
|
|
||
|
|
|
||
|
|
def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
|
||
|
|
"""Test that changed files are re-ingested."""
|
||
|
|
init_db()
|
||
|
|
ingest_file(sample_markdown)
|
||
|
|
|
||
|
|
# Modify the file
|
||
|
|
sample_markdown.write_text(
|
||
|
|
sample_markdown.read_text(encoding="utf-8") + "\n\n## New Section\n\nNew content added.",
|
||
|
|
encoding="utf-8",
|
||
|
|
)
|
||
|
|
result = ingest_file(sample_markdown)
|
||
|
|
assert result["status"] == "ingested"
|