feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions
--- a/tests/test_ingestion.py
+++ b/tests/test_ingestion.py
@@ -0,0 +1,71 @@
+"""Tests for the ingestion pipeline."""
+
+from pathlib import Path
+
+from atocore.ingestion.parser import parse_markdown
+from atocore.models.database import get_connection, init_db
+from atocore.ingestion.pipeline import ingest_file
+
+
+def test_parse_markdown(sample_markdown):
+    """Test markdown parsing with frontmatter."""
+    parsed = parse_markdown(sample_markdown)
+    assert parsed.title == "AtoCore Architecture"
+    assert "atocore" in parsed.tags
+    assert "architecture" in parsed.tags
+    assert len(parsed.body) > 0
+    assert len(parsed.headings) > 0
+
+
+def test_parse_extracts_headings(sample_markdown):
+    """Test that headings are extracted correctly."""
+    parsed = parse_markdown(sample_markdown)
+    heading_texts = [h[1] for h in parsed.headings]
+    assert "AtoCore Architecture" in heading_texts
+    assert "Overview" in heading_texts
+
+
+def test_ingest_file(tmp_data_dir, sample_markdown):
+    """Test ingesting a single file."""
+    init_db()
+    result = ingest_file(sample_markdown)
+    assert result["status"] == "ingested"
+    assert result["chunks"] > 0
+
+    # Verify the file was stored in DB
+    with get_connection() as conn:
+        doc = conn.execute(
+            "SELECT COUNT(*) as c FROM source_documents WHERE file_path = ?",
+            (str(sample_markdown.resolve()),),
+        ).fetchone()
+        assert doc["c"] == 1
+
+        chunks = conn.execute(
+            "SELECT COUNT(*) as c FROM source_chunks sc "
+            "JOIN source_documents sd ON sc.document_id = sd.id "
+            "WHERE sd.file_path = ?",
+            (str(sample_markdown.resolve()),),
+        ).fetchone()
+        assert chunks["c"] > 0
+
+
+def test_ingest_skips_unchanged(tmp_data_dir, sample_markdown):
+    """Test that re-ingesting unchanged file is skipped."""
+    init_db()
+    ingest_file(sample_markdown)
+    result = ingest_file(sample_markdown)
+    assert result["status"] == "skipped"
+
+
+def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
+    """Test that changed files are re-ingested."""
+    init_db()
+    ingest_file(sample_markdown)
+
+    # Modify the file
+    sample_markdown.write_text(
+        sample_markdown.read_text(encoding="utf-8") + "\n\n## New Section\n\nNew content added.",
+        encoding="utf-8",
+    )
+    result = ingest_file(sample_markdown)
+    assert result["status"] == "ingested"