feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions

0
tests/__init__.py Normal file
View File

114
tests/conftest.py Normal file
View File

@@ -0,0 +1,114 @@
"""pytest configuration and shared fixtures."""
import os
import tempfile
from pathlib import Path
import pytest
# Force test data directory
os.environ["ATOCORE_DATA_DIR"] = tempfile.mkdtemp(prefix="atocore_test_")
os.environ["ATOCORE_DEBUG"] = "true"
@pytest.fixture
def tmp_data_dir(tmp_path):
"""Provide a temporary data directory for tests."""
os.environ["ATOCORE_DATA_DIR"] = str(tmp_path)
# Reset singletons
from atocore import config
config.settings = config.Settings()
import atocore.retrieval.vector_store as vs
vs._store = None
return tmp_path
@pytest.fixture
def sample_markdown(tmp_path) -> Path:
"""Create a sample markdown file for testing."""
md_file = tmp_path / "test_note.md"
md_file.write_text(
"""---
tags:
- atocore
- architecture
date: 2026-04-05
---
# AtoCore Architecture
## Overview
AtoCore is a personal context engine that enriches LLM interactions
with durable memory, structured context, and project knowledge.
## Layers
The system has these layers:
1. Main PKM (human, messy, exploratory)
2. AtoVault (system mirror)
3. AtoDrive (trusted project truth)
4. Structured Memory (DB)
5. Semantic Retrieval (vector DB)
## Memory Types
AtoCore supports these memory types:
- Identity
- Preferences
- Project Memory
- Episodic Memory
- Knowledge Objects
- Adaptation Memory
- Trusted Project State
## Trust Precedence
When sources conflict:
1. Trusted Project State wins
2. AtoDrive overrides PKM
3. Most recent confirmed wins
4. Higher confidence wins
5. Equal → flag conflict
No silent merging.
""",
encoding="utf-8",
)
return md_file
@pytest.fixture
def sample_folder(tmp_path, sample_markdown) -> Path:
"""Create a folder with multiple markdown files."""
# Already has test_note.md from sample_markdown
second = tmp_path / "second_note.md"
second.write_text(
"""---
tags:
- chunking
---
# Chunking Strategy
## Approach
Heading-aware recursive splitting:
1. Split on H2 boundaries first
2. If section > 800 chars, split on H3
3. If still > 800 chars, split on paragraphs
4. Hard split at 800 chars with 100 char overlap
## Parameters
- max_chunk_size: 800 characters
- overlap: 100 characters
- min_chunk_size: 50 characters
""",
encoding="utf-8",
)
return tmp_path

73
tests/test_chunker.py Normal file
View File

@@ -0,0 +1,73 @@
"""Tests for the markdown chunker."""
from atocore.ingestion.chunker import chunk_markdown
def test_basic_chunking():
"""Test that markdown is split into chunks."""
body = """## Section One
This is the first section with some content that is long enough to pass the minimum chunk size filter applied by the chunker.
## Section Two
This is the second section with different content that is also long enough to pass the minimum chunk size threshold.
"""
chunks = chunk_markdown(body)
assert len(chunks) >= 2
assert all(c.char_count > 0 for c in chunks)
assert all(c.chunk_index >= 0 for c in chunks)
def test_heading_path_preserved():
"""Test that heading paths are captured."""
body = """## Architecture
### Layers
The system has multiple layers organized in a clear hierarchy for separation of concerns and maintainability.
"""
chunks = chunk_markdown(body)
assert len(chunks) >= 1
# At least one chunk should have heading info
has_heading = any(c.heading_path for c in chunks)
assert has_heading
def test_small_chunks_filtered():
"""Test that very small chunks are discarded."""
body = """## A
Hi
## B
This is a real section with enough content to pass the minimum size threshold.
"""
chunks = chunk_markdown(body, min_size=50)
# "Hi" should be filtered out
for c in chunks:
assert c.char_count >= 50
def test_large_section_split():
"""Test that large sections are split further."""
large_content = "Word " * 200 # ~1000 chars
body = f"## Big Section\n\n{large_content}"
chunks = chunk_markdown(body, max_size=400)
assert len(chunks) >= 2
def test_metadata_passed_through():
"""Test that base metadata is included in chunks."""
body = "## Test\n\nSome content here that is long enough."
meta = {"source_file": "/test/file.md", "tags": ["test"]}
chunks = chunk_markdown(body, base_metadata=meta)
if chunks:
assert chunks[0].metadata.get("source_file") == "/test/file.md"
def test_empty_body():
"""Test chunking an empty body."""
chunks = chunk_markdown("")
assert chunks == []

View File

@@ -0,0 +1,60 @@
"""Tests for the context builder."""
from atocore.context.builder import build_context, get_last_context_pack
from atocore.ingestion.pipeline import ingest_file
from atocore.models.database import init_db
def test_build_context_returns_pack(tmp_data_dir, sample_markdown):
"""Test that context builder returns a valid pack."""
init_db()
ingest_file(sample_markdown)
pack = build_context("What is AtoCore?")
assert pack.total_chars > 0
assert len(pack.chunks_used) > 0
assert pack.budget_remaining >= 0
assert "--- AtoCore Context ---" in pack.formatted_context
assert "--- End Context ---" in pack.formatted_context
def test_context_respects_budget(tmp_data_dir, sample_markdown):
"""Test that context builder respects character budget."""
init_db()
ingest_file(sample_markdown)
pack = build_context("What is AtoCore?", budget=500)
assert pack.total_chars <= 500
def test_context_with_project_hint(tmp_data_dir, sample_markdown):
"""Test that project hint boosts relevant chunks."""
init_db()
ingest_file(sample_markdown)
pack = build_context("What is the architecture?", project_hint="atocore")
assert len(pack.chunks_used) > 0
# With project hint, we should still get results
assert pack.total_chars > 0
def test_last_context_pack_stored(tmp_data_dir, sample_markdown):
"""Test that last context pack is stored for debug."""
init_db()
ingest_file(sample_markdown)
build_context("test prompt")
last = get_last_context_pack()
assert last is not None
assert last.query == "test prompt"
def test_full_prompt_structure(tmp_data_dir, sample_markdown):
"""Test that the full prompt has correct structure."""
init_db()
ingest_file(sample_markdown)
pack = build_context("What are memory types?")
assert "knowledge base" in pack.full_prompt.lower()
assert "--- AtoCore Context ---" in pack.full_prompt
assert "What are memory types?" in pack.full_prompt

71
tests/test_ingestion.py Normal file
View File

@@ -0,0 +1,71 @@
"""Tests for the ingestion pipeline."""
from pathlib import Path
from atocore.ingestion.parser import parse_markdown
from atocore.models.database import get_connection, init_db
from atocore.ingestion.pipeline import ingest_file
def test_parse_markdown(sample_markdown):
"""Test markdown parsing with frontmatter."""
parsed = parse_markdown(sample_markdown)
assert parsed.title == "AtoCore Architecture"
assert "atocore" in parsed.tags
assert "architecture" in parsed.tags
assert len(parsed.body) > 0
assert len(parsed.headings) > 0
def test_parse_extracts_headings(sample_markdown):
"""Test that headings are extracted correctly."""
parsed = parse_markdown(sample_markdown)
heading_texts = [h[1] for h in parsed.headings]
assert "AtoCore Architecture" in heading_texts
assert "Overview" in heading_texts
def test_ingest_file(tmp_data_dir, sample_markdown):
"""Test ingesting a single file."""
init_db()
result = ingest_file(sample_markdown)
assert result["status"] == "ingested"
assert result["chunks"] > 0
# Verify the file was stored in DB
with get_connection() as conn:
doc = conn.execute(
"SELECT COUNT(*) as c FROM source_documents WHERE file_path = ?",
(str(sample_markdown.resolve()),),
).fetchone()
assert doc["c"] == 1
chunks = conn.execute(
"SELECT COUNT(*) as c FROM source_chunks sc "
"JOIN source_documents sd ON sc.document_id = sd.id "
"WHERE sd.file_path = ?",
(str(sample_markdown.resolve()),),
).fetchone()
assert chunks["c"] > 0
def test_ingest_skips_unchanged(tmp_data_dir, sample_markdown):
"""Test that re-ingesting unchanged file is skipped."""
init_db()
ingest_file(sample_markdown)
result = ingest_file(sample_markdown)
assert result["status"] == "skipped"
def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
"""Test that changed files are re-ingested."""
init_db()
ingest_file(sample_markdown)
# Modify the file
sample_markdown.write_text(
sample_markdown.read_text(encoding="utf-8") + "\n\n## New Section\n\nNew content added.",
encoding="utf-8",
)
result = ingest_file(sample_markdown)
assert result["status"] == "ingested"

View File

@@ -0,0 +1,40 @@
prompts:
- id: g1
prompt: "What is the GigaBIT M1 project about?"
project: gigabit
expected: "Should mention 1.2m primary mirror, StarSpec, telescope"
- id: g2
prompt: "What are the main requirements for the M1 mirror?"
project: gigabit
expected: "Should mention optical/mechanical requirements, SOW, diameter, Zerodur"
- id: g3
prompt: "What vendors are involved in the project?"
project: gigabit
expected: "Should mention Optiques Fullum, StarSpec, Atomaste, or subcontractors"
- id: g4
prompt: "What is the status of the CDR?"
project: gigabit
expected: "Should mention Critical Design Review status, CBUSH, design completion"
- id: g5
prompt: "What are the key design decisions made so far?"
project: gigabit
expected: "Should mention design phases, PDR, assumptions, blank order"
- id: g6
prompt: "What FEA optimization work has been done?"
project: gigabit
expected: "Should mention FEA analysis, optimization approach, WFE, displacement data"
- id: g7
prompt: "What is the cost reduction strategy?"
project: gigabit
expected: "Should mention cost reduction campaign, trade-off, topology selection"
- id: g8
prompt: "What are the mirror blank specifications?"
project: gigabit
expected: "Should mention 1200mm diameter, Zerodur, optical specifications"

View File

@@ -0,0 +1,40 @@
prompts:
- id: p1
prompt: "What is AtoCore's architecture?"
project: atocore
expected: "Should mention layered architecture, SQLite, vector DB"
- id: p2
prompt: "What chunking strategy does AtoCore use?"
project: atocore
expected: "Should mention heading-aware splitting, 800 char max"
- id: p3
prompt: "What is the trust precedence order?"
project: atocore
expected: "Should list: Trusted Project State > AtoDrive > validated memory"
- id: p4
prompt: "How does AtoCore handle conflicts between sources?"
project: atocore
expected: "Should mention conflict resolution rules, no silent merging"
- id: p5
prompt: "What are the different memory types?"
project: atocore
expected: "Should list: Identity, Preferences, Project, Episodic, Knowledge, Adaptation, Trusted Project State"
- id: p6
prompt: "What is the context budget allocation?"
project: atocore
expected: "Should mention percentages: identity 5%, preferences 5%, project 20%, episodic 10%, retrieval 60%"
- id: p7
prompt: "What is a trivial prompt in AtoCore?"
project: atocore
expected: "Should mention: no project ref, no proper nouns, no past context dependency"
- id: p8
prompt: "What are the success criteria for the first win?"
project: atocore
expected: "Should mention: saves >=5 min lookup, >=80-90% accuracy, >=10 test prompts"

41
tests/test_retrieval.py Normal file
View File

@@ -0,0 +1,41 @@
"""Tests for the retrieval system."""
from atocore.ingestion.pipeline import ingest_file
from atocore.models.database import init_db
from atocore.retrieval.retriever import retrieve
from atocore.retrieval.vector_store import get_vector_store
def test_retrieve_returns_results(tmp_data_dir, sample_markdown):
"""Test that retrieval returns relevant chunks."""
init_db()
ingest_file(sample_markdown)
results = retrieve("What are the memory types?", top_k=5)
assert len(results) > 0
assert all(r.score > 0 for r in results)
assert all(r.content for r in results)
def test_retrieve_scores_ranked(tmp_data_dir, sample_markdown):
"""Test that results are ranked by score."""
init_db()
ingest_file(sample_markdown)
results = retrieve("architecture layers", top_k=5)
if len(results) >= 2:
scores = [r.score for r in results]
assert scores == sorted(scores, reverse=True)
def test_vector_store_count(tmp_data_dir, sample_markdown):
"""Test that vector store tracks chunk count."""
init_db()
# Reset singleton for clean test
import atocore.retrieval.vector_store as vs
vs._store = None
ingest_file(sample_markdown)
store = get_vector_store()
assert store.count > 0