feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
114
tests/conftest.py
Normal file
114
tests/conftest.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""pytest configuration and shared fixtures."""
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Force test data directory
|
||||
os.environ["ATOCORE_DATA_DIR"] = tempfile.mkdtemp(prefix="atocore_test_")
|
||||
os.environ["ATOCORE_DEBUG"] = "true"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tmp_data_dir(tmp_path):
|
||||
"""Provide a temporary data directory for tests."""
|
||||
os.environ["ATOCORE_DATA_DIR"] = str(tmp_path)
|
||||
# Reset singletons
|
||||
from atocore import config
|
||||
config.settings = config.Settings()
|
||||
|
||||
import atocore.retrieval.vector_store as vs
|
||||
vs._store = None
|
||||
|
||||
return tmp_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_markdown(tmp_path) -> Path:
|
||||
"""Create a sample markdown file for testing."""
|
||||
md_file = tmp_path / "test_note.md"
|
||||
md_file.write_text(
|
||||
"""---
|
||||
tags:
|
||||
- atocore
|
||||
- architecture
|
||||
date: 2026-04-05
|
||||
---
|
||||
# AtoCore Architecture
|
||||
|
||||
## Overview
|
||||
|
||||
AtoCore is a personal context engine that enriches LLM interactions
|
||||
with durable memory, structured context, and project knowledge.
|
||||
|
||||
## Layers
|
||||
|
||||
The system has these layers:
|
||||
|
||||
1. Main PKM (human, messy, exploratory)
|
||||
2. AtoVault (system mirror)
|
||||
3. AtoDrive (trusted project truth)
|
||||
4. Structured Memory (DB)
|
||||
5. Semantic Retrieval (vector DB)
|
||||
|
||||
## Memory Types
|
||||
|
||||
AtoCore supports these memory types:
|
||||
|
||||
- Identity
|
||||
- Preferences
|
||||
- Project Memory
|
||||
- Episodic Memory
|
||||
- Knowledge Objects
|
||||
- Adaptation Memory
|
||||
- Trusted Project State
|
||||
|
||||
## Trust Precedence
|
||||
|
||||
When sources conflict:
|
||||
|
||||
1. Trusted Project State wins
|
||||
2. AtoDrive overrides PKM
|
||||
3. Most recent confirmed wins
|
||||
4. Higher confidence wins
|
||||
5. Equal → flag conflict
|
||||
|
||||
No silent merging.
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
return md_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_folder(tmp_path, sample_markdown) -> Path:
|
||||
"""Create a folder with multiple markdown files."""
|
||||
# Already has test_note.md from sample_markdown
|
||||
second = tmp_path / "second_note.md"
|
||||
second.write_text(
|
||||
"""---
|
||||
tags:
|
||||
- chunking
|
||||
---
|
||||
# Chunking Strategy
|
||||
|
||||
## Approach
|
||||
|
||||
Heading-aware recursive splitting:
|
||||
|
||||
1. Split on H2 boundaries first
|
||||
2. If section > 800 chars, split on H3
|
||||
3. If still > 800 chars, split on paragraphs
|
||||
4. Hard split at 800 chars with 100 char overlap
|
||||
|
||||
## Parameters
|
||||
|
||||
- max_chunk_size: 800 characters
|
||||
- overlap: 100 characters
|
||||
- min_chunk_size: 50 characters
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
return tmp_path
|
||||
73
tests/test_chunker.py
Normal file
73
tests/test_chunker.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""Tests for the markdown chunker."""
|
||||
|
||||
from atocore.ingestion.chunker import chunk_markdown
|
||||
|
||||
|
||||
def test_basic_chunking():
|
||||
"""Test that markdown is split into chunks."""
|
||||
body = """## Section One
|
||||
|
||||
This is the first section with some content that is long enough to pass the minimum chunk size filter applied by the chunker.
|
||||
|
||||
## Section Two
|
||||
|
||||
This is the second section with different content that is also long enough to pass the minimum chunk size threshold.
|
||||
"""
|
||||
chunks = chunk_markdown(body)
|
||||
assert len(chunks) >= 2
|
||||
assert all(c.char_count > 0 for c in chunks)
|
||||
assert all(c.chunk_index >= 0 for c in chunks)
|
||||
|
||||
|
||||
def test_heading_path_preserved():
|
||||
"""Test that heading paths are captured."""
|
||||
body = """## Architecture
|
||||
|
||||
### Layers
|
||||
|
||||
The system has multiple layers organized in a clear hierarchy for separation of concerns and maintainability.
|
||||
"""
|
||||
chunks = chunk_markdown(body)
|
||||
assert len(chunks) >= 1
|
||||
# At least one chunk should have heading info
|
||||
has_heading = any(c.heading_path for c in chunks)
|
||||
assert has_heading
|
||||
|
||||
|
||||
def test_small_chunks_filtered():
|
||||
"""Test that very small chunks are discarded."""
|
||||
body = """## A
|
||||
|
||||
Hi
|
||||
|
||||
## B
|
||||
|
||||
This is a real section with enough content to pass the minimum size threshold.
|
||||
"""
|
||||
chunks = chunk_markdown(body, min_size=50)
|
||||
# "Hi" should be filtered out
|
||||
for c in chunks:
|
||||
assert c.char_count >= 50
|
||||
|
||||
|
||||
def test_large_section_split():
|
||||
"""Test that large sections are split further."""
|
||||
large_content = "Word " * 200 # ~1000 chars
|
||||
body = f"## Big Section\n\n{large_content}"
|
||||
chunks = chunk_markdown(body, max_size=400)
|
||||
assert len(chunks) >= 2
|
||||
|
||||
|
||||
def test_metadata_passed_through():
|
||||
"""Test that base metadata is included in chunks."""
|
||||
body = "## Test\n\nSome content here that is long enough."
|
||||
meta = {"source_file": "/test/file.md", "tags": ["test"]}
|
||||
chunks = chunk_markdown(body, base_metadata=meta)
|
||||
if chunks:
|
||||
assert chunks[0].metadata.get("source_file") == "/test/file.md"
|
||||
|
||||
|
||||
def test_empty_body():
|
||||
"""Test chunking an empty body."""
|
||||
chunks = chunk_markdown("")
|
||||
assert chunks == []
|
||||
60
tests/test_context_builder.py
Normal file
60
tests/test_context_builder.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""Tests for the context builder."""
|
||||
|
||||
from atocore.context.builder import build_context, get_last_context_pack
|
||||
from atocore.ingestion.pipeline import ingest_file
|
||||
from atocore.models.database import init_db
|
||||
|
||||
|
||||
def test_build_context_returns_pack(tmp_data_dir, sample_markdown):
|
||||
"""Test that context builder returns a valid pack."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
|
||||
pack = build_context("What is AtoCore?")
|
||||
assert pack.total_chars > 0
|
||||
assert len(pack.chunks_used) > 0
|
||||
assert pack.budget_remaining >= 0
|
||||
assert "--- AtoCore Context ---" in pack.formatted_context
|
||||
assert "--- End Context ---" in pack.formatted_context
|
||||
|
||||
|
||||
def test_context_respects_budget(tmp_data_dir, sample_markdown):
|
||||
"""Test that context builder respects character budget."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
|
||||
pack = build_context("What is AtoCore?", budget=500)
|
||||
assert pack.total_chars <= 500
|
||||
|
||||
|
||||
def test_context_with_project_hint(tmp_data_dir, sample_markdown):
|
||||
"""Test that project hint boosts relevant chunks."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
|
||||
pack = build_context("What is the architecture?", project_hint="atocore")
|
||||
assert len(pack.chunks_used) > 0
|
||||
# With project hint, we should still get results
|
||||
assert pack.total_chars > 0
|
||||
|
||||
|
||||
def test_last_context_pack_stored(tmp_data_dir, sample_markdown):
|
||||
"""Test that last context pack is stored for debug."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
|
||||
build_context("test prompt")
|
||||
last = get_last_context_pack()
|
||||
assert last is not None
|
||||
assert last.query == "test prompt"
|
||||
|
||||
|
||||
def test_full_prompt_structure(tmp_data_dir, sample_markdown):
|
||||
"""Test that the full prompt has correct structure."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
|
||||
pack = build_context("What are memory types?")
|
||||
assert "knowledge base" in pack.full_prompt.lower()
|
||||
assert "--- AtoCore Context ---" in pack.full_prompt
|
||||
assert "What are memory types?" in pack.full_prompt
|
||||
71
tests/test_ingestion.py
Normal file
71
tests/test_ingestion.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""Tests for the ingestion pipeline."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from atocore.ingestion.parser import parse_markdown
|
||||
from atocore.models.database import get_connection, init_db
|
||||
from atocore.ingestion.pipeline import ingest_file
|
||||
|
||||
|
||||
def test_parse_markdown(sample_markdown):
|
||||
"""Test markdown parsing with frontmatter."""
|
||||
parsed = parse_markdown(sample_markdown)
|
||||
assert parsed.title == "AtoCore Architecture"
|
||||
assert "atocore" in parsed.tags
|
||||
assert "architecture" in parsed.tags
|
||||
assert len(parsed.body) > 0
|
||||
assert len(parsed.headings) > 0
|
||||
|
||||
|
||||
def test_parse_extracts_headings(sample_markdown):
|
||||
"""Test that headings are extracted correctly."""
|
||||
parsed = parse_markdown(sample_markdown)
|
||||
heading_texts = [h[1] for h in parsed.headings]
|
||||
assert "AtoCore Architecture" in heading_texts
|
||||
assert "Overview" in heading_texts
|
||||
|
||||
|
||||
def test_ingest_file(tmp_data_dir, sample_markdown):
|
||||
"""Test ingesting a single file."""
|
||||
init_db()
|
||||
result = ingest_file(sample_markdown)
|
||||
assert result["status"] == "ingested"
|
||||
assert result["chunks"] > 0
|
||||
|
||||
# Verify the file was stored in DB
|
||||
with get_connection() as conn:
|
||||
doc = conn.execute(
|
||||
"SELECT COUNT(*) as c FROM source_documents WHERE file_path = ?",
|
||||
(str(sample_markdown.resolve()),),
|
||||
).fetchone()
|
||||
assert doc["c"] == 1
|
||||
|
||||
chunks = conn.execute(
|
||||
"SELECT COUNT(*) as c FROM source_chunks sc "
|
||||
"JOIN source_documents sd ON sc.document_id = sd.id "
|
||||
"WHERE sd.file_path = ?",
|
||||
(str(sample_markdown.resolve()),),
|
||||
).fetchone()
|
||||
assert chunks["c"] > 0
|
||||
|
||||
|
||||
def test_ingest_skips_unchanged(tmp_data_dir, sample_markdown):
|
||||
"""Test that re-ingesting unchanged file is skipped."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
result = ingest_file(sample_markdown)
|
||||
assert result["status"] == "skipped"
|
||||
|
||||
|
||||
def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
|
||||
"""Test that changed files are re-ingested."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
|
||||
# Modify the file
|
||||
sample_markdown.write_text(
|
||||
sample_markdown.read_text(encoding="utf-8") + "\n\n## New Section\n\nNew content added.",
|
||||
encoding="utf-8",
|
||||
)
|
||||
result = ingest_file(sample_markdown)
|
||||
assert result["status"] == "ingested"
|
||||
40
tests/test_prompts/gigabit_prompts.yaml
Normal file
40
tests/test_prompts/gigabit_prompts.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
prompts:
|
||||
- id: g1
|
||||
prompt: "What is the GigaBIT M1 project about?"
|
||||
project: gigabit
|
||||
expected: "Should mention 1.2m primary mirror, StarSpec, telescope"
|
||||
|
||||
- id: g2
|
||||
prompt: "What are the main requirements for the M1 mirror?"
|
||||
project: gigabit
|
||||
expected: "Should mention optical/mechanical requirements, SOW, diameter, Zerodur"
|
||||
|
||||
- id: g3
|
||||
prompt: "What vendors are involved in the project?"
|
||||
project: gigabit
|
||||
expected: "Should mention Optiques Fullum, StarSpec, Atomaste, or subcontractors"
|
||||
|
||||
- id: g4
|
||||
prompt: "What is the status of the CDR?"
|
||||
project: gigabit
|
||||
expected: "Should mention Critical Design Review status, CBUSH, design completion"
|
||||
|
||||
- id: g5
|
||||
prompt: "What are the key design decisions made so far?"
|
||||
project: gigabit
|
||||
expected: "Should mention design phases, PDR, assumptions, blank order"
|
||||
|
||||
- id: g6
|
||||
prompt: "What FEA optimization work has been done?"
|
||||
project: gigabit
|
||||
expected: "Should mention FEA analysis, optimization approach, WFE, displacement data"
|
||||
|
||||
- id: g7
|
||||
prompt: "What is the cost reduction strategy?"
|
||||
project: gigabit
|
||||
expected: "Should mention cost reduction campaign, trade-off, topology selection"
|
||||
|
||||
- id: g8
|
||||
prompt: "What are the mirror blank specifications?"
|
||||
project: gigabit
|
||||
expected: "Should mention 1200mm diameter, Zerodur, optical specifications"
|
||||
40
tests/test_prompts/prompts.yaml
Normal file
40
tests/test_prompts/prompts.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
prompts:
|
||||
- id: p1
|
||||
prompt: "What is AtoCore's architecture?"
|
||||
project: atocore
|
||||
expected: "Should mention layered architecture, SQLite, vector DB"
|
||||
|
||||
- id: p2
|
||||
prompt: "What chunking strategy does AtoCore use?"
|
||||
project: atocore
|
||||
expected: "Should mention heading-aware splitting, 800 char max"
|
||||
|
||||
- id: p3
|
||||
prompt: "What is the trust precedence order?"
|
||||
project: atocore
|
||||
expected: "Should list: Trusted Project State > AtoDrive > validated memory"
|
||||
|
||||
- id: p4
|
||||
prompt: "How does AtoCore handle conflicts between sources?"
|
||||
project: atocore
|
||||
expected: "Should mention conflict resolution rules, no silent merging"
|
||||
|
||||
- id: p5
|
||||
prompt: "What are the different memory types?"
|
||||
project: atocore
|
||||
expected: "Should list: Identity, Preferences, Project, Episodic, Knowledge, Adaptation, Trusted Project State"
|
||||
|
||||
- id: p6
|
||||
prompt: "What is the context budget allocation?"
|
||||
project: atocore
|
||||
expected: "Should mention percentages: identity 5%, preferences 5%, project 20%, episodic 10%, retrieval 60%"
|
||||
|
||||
- id: p7
|
||||
prompt: "What is a trivial prompt in AtoCore?"
|
||||
project: atocore
|
||||
expected: "Should mention: no project ref, no proper nouns, no past context dependency"
|
||||
|
||||
- id: p8
|
||||
prompt: "What are the success criteria for the first win?"
|
||||
project: atocore
|
||||
expected: "Should mention: saves >=5 min lookup, >=80-90% accuracy, >=10 test prompts"
|
||||
41
tests/test_retrieval.py
Normal file
41
tests/test_retrieval.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Tests for the retrieval system."""
|
||||
|
||||
from atocore.ingestion.pipeline import ingest_file
|
||||
from atocore.models.database import init_db
|
||||
from atocore.retrieval.retriever import retrieve
|
||||
from atocore.retrieval.vector_store import get_vector_store
|
||||
|
||||
|
||||
def test_retrieve_returns_results(tmp_data_dir, sample_markdown):
|
||||
"""Test that retrieval returns relevant chunks."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
|
||||
results = retrieve("What are the memory types?", top_k=5)
|
||||
assert len(results) > 0
|
||||
assert all(r.score > 0 for r in results)
|
||||
assert all(r.content for r in results)
|
||||
|
||||
|
||||
def test_retrieve_scores_ranked(tmp_data_dir, sample_markdown):
|
||||
"""Test that results are ranked by score."""
|
||||
init_db()
|
||||
ingest_file(sample_markdown)
|
||||
|
||||
results = retrieve("architecture layers", top_k=5)
|
||||
if len(results) >= 2:
|
||||
scores = [r.score for r in results]
|
||||
assert scores == sorted(scores, reverse=True)
|
||||
|
||||
|
||||
def test_vector_store_count(tmp_data_dir, sample_markdown):
|
||||
"""Test that vector store tracks chunk count."""
|
||||
init_db()
|
||||
|
||||
# Reset singleton for clean test
|
||||
import atocore.retrieval.vector_store as vs
|
||||
vs._store = None
|
||||
|
||||
ingest_file(sample_markdown)
|
||||
store = get_vector_store()
|
||||
assert store.count > 0
|
||||
Reference in New Issue
Block a user