feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
8
.env.example
Normal file
8
.env.example
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ATOCORE_DEBUG=false
|
||||||
|
ATOCORE_DATA_DIR=./data
|
||||||
|
ATOCORE_HOST=127.0.0.1
|
||||||
|
ATOCORE_PORT=8100
|
||||||
|
ATOCORE_EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
|
||||||
|
ATOCORE_CHUNK_MAX_SIZE=800
|
||||||
|
ATOCORE_CHUNK_OVERLAP=100
|
||||||
|
ATOCORE_CONTEXT_BUDGET=3000
|
||||||
12
.gitignore
vendored
Normal file
12
.gitignore
vendored
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
data/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.env
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
.pytest_cache/
|
||||||
|
htmlcov/
|
||||||
|
.coverage
|
||||||
|
venv/
|
||||||
|
.venv/
|
||||||
36
pyproject.toml
Normal file
36
pyproject.toml
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=68.0", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "atocore"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Personal context engine for LLM interactions"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
dependencies = [
|
||||||
|
"fastapi>=0.110.0",
|
||||||
|
"uvicorn[standard]>=0.27.0",
|
||||||
|
"python-frontmatter>=1.1.0",
|
||||||
|
"chromadb>=0.4.22",
|
||||||
|
"sentence-transformers>=2.5.0",
|
||||||
|
"pydantic>=2.6.0",
|
||||||
|
"pydantic-settings>=2.1.0",
|
||||||
|
"structlog>=24.1.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.0.0",
|
||||||
|
"pytest-cov>=4.1.0",
|
||||||
|
"httpx>=0.27.0",
|
||||||
|
"pyyaml>=6.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
python_files = ["test_*.py"]
|
||||||
|
python_functions = ["test_*"]
|
||||||
|
addopts = "--cov=atocore --cov-report=term-missing -v"
|
||||||
5
requirements-dev.txt
Normal file
5
requirements-dev.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
-r requirements.txt
|
||||||
|
pytest>=8.0.0
|
||||||
|
pytest-cov>=4.1.0
|
||||||
|
httpx>=0.27.0
|
||||||
|
pyyaml>=6.0.0
|
||||||
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
fastapi>=0.110.0
|
||||||
|
uvicorn[standard]>=0.27.0
|
||||||
|
python-frontmatter>=1.1.0
|
||||||
|
chromadb>=0.4.22
|
||||||
|
sentence-transformers>=2.5.0
|
||||||
|
pydantic>=2.6.0
|
||||||
|
pydantic-settings>=2.1.0
|
||||||
|
structlog>=24.1.0
|
||||||
54
scripts/ingest_folder.py
Normal file
54
scripts/ingest_folder.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
"""CLI script to ingest a folder of markdown files."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add src to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
from atocore.ingestion.pipeline import ingest_folder
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
from atocore.observability.logger import setup_logging
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Ingest markdown files into AtoCore")
|
||||||
|
parser.add_argument("--path", required=True, help="Path to folder with markdown files")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
setup_logging()
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
folder = Path(args.path)
|
||||||
|
if not folder.is_dir():
|
||||||
|
print(f"Error: {folder} is not a directory")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
results = ingest_folder(folder)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
ingested = sum(1 for r in results if r["status"] == "ingested")
|
||||||
|
skipped = sum(1 for r in results if r["status"] == "skipped")
|
||||||
|
errors = sum(1 for r in results if r["status"] == "error")
|
||||||
|
total_chunks = sum(r.get("chunks", 0) for r in results)
|
||||||
|
|
||||||
|
print(f"\n{'='*50}")
|
||||||
|
print(f"Ingestion complete:")
|
||||||
|
print(f" Files processed: {len(results)}")
|
||||||
|
print(f" Ingested: {ingested}")
|
||||||
|
print(f" Skipped (unchanged): {skipped}")
|
||||||
|
print(f" Errors: {errors}")
|
||||||
|
print(f" Total chunks created: {total_chunks}")
|
||||||
|
print(f"{'='*50}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
print("\nErrors:")
|
||||||
|
for r in results:
|
||||||
|
if r["status"] == "error":
|
||||||
|
print(f" {r['file']}: {r['error']}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
76
scripts/query_test.py
Normal file
76
scripts/query_test.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""CLI script to run test prompts and compare baseline vs enriched."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Add src to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||||
|
|
||||||
|
from atocore.context.builder import build_context
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
from atocore.observability.logger import setup_logging
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Run test prompts against AtoCore")
|
||||||
|
parser.add_argument(
|
||||||
|
"--prompts",
|
||||||
|
default=str(Path(__file__).parent.parent / "tests" / "test_prompts" / "prompts.yaml"),
|
||||||
|
help="Path to prompts YAML file",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
setup_logging()
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
prompts_path = Path(args.prompts)
|
||||||
|
if not prompts_path.exists():
|
||||||
|
print(f"Error: {prompts_path} not found")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
with open(prompts_path) as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
prompts = data.get("prompts", [])
|
||||||
|
print(f"Running {len(prompts)} test prompts...\n")
|
||||||
|
|
||||||
|
for p in prompts:
|
||||||
|
prompt_id = p["id"]
|
||||||
|
prompt_text = p["prompt"]
|
||||||
|
project = p.get("project")
|
||||||
|
expected = p.get("expected", "")
|
||||||
|
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"[{prompt_id}] {prompt_text}")
|
||||||
|
print(f"Project: {project or 'none'}")
|
||||||
|
print(f"Expected: {expected}")
|
||||||
|
print(f"-" * 60)
|
||||||
|
|
||||||
|
pack = build_context(
|
||||||
|
user_prompt=prompt_text,
|
||||||
|
project_hint=project,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Chunks retrieved: {len(pack.chunks_used)}")
|
||||||
|
print(f"Total chars: {pack.total_chars} / {pack.budget}")
|
||||||
|
print(f"Duration: {pack.duration_ms}ms")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for i, chunk in enumerate(pack.chunks_used[:5]):
|
||||||
|
print(f" [{i+1}] Score: {chunk.score:.2f} | {chunk.source_file}")
|
||||||
|
print(f" Section: {chunk.heading_path}")
|
||||||
|
print(f" Preview: {chunk.content[:120]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(f"Full prompt length: {len(pack.full_prompt)} chars")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print("Done. Review output above to assess retrieval quality.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
3
src/atocore/__init__.py
Normal file
3
src/atocore/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
"""AtoCore — Personal Context Engine."""
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
0
src/atocore/api/__init__.py
Normal file
0
src/atocore/api/__init__.py
Normal file
132
src/atocore/api/routes.py
Normal file
132
src/atocore/api/routes.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
"""FastAPI route definitions."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from atocore.context.builder import (
|
||||||
|
ContextPack,
|
||||||
|
build_context,
|
||||||
|
get_last_context_pack,
|
||||||
|
_pack_to_dict,
|
||||||
|
)
|
||||||
|
from atocore.ingestion.pipeline import ingest_file, ingest_folder
|
||||||
|
from atocore.retrieval.retriever import retrieve
|
||||||
|
from atocore.retrieval.vector_store import get_vector_store
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
# --- Request/Response models ---
|
||||||
|
|
||||||
|
|
||||||
|
class IngestRequest(BaseModel):
|
||||||
|
path: str # file or folder path
|
||||||
|
|
||||||
|
|
||||||
|
class IngestResponse(BaseModel):
|
||||||
|
results: list[dict]
|
||||||
|
|
||||||
|
|
||||||
|
class QueryRequest(BaseModel):
|
||||||
|
prompt: str
|
||||||
|
top_k: int = 10
|
||||||
|
filter_tags: list[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class QueryResponse(BaseModel):
|
||||||
|
results: list[dict]
|
||||||
|
|
||||||
|
|
||||||
|
class ContextBuildRequest(BaseModel):
|
||||||
|
prompt: str
|
||||||
|
project: str | None = None
|
||||||
|
budget: int | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class ContextBuildResponse(BaseModel):
|
||||||
|
formatted_context: str
|
||||||
|
full_prompt: str
|
||||||
|
chunks_used: int
|
||||||
|
total_chars: int
|
||||||
|
budget: int
|
||||||
|
budget_remaining: int
|
||||||
|
duration_ms: int
|
||||||
|
chunks: list[dict]
|
||||||
|
|
||||||
|
|
||||||
|
# --- Endpoints ---
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/ingest", response_model=IngestResponse)
|
||||||
|
def api_ingest(req: IngestRequest):
|
||||||
|
"""Ingest a markdown file or folder."""
|
||||||
|
target = Path(req.path)
|
||||||
|
if target.is_file():
|
||||||
|
results = [ingest_file(target)]
|
||||||
|
elif target.is_dir():
|
||||||
|
results = ingest_folder(target)
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=404, detail=f"Path not found: {req.path}")
|
||||||
|
return IngestResponse(results=results)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/query", response_model=QueryResponse)
|
||||||
|
def api_query(req: QueryRequest):
|
||||||
|
"""Retrieve relevant chunks for a prompt."""
|
||||||
|
chunks = retrieve(req.prompt, top_k=req.top_k, filter_tags=req.filter_tags)
|
||||||
|
return QueryResponse(
|
||||||
|
results=[
|
||||||
|
{
|
||||||
|
"chunk_id": c.chunk_id,
|
||||||
|
"content": c.content,
|
||||||
|
"score": c.score,
|
||||||
|
"heading_path": c.heading_path,
|
||||||
|
"source_file": c.source_file,
|
||||||
|
"title": c.title,
|
||||||
|
}
|
||||||
|
for c in chunks
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/context/build", response_model=ContextBuildResponse)
|
||||||
|
def api_build_context(req: ContextBuildRequest):
|
||||||
|
"""Build a full context pack for a prompt."""
|
||||||
|
pack = build_context(
|
||||||
|
user_prompt=req.prompt,
|
||||||
|
project_hint=req.project,
|
||||||
|
budget=req.budget,
|
||||||
|
)
|
||||||
|
pack_dict = _pack_to_dict(pack)
|
||||||
|
return ContextBuildResponse(
|
||||||
|
formatted_context=pack.formatted_context,
|
||||||
|
full_prompt=pack.full_prompt,
|
||||||
|
chunks_used=len(pack.chunks_used),
|
||||||
|
total_chars=pack.total_chars,
|
||||||
|
budget=pack.budget,
|
||||||
|
budget_remaining=pack.budget_remaining,
|
||||||
|
duration_ms=pack.duration_ms,
|
||||||
|
chunks=pack_dict["chunks"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
def api_health():
|
||||||
|
"""Health check."""
|
||||||
|
store = get_vector_store()
|
||||||
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"vectors_count": store.count,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/debug/context")
|
||||||
|
def api_debug_context():
|
||||||
|
"""Inspect the last assembled context pack."""
|
||||||
|
pack = get_last_context_pack()
|
||||||
|
if pack is None:
|
||||||
|
return {"message": "No context pack built yet."}
|
||||||
|
return _pack_to_dict(pack)
|
||||||
39
src/atocore/config.py
Normal file
39
src/atocore/config.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""AtoCore configuration via environment variables."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
debug: bool = False
|
||||||
|
data_dir: Path = Path("./data")
|
||||||
|
host: str = "127.0.0.1"
|
||||||
|
port: int = 8100
|
||||||
|
|
||||||
|
# Embedding
|
||||||
|
embedding_model: str = (
|
||||||
|
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Chunking
|
||||||
|
chunk_max_size: int = 800
|
||||||
|
chunk_overlap: int = 100
|
||||||
|
chunk_min_size: int = 50
|
||||||
|
|
||||||
|
# Context
|
||||||
|
context_budget: int = 3000
|
||||||
|
context_top_k: int = 15
|
||||||
|
|
||||||
|
model_config = {"env_prefix": "ATOCORE_"}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def db_path(self) -> Path:
|
||||||
|
return self.data_dir / "atocore.db"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chroma_path(self) -> Path:
|
||||||
|
return self.data_dir / "chroma"
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
0
src/atocore/context/__init__.py
Normal file
0
src/atocore/context/__init__.py
Normal file
212
src/atocore/context/builder.py
Normal file
212
src/atocore/context/builder.py
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
"""Context pack assembly: retrieve, rank, budget, format."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from atocore.config import settings
|
||||||
|
from atocore.observability.logger import get_logger
|
||||||
|
from atocore.retrieval.retriever import ChunkResult, retrieve
|
||||||
|
|
||||||
|
log = get_logger("context_builder")
|
||||||
|
|
||||||
|
SYSTEM_PREFIX = (
|
||||||
|
"You have access to the following personal context from the user's knowledge base.\n"
|
||||||
|
"Use it to inform your answer. If the context is not relevant, ignore it.\n"
|
||||||
|
"Do not mention the context system unless asked."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Last built context pack for debug inspection
|
||||||
|
_last_context_pack: "ContextPack | None" = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ContextChunk:
|
||||||
|
content: str
|
||||||
|
source_file: str
|
||||||
|
heading_path: str
|
||||||
|
score: float
|
||||||
|
char_count: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ContextPack:
|
||||||
|
chunks_used: list[ContextChunk] = field(default_factory=list)
|
||||||
|
total_chars: int = 0
|
||||||
|
budget: int = 0
|
||||||
|
budget_remaining: int = 0
|
||||||
|
formatted_context: str = ""
|
||||||
|
full_prompt: str = ""
|
||||||
|
query: str = ""
|
||||||
|
project_hint: str = ""
|
||||||
|
duration_ms: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
def build_context(
|
||||||
|
user_prompt: str,
|
||||||
|
project_hint: str | None = None,
|
||||||
|
budget: int | None = None,
|
||||||
|
) -> ContextPack:
|
||||||
|
"""Build a context pack for a user prompt."""
|
||||||
|
global _last_context_pack
|
||||||
|
start = time.time()
|
||||||
|
budget = budget or settings.context_budget
|
||||||
|
|
||||||
|
# 1. Retrieve candidates
|
||||||
|
candidates = retrieve(user_prompt, top_k=settings.context_top_k)
|
||||||
|
|
||||||
|
# 2. Score and rank
|
||||||
|
scored = _rank_chunks(candidates, project_hint)
|
||||||
|
|
||||||
|
# 3. Select within budget
|
||||||
|
selected = _select_within_budget(scored, budget)
|
||||||
|
|
||||||
|
# 4. Format
|
||||||
|
formatted = _format_context_block(selected)
|
||||||
|
|
||||||
|
# 5. Build full prompt
|
||||||
|
full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"
|
||||||
|
|
||||||
|
total_chars = sum(c.char_count for c in selected)
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
|
||||||
|
pack = ContextPack(
|
||||||
|
chunks_used=selected,
|
||||||
|
total_chars=total_chars,
|
||||||
|
budget=budget,
|
||||||
|
budget_remaining=budget - total_chars,
|
||||||
|
formatted_context=formatted,
|
||||||
|
full_prompt=full_prompt,
|
||||||
|
query=user_prompt,
|
||||||
|
project_hint=project_hint or "",
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
|
||||||
|
_last_context_pack = pack
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"context_built",
|
||||||
|
chunks_used=len(selected),
|
||||||
|
total_chars=total_chars,
|
||||||
|
budget_remaining=budget - total_chars,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
log.debug("context_pack_detail", pack=_pack_to_dict(pack))
|
||||||
|
|
||||||
|
return pack
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_context_pack() -> ContextPack | None:
|
||||||
|
"""Return the last built context pack for debug inspection."""
|
||||||
|
return _last_context_pack
|
||||||
|
|
||||||
|
|
||||||
|
def _rank_chunks(
|
||||||
|
candidates: list[ChunkResult],
|
||||||
|
project_hint: str | None,
|
||||||
|
) -> list[tuple[float, ChunkResult]]:
|
||||||
|
"""Rank candidates with boosting for project match."""
|
||||||
|
scored = []
|
||||||
|
seen_content: set[str] = set()
|
||||||
|
|
||||||
|
for chunk in candidates:
|
||||||
|
# Deduplicate by content prefix (first 200 chars)
|
||||||
|
content_key = chunk.content[:200]
|
||||||
|
if content_key in seen_content:
|
||||||
|
continue
|
||||||
|
seen_content.add(content_key)
|
||||||
|
|
||||||
|
# Base score from similarity
|
||||||
|
final_score = chunk.score
|
||||||
|
|
||||||
|
# Project boost
|
||||||
|
if project_hint:
|
||||||
|
tags_str = chunk.tags.lower() if chunk.tags else ""
|
||||||
|
source_str = chunk.source_file.lower()
|
||||||
|
title_str = chunk.title.lower() if chunk.title else ""
|
||||||
|
hint_lower = project_hint.lower()
|
||||||
|
|
||||||
|
if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:
|
||||||
|
final_score += 0.3
|
||||||
|
|
||||||
|
scored.append((final_score, chunk))
|
||||||
|
|
||||||
|
# Sort by score descending
|
||||||
|
scored.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
return scored
|
||||||
|
|
||||||
|
|
||||||
|
def _select_within_budget(
|
||||||
|
scored: list[tuple[float, ChunkResult]],
|
||||||
|
budget: int,
|
||||||
|
) -> list[ContextChunk]:
|
||||||
|
"""Select top chunks that fit within the character budget."""
|
||||||
|
selected = []
|
||||||
|
used = 0
|
||||||
|
|
||||||
|
for score, chunk in scored:
|
||||||
|
chunk_len = len(chunk.content)
|
||||||
|
if used + chunk_len > budget:
|
||||||
|
continue
|
||||||
|
selected.append(
|
||||||
|
ContextChunk(
|
||||||
|
content=chunk.content,
|
||||||
|
source_file=_shorten_path(chunk.source_file),
|
||||||
|
heading_path=chunk.heading_path,
|
||||||
|
score=score,
|
||||||
|
char_count=chunk_len,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
used += chunk_len
|
||||||
|
|
||||||
|
return selected
|
||||||
|
|
||||||
|
|
||||||
|
def _format_context_block(chunks: list[ContextChunk]) -> str:
|
||||||
|
"""Format chunks into the context block string."""
|
||||||
|
if not chunks:
|
||||||
|
return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
|
||||||
|
|
||||||
|
lines = ["--- AtoCore Context ---"]
|
||||||
|
for chunk in chunks:
|
||||||
|
lines.append(
|
||||||
|
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
|
||||||
|
)
|
||||||
|
lines.append(chunk.content)
|
||||||
|
lines.append("")
|
||||||
|
lines.append("--- End Context ---")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _shorten_path(path: str) -> str:
|
||||||
|
"""Shorten an absolute path to a relative-like display."""
|
||||||
|
p = Path(path)
|
||||||
|
parts = p.parts
|
||||||
|
# Show last 3 parts at most
|
||||||
|
if len(parts) > 3:
|
||||||
|
return str(Path(*parts[-3:]))
|
||||||
|
return str(p)
|
||||||
|
|
||||||
|
|
||||||
|
def _pack_to_dict(pack: ContextPack) -> dict:
|
||||||
|
"""Convert a context pack to a JSON-serializable dict."""
|
||||||
|
return {
|
||||||
|
"query": pack.query,
|
||||||
|
"project_hint": pack.project_hint,
|
||||||
|
"chunks_used": len(pack.chunks_used),
|
||||||
|
"total_chars": pack.total_chars,
|
||||||
|
"budget": pack.budget,
|
||||||
|
"budget_remaining": pack.budget_remaining,
|
||||||
|
"duration_ms": pack.duration_ms,
|
||||||
|
"chunks": [
|
||||||
|
{
|
||||||
|
"source_file": c.source_file,
|
||||||
|
"heading_path": c.heading_path,
|
||||||
|
"score": c.score,
|
||||||
|
"char_count": c.char_count,
|
||||||
|
"content_preview": c.content[:100],
|
||||||
|
}
|
||||||
|
for c in pack.chunks_used
|
||||||
|
],
|
||||||
|
}
|
||||||
0
src/atocore/ingestion/__init__.py
Normal file
0
src/atocore/ingestion/__init__.py
Normal file
146
src/atocore/ingestion/chunker.py
Normal file
146
src/atocore/ingestion/chunker.py
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
"""Heading-aware recursive markdown chunking."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from atocore.config import settings
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Chunk:
|
||||||
|
content: str
|
||||||
|
chunk_index: int
|
||||||
|
heading_path: str
|
||||||
|
char_count: int
|
||||||
|
metadata: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_markdown(
|
||||||
|
body: str,
|
||||||
|
base_metadata: dict | None = None,
|
||||||
|
max_size: int | None = None,
|
||||||
|
overlap: int | None = None,
|
||||||
|
min_size: int | None = None,
|
||||||
|
) -> list[Chunk]:
|
||||||
|
"""Split markdown body into chunks using heading-aware strategy.
|
||||||
|
|
||||||
|
1. Split on H2 boundaries
|
||||||
|
2. If section > max_size, split on H3
|
||||||
|
3. If still > max_size, split on paragraph breaks
|
||||||
|
4. If still > max_size, hard split with overlap
|
||||||
|
"""
|
||||||
|
max_size = max_size or settings.chunk_max_size
|
||||||
|
overlap = overlap or settings.chunk_overlap
|
||||||
|
min_size = min_size or settings.chunk_min_size
|
||||||
|
base_metadata = base_metadata or {}
|
||||||
|
|
||||||
|
sections = _split_by_heading(body, level=2)
|
||||||
|
raw_chunks: list[tuple[str, str]] = [] # (heading_path, content)
|
||||||
|
|
||||||
|
for heading, content in sections:
|
||||||
|
if len(content) <= max_size:
|
||||||
|
raw_chunks.append((heading, content))
|
||||||
|
else:
|
||||||
|
# Try splitting on H3
|
||||||
|
subsections = _split_by_heading(content, level=3)
|
||||||
|
for sub_heading, sub_content in subsections:
|
||||||
|
full_path = (
|
||||||
|
f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading
|
||||||
|
)
|
||||||
|
if len(sub_content) <= max_size:
|
||||||
|
raw_chunks.append((full_path, sub_content))
|
||||||
|
else:
|
||||||
|
# Split on paragraphs
|
||||||
|
para_chunks = _split_by_paragraphs(
|
||||||
|
sub_content, max_size, overlap
|
||||||
|
)
|
||||||
|
for pc in para_chunks:
|
||||||
|
raw_chunks.append((full_path, pc))
|
||||||
|
|
||||||
|
# Build final chunks, filtering out too-small ones
|
||||||
|
chunks = []
|
||||||
|
idx = 0
|
||||||
|
for heading_path, content in raw_chunks:
|
||||||
|
content = content.strip()
|
||||||
|
if len(content) < min_size:
|
||||||
|
continue
|
||||||
|
chunks.append(
|
||||||
|
Chunk(
|
||||||
|
content=content,
|
||||||
|
chunk_index=idx,
|
||||||
|
heading_path=heading_path,
|
||||||
|
char_count=len(content),
|
||||||
|
metadata={**base_metadata},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:
|
||||||
|
"""Split text by heading level. Returns (heading_text, section_content) pairs."""
|
||||||
|
pattern = rf"^({'#' * level})\s+(.+)$"
|
||||||
|
parts: list[tuple[str, str]] = []
|
||||||
|
current_heading = ""
|
||||||
|
current_lines: list[str] = []
|
||||||
|
|
||||||
|
for line in text.split("\n"):
|
||||||
|
match = re.match(pattern, line)
|
||||||
|
if match:
|
||||||
|
# Save previous section
|
||||||
|
if current_lines:
|
||||||
|
parts.append((current_heading, "\n".join(current_lines)))
|
||||||
|
current_heading = match.group(2).strip()
|
||||||
|
current_lines = []
|
||||||
|
else:
|
||||||
|
current_lines.append(line)
|
||||||
|
|
||||||
|
# Save last section
|
||||||
|
if current_lines:
|
||||||
|
parts.append((current_heading, "\n".join(current_lines)))
|
||||||
|
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
def _split_by_paragraphs(
|
||||||
|
text: str, max_size: int, overlap: int
|
||||||
|
) -> list[str]:
|
||||||
|
"""Split text by paragraph breaks, then hard-split if needed."""
|
||||||
|
paragraphs = re.split(r"\n\n+", text)
|
||||||
|
chunks: list[str] = []
|
||||||
|
current = ""
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
para = para.strip()
|
||||||
|
if not para:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(current) + len(para) + 2 <= max_size:
|
||||||
|
current = f"{current}\n\n{para}" if current else para
|
||||||
|
else:
|
||||||
|
if current:
|
||||||
|
chunks.append(current)
|
||||||
|
# If single paragraph exceeds max, hard split
|
||||||
|
if len(para) > max_size:
|
||||||
|
chunks.extend(_hard_split(para, max_size, overlap))
|
||||||
|
else:
|
||||||
|
current = para
|
||||||
|
continue
|
||||||
|
current = ""
|
||||||
|
|
||||||
|
if current:
|
||||||
|
chunks.append(current)
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
|
||||||
|
"""Hard split text at max_size with overlap."""
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = start + max_size
|
||||||
|
chunks.append(text[start:end])
|
||||||
|
start = end - overlap
|
||||||
|
return chunks
|
||||||
65
src/atocore/ingestion/parser.py
Normal file
65
src/atocore/ingestion/parser.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""Markdown file parsing with frontmatter extraction."""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import frontmatter
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ParsedDocument:
|
||||||
|
file_path: str
|
||||||
|
title: str
|
||||||
|
body: str
|
||||||
|
tags: list[str] = field(default_factory=list)
|
||||||
|
frontmatter: dict = field(default_factory=dict)
|
||||||
|
headings: list[tuple[int, str]] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_markdown(file_path: Path) -> ParsedDocument:
|
||||||
|
"""Parse a markdown file, extracting frontmatter and structure."""
|
||||||
|
text = file_path.read_text(encoding="utf-8")
|
||||||
|
post = frontmatter.loads(text)
|
||||||
|
|
||||||
|
meta = dict(post.metadata) if post.metadata else {}
|
||||||
|
body = post.content.strip()
|
||||||
|
|
||||||
|
# Extract title: first H1, or filename
|
||||||
|
title = _extract_title(body, file_path)
|
||||||
|
|
||||||
|
# Extract tags from frontmatter
|
||||||
|
tags = meta.get("tags", [])
|
||||||
|
if isinstance(tags, str):
|
||||||
|
tags = [t.strip() for t in tags.split(",") if t.strip()]
|
||||||
|
tags = tags or []
|
||||||
|
|
||||||
|
# Extract heading structure
|
||||||
|
headings = _extract_headings(body)
|
||||||
|
|
||||||
|
return ParsedDocument(
|
||||||
|
file_path=str(file_path.resolve()),
|
||||||
|
title=title,
|
||||||
|
body=body,
|
||||||
|
tags=tags,
|
||||||
|
frontmatter=meta,
|
||||||
|
headings=headings,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_title(body: str, file_path: Path) -> str:
|
||||||
|
"""Get title from first H1 or fallback to filename."""
|
||||||
|
match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
|
||||||
|
if match:
|
||||||
|
return match.group(1).strip()
|
||||||
|
return file_path.stem.replace("_", " ").replace("-", " ").title()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_headings(body: str) -> list[tuple[int, str]]:
|
||||||
|
"""Extract all headings with their level."""
|
||||||
|
headings = []
|
||||||
|
for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE):
|
||||||
|
level = len(match.group(1))
|
||||||
|
text = match.group(2).strip()
|
||||||
|
headings.append((level, text))
|
||||||
|
return headings
|
||||||
157
src/atocore/ingestion/pipeline.py
Normal file
157
src/atocore/ingestion/pipeline.py
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
"""Ingestion pipeline: parse → chunk → embed → store."""
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from atocore.config import settings
|
||||||
|
from atocore.ingestion.chunker import chunk_markdown
|
||||||
|
from atocore.ingestion.parser import parse_markdown
|
||||||
|
from atocore.models.database import get_connection
|
||||||
|
from atocore.observability.logger import get_logger
|
||||||
|
from atocore.retrieval.vector_store import get_vector_store
|
||||||
|
|
||||||
|
log = get_logger("ingestion")
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_file(file_path: Path) -> dict:
|
||||||
|
"""Ingest a single markdown file. Returns stats."""
|
||||||
|
start = time.time()
|
||||||
|
file_path = file_path.resolve()
|
||||||
|
|
||||||
|
if not file_path.exists():
|
||||||
|
raise FileNotFoundError(f"File not found: {file_path}")
|
||||||
|
if file_path.suffix.lower() not in (".md", ".markdown"):
|
||||||
|
raise ValueError(f"Not a markdown file: {file_path}")
|
||||||
|
|
||||||
|
# Read and hash
|
||||||
|
raw_content = file_path.read_text(encoding="utf-8")
|
||||||
|
file_hash = hashlib.sha256(raw_content.encode()).hexdigest()
|
||||||
|
|
||||||
|
# Check if already ingested and unchanged
|
||||||
|
with get_connection() as conn:
|
||||||
|
existing = conn.execute(
|
||||||
|
"SELECT id, file_hash FROM source_documents WHERE file_path = ?",
|
||||||
|
(str(file_path),),
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if existing and existing["file_hash"] == file_hash:
|
||||||
|
log.info("file_skipped_unchanged", file_path=str(file_path))
|
||||||
|
return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}
|
||||||
|
|
||||||
|
# Parse
|
||||||
|
parsed = parse_markdown(file_path)
|
||||||
|
|
||||||
|
# Chunk
|
||||||
|
base_meta = {
|
||||||
|
"source_file": str(file_path),
|
||||||
|
"tags": parsed.tags,
|
||||||
|
"title": parsed.title,
|
||||||
|
}
|
||||||
|
chunks = chunk_markdown(parsed.body, base_metadata=base_meta)
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
log.warning("no_chunks_created", file_path=str(file_path))
|
||||||
|
return {"file": str(file_path), "status": "empty", "chunks": 0}
|
||||||
|
|
||||||
|
# Store in DB and vector store
|
||||||
|
doc_id = str(uuid.uuid4())
|
||||||
|
vector_store = get_vector_store()
|
||||||
|
|
||||||
|
with get_connection() as conn:
|
||||||
|
# Remove old data if re-ingesting
|
||||||
|
if existing:
|
||||||
|
doc_id = existing["id"]
|
||||||
|
old_chunk_ids = [
|
||||||
|
row["id"]
|
||||||
|
for row in conn.execute(
|
||||||
|
"SELECT id FROM source_chunks WHERE document_id = ?",
|
||||||
|
(doc_id,),
|
||||||
|
).fetchall()
|
||||||
|
]
|
||||||
|
conn.execute(
|
||||||
|
"DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||||||
|
(file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
|
||||||
|
)
|
||||||
|
# Remove old vectors
|
||||||
|
if old_chunk_ids:
|
||||||
|
vector_store.delete(old_chunk_ids)
|
||||||
|
else:
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
|
||||||
|
(doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Insert chunks
|
||||||
|
chunk_ids = []
|
||||||
|
chunk_contents = []
|
||||||
|
chunk_metadatas = []
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
chunk_id = str(uuid.uuid4())
|
||||||
|
chunk_ids.append(chunk_id)
|
||||||
|
chunk_contents.append(chunk.content)
|
||||||
|
chunk_metadatas.append({
|
||||||
|
"document_id": doc_id,
|
||||||
|
"heading_path": chunk.heading_path,
|
||||||
|
"source_file": str(file_path),
|
||||||
|
"tags": json.dumps(parsed.tags),
|
||||||
|
"title": parsed.title,
|
||||||
|
})
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
(
|
||||||
|
chunk_id,
|
||||||
|
doc_id,
|
||||||
|
chunk.chunk_index,
|
||||||
|
chunk.content,
|
||||||
|
chunk.heading_path,
|
||||||
|
chunk.char_count,
|
||||||
|
json.dumps(chunk.metadata),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store embeddings
|
||||||
|
vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)
|
||||||
|
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
log.info(
|
||||||
|
"file_ingested",
|
||||||
|
file_path=str(file_path),
|
||||||
|
chunks_created=len(chunks),
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"file": str(file_path),
|
||||||
|
"status": "ingested",
|
||||||
|
"chunks": len(chunks),
|
||||||
|
"duration_ms": duration_ms,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_folder(folder_path: Path) -> list[dict]:
|
||||||
|
"""Ingest all markdown files in a folder recursively."""
|
||||||
|
folder_path = folder_path.resolve()
|
||||||
|
if not folder_path.is_dir():
|
||||||
|
raise NotADirectoryError(f"Not a directory: {folder_path}")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
md_files = sorted(folder_path.rglob("*.md"))
|
||||||
|
log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))
|
||||||
|
|
||||||
|
for md_file in md_files:
|
||||||
|
try:
|
||||||
|
result = ingest_file(md_file)
|
||||||
|
results.append(result)
|
||||||
|
except Exception as e:
|
||||||
|
log.error("ingestion_error", file_path=str(md_file), error=str(e))
|
||||||
|
results.append({"file": str(md_file), "status": "error", "error": str(e)})
|
||||||
|
|
||||||
|
return results
|
||||||
33
src/atocore/main.py
Normal file
33
src/atocore/main.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
"""AtoCore — FastAPI application entry point."""
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
from atocore.api.routes import router
|
||||||
|
from atocore.config import settings
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
from atocore.observability.logger import setup_logging
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="AtoCore",
|
||||||
|
description="Personal Context Engine for LLM interactions",
|
||||||
|
version="0.1.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(router)
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
def startup():
|
||||||
|
setup_logging()
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
uvicorn.run(
|
||||||
|
"atocore.main:app",
|
||||||
|
host=settings.host,
|
||||||
|
port=settings.port,
|
||||||
|
reload=True,
|
||||||
|
)
|
||||||
0
src/atocore/models/__init__.py
Normal file
0
src/atocore/models/__init__.py
Normal file
98
src/atocore/models/database.py
Normal file
98
src/atocore/models/database.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
"""SQLite database schema and connection management."""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
from atocore.config import settings
|
||||||
|
from atocore.observability.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("database")
|
||||||
|
|
||||||
|
SCHEMA_SQL = """
|
||||||
|
CREATE TABLE IF NOT EXISTS source_documents (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
file_path TEXT UNIQUE NOT NULL,
|
||||||
|
file_hash TEXT NOT NULL,
|
||||||
|
title TEXT,
|
||||||
|
doc_type TEXT DEFAULT 'markdown',
|
||||||
|
tags TEXT DEFAULT '[]',
|
||||||
|
ingested_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS source_chunks (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
document_id TEXT NOT NULL REFERENCES source_documents(id) ON DELETE CASCADE,
|
||||||
|
chunk_index INTEGER NOT NULL,
|
||||||
|
content TEXT NOT NULL,
|
||||||
|
heading_path TEXT DEFAULT '',
|
||||||
|
char_count INTEGER NOT NULL,
|
||||||
|
metadata TEXT DEFAULT '{}',
|
||||||
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS memories (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
memory_type TEXT NOT NULL,
|
||||||
|
content TEXT NOT NULL,
|
||||||
|
source_chunk_id TEXT REFERENCES source_chunks(id),
|
||||||
|
confidence REAL DEFAULT 1.0,
|
||||||
|
status TEXT DEFAULT 'active',
|
||||||
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS projects (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
name TEXT UNIQUE NOT NULL,
|
||||||
|
description TEXT DEFAULT '',
|
||||||
|
status TEXT DEFAULT 'active',
|
||||||
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS interactions (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
prompt TEXT NOT NULL,
|
||||||
|
context_pack TEXT DEFAULT '{}',
|
||||||
|
response_summary TEXT DEFAULT '',
|
||||||
|
project_id TEXT REFERENCES projects(id),
|
||||||
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_chunks_document ON source_chunks(document_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_interactions_project ON interactions(project_id);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_data_dir() -> None:
|
||||||
|
settings.data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def init_db() -> None:
|
||||||
|
"""Initialize the database with schema."""
|
||||||
|
_ensure_data_dir()
|
||||||
|
with get_connection() as conn:
|
||||||
|
conn.executescript(SCHEMA_SQL)
|
||||||
|
log.info("database_initialized", path=str(settings.db_path))
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def get_connection() -> Generator[sqlite3.Connection, None, None]:
|
||||||
|
"""Get a database connection with row factory."""
|
||||||
|
_ensure_data_dir()
|
||||||
|
conn = sqlite3.connect(str(settings.db_path))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
conn.execute("PRAGMA foreign_keys = ON")
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
conn.commit()
|
||||||
|
except Exception:
|
||||||
|
conn.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
0
src/atocore/observability/__init__.py
Normal file
0
src/atocore/observability/__init__.py
Normal file
41
src/atocore/observability/logger.py
Normal file
41
src/atocore/observability/logger.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
"""Structured logging for AtoCore."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import structlog
|
||||||
|
|
||||||
|
from atocore.config import settings
|
||||||
|
|
||||||
|
_LOG_LEVELS = {
|
||||||
|
"DEBUG": logging.DEBUG,
|
||||||
|
"INFO": logging.INFO,
|
||||||
|
"WARNING": logging.WARNING,
|
||||||
|
"ERROR": logging.ERROR,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging() -> None:
|
||||||
|
"""Configure structlog with JSON output."""
|
||||||
|
log_level = "DEBUG" if settings.debug else "INFO"
|
||||||
|
|
||||||
|
structlog.configure(
|
||||||
|
processors=[
|
||||||
|
structlog.contextvars.merge_contextvars,
|
||||||
|
structlog.processors.add_log_level,
|
||||||
|
structlog.processors.TimeStamper(fmt="iso"),
|
||||||
|
structlog.dev.ConsoleRenderer()
|
||||||
|
if settings.debug
|
||||||
|
else structlog.processors.JSONRenderer(),
|
||||||
|
],
|
||||||
|
wrapper_class=structlog.make_filtering_bound_logger(
|
||||||
|
_LOG_LEVELS.get(log_level, logging.INFO)
|
||||||
|
),
|
||||||
|
context_class=dict,
|
||||||
|
logger_factory=structlog.PrintLoggerFactory(),
|
||||||
|
cache_logger_on_first_use=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_logger(name: str) -> structlog.BoundLogger:
|
||||||
|
"""Get a named logger."""
|
||||||
|
return structlog.get_logger(name)
|
||||||
0
src/atocore/retrieval/__init__.py
Normal file
0
src/atocore/retrieval/__init__.py
Normal file
32
src/atocore/retrieval/embeddings.py
Normal file
32
src/atocore/retrieval/embeddings.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""Embedding model management."""
|
||||||
|
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
from atocore.config import settings
|
||||||
|
from atocore.observability.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("embeddings")
|
||||||
|
|
||||||
|
_model: SentenceTransformer | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_model() -> SentenceTransformer:
|
||||||
|
"""Load and cache the embedding model."""
|
||||||
|
global _model
|
||||||
|
if _model is None:
|
||||||
|
log.info("loading_embedding_model", model=settings.embedding_model)
|
||||||
|
_model = SentenceTransformer(settings.embedding_model)
|
||||||
|
log.info("embedding_model_loaded", model=settings.embedding_model)
|
||||||
|
return _model
|
||||||
|
|
||||||
|
|
||||||
|
def embed_texts(texts: list[str]) -> list[list[float]]:
|
||||||
|
"""Generate embeddings for a list of texts."""
|
||||||
|
model = get_model()
|
||||||
|
embeddings = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
|
||||||
|
return embeddings.tolist()
|
||||||
|
|
||||||
|
|
||||||
|
def embed_query(query: str) -> list[float]:
|
||||||
|
"""Generate embedding for a single query."""
|
||||||
|
return embed_texts([query])[0]
|
||||||
83
src/atocore/retrieval/retriever.py
Normal file
83
src/atocore/retrieval/retriever.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
"""Retrieval: query → ranked chunks."""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from atocore.config import settings
|
||||||
|
from atocore.observability.logger import get_logger
|
||||||
|
from atocore.retrieval.embeddings import embed_query
|
||||||
|
from atocore.retrieval.vector_store import get_vector_store
|
||||||
|
|
||||||
|
log = get_logger("retriever")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChunkResult:
|
||||||
|
chunk_id: str
|
||||||
|
content: str
|
||||||
|
score: float
|
||||||
|
heading_path: str
|
||||||
|
source_file: str
|
||||||
|
tags: str
|
||||||
|
title: str
|
||||||
|
document_id: str
|
||||||
|
|
||||||
|
|
||||||
|
def retrieve(
|
||||||
|
query: str,
|
||||||
|
top_k: int | None = None,
|
||||||
|
filter_tags: list[str] | None = None,
|
||||||
|
) -> list[ChunkResult]:
|
||||||
|
"""Retrieve the most relevant chunks for a query."""
|
||||||
|
top_k = top_k or settings.context_top_k
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
query_embedding = embed_query(query)
|
||||||
|
store = get_vector_store()
|
||||||
|
|
||||||
|
# Build filter
|
||||||
|
where = None
|
||||||
|
if filter_tags:
|
||||||
|
# ChromaDB where filter for tags (stored as JSON string)
|
||||||
|
# Simple contains check — works for single-tag filtering
|
||||||
|
where = {"tags": {"$contains": filter_tags[0]}}
|
||||||
|
|
||||||
|
results = store.query(
|
||||||
|
query_embedding=query_embedding,
|
||||||
|
top_k=top_k,
|
||||||
|
where=where,
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
if results and results["ids"] and results["ids"][0]:
|
||||||
|
for i, chunk_id in enumerate(results["ids"][0]):
|
||||||
|
# ChromaDB returns distances (lower = more similar for cosine)
|
||||||
|
# Convert to similarity score (1 - distance)
|
||||||
|
distance = results["distances"][0][i] if results["distances"] else 0
|
||||||
|
score = 1.0 - distance
|
||||||
|
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||||
|
content = results["documents"][0][i] if results["documents"] else ""
|
||||||
|
|
||||||
|
chunks.append(
|
||||||
|
ChunkResult(
|
||||||
|
chunk_id=chunk_id,
|
||||||
|
content=content,
|
||||||
|
score=round(score, 4),
|
||||||
|
heading_path=meta.get("heading_path", ""),
|
||||||
|
source_file=meta.get("source_file", ""),
|
||||||
|
tags=meta.get("tags", "[]"),
|
||||||
|
title=meta.get("title", ""),
|
||||||
|
document_id=meta.get("document_id", ""),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
duration_ms = int((time.time() - start) * 1000)
|
||||||
|
log.info(
|
||||||
|
"retrieval_done",
|
||||||
|
query=query[:100],
|
||||||
|
top_k=top_k,
|
||||||
|
results_count=len(chunks),
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
)
|
||||||
|
|
||||||
|
return chunks
|
||||||
77
src/atocore/retrieval/vector_store.py
Normal file
77
src/atocore/retrieval/vector_store.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
"""ChromaDB vector store wrapper."""
|
||||||
|
|
||||||
|
import chromadb
|
||||||
|
|
||||||
|
from atocore.config import settings
|
||||||
|
from atocore.observability.logger import get_logger
|
||||||
|
from atocore.retrieval.embeddings import embed_texts
|
||||||
|
|
||||||
|
log = get_logger("vector_store")
|
||||||
|
|
||||||
|
COLLECTION_NAME = "atocore_chunks"
|
||||||
|
|
||||||
|
_store: "VectorStore | None" = None
|
||||||
|
|
||||||
|
|
||||||
|
class VectorStore:
|
||||||
|
"""Wrapper around ChromaDB for chunk storage and retrieval."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
settings.chroma_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
self._client = chromadb.PersistentClient(path=str(settings.chroma_path))
|
||||||
|
self._collection = self._client.get_or_create_collection(
|
||||||
|
name=COLLECTION_NAME,
|
||||||
|
metadata={"hnsw:space": "cosine"},
|
||||||
|
)
|
||||||
|
log.info("vector_store_initialized", path=str(settings.chroma_path))
|
||||||
|
|
||||||
|
def add(
|
||||||
|
self,
|
||||||
|
ids: list[str],
|
||||||
|
documents: list[str],
|
||||||
|
metadatas: list[dict],
|
||||||
|
) -> None:
|
||||||
|
"""Add chunks with embeddings to the store."""
|
||||||
|
embeddings = embed_texts(documents)
|
||||||
|
self._collection.add(
|
||||||
|
ids=ids,
|
||||||
|
embeddings=embeddings,
|
||||||
|
documents=documents,
|
||||||
|
metadatas=metadatas,
|
||||||
|
)
|
||||||
|
log.debug("vectors_added", count=len(ids))
|
||||||
|
|
||||||
|
def query(
|
||||||
|
self,
|
||||||
|
query_embedding: list[float],
|
||||||
|
top_k: int = 10,
|
||||||
|
where: dict | None = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Query the store for similar chunks."""
|
||||||
|
kwargs: dict = {
|
||||||
|
"query_embeddings": [query_embedding],
|
||||||
|
"n_results": top_k,
|
||||||
|
"include": ["documents", "metadatas", "distances"],
|
||||||
|
}
|
||||||
|
if where:
|
||||||
|
kwargs["where"] = where
|
||||||
|
|
||||||
|
return self._collection.query(**kwargs)
|
||||||
|
|
||||||
|
def delete(self, ids: list[str]) -> None:
|
||||||
|
"""Delete chunks by IDs."""
|
||||||
|
if ids:
|
||||||
|
self._collection.delete(ids=ids)
|
||||||
|
log.debug("vectors_deleted", count=len(ids))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def count(self) -> int:
|
||||||
|
return self._collection.count()
|
||||||
|
|
||||||
|
|
||||||
|
def get_vector_store() -> VectorStore:
|
||||||
|
"""Get or create the singleton vector store."""
|
||||||
|
global _store
|
||||||
|
if _store is None:
|
||||||
|
_store = VectorStore()
|
||||||
|
return _store
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
114
tests/conftest.py
Normal file
114
tests/conftest.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
"""pytest configuration and shared fixtures."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Force test data directory
|
||||||
|
os.environ["ATOCORE_DATA_DIR"] = tempfile.mkdtemp(prefix="atocore_test_")
|
||||||
|
os.environ["ATOCORE_DEBUG"] = "true"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def tmp_data_dir(tmp_path):
|
||||||
|
"""Provide a temporary data directory for tests."""
|
||||||
|
os.environ["ATOCORE_DATA_DIR"] = str(tmp_path)
|
||||||
|
# Reset singletons
|
||||||
|
from atocore import config
|
||||||
|
config.settings = config.Settings()
|
||||||
|
|
||||||
|
import atocore.retrieval.vector_store as vs
|
||||||
|
vs._store = None
|
||||||
|
|
||||||
|
return tmp_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_markdown(tmp_path) -> Path:
|
||||||
|
"""Create a sample markdown file for testing."""
|
||||||
|
md_file = tmp_path / "test_note.md"
|
||||||
|
md_file.write_text(
|
||||||
|
"""---
|
||||||
|
tags:
|
||||||
|
- atocore
|
||||||
|
- architecture
|
||||||
|
date: 2026-04-05
|
||||||
|
---
|
||||||
|
# AtoCore Architecture
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
AtoCore is a personal context engine that enriches LLM interactions
|
||||||
|
with durable memory, structured context, and project knowledge.
|
||||||
|
|
||||||
|
## Layers
|
||||||
|
|
||||||
|
The system has these layers:
|
||||||
|
|
||||||
|
1. Main PKM (human, messy, exploratory)
|
||||||
|
2. AtoVault (system mirror)
|
||||||
|
3. AtoDrive (trusted project truth)
|
||||||
|
4. Structured Memory (DB)
|
||||||
|
5. Semantic Retrieval (vector DB)
|
||||||
|
|
||||||
|
## Memory Types
|
||||||
|
|
||||||
|
AtoCore supports these memory types:
|
||||||
|
|
||||||
|
- Identity
|
||||||
|
- Preferences
|
||||||
|
- Project Memory
|
||||||
|
- Episodic Memory
|
||||||
|
- Knowledge Objects
|
||||||
|
- Adaptation Memory
|
||||||
|
- Trusted Project State
|
||||||
|
|
||||||
|
## Trust Precedence
|
||||||
|
|
||||||
|
When sources conflict:
|
||||||
|
|
||||||
|
1. Trusted Project State wins
|
||||||
|
2. AtoDrive overrides PKM
|
||||||
|
3. Most recent confirmed wins
|
||||||
|
4. Higher confidence wins
|
||||||
|
5. Equal → flag conflict
|
||||||
|
|
||||||
|
No silent merging.
|
||||||
|
""",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
return md_file
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_folder(tmp_path, sample_markdown) -> Path:
|
||||||
|
"""Create a folder with multiple markdown files."""
|
||||||
|
# Already has test_note.md from sample_markdown
|
||||||
|
second = tmp_path / "second_note.md"
|
||||||
|
second.write_text(
|
||||||
|
"""---
|
||||||
|
tags:
|
||||||
|
- chunking
|
||||||
|
---
|
||||||
|
# Chunking Strategy
|
||||||
|
|
||||||
|
## Approach
|
||||||
|
|
||||||
|
Heading-aware recursive splitting:
|
||||||
|
|
||||||
|
1. Split on H2 boundaries first
|
||||||
|
2. If section > 800 chars, split on H3
|
||||||
|
3. If still > 800 chars, split on paragraphs
|
||||||
|
4. Hard split at 800 chars with 100 char overlap
|
||||||
|
|
||||||
|
## Parameters
|
||||||
|
|
||||||
|
- max_chunk_size: 800 characters
|
||||||
|
- overlap: 100 characters
|
||||||
|
- min_chunk_size: 50 characters
|
||||||
|
""",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
return tmp_path
|
||||||
73
tests/test_chunker.py
Normal file
73
tests/test_chunker.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
"""Tests for the markdown chunker."""
|
||||||
|
|
||||||
|
from atocore.ingestion.chunker import chunk_markdown
|
||||||
|
|
||||||
|
|
||||||
|
def test_basic_chunking():
|
||||||
|
"""Test that markdown is split into chunks."""
|
||||||
|
body = """## Section One
|
||||||
|
|
||||||
|
This is the first section with some content that is long enough to pass the minimum chunk size filter applied by the chunker.
|
||||||
|
|
||||||
|
## Section Two
|
||||||
|
|
||||||
|
This is the second section with different content that is also long enough to pass the minimum chunk size threshold.
|
||||||
|
"""
|
||||||
|
chunks = chunk_markdown(body)
|
||||||
|
assert len(chunks) >= 2
|
||||||
|
assert all(c.char_count > 0 for c in chunks)
|
||||||
|
assert all(c.chunk_index >= 0 for c in chunks)
|
||||||
|
|
||||||
|
|
||||||
|
def test_heading_path_preserved():
|
||||||
|
"""Test that heading paths are captured."""
|
||||||
|
body = """## Architecture
|
||||||
|
|
||||||
|
### Layers
|
||||||
|
|
||||||
|
The system has multiple layers organized in a clear hierarchy for separation of concerns and maintainability.
|
||||||
|
"""
|
||||||
|
chunks = chunk_markdown(body)
|
||||||
|
assert len(chunks) >= 1
|
||||||
|
# At least one chunk should have heading info
|
||||||
|
has_heading = any(c.heading_path for c in chunks)
|
||||||
|
assert has_heading
|
||||||
|
|
||||||
|
|
||||||
|
def test_small_chunks_filtered():
|
||||||
|
"""Test that very small chunks are discarded."""
|
||||||
|
body = """## A
|
||||||
|
|
||||||
|
Hi
|
||||||
|
|
||||||
|
## B
|
||||||
|
|
||||||
|
This is a real section with enough content to pass the minimum size threshold.
|
||||||
|
"""
|
||||||
|
chunks = chunk_markdown(body, min_size=50)
|
||||||
|
# "Hi" should be filtered out
|
||||||
|
for c in chunks:
|
||||||
|
assert c.char_count >= 50
|
||||||
|
|
||||||
|
|
||||||
|
def test_large_section_split():
|
||||||
|
"""Test that large sections are split further."""
|
||||||
|
large_content = "Word " * 200 # ~1000 chars
|
||||||
|
body = f"## Big Section\n\n{large_content}"
|
||||||
|
chunks = chunk_markdown(body, max_size=400)
|
||||||
|
assert len(chunks) >= 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_metadata_passed_through():
|
||||||
|
"""Test that base metadata is included in chunks."""
|
||||||
|
body = "## Test\n\nSome content here that is long enough."
|
||||||
|
meta = {"source_file": "/test/file.md", "tags": ["test"]}
|
||||||
|
chunks = chunk_markdown(body, base_metadata=meta)
|
||||||
|
if chunks:
|
||||||
|
assert chunks[0].metadata.get("source_file") == "/test/file.md"
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_body():
|
||||||
|
"""Test chunking an empty body."""
|
||||||
|
chunks = chunk_markdown("")
|
||||||
|
assert chunks == []
|
||||||
60
tests/test_context_builder.py
Normal file
60
tests/test_context_builder.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
"""Tests for the context builder."""
|
||||||
|
|
||||||
|
from atocore.context.builder import build_context, get_last_context_pack
|
||||||
|
from atocore.ingestion.pipeline import ingest_file
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_context_returns_pack(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that context builder returns a valid pack."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
|
||||||
|
pack = build_context("What is AtoCore?")
|
||||||
|
assert pack.total_chars > 0
|
||||||
|
assert len(pack.chunks_used) > 0
|
||||||
|
assert pack.budget_remaining >= 0
|
||||||
|
assert "--- AtoCore Context ---" in pack.formatted_context
|
||||||
|
assert "--- End Context ---" in pack.formatted_context
|
||||||
|
|
||||||
|
|
||||||
|
def test_context_respects_budget(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that context builder respects character budget."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
|
||||||
|
pack = build_context("What is AtoCore?", budget=500)
|
||||||
|
assert pack.total_chars <= 500
|
||||||
|
|
||||||
|
|
||||||
|
def test_context_with_project_hint(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that project hint boosts relevant chunks."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
|
||||||
|
pack = build_context("What is the architecture?", project_hint="atocore")
|
||||||
|
assert len(pack.chunks_used) > 0
|
||||||
|
# With project hint, we should still get results
|
||||||
|
assert pack.total_chars > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_last_context_pack_stored(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that last context pack is stored for debug."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
|
||||||
|
build_context("test prompt")
|
||||||
|
last = get_last_context_pack()
|
||||||
|
assert last is not None
|
||||||
|
assert last.query == "test prompt"
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_prompt_structure(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that the full prompt has correct structure."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
|
||||||
|
pack = build_context("What are memory types?")
|
||||||
|
assert "knowledge base" in pack.full_prompt.lower()
|
||||||
|
assert "--- AtoCore Context ---" in pack.full_prompt
|
||||||
|
assert "What are memory types?" in pack.full_prompt
|
||||||
71
tests/test_ingestion.py
Normal file
71
tests/test_ingestion.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
"""Tests for the ingestion pipeline."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from atocore.ingestion.parser import parse_markdown
|
||||||
|
from atocore.models.database import get_connection, init_db
|
||||||
|
from atocore.ingestion.pipeline import ingest_file
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_markdown(sample_markdown):
|
||||||
|
"""Test markdown parsing with frontmatter."""
|
||||||
|
parsed = parse_markdown(sample_markdown)
|
||||||
|
assert parsed.title == "AtoCore Architecture"
|
||||||
|
assert "atocore" in parsed.tags
|
||||||
|
assert "architecture" in parsed.tags
|
||||||
|
assert len(parsed.body) > 0
|
||||||
|
assert len(parsed.headings) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_extracts_headings(sample_markdown):
|
||||||
|
"""Test that headings are extracted correctly."""
|
||||||
|
parsed = parse_markdown(sample_markdown)
|
||||||
|
heading_texts = [h[1] for h in parsed.headings]
|
||||||
|
assert "AtoCore Architecture" in heading_texts
|
||||||
|
assert "Overview" in heading_texts
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_file(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test ingesting a single file."""
|
||||||
|
init_db()
|
||||||
|
result = ingest_file(sample_markdown)
|
||||||
|
assert result["status"] == "ingested"
|
||||||
|
assert result["chunks"] > 0
|
||||||
|
|
||||||
|
# Verify the file was stored in DB
|
||||||
|
with get_connection() as conn:
|
||||||
|
doc = conn.execute(
|
||||||
|
"SELECT COUNT(*) as c FROM source_documents WHERE file_path = ?",
|
||||||
|
(str(sample_markdown.resolve()),),
|
||||||
|
).fetchone()
|
||||||
|
assert doc["c"] == 1
|
||||||
|
|
||||||
|
chunks = conn.execute(
|
||||||
|
"SELECT COUNT(*) as c FROM source_chunks sc "
|
||||||
|
"JOIN source_documents sd ON sc.document_id = sd.id "
|
||||||
|
"WHERE sd.file_path = ?",
|
||||||
|
(str(sample_markdown.resolve()),),
|
||||||
|
).fetchone()
|
||||||
|
assert chunks["c"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_skips_unchanged(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that re-ingesting unchanged file is skipped."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
result = ingest_file(sample_markdown)
|
||||||
|
assert result["status"] == "skipped"
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that changed files are re-ingested."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
|
||||||
|
# Modify the file
|
||||||
|
sample_markdown.write_text(
|
||||||
|
sample_markdown.read_text(encoding="utf-8") + "\n\n## New Section\n\nNew content added.",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
result = ingest_file(sample_markdown)
|
||||||
|
assert result["status"] == "ingested"
|
||||||
40
tests/test_prompts/gigabit_prompts.yaml
Normal file
40
tests/test_prompts/gigabit_prompts.yaml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
prompts:
|
||||||
|
- id: g1
|
||||||
|
prompt: "What is the GigaBIT M1 project about?"
|
||||||
|
project: gigabit
|
||||||
|
expected: "Should mention 1.2m primary mirror, StarSpec, telescope"
|
||||||
|
|
||||||
|
- id: g2
|
||||||
|
prompt: "What are the main requirements for the M1 mirror?"
|
||||||
|
project: gigabit
|
||||||
|
expected: "Should mention optical/mechanical requirements, SOW, diameter, Zerodur"
|
||||||
|
|
||||||
|
- id: g3
|
||||||
|
prompt: "What vendors are involved in the project?"
|
||||||
|
project: gigabit
|
||||||
|
expected: "Should mention Optiques Fullum, StarSpec, Atomaste, or subcontractors"
|
||||||
|
|
||||||
|
- id: g4
|
||||||
|
prompt: "What is the status of the CDR?"
|
||||||
|
project: gigabit
|
||||||
|
expected: "Should mention Critical Design Review status, CBUSH, design completion"
|
||||||
|
|
||||||
|
- id: g5
|
||||||
|
prompt: "What are the key design decisions made so far?"
|
||||||
|
project: gigabit
|
||||||
|
expected: "Should mention design phases, PDR, assumptions, blank order"
|
||||||
|
|
||||||
|
- id: g6
|
||||||
|
prompt: "What FEA optimization work has been done?"
|
||||||
|
project: gigabit
|
||||||
|
expected: "Should mention FEA analysis, optimization approach, WFE, displacement data"
|
||||||
|
|
||||||
|
- id: g7
|
||||||
|
prompt: "What is the cost reduction strategy?"
|
||||||
|
project: gigabit
|
||||||
|
expected: "Should mention cost reduction campaign, trade-off, topology selection"
|
||||||
|
|
||||||
|
- id: g8
|
||||||
|
prompt: "What are the mirror blank specifications?"
|
||||||
|
project: gigabit
|
||||||
|
expected: "Should mention 1200mm diameter, Zerodur, optical specifications"
|
||||||
40
tests/test_prompts/prompts.yaml
Normal file
40
tests/test_prompts/prompts.yaml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
prompts:
|
||||||
|
- id: p1
|
||||||
|
prompt: "What is AtoCore's architecture?"
|
||||||
|
project: atocore
|
||||||
|
expected: "Should mention layered architecture, SQLite, vector DB"
|
||||||
|
|
||||||
|
- id: p2
|
||||||
|
prompt: "What chunking strategy does AtoCore use?"
|
||||||
|
project: atocore
|
||||||
|
expected: "Should mention heading-aware splitting, 800 char max"
|
||||||
|
|
||||||
|
- id: p3
|
||||||
|
prompt: "What is the trust precedence order?"
|
||||||
|
project: atocore
|
||||||
|
expected: "Should list: Trusted Project State > AtoDrive > validated memory"
|
||||||
|
|
||||||
|
- id: p4
|
||||||
|
prompt: "How does AtoCore handle conflicts between sources?"
|
||||||
|
project: atocore
|
||||||
|
expected: "Should mention conflict resolution rules, no silent merging"
|
||||||
|
|
||||||
|
- id: p5
|
||||||
|
prompt: "What are the different memory types?"
|
||||||
|
project: atocore
|
||||||
|
expected: "Should list: Identity, Preferences, Project, Episodic, Knowledge, Adaptation, Trusted Project State"
|
||||||
|
|
||||||
|
- id: p6
|
||||||
|
prompt: "What is the context budget allocation?"
|
||||||
|
project: atocore
|
||||||
|
expected: "Should mention percentages: identity 5%, preferences 5%, project 20%, episodic 10%, retrieval 60%"
|
||||||
|
|
||||||
|
- id: p7
|
||||||
|
prompt: "What is a trivial prompt in AtoCore?"
|
||||||
|
project: atocore
|
||||||
|
expected: "Should mention: no project ref, no proper nouns, no past context dependency"
|
||||||
|
|
||||||
|
- id: p8
|
||||||
|
prompt: "What are the success criteria for the first win?"
|
||||||
|
project: atocore
|
||||||
|
expected: "Should mention: saves >=5 min lookup, >=80-90% accuracy, >=10 test prompts"
|
||||||
41
tests/test_retrieval.py
Normal file
41
tests/test_retrieval.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
"""Tests for the retrieval system."""
|
||||||
|
|
||||||
|
from atocore.ingestion.pipeline import ingest_file
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
from atocore.retrieval.retriever import retrieve
|
||||||
|
from atocore.retrieval.vector_store import get_vector_store
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrieve_returns_results(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that retrieval returns relevant chunks."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
|
||||||
|
results = retrieve("What are the memory types?", top_k=5)
|
||||||
|
assert len(results) > 0
|
||||||
|
assert all(r.score > 0 for r in results)
|
||||||
|
assert all(r.content for r in results)
|
||||||
|
|
||||||
|
|
||||||
|
def test_retrieve_scores_ranked(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that results are ranked by score."""
|
||||||
|
init_db()
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
|
||||||
|
results = retrieve("architecture layers", top_k=5)
|
||||||
|
if len(results) >= 2:
|
||||||
|
scores = [r.score for r in results]
|
||||||
|
assert scores == sorted(scores, reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_vector_store_count(tmp_data_dir, sample_markdown):
|
||||||
|
"""Test that vector store tracks chunk count."""
|
||||||
|
init_db()
|
||||||
|
|
||||||
|
# Reset singleton for clean test
|
||||||
|
import atocore.retrieval.vector_store as vs
|
||||||
|
vs._store = None
|
||||||
|
|
||||||
|
ingest_file(sample_markdown)
|
||||||
|
store = get_vector_store()
|
||||||
|
assert store.count > 0
|
||||||
Reference in New Issue
Block a user