diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..9cdb6a1 --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +ATOCORE_DEBUG=false +ATOCORE_DATA_DIR=./data +ATOCORE_HOST=127.0.0.1 +ATOCORE_PORT=8100 +ATOCORE_EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 +ATOCORE_CHUNK_MAX_SIZE=800 +ATOCORE_CHUNK_OVERLAP=100 +ATOCORE_CONTEXT_BUDGET=3000 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..178704b --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +data/ +__pycache__/ +*.pyc +.env +*.egg-info/ +dist/ +build/ +.pytest_cache/ +htmlcov/ +.coverage +venv/ +.venv/ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ffba0d3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["setuptools>=68.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "atocore" +version = "0.1.0" +description = "Personal context engine for LLM interactions" +requires-python = ">=3.11" +dependencies = [ + "fastapi>=0.110.0", + "uvicorn[standard]>=0.27.0", + "python-frontmatter>=1.1.0", + "chromadb>=0.4.22", + "sentence-transformers>=2.5.0", + "pydantic>=2.6.0", + "pydantic-settings>=2.1.0", + "structlog>=24.1.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "pytest-cov>=4.1.0", + "httpx>=0.27.0", + "pyyaml>=6.0.0", +] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_functions = ["test_*"] +addopts = "--cov=atocore --cov-report=term-missing -v" diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..bbd5f6b --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,5 @@ +-r requirements.txt +pytest>=8.0.0 +pytest-cov>=4.1.0 +httpx>=0.27.0 +pyyaml>=6.0.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..42fcd7e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +fastapi>=0.110.0 +uvicorn[standard]>=0.27.0 +python-frontmatter>=1.1.0 +chromadb>=0.4.22 +sentence-transformers>=2.5.0 +pydantic>=2.6.0 +pydantic-settings>=2.1.0 +structlog>=24.1.0 diff --git a/scripts/ingest_folder.py b/scripts/ingest_folder.py new file mode 100644 index 0000000..43a902f --- /dev/null +++ b/scripts/ingest_folder.py @@ -0,0 +1,54 @@ +"""CLI script to ingest a folder of markdown files.""" + +import argparse +import json +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from atocore.ingestion.pipeline import ingest_folder +from atocore.models.database import init_db +from atocore.observability.logger import setup_logging + + +def main(): + parser = argparse.ArgumentParser(description="Ingest markdown files into AtoCore") + parser.add_argument("--path", required=True, help="Path to folder with markdown files") + args = parser.parse_args() + + setup_logging() + init_db() + + folder = Path(args.path) + if not folder.is_dir(): + print(f"Error: {folder} is not a directory") + sys.exit(1) + + results = ingest_folder(folder) + + # Summary + ingested = sum(1 for r in results if r["status"] == "ingested") + skipped = sum(1 for r in results if r["status"] == "skipped") + errors = sum(1 for r in results if r["status"] == "error") + total_chunks = sum(r.get("chunks", 0) for r in results) + + print(f"\n{'='*50}") + print(f"Ingestion complete:") + print(f" Files processed: {len(results)}") + print(f" Ingested: {ingested}") + print(f" Skipped (unchanged): {skipped}") + print(f" Errors: {errors}") + print(f" Total chunks created: {total_chunks}") + print(f"{'='*50}") + + if errors: + print("\nErrors:") + for r in results: + if r["status"] == "error": + print(f" {r['file']}: {r['error']}") + + +if __name__ == "__main__": + main() diff --git a/scripts/query_test.py b/scripts/query_test.py new file mode 100644 index 0000000..6027607 --- /dev/null +++ b/scripts/query_test.py @@ -0,0 +1,76 @@ +"""CLI script to run test prompts and compare baseline vs enriched.""" + +import argparse +import sys +from pathlib import Path + +import yaml + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from atocore.context.builder import build_context +from atocore.models.database import init_db +from atocore.observability.logger import setup_logging + + +def main(): + parser = argparse.ArgumentParser(description="Run test prompts against AtoCore") + parser.add_argument( + "--prompts", + default=str(Path(__file__).parent.parent / "tests" / "test_prompts" / "prompts.yaml"), + help="Path to prompts YAML file", + ) + args = parser.parse_args() + + setup_logging() + init_db() + + prompts_path = Path(args.prompts) + if not prompts_path.exists(): + print(f"Error: {prompts_path} not found") + sys.exit(1) + + with open(prompts_path) as f: + data = yaml.safe_load(f) + + prompts = data.get("prompts", []) + print(f"Running {len(prompts)} test prompts...\n") + + for p in prompts: + prompt_id = p["id"] + prompt_text = p["prompt"] + project = p.get("project") + expected = p.get("expected", "") + + print(f"{'='*60}") + print(f"[{prompt_id}] {prompt_text}") + print(f"Project: {project or 'none'}") + print(f"Expected: {expected}") + print(f"-" * 60) + + pack = build_context( + user_prompt=prompt_text, + project_hint=project, + ) + + print(f"Chunks retrieved: {len(pack.chunks_used)}") + print(f"Total chars: {pack.total_chars} / {pack.budget}") + print(f"Duration: {pack.duration_ms}ms") + print() + + for i, chunk in enumerate(pack.chunks_used[:5]): + print(f" [{i+1}] Score: {chunk.score:.2f} | {chunk.source_file}") + print(f" Section: {chunk.heading_path}") + print(f" Preview: {chunk.content[:120]}...") + print() + + print(f"Full prompt length: {len(pack.full_prompt)} chars") + print() + + print(f"{'='*60}") + print("Done. Review output above to assess retrieval quality.") + + +if __name__ == "__main__": + main() diff --git a/src/atocore/__init__.py b/src/atocore/__init__.py new file mode 100644 index 0000000..cee2512 --- /dev/null +++ b/src/atocore/__init__.py @@ -0,0 +1,3 @@ +"""AtoCore — Personal Context Engine.""" + +__version__ = "0.1.0" diff --git a/src/atocore/api/__init__.py b/src/atocore/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atocore/api/routes.py b/src/atocore/api/routes.py new file mode 100644 index 0000000..8f3e59d --- /dev/null +++ b/src/atocore/api/routes.py @@ -0,0 +1,132 @@ +"""FastAPI route definitions.""" + +from pathlib import Path + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +from atocore.context.builder import ( + ContextPack, + build_context, + get_last_context_pack, + _pack_to_dict, +) +from atocore.ingestion.pipeline import ingest_file, ingest_folder +from atocore.retrieval.retriever import retrieve +from atocore.retrieval.vector_store import get_vector_store + +router = APIRouter() + + +# --- Request/Response models --- + + +class IngestRequest(BaseModel): + path: str # file or folder path + + +class IngestResponse(BaseModel): + results: list[dict] + + +class QueryRequest(BaseModel): + prompt: str + top_k: int = 10 + filter_tags: list[str] | None = None + + +class QueryResponse(BaseModel): + results: list[dict] + + +class ContextBuildRequest(BaseModel): + prompt: str + project: str | None = None + budget: int | None = None + + +class ContextBuildResponse(BaseModel): + formatted_context: str + full_prompt: str + chunks_used: int + total_chars: int + budget: int + budget_remaining: int + duration_ms: int + chunks: list[dict] + + +# --- Endpoints --- + + +@router.post("/ingest", response_model=IngestResponse) +def api_ingest(req: IngestRequest): + """Ingest a markdown file or folder.""" + target = Path(req.path) + if target.is_file(): + results = [ingest_file(target)] + elif target.is_dir(): + results = ingest_folder(target) + else: + raise HTTPException(status_code=404, detail=f"Path not found: {req.path}") + return IngestResponse(results=results) + + +@router.post("/query", response_model=QueryResponse) +def api_query(req: QueryRequest): + """Retrieve relevant chunks for a prompt.""" + chunks = retrieve(req.prompt, top_k=req.top_k, filter_tags=req.filter_tags) + return QueryResponse( + results=[ + { + "chunk_id": c.chunk_id, + "content": c.content, + "score": c.score, + "heading_path": c.heading_path, + "source_file": c.source_file, + "title": c.title, + } + for c in chunks + ] + ) + + +@router.post("/context/build", response_model=ContextBuildResponse) +def api_build_context(req: ContextBuildRequest): + """Build a full context pack for a prompt.""" + pack = build_context( + user_prompt=req.prompt, + project_hint=req.project, + budget=req.budget, + ) + pack_dict = _pack_to_dict(pack) + return ContextBuildResponse( + formatted_context=pack.formatted_context, + full_prompt=pack.full_prompt, + chunks_used=len(pack.chunks_used), + total_chars=pack.total_chars, + budget=pack.budget, + budget_remaining=pack.budget_remaining, + duration_ms=pack.duration_ms, + chunks=pack_dict["chunks"], + ) + + +@router.get("/health") +def api_health(): + """Health check.""" + store = get_vector_store() + return { + "status": "ok", + "version": "0.1.0", + "vectors_count": store.count, + } + + +@router.get("/debug/context") +def api_debug_context(): + """Inspect the last assembled context pack.""" + pack = get_last_context_pack() + if pack is None: + return {"message": "No context pack built yet."} + return _pack_to_dict(pack) diff --git a/src/atocore/config.py b/src/atocore/config.py new file mode 100644 index 0000000..c487615 --- /dev/null +++ b/src/atocore/config.py @@ -0,0 +1,39 @@ +"""AtoCore configuration via environment variables.""" + +from pathlib import Path + +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + debug: bool = False + data_dir: Path = Path("./data") + host: str = "127.0.0.1" + port: int = 8100 + + # Embedding + embedding_model: str = ( + "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" + ) + + # Chunking + chunk_max_size: int = 800 + chunk_overlap: int = 100 + chunk_min_size: int = 50 + + # Context + context_budget: int = 3000 + context_top_k: int = 15 + + model_config = {"env_prefix": "ATOCORE_"} + + @property + def db_path(self) -> Path: + return self.data_dir / "atocore.db" + + @property + def chroma_path(self) -> Path: + return self.data_dir / "chroma" + + +settings = Settings() diff --git a/src/atocore/context/__init__.py b/src/atocore/context/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atocore/context/builder.py b/src/atocore/context/builder.py new file mode 100644 index 0000000..62c53c8 --- /dev/null +++ b/src/atocore/context/builder.py @@ -0,0 +1,212 @@ +"""Context pack assembly: retrieve, rank, budget, format.""" + +import json +import time +from dataclasses import dataclass, field +from pathlib import Path + +from atocore.config import settings +from atocore.observability.logger import get_logger +from atocore.retrieval.retriever import ChunkResult, retrieve + +log = get_logger("context_builder") + +SYSTEM_PREFIX = ( + "You have access to the following personal context from the user's knowledge base.\n" + "Use it to inform your answer. If the context is not relevant, ignore it.\n" + "Do not mention the context system unless asked." +) + +# Last built context pack for debug inspection +_last_context_pack: "ContextPack | None" = None + + +@dataclass +class ContextChunk: + content: str + source_file: str + heading_path: str + score: float + char_count: int + + +@dataclass +class ContextPack: + chunks_used: list[ContextChunk] = field(default_factory=list) + total_chars: int = 0 + budget: int = 0 + budget_remaining: int = 0 + formatted_context: str = "" + full_prompt: str = "" + query: str = "" + project_hint: str = "" + duration_ms: int = 0 + + +def build_context( + user_prompt: str, + project_hint: str | None = None, + budget: int | None = None, +) -> ContextPack: + """Build a context pack for a user prompt.""" + global _last_context_pack + start = time.time() + budget = budget or settings.context_budget + + # 1. Retrieve candidates + candidates = retrieve(user_prompt, top_k=settings.context_top_k) + + # 2. Score and rank + scored = _rank_chunks(candidates, project_hint) + + # 3. Select within budget + selected = _select_within_budget(scored, budget) + + # 4. Format + formatted = _format_context_block(selected) + + # 5. Build full prompt + full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}" + + total_chars = sum(c.char_count for c in selected) + duration_ms = int((time.time() - start) * 1000) + + pack = ContextPack( + chunks_used=selected, + total_chars=total_chars, + budget=budget, + budget_remaining=budget - total_chars, + formatted_context=formatted, + full_prompt=full_prompt, + query=user_prompt, + project_hint=project_hint or "", + duration_ms=duration_ms, + ) + + _last_context_pack = pack + + log.info( + "context_built", + chunks_used=len(selected), + total_chars=total_chars, + budget_remaining=budget - total_chars, + duration_ms=duration_ms, + ) + log.debug("context_pack_detail", pack=_pack_to_dict(pack)) + + return pack + + +def get_last_context_pack() -> ContextPack | None: + """Return the last built context pack for debug inspection.""" + return _last_context_pack + + +def _rank_chunks( + candidates: list[ChunkResult], + project_hint: str | None, +) -> list[tuple[float, ChunkResult]]: + """Rank candidates with boosting for project match.""" + scored = [] + seen_content: set[str] = set() + + for chunk in candidates: + # Deduplicate by content prefix (first 200 chars) + content_key = chunk.content[:200] + if content_key in seen_content: + continue + seen_content.add(content_key) + + # Base score from similarity + final_score = chunk.score + + # Project boost + if project_hint: + tags_str = chunk.tags.lower() if chunk.tags else "" + source_str = chunk.source_file.lower() + title_str = chunk.title.lower() if chunk.title else "" + hint_lower = project_hint.lower() + + if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str: + final_score += 0.3 + + scored.append((final_score, chunk)) + + # Sort by score descending + scored.sort(key=lambda x: x[0], reverse=True) + return scored + + +def _select_within_budget( + scored: list[tuple[float, ChunkResult]], + budget: int, +) -> list[ContextChunk]: + """Select top chunks that fit within the character budget.""" + selected = [] + used = 0 + + for score, chunk in scored: + chunk_len = len(chunk.content) + if used + chunk_len > budget: + continue + selected.append( + ContextChunk( + content=chunk.content, + source_file=_shorten_path(chunk.source_file), + heading_path=chunk.heading_path, + score=score, + char_count=chunk_len, + ) + ) + used += chunk_len + + return selected + + +def _format_context_block(chunks: list[ContextChunk]) -> str: + """Format chunks into the context block string.""" + if not chunks: + return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---" + + lines = ["--- AtoCore Context ---"] + for chunk in chunks: + lines.append( + f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]" + ) + lines.append(chunk.content) + lines.append("") + lines.append("--- End Context ---") + return "\n".join(lines) + + +def _shorten_path(path: str) -> str: + """Shorten an absolute path to a relative-like display.""" + p = Path(path) + parts = p.parts + # Show last 3 parts at most + if len(parts) > 3: + return str(Path(*parts[-3:])) + return str(p) + + +def _pack_to_dict(pack: ContextPack) -> dict: + """Convert a context pack to a JSON-serializable dict.""" + return { + "query": pack.query, + "project_hint": pack.project_hint, + "chunks_used": len(pack.chunks_used), + "total_chars": pack.total_chars, + "budget": pack.budget, + "budget_remaining": pack.budget_remaining, + "duration_ms": pack.duration_ms, + "chunks": [ + { + "source_file": c.source_file, + "heading_path": c.heading_path, + "score": c.score, + "char_count": c.char_count, + "content_preview": c.content[:100], + } + for c in pack.chunks_used + ], + } diff --git a/src/atocore/ingestion/__init__.py b/src/atocore/ingestion/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atocore/ingestion/chunker.py b/src/atocore/ingestion/chunker.py new file mode 100644 index 0000000..90e7e54 --- /dev/null +++ b/src/atocore/ingestion/chunker.py @@ -0,0 +1,146 @@ +"""Heading-aware recursive markdown chunking.""" + +import re +from dataclasses import dataclass, field + +from atocore.config import settings + + +@dataclass +class Chunk: + content: str + chunk_index: int + heading_path: str + char_count: int + metadata: dict = field(default_factory=dict) + + +def chunk_markdown( + body: str, + base_metadata: dict | None = None, + max_size: int | None = None, + overlap: int | None = None, + min_size: int | None = None, +) -> list[Chunk]: + """Split markdown body into chunks using heading-aware strategy. + + 1. Split on H2 boundaries + 2. If section > max_size, split on H3 + 3. If still > max_size, split on paragraph breaks + 4. If still > max_size, hard split with overlap + """ + max_size = max_size or settings.chunk_max_size + overlap = overlap or settings.chunk_overlap + min_size = min_size or settings.chunk_min_size + base_metadata = base_metadata or {} + + sections = _split_by_heading(body, level=2) + raw_chunks: list[tuple[str, str]] = [] # (heading_path, content) + + for heading, content in sections: + if len(content) <= max_size: + raw_chunks.append((heading, content)) + else: + # Try splitting on H3 + subsections = _split_by_heading(content, level=3) + for sub_heading, sub_content in subsections: + full_path = ( + f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading + ) + if len(sub_content) <= max_size: + raw_chunks.append((full_path, sub_content)) + else: + # Split on paragraphs + para_chunks = _split_by_paragraphs( + sub_content, max_size, overlap + ) + for pc in para_chunks: + raw_chunks.append((full_path, pc)) + + # Build final chunks, filtering out too-small ones + chunks = [] + idx = 0 + for heading_path, content in raw_chunks: + content = content.strip() + if len(content) < min_size: + continue + chunks.append( + Chunk( + content=content, + chunk_index=idx, + heading_path=heading_path, + char_count=len(content), + metadata={**base_metadata}, + ) + ) + idx += 1 + + return chunks + + +def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]: + """Split text by heading level. Returns (heading_text, section_content) pairs.""" + pattern = rf"^({'#' * level})\s+(.+)$" + parts: list[tuple[str, str]] = [] + current_heading = "" + current_lines: list[str] = [] + + for line in text.split("\n"): + match = re.match(pattern, line) + if match: + # Save previous section + if current_lines: + parts.append((current_heading, "\n".join(current_lines))) + current_heading = match.group(2).strip() + current_lines = [] + else: + current_lines.append(line) + + # Save last section + if current_lines: + parts.append((current_heading, "\n".join(current_lines))) + + return parts + + +def _split_by_paragraphs( + text: str, max_size: int, overlap: int +) -> list[str]: + """Split text by paragraph breaks, then hard-split if needed.""" + paragraphs = re.split(r"\n\n+", text) + chunks: list[str] = [] + current = "" + + for para in paragraphs: + para = para.strip() + if not para: + continue + + if len(current) + len(para) + 2 <= max_size: + current = f"{current}\n\n{para}" if current else para + else: + if current: + chunks.append(current) + # If single paragraph exceeds max, hard split + if len(para) > max_size: + chunks.extend(_hard_split(para, max_size, overlap)) + else: + current = para + continue + current = "" + + if current: + chunks.append(current) + + return chunks + + +def _hard_split(text: str, max_size: int, overlap: int) -> list[str]: + """Hard split text at max_size with overlap.""" + chunks = [] + start = 0 + while start < len(text): + end = start + max_size + chunks.append(text[start:end]) + start = end - overlap + return chunks diff --git a/src/atocore/ingestion/parser.py b/src/atocore/ingestion/parser.py new file mode 100644 index 0000000..2684ec0 --- /dev/null +++ b/src/atocore/ingestion/parser.py @@ -0,0 +1,65 @@ +"""Markdown file parsing with frontmatter extraction.""" + +import re +from dataclasses import dataclass, field +from pathlib import Path + +import frontmatter + + +@dataclass +class ParsedDocument: + file_path: str + title: str + body: str + tags: list[str] = field(default_factory=list) + frontmatter: dict = field(default_factory=dict) + headings: list[tuple[int, str]] = field(default_factory=list) + + +def parse_markdown(file_path: Path) -> ParsedDocument: + """Parse a markdown file, extracting frontmatter and structure.""" + text = file_path.read_text(encoding="utf-8") + post = frontmatter.loads(text) + + meta = dict(post.metadata) if post.metadata else {} + body = post.content.strip() + + # Extract title: first H1, or filename + title = _extract_title(body, file_path) + + # Extract tags from frontmatter + tags = meta.get("tags", []) + if isinstance(tags, str): + tags = [t.strip() for t in tags.split(",") if t.strip()] + tags = tags or [] + + # Extract heading structure + headings = _extract_headings(body) + + return ParsedDocument( + file_path=str(file_path.resolve()), + title=title, + body=body, + tags=tags, + frontmatter=meta, + headings=headings, + ) + + +def _extract_title(body: str, file_path: Path) -> str: + """Get title from first H1 or fallback to filename.""" + match = re.search(r"^#\s+(.+)$", body, re.MULTILINE) + if match: + return match.group(1).strip() + return file_path.stem.replace("_", " ").replace("-", " ").title() + + +def _extract_headings(body: str) -> list[tuple[int, str]]: + """Extract all headings with their level.""" + headings = [] + for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE): + level = len(match.group(1)) + text = match.group(2).strip() + headings.append((level, text)) + return headings diff --git a/src/atocore/ingestion/pipeline.py b/src/atocore/ingestion/pipeline.py new file mode 100644 index 0000000..290ecb1 --- /dev/null +++ b/src/atocore/ingestion/pipeline.py @@ -0,0 +1,157 @@ +"""Ingestion pipeline: parse → chunk → embed → store.""" + +import hashlib +import json +import time +import uuid +from pathlib import Path + +from atocore.config import settings +from atocore.ingestion.chunker import chunk_markdown +from atocore.ingestion.parser import parse_markdown +from atocore.models.database import get_connection +from atocore.observability.logger import get_logger +from atocore.retrieval.vector_store import get_vector_store + +log = get_logger("ingestion") + + +def ingest_file(file_path: Path) -> dict: + """Ingest a single markdown file. Returns stats.""" + start = time.time() + file_path = file_path.resolve() + + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + if file_path.suffix.lower() not in (".md", ".markdown"): + raise ValueError(f"Not a markdown file: {file_path}") + + # Read and hash + raw_content = file_path.read_text(encoding="utf-8") + file_hash = hashlib.sha256(raw_content.encode()).hexdigest() + + # Check if already ingested and unchanged + with get_connection() as conn: + existing = conn.execute( + "SELECT id, file_hash FROM source_documents WHERE file_path = ?", + (str(file_path),), + ).fetchone() + + if existing and existing["file_hash"] == file_hash: + log.info("file_skipped_unchanged", file_path=str(file_path)) + return {"file": str(file_path), "status": "skipped", "reason": "unchanged"} + + # Parse + parsed = parse_markdown(file_path) + + # Chunk + base_meta = { + "source_file": str(file_path), + "tags": parsed.tags, + "title": parsed.title, + } + chunks = chunk_markdown(parsed.body, base_metadata=base_meta) + + if not chunks: + log.warning("no_chunks_created", file_path=str(file_path)) + return {"file": str(file_path), "status": "empty", "chunks": 0} + + # Store in DB and vector store + doc_id = str(uuid.uuid4()) + vector_store = get_vector_store() + + with get_connection() as conn: + # Remove old data if re-ingesting + if existing: + doc_id = existing["id"] + old_chunk_ids = [ + row["id"] + for row in conn.execute( + "SELECT id FROM source_chunks WHERE document_id = ?", + (doc_id,), + ).fetchall() + ] + conn.execute( + "DELETE FROM source_chunks WHERE document_id = ?", (doc_id,) + ) + conn.execute( + "UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?", + (file_hash, parsed.title, json.dumps(parsed.tags), doc_id), + ) + # Remove old vectors + if old_chunk_ids: + vector_store.delete(old_chunk_ids) + else: + conn.execute( + "INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)", + (doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)), + ) + + # Insert chunks + chunk_ids = [] + chunk_contents = [] + chunk_metadatas = [] + + for chunk in chunks: + chunk_id = str(uuid.uuid4()) + chunk_ids.append(chunk_id) + chunk_contents.append(chunk.content) + chunk_metadatas.append({ + "document_id": doc_id, + "heading_path": chunk.heading_path, + "source_file": str(file_path), + "tags": json.dumps(parsed.tags), + "title": parsed.title, + }) + + conn.execute( + "INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)", + ( + chunk_id, + doc_id, + chunk.chunk_index, + chunk.content, + chunk.heading_path, + chunk.char_count, + json.dumps(chunk.metadata), + ), + ) + + # Store embeddings + vector_store.add(chunk_ids, chunk_contents, chunk_metadatas) + + duration_ms = int((time.time() - start) * 1000) + log.info( + "file_ingested", + file_path=str(file_path), + chunks_created=len(chunks), + duration_ms=duration_ms, + ) + + return { + "file": str(file_path), + "status": "ingested", + "chunks": len(chunks), + "duration_ms": duration_ms, + } + + +def ingest_folder(folder_path: Path) -> list[dict]: + """Ingest all markdown files in a folder recursively.""" + folder_path = folder_path.resolve() + if not folder_path.is_dir(): + raise NotADirectoryError(f"Not a directory: {folder_path}") + + results = [] + md_files = sorted(folder_path.rglob("*.md")) + log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files)) + + for md_file in md_files: + try: + result = ingest_file(md_file) + results.append(result) + except Exception as e: + log.error("ingestion_error", file_path=str(md_file), error=str(e)) + results.append({"file": str(md_file), "status": "error", "error": str(e)}) + + return results diff --git a/src/atocore/main.py b/src/atocore/main.py new file mode 100644 index 0000000..33529a4 --- /dev/null +++ b/src/atocore/main.py @@ -0,0 +1,33 @@ +"""AtoCore — FastAPI application entry point.""" + +from fastapi import FastAPI + +from atocore.api.routes import router +from atocore.config import settings +from atocore.models.database import init_db +from atocore.observability.logger import setup_logging + +app = FastAPI( + title="AtoCore", + description="Personal Context Engine for LLM interactions", + version="0.1.0", +) + +app.include_router(router) + + +@app.on_event("startup") +def startup(): + setup_logging() + init_db() + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run( + "atocore.main:app", + host=settings.host, + port=settings.port, + reload=True, + ) diff --git a/src/atocore/models/__init__.py b/src/atocore/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atocore/models/database.py b/src/atocore/models/database.py new file mode 100644 index 0000000..1e07189 --- /dev/null +++ b/src/atocore/models/database.py @@ -0,0 +1,98 @@ +"""SQLite database schema and connection management.""" + +import sqlite3 +from contextlib import contextmanager +from pathlib import Path +from typing import Generator + +from atocore.config import settings +from atocore.observability.logger import get_logger + +log = get_logger("database") + +SCHEMA_SQL = """ +CREATE TABLE IF NOT EXISTS source_documents ( + id TEXT PRIMARY KEY, + file_path TEXT UNIQUE NOT NULL, + file_hash TEXT NOT NULL, + title TEXT, + doc_type TEXT DEFAULT 'markdown', + tags TEXT DEFAULT '[]', + ingested_at DATETIME DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS source_chunks ( + id TEXT PRIMARY KEY, + document_id TEXT NOT NULL REFERENCES source_documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, + content TEXT NOT NULL, + heading_path TEXT DEFAULT '', + char_count INTEGER NOT NULL, + metadata TEXT DEFAULT '{}', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS memories ( + id TEXT PRIMARY KEY, + memory_type TEXT NOT NULL, + content TEXT NOT NULL, + source_chunk_id TEXT REFERENCES source_chunks(id), + confidence REAL DEFAULT 1.0, + status TEXT DEFAULT 'active', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS projects ( + id TEXT PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + description TEXT DEFAULT '', + status TEXT DEFAULT 'active', + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS interactions ( + id TEXT PRIMARY KEY, + prompt TEXT NOT NULL, + context_pack TEXT DEFAULT '{}', + response_summary TEXT DEFAULT '', + project_id TEXT REFERENCES projects(id), + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS idx_chunks_document ON source_chunks(document_id); +CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type); +CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status); +CREATE INDEX IF NOT EXISTS idx_interactions_project ON interactions(project_id); +""" + + +def _ensure_data_dir() -> None: + settings.data_dir.mkdir(parents=True, exist_ok=True) + + +def init_db() -> None: + """Initialize the database with schema.""" + _ensure_data_dir() + with get_connection() as conn: + conn.executescript(SCHEMA_SQL) + log.info("database_initialized", path=str(settings.db_path)) + + +@contextmanager +def get_connection() -> Generator[sqlite3.Connection, None, None]: + """Get a database connection with row factory.""" + _ensure_data_dir() + conn = sqlite3.connect(str(settings.db_path)) + conn.row_factory = sqlite3.Row + conn.execute("PRAGMA foreign_keys = ON") + try: + yield conn + conn.commit() + except Exception: + conn.rollback() + raise + finally: + conn.close() diff --git a/src/atocore/observability/__init__.py b/src/atocore/observability/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atocore/observability/logger.py b/src/atocore/observability/logger.py new file mode 100644 index 0000000..6ee409c --- /dev/null +++ b/src/atocore/observability/logger.py @@ -0,0 +1,41 @@ +"""Structured logging for AtoCore.""" + +import logging + +import structlog + +from atocore.config import settings + +_LOG_LEVELS = { + "DEBUG": logging.DEBUG, + "INFO": logging.INFO, + "WARNING": logging.WARNING, + "ERROR": logging.ERROR, +} + + +def setup_logging() -> None: + """Configure structlog with JSON output.""" + log_level = "DEBUG" if settings.debug else "INFO" + + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.TimeStamper(fmt="iso"), + structlog.dev.ConsoleRenderer() + if settings.debug + else structlog.processors.JSONRenderer(), + ], + wrapper_class=structlog.make_filtering_bound_logger( + _LOG_LEVELS.get(log_level, logging.INFO) + ), + context_class=dict, + logger_factory=structlog.PrintLoggerFactory(), + cache_logger_on_first_use=True, + ) + + +def get_logger(name: str) -> structlog.BoundLogger: + """Get a named logger.""" + return structlog.get_logger(name) diff --git a/src/atocore/retrieval/__init__.py b/src/atocore/retrieval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atocore/retrieval/embeddings.py b/src/atocore/retrieval/embeddings.py new file mode 100644 index 0000000..37a781e --- /dev/null +++ b/src/atocore/retrieval/embeddings.py @@ -0,0 +1,32 @@ +"""Embedding model management.""" + +from sentence_transformers import SentenceTransformer + +from atocore.config import settings +from atocore.observability.logger import get_logger + +log = get_logger("embeddings") + +_model: SentenceTransformer | None = None + + +def get_model() -> SentenceTransformer: + """Load and cache the embedding model.""" + global _model + if _model is None: + log.info("loading_embedding_model", model=settings.embedding_model) + _model = SentenceTransformer(settings.embedding_model) + log.info("embedding_model_loaded", model=settings.embedding_model) + return _model + + +def embed_texts(texts: list[str]) -> list[list[float]]: + """Generate embeddings for a list of texts.""" + model = get_model() + embeddings = model.encode(texts, show_progress_bar=False, normalize_embeddings=True) + return embeddings.tolist() + + +def embed_query(query: str) -> list[float]: + """Generate embedding for a single query.""" + return embed_texts([query])[0] diff --git a/src/atocore/retrieval/retriever.py b/src/atocore/retrieval/retriever.py new file mode 100644 index 0000000..ddb3b0e --- /dev/null +++ b/src/atocore/retrieval/retriever.py @@ -0,0 +1,83 @@ +"""Retrieval: query → ranked chunks.""" + +import time +from dataclasses import dataclass + +from atocore.config import settings +from atocore.observability.logger import get_logger +from atocore.retrieval.embeddings import embed_query +from atocore.retrieval.vector_store import get_vector_store + +log = get_logger("retriever") + + +@dataclass +class ChunkResult: + chunk_id: str + content: str + score: float + heading_path: str + source_file: str + tags: str + title: str + document_id: str + + +def retrieve( + query: str, + top_k: int | None = None, + filter_tags: list[str] | None = None, +) -> list[ChunkResult]: + """Retrieve the most relevant chunks for a query.""" + top_k = top_k or settings.context_top_k + start = time.time() + + query_embedding = embed_query(query) + store = get_vector_store() + + # Build filter + where = None + if filter_tags: + # ChromaDB where filter for tags (stored as JSON string) + # Simple contains check — works for single-tag filtering + where = {"tags": {"$contains": filter_tags[0]}} + + results = store.query( + query_embedding=query_embedding, + top_k=top_k, + where=where, + ) + + chunks = [] + if results and results["ids"] and results["ids"][0]: + for i, chunk_id in enumerate(results["ids"][0]): + # ChromaDB returns distances (lower = more similar for cosine) + # Convert to similarity score (1 - distance) + distance = results["distances"][0][i] if results["distances"] else 0 + score = 1.0 - distance + meta = results["metadatas"][0][i] if results["metadatas"] else {} + content = results["documents"][0][i] if results["documents"] else "" + + chunks.append( + ChunkResult( + chunk_id=chunk_id, + content=content, + score=round(score, 4), + heading_path=meta.get("heading_path", ""), + source_file=meta.get("source_file", ""), + tags=meta.get("tags", "[]"), + title=meta.get("title", ""), + document_id=meta.get("document_id", ""), + ) + ) + + duration_ms = int((time.time() - start) * 1000) + log.info( + "retrieval_done", + query=query[:100], + top_k=top_k, + results_count=len(chunks), + duration_ms=duration_ms, + ) + + return chunks diff --git a/src/atocore/retrieval/vector_store.py b/src/atocore/retrieval/vector_store.py new file mode 100644 index 0000000..4143d9c --- /dev/null +++ b/src/atocore/retrieval/vector_store.py @@ -0,0 +1,77 @@ +"""ChromaDB vector store wrapper.""" + +import chromadb + +from atocore.config import settings +from atocore.observability.logger import get_logger +from atocore.retrieval.embeddings import embed_texts + +log = get_logger("vector_store") + +COLLECTION_NAME = "atocore_chunks" + +_store: "VectorStore | None" = None + + +class VectorStore: + """Wrapper around ChromaDB for chunk storage and retrieval.""" + + def __init__(self) -> None: + settings.chroma_path.mkdir(parents=True, exist_ok=True) + self._client = chromadb.PersistentClient(path=str(settings.chroma_path)) + self._collection = self._client.get_or_create_collection( + name=COLLECTION_NAME, + metadata={"hnsw:space": "cosine"}, + ) + log.info("vector_store_initialized", path=str(settings.chroma_path)) + + def add( + self, + ids: list[str], + documents: list[str], + metadatas: list[dict], + ) -> None: + """Add chunks with embeddings to the store.""" + embeddings = embed_texts(documents) + self._collection.add( + ids=ids, + embeddings=embeddings, + documents=documents, + metadatas=metadatas, + ) + log.debug("vectors_added", count=len(ids)) + + def query( + self, + query_embedding: list[float], + top_k: int = 10, + where: dict | None = None, + ) -> dict: + """Query the store for similar chunks.""" + kwargs: dict = { + "query_embeddings": [query_embedding], + "n_results": top_k, + "include": ["documents", "metadatas", "distances"], + } + if where: + kwargs["where"] = where + + return self._collection.query(**kwargs) + + def delete(self, ids: list[str]) -> None: + """Delete chunks by IDs.""" + if ids: + self._collection.delete(ids=ids) + log.debug("vectors_deleted", count=len(ids)) + + @property + def count(self) -> int: + return self._collection.count() + + +def get_vector_store() -> VectorStore: + """Get or create the singleton vector store.""" + global _store + if _store is None: + _store = VectorStore() + return _store diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5bbafe0 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,114 @@ +"""pytest configuration and shared fixtures.""" + +import os +import tempfile +from pathlib import Path + +import pytest + +# Force test data directory +os.environ["ATOCORE_DATA_DIR"] = tempfile.mkdtemp(prefix="atocore_test_") +os.environ["ATOCORE_DEBUG"] = "true" + + +@pytest.fixture +def tmp_data_dir(tmp_path): + """Provide a temporary data directory for tests.""" + os.environ["ATOCORE_DATA_DIR"] = str(tmp_path) + # Reset singletons + from atocore import config + config.settings = config.Settings() + + import atocore.retrieval.vector_store as vs + vs._store = None + + return tmp_path + + +@pytest.fixture +def sample_markdown(tmp_path) -> Path: + """Create a sample markdown file for testing.""" + md_file = tmp_path / "test_note.md" + md_file.write_text( + """--- +tags: + - atocore + - architecture +date: 2026-04-05 +--- +# AtoCore Architecture + +## Overview + +AtoCore is a personal context engine that enriches LLM interactions +with durable memory, structured context, and project knowledge. + +## Layers + +The system has these layers: + +1. Main PKM (human, messy, exploratory) +2. AtoVault (system mirror) +3. AtoDrive (trusted project truth) +4. Structured Memory (DB) +5. Semantic Retrieval (vector DB) + +## Memory Types + +AtoCore supports these memory types: + +- Identity +- Preferences +- Project Memory +- Episodic Memory +- Knowledge Objects +- Adaptation Memory +- Trusted Project State + +## Trust Precedence + +When sources conflict: + +1. Trusted Project State wins +2. AtoDrive overrides PKM +3. Most recent confirmed wins +4. Higher confidence wins +5. Equal → flag conflict + +No silent merging. +""", + encoding="utf-8", + ) + return md_file + + +@pytest.fixture +def sample_folder(tmp_path, sample_markdown) -> Path: + """Create a folder with multiple markdown files.""" + # Already has test_note.md from sample_markdown + second = tmp_path / "second_note.md" + second.write_text( + """--- +tags: + - chunking +--- +# Chunking Strategy + +## Approach + +Heading-aware recursive splitting: + +1. Split on H2 boundaries first +2. If section > 800 chars, split on H3 +3. If still > 800 chars, split on paragraphs +4. Hard split at 800 chars with 100 char overlap + +## Parameters + +- max_chunk_size: 800 characters +- overlap: 100 characters +- min_chunk_size: 50 characters +""", + encoding="utf-8", + ) + return tmp_path diff --git a/tests/test_chunker.py b/tests/test_chunker.py new file mode 100644 index 0000000..ba9f2c8 --- /dev/null +++ b/tests/test_chunker.py @@ -0,0 +1,73 @@ +"""Tests for the markdown chunker.""" + +from atocore.ingestion.chunker import chunk_markdown + + +def test_basic_chunking(): + """Test that markdown is split into chunks.""" + body = """## Section One + +This is the first section with some content that is long enough to pass the minimum chunk size filter applied by the chunker. + +## Section Two + +This is the second section with different content that is also long enough to pass the minimum chunk size threshold. +""" + chunks = chunk_markdown(body) + assert len(chunks) >= 2 + assert all(c.char_count > 0 for c in chunks) + assert all(c.chunk_index >= 0 for c in chunks) + + +def test_heading_path_preserved(): + """Test that heading paths are captured.""" + body = """## Architecture + +### Layers + +The system has multiple layers organized in a clear hierarchy for separation of concerns and maintainability. +""" + chunks = chunk_markdown(body) + assert len(chunks) >= 1 + # At least one chunk should have heading info + has_heading = any(c.heading_path for c in chunks) + assert has_heading + + +def test_small_chunks_filtered(): + """Test that very small chunks are discarded.""" + body = """## A + +Hi + +## B + +This is a real section with enough content to pass the minimum size threshold. +""" + chunks = chunk_markdown(body, min_size=50) + # "Hi" should be filtered out + for c in chunks: + assert c.char_count >= 50 + + +def test_large_section_split(): + """Test that large sections are split further.""" + large_content = "Word " * 200 # ~1000 chars + body = f"## Big Section\n\n{large_content}" + chunks = chunk_markdown(body, max_size=400) + assert len(chunks) >= 2 + + +def test_metadata_passed_through(): + """Test that base metadata is included in chunks.""" + body = "## Test\n\nSome content here that is long enough." + meta = {"source_file": "/test/file.md", "tags": ["test"]} + chunks = chunk_markdown(body, base_metadata=meta) + if chunks: + assert chunks[0].metadata.get("source_file") == "/test/file.md" + + +def test_empty_body(): + """Test chunking an empty body.""" + chunks = chunk_markdown("") + assert chunks == [] diff --git a/tests/test_context_builder.py b/tests/test_context_builder.py new file mode 100644 index 0000000..fdba8d9 --- /dev/null +++ b/tests/test_context_builder.py @@ -0,0 +1,60 @@ +"""Tests for the context builder.""" + +from atocore.context.builder import build_context, get_last_context_pack +from atocore.ingestion.pipeline import ingest_file +from atocore.models.database import init_db + + +def test_build_context_returns_pack(tmp_data_dir, sample_markdown): + """Test that context builder returns a valid pack.""" + init_db() + ingest_file(sample_markdown) + + pack = build_context("What is AtoCore?") + assert pack.total_chars > 0 + assert len(pack.chunks_used) > 0 + assert pack.budget_remaining >= 0 + assert "--- AtoCore Context ---" in pack.formatted_context + assert "--- End Context ---" in pack.formatted_context + + +def test_context_respects_budget(tmp_data_dir, sample_markdown): + """Test that context builder respects character budget.""" + init_db() + ingest_file(sample_markdown) + + pack = build_context("What is AtoCore?", budget=500) + assert pack.total_chars <= 500 + + +def test_context_with_project_hint(tmp_data_dir, sample_markdown): + """Test that project hint boosts relevant chunks.""" + init_db() + ingest_file(sample_markdown) + + pack = build_context("What is the architecture?", project_hint="atocore") + assert len(pack.chunks_used) > 0 + # With project hint, we should still get results + assert pack.total_chars > 0 + + +def test_last_context_pack_stored(tmp_data_dir, sample_markdown): + """Test that last context pack is stored for debug.""" + init_db() + ingest_file(sample_markdown) + + build_context("test prompt") + last = get_last_context_pack() + assert last is not None + assert last.query == "test prompt" + + +def test_full_prompt_structure(tmp_data_dir, sample_markdown): + """Test that the full prompt has correct structure.""" + init_db() + ingest_file(sample_markdown) + + pack = build_context("What are memory types?") + assert "knowledge base" in pack.full_prompt.lower() + assert "--- AtoCore Context ---" in pack.full_prompt + assert "What are memory types?" in pack.full_prompt diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py new file mode 100644 index 0000000..7c941a5 --- /dev/null +++ b/tests/test_ingestion.py @@ -0,0 +1,71 @@ +"""Tests for the ingestion pipeline.""" + +from pathlib import Path + +from atocore.ingestion.parser import parse_markdown +from atocore.models.database import get_connection, init_db +from atocore.ingestion.pipeline import ingest_file + + +def test_parse_markdown(sample_markdown): + """Test markdown parsing with frontmatter.""" + parsed = parse_markdown(sample_markdown) + assert parsed.title == "AtoCore Architecture" + assert "atocore" in parsed.tags + assert "architecture" in parsed.tags + assert len(parsed.body) > 0 + assert len(parsed.headings) > 0 + + +def test_parse_extracts_headings(sample_markdown): + """Test that headings are extracted correctly.""" + parsed = parse_markdown(sample_markdown) + heading_texts = [h[1] for h in parsed.headings] + assert "AtoCore Architecture" in heading_texts + assert "Overview" in heading_texts + + +def test_ingest_file(tmp_data_dir, sample_markdown): + """Test ingesting a single file.""" + init_db() + result = ingest_file(sample_markdown) + assert result["status"] == "ingested" + assert result["chunks"] > 0 + + # Verify the file was stored in DB + with get_connection() as conn: + doc = conn.execute( + "SELECT COUNT(*) as c FROM source_documents WHERE file_path = ?", + (str(sample_markdown.resolve()),), + ).fetchone() + assert doc["c"] == 1 + + chunks = conn.execute( + "SELECT COUNT(*) as c FROM source_chunks sc " + "JOIN source_documents sd ON sc.document_id = sd.id " + "WHERE sd.file_path = ?", + (str(sample_markdown.resolve()),), + ).fetchone() + assert chunks["c"] > 0 + + +def test_ingest_skips_unchanged(tmp_data_dir, sample_markdown): + """Test that re-ingesting unchanged file is skipped.""" + init_db() + ingest_file(sample_markdown) + result = ingest_file(sample_markdown) + assert result["status"] == "skipped" + + +def test_ingest_updates_changed(tmp_data_dir, sample_markdown): + """Test that changed files are re-ingested.""" + init_db() + ingest_file(sample_markdown) + + # Modify the file + sample_markdown.write_text( + sample_markdown.read_text(encoding="utf-8") + "\n\n## New Section\n\nNew content added.", + encoding="utf-8", + ) + result = ingest_file(sample_markdown) + assert result["status"] == "ingested" diff --git a/tests/test_prompts/gigabit_prompts.yaml b/tests/test_prompts/gigabit_prompts.yaml new file mode 100644 index 0000000..8fd563f --- /dev/null +++ b/tests/test_prompts/gigabit_prompts.yaml @@ -0,0 +1,40 @@ +prompts: + - id: g1 + prompt: "What is the GigaBIT M1 project about?" + project: gigabit + expected: "Should mention 1.2m primary mirror, StarSpec, telescope" + + - id: g2 + prompt: "What are the main requirements for the M1 mirror?" + project: gigabit + expected: "Should mention optical/mechanical requirements, SOW, diameter, Zerodur" + + - id: g3 + prompt: "What vendors are involved in the project?" + project: gigabit + expected: "Should mention Optiques Fullum, StarSpec, Atomaste, or subcontractors" + + - id: g4 + prompt: "What is the status of the CDR?" + project: gigabit + expected: "Should mention Critical Design Review status, CBUSH, design completion" + + - id: g5 + prompt: "What are the key design decisions made so far?" + project: gigabit + expected: "Should mention design phases, PDR, assumptions, blank order" + + - id: g6 + prompt: "What FEA optimization work has been done?" + project: gigabit + expected: "Should mention FEA analysis, optimization approach, WFE, displacement data" + + - id: g7 + prompt: "What is the cost reduction strategy?" + project: gigabit + expected: "Should mention cost reduction campaign, trade-off, topology selection" + + - id: g8 + prompt: "What are the mirror blank specifications?" + project: gigabit + expected: "Should mention 1200mm diameter, Zerodur, optical specifications" diff --git a/tests/test_prompts/prompts.yaml b/tests/test_prompts/prompts.yaml new file mode 100644 index 0000000..f2cd912 --- /dev/null +++ b/tests/test_prompts/prompts.yaml @@ -0,0 +1,40 @@ +prompts: + - id: p1 + prompt: "What is AtoCore's architecture?" + project: atocore + expected: "Should mention layered architecture, SQLite, vector DB" + + - id: p2 + prompt: "What chunking strategy does AtoCore use?" + project: atocore + expected: "Should mention heading-aware splitting, 800 char max" + + - id: p3 + prompt: "What is the trust precedence order?" + project: atocore + expected: "Should list: Trusted Project State > AtoDrive > validated memory" + + - id: p4 + prompt: "How does AtoCore handle conflicts between sources?" + project: atocore + expected: "Should mention conflict resolution rules, no silent merging" + + - id: p5 + prompt: "What are the different memory types?" + project: atocore + expected: "Should list: Identity, Preferences, Project, Episodic, Knowledge, Adaptation, Trusted Project State" + + - id: p6 + prompt: "What is the context budget allocation?" + project: atocore + expected: "Should mention percentages: identity 5%, preferences 5%, project 20%, episodic 10%, retrieval 60%" + + - id: p7 + prompt: "What is a trivial prompt in AtoCore?" + project: atocore + expected: "Should mention: no project ref, no proper nouns, no past context dependency" + + - id: p8 + prompt: "What are the success criteria for the first win?" + project: atocore + expected: "Should mention: saves >=5 min lookup, >=80-90% accuracy, >=10 test prompts" diff --git a/tests/test_retrieval.py b/tests/test_retrieval.py new file mode 100644 index 0000000..d4cbdcb --- /dev/null +++ b/tests/test_retrieval.py @@ -0,0 +1,41 @@ +"""Tests for the retrieval system.""" + +from atocore.ingestion.pipeline import ingest_file +from atocore.models.database import init_db +from atocore.retrieval.retriever import retrieve +from atocore.retrieval.vector_store import get_vector_store + + +def test_retrieve_returns_results(tmp_data_dir, sample_markdown): + """Test that retrieval returns relevant chunks.""" + init_db() + ingest_file(sample_markdown) + + results = retrieve("What are the memory types?", top_k=5) + assert len(results) > 0 + assert all(r.score > 0 for r in results) + assert all(r.content for r in results) + + +def test_retrieve_scores_ranked(tmp_data_dir, sample_markdown): + """Test that results are ranked by score.""" + init_db() + ingest_file(sample_markdown) + + results = retrieve("architecture layers", top_k=5) + if len(results) >= 2: + scores = [r.score for r in results] + assert scores == sorted(scores, reverse=True) + + +def test_vector_store_count(tmp_data_dir, sample_markdown): + """Test that vector store tracks chunk count.""" + init_db() + + # Reset singleton for clean test + import atocore.retrieval.vector_store as vs + vs._store = None + + ingest_file(sample_markdown) + store = get_vector_store() + assert store.count > 0