feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)

Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
parent 32ce409a7b
commit b4afbbb53a
34 changed files with 1756 additions and 0 deletions
--- a/scripts/ingest_folder.py
+++ b/scripts/ingest_folder.py
@@ -0,0 +1,54 @@
+"""CLI script to ingest a folder of markdown files."""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from atocore.ingestion.pipeline import ingest_folder
+from atocore.models.database import init_db
+from atocore.observability.logger import setup_logging
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Ingest markdown files into AtoCore")
+    parser.add_argument("--path", required=True, help="Path to folder with markdown files")
+    args = parser.parse_args()
+
+    setup_logging()
+    init_db()
+
+    folder = Path(args.path)
+    if not folder.is_dir():
+        print(f"Error: {folder} is not a directory")
+        sys.exit(1)
+
+    results = ingest_folder(folder)
+
+    # Summary
+    ingested = sum(1 for r in results if r["status"] == "ingested")
+    skipped = sum(1 for r in results if r["status"] == "skipped")
+    errors = sum(1 for r in results if r["status"] == "error")
+    total_chunks = sum(r.get("chunks", 0) for r in results)
+
+    print(f"\n{'='*50}")
+    print(f"Ingestion complete:")
+    print(f"  Files processed: {len(results)}")
+    print(f"  Ingested: {ingested}")
+    print(f"  Skipped (unchanged): {skipped}")
+    print(f"  Errors: {errors}")
+    print(f"  Total chunks created: {total_chunks}")
+    print(f"{'='*50}")
+
+    if errors:
+        print("\nErrors:")
+        for r in results:
+            if r["status"] == "error":
+                print(f"  {r['file']}: {r['error']}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/query_test.py
+++ b/scripts/query_test.py
@@ -0,0 +1,76 @@
+"""CLI script to run test prompts and compare baseline vs enriched."""
+
+import argparse
+import sys
+from pathlib import Path
+
+import yaml
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+
+from atocore.context.builder import build_context
+from atocore.models.database import init_db
+from atocore.observability.logger import setup_logging
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run test prompts against AtoCore")
+    parser.add_argument(
+        "--prompts",
+        default=str(Path(__file__).parent.parent / "tests" / "test_prompts" / "prompts.yaml"),
+        help="Path to prompts YAML file",
+    )
+    args = parser.parse_args()
+
+    setup_logging()
+    init_db()
+
+    prompts_path = Path(args.prompts)
+    if not prompts_path.exists():
+        print(f"Error: {prompts_path} not found")
+        sys.exit(1)
+
+    with open(prompts_path) as f:
+        data = yaml.safe_load(f)
+
+    prompts = data.get("prompts", [])
+    print(f"Running {len(prompts)} test prompts...\n")
+
+    for p in prompts:
+        prompt_id = p["id"]
+        prompt_text = p["prompt"]
+        project = p.get("project")
+        expected = p.get("expected", "")
+
+        print(f"{'='*60}")
+        print(f"[{prompt_id}] {prompt_text}")
+        print(f"Project: {project or 'none'}")
+        print(f"Expected: {expected}")
+        print(f"-" * 60)
+
+        pack = build_context(
+            user_prompt=prompt_text,
+            project_hint=project,
+        )
+
+        print(f"Chunks retrieved: {len(pack.chunks_used)}")
+        print(f"Total chars: {pack.total_chars} / {pack.budget}")
+        print(f"Duration: {pack.duration_ms}ms")
+        print()
+
+        for i, chunk in enumerate(pack.chunks_used[:5]):
+            print(f"  [{i+1}] Score: {chunk.score:.2f} | {chunk.source_file}")
+            print(f"      Section: {chunk.heading_path}")
+            print(f"      Preview: {chunk.content[:120]}...")
+            print()
+
+        print(f"Full prompt length: {len(pack.full_prompt)} chars")
+        print()
+
+    print(f"{'='*60}")
+    print("Done. Review output above to assess retrieval quality.")
+
+
+if __name__ == "__main__":
+    main()