Files
ATOCore/scripts/ingest_folder.py
Anto01 b4afbbb53a feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00

55 lines
1.5 KiB
Python

"""CLI script to ingest a folder of markdown files."""
import argparse
import json
import sys
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from atocore.ingestion.pipeline import ingest_folder
from atocore.models.database import init_db
from atocore.observability.logger import setup_logging
def main():
parser = argparse.ArgumentParser(description="Ingest markdown files into AtoCore")
parser.add_argument("--path", required=True, help="Path to folder with markdown files")
args = parser.parse_args()
setup_logging()
init_db()
folder = Path(args.path)
if not folder.is_dir():
print(f"Error: {folder} is not a directory")
sys.exit(1)
results = ingest_folder(folder)
# Summary
ingested = sum(1 for r in results if r["status"] == "ingested")
skipped = sum(1 for r in results if r["status"] == "skipped")
errors = sum(1 for r in results if r["status"] == "error")
total_chunks = sum(r.get("chunks", 0) for r in results)
print(f"\n{'='*50}")
print(f"Ingestion complete:")
print(f" Files processed: {len(results)}")
print(f" Ingested: {ingested}")
print(f" Skipped (unchanged): {skipped}")
print(f" Errors: {errors}")
print(f" Total chunks created: {total_chunks}")
print(f"{'='*50}")
if errors:
print("\nErrors:")
for r in results:
if r["status"] == "error":
print(f" {r['file']}: {r['error']}")
if __name__ == "__main__":
main()