src/atocore/ingestion/pipeline.py

"""Ingestion pipeline: parse → chunk → embed → store."""

import hashlib
import json
import time
import uuid
from pathlib import Path

from atocore.config import settings
from atocore.ingestion.chunker import chunk_markdown
from atocore.ingestion.parser import parse_markdown
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
from atocore.retrieval.vector_store import get_vector_store

log = get_logger("ingestion")


def ingest_file(file_path: Path) -> dict:
    """Ingest a single markdown file. Returns stats."""
    start = time.time()
    file_path = file_path.resolve()

    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    if file_path.suffix.lower() not in (".md", ".markdown"):
        raise ValueError(f"Not a markdown file: {file_path}")

    # Read and hash
    raw_content = file_path.read_text(encoding="utf-8")
    file_hash = hashlib.sha256(raw_content.encode()).hexdigest()

    # Check if already ingested and unchanged
    with get_connection() as conn:
        existing = conn.execute(
            "SELECT id, file_hash FROM source_documents WHERE file_path = ?",
            (str(file_path),),
        ).fetchone()

        if existing and existing["file_hash"] == file_hash:
            log.info("file_skipped_unchanged", file_path=str(file_path))
            return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}

    # Parse
    parsed = parse_markdown(file_path)

    # Chunk
    base_meta = {
        "source_file": str(file_path),
        "tags": parsed.tags,
        "title": parsed.title,
    }
    chunks = chunk_markdown(parsed.body, base_metadata=base_meta)

    if not chunks:
        log.warning("no_chunks_created", file_path=str(file_path))
        return {"file": str(file_path), "status": "empty", "chunks": 0}

    # Store in DB and vector store
    doc_id = str(uuid.uuid4())
    vector_store = get_vector_store()

    with get_connection() as conn:
        # Remove old data if re-ingesting
        if existing:
            doc_id = existing["id"]
            old_chunk_ids = [
                row["id"]
                for row in conn.execute(
                    "SELECT id FROM source_chunks WHERE document_id = ?",
                    (doc_id,),
                ).fetchall()
            ]
            conn.execute(
                "DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
            )
            conn.execute(
                "UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
                (file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
            )
            # Remove old vectors
            if old_chunk_ids:
                vector_store.delete(old_chunk_ids)
        else:
            conn.execute(
                "INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
                (doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
            )

        # Insert chunks
        chunk_ids = []
        chunk_contents = []
        chunk_metadatas = []

        for chunk in chunks:
            chunk_id = str(uuid.uuid4())
            chunk_ids.append(chunk_id)
            chunk_contents.append(chunk.content)
            chunk_metadatas.append({
                "document_id": doc_id,
                "heading_path": chunk.heading_path,
                "source_file": str(file_path),
                "tags": json.dumps(parsed.tags),
                "title": parsed.title,
            })

            conn.execute(
                "INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
                (
                    chunk_id,
                    doc_id,
                    chunk.chunk_index,
                    chunk.content,
                    chunk.heading_path,
                    chunk.char_count,
                    json.dumps(chunk.metadata),
                ),
            )

        # Store embeddings
        vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)

    duration_ms = int((time.time() - start) * 1000)
    log.info(
        "file_ingested",
        file_path=str(file_path),
        chunks_created=len(chunks),
        duration_ms=duration_ms,
    )

    return {
        "file": str(file_path),
        "status": "ingested",
        "chunks": len(chunks),
        "duration_ms": duration_ms,
    }


def ingest_folder(folder_path: Path) -> list[dict]:
    """Ingest all markdown files in a folder recursively."""
    folder_path = folder_path.resolve()
    if not folder_path.is_dir():
        raise NotADirectoryError(f"Not a directory: {folder_path}")

    results = []
    md_files = sorted(folder_path.rglob("*.md"))
    log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))

    for md_file in md_files:
        try:
            result = ingest_file(md_file)
            results.append(result)
        except Exception as e:
            log.error("ingestion_error", file_path=str(md_file), error=str(e))
            results.append({"file": str(md_file), "status": "error", "error": str(e)})

    return results
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00			`"""Ingestion pipeline: parse → chunk → embed → store."""`

			`import hashlib`
			`import json`
			`import time`
			`import uuid`
			`from pathlib import Path`

			`from atocore.config import settings`
			`from atocore.ingestion.chunker import chunk_markdown`
			`from atocore.ingestion.parser import parse_markdown`
			`from atocore.models.database import get_connection`
			`from atocore.observability.logger import get_logger`
			`from atocore.retrieval.vector_store import get_vector_store`

			`log = get_logger("ingestion")`


			`def ingest_file(file_path: Path) -> dict:`
			`"""Ingest a single markdown file. Returns stats."""`
			`start = time.time()`
			`file_path = file_path.resolve()`

			`if not file_path.exists():`
			`raise FileNotFoundError(f"File not found: {file_path}")`
			`if file_path.suffix.lower() not in (".md", ".markdown"):`
			`raise ValueError(f"Not a markdown file: {file_path}")`

			`# Read and hash`
			`raw_content = file_path.read_text(encoding="utf-8")`
			`file_hash = hashlib.sha256(raw_content.encode()).hexdigest()`

			`# Check if already ingested and unchanged`
			`with get_connection() as conn:`
			`existing = conn.execute(`
			`"SELECT id, file_hash FROM source_documents WHERE file_path = ?",`
			`(str(file_path),),`
			`).fetchone()`

			`if existing and existing["file_hash"] == file_hash:`
			`log.info("file_skipped_unchanged", file_path=str(file_path))`
			`return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}`

			`# Parse`
			`parsed = parse_markdown(file_path)`

			`# Chunk`
			`base_meta = {`
			`"source_file": str(file_path),`
			`"tags": parsed.tags,`
			`"title": parsed.title,`
			`}`
			`chunks = chunk_markdown(parsed.body, base_metadata=base_meta)`

			`if not chunks:`
			`log.warning("no_chunks_created", file_path=str(file_path))`
			`return {"file": str(file_path), "status": "empty", "chunks": 0}`

			`# Store in DB and vector store`
			`doc_id = str(uuid.uuid4())`
			`vector_store = get_vector_store()`

			`with get_connection() as conn:`
			`# Remove old data if re-ingesting`
			`if existing:`
			`doc_id = existing["id"]`
			`old_chunk_ids = [`
			`row["id"]`
			`for row in conn.execute(`
			`"SELECT id FROM source_chunks WHERE document_id = ?",`
			`(doc_id,),`
			`).fetchall()`
			`]`
			`conn.execute(`
			`"DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)`
			`)`
			`conn.execute(`
			`"UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",`
			`(file_hash, parsed.title, json.dumps(parsed.tags), doc_id),`
			`)`
			`# Remove old vectors`
			`if old_chunk_ids:`
			`vector_store.delete(old_chunk_ids)`
			`else:`
			`conn.execute(`
			`"INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",`
			`(doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),`
			`)`

			`# Insert chunks`
			`chunk_ids = []`
			`chunk_contents = []`
			`chunk_metadatas = []`

			`for chunk in chunks:`
			`chunk_id = str(uuid.uuid4())`
			`chunk_ids.append(chunk_id)`
			`chunk_contents.append(chunk.content)`
			`chunk_metadatas.append({`
			`"document_id": doc_id,`
			`"heading_path": chunk.heading_path,`
			`"source_file": str(file_path),`
			`"tags": json.dumps(parsed.tags),`
			`"title": parsed.title,`
			`})`

			`conn.execute(`
			`"INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",`
			`(`
			`chunk_id,`
			`doc_id,`
			`chunk.chunk_index,`
			`chunk.content,`
			`chunk.heading_path,`
			`chunk.char_count,`
			`json.dumps(chunk.metadata),`
			`),`
			`)`

			`# Store embeddings`
			`vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)`

			`duration_ms = int((time.time() - start) * 1000)`
			`log.info(`
			`"file_ingested",`
			`file_path=str(file_path),`
			`chunks_created=len(chunks),`
			`duration_ms=duration_ms,`
			`)`

			`return {`
			`"file": str(file_path),`
			`"status": "ingested",`
			`"chunks": len(chunks),`
			`"duration_ms": duration_ms,`
			`}`


			`def ingest_folder(folder_path: Path) -> list[dict]:`
			`"""Ingest all markdown files in a folder recursively."""`
			`folder_path = folder_path.resolve()`
			`if not folder_path.is_dir():`
			`raise NotADirectoryError(f"Not a directory: {folder_path}")`

			`results = []`
			`md_files = sorted(folder_path.rglob("*.md"))`
			`log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))`

			`for md_file in md_files:`
			`try:`
			`result = ingest_file(md_file)`
			`results.append(result)`
			`except Exception as e:`
			`log.error("ingestion_error", file_path=str(md_file), error=str(e))`
			`results.append({"file": str(md_file), "status": "error", "error": str(e)})`

			`return results`