src/atocore/ingestion/pipeline.py

"""Ingestion pipeline: parse → chunk → embed → store."""

import hashlib
import json
import threading
import time
import uuid
from contextlib import contextmanager
from pathlib import Path

import atocore.config as _config
from atocore.ingestion.chunker import chunk_markdown
from atocore.ingestion.parser import parse_markdown
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
from atocore.retrieval.vector_store import get_vector_store

log = get_logger("ingestion")

# Encodings to try when reading markdown files
_ENCODINGS = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
_INGESTION_LOCK = threading.Lock()


@contextmanager
def exclusive_ingestion():
    """Serialize long-running ingestion operations across API requests."""
    _INGESTION_LOCK.acquire()
    try:
        yield
    finally:
        _INGESTION_LOCK.release()


def ingest_file(file_path: Path) -> dict:
    """Ingest a single markdown file. Returns stats."""
    start = time.time()
    file_path = file_path.resolve()

    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    if file_path.suffix.lower() not in (".md", ".markdown"):
        raise ValueError(f"Not a markdown file: {file_path}")

    # Read with encoding fallback
    raw_content = _read_file_safe(file_path)
    file_hash = hashlib.sha256(raw_content.encode("utf-8")).hexdigest()

    # Check if already ingested and unchanged
    with get_connection() as conn:
        existing = conn.execute(
            "SELECT id, file_hash FROM source_documents WHERE file_path = ?",
            (str(file_path),),
        ).fetchone()

        if existing and existing["file_hash"] == file_hash:
            log.info("file_skipped_unchanged", file_path=str(file_path))
            return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}

    # Parse
    parsed = parse_markdown(file_path, text=raw_content)

    # Chunk
    base_meta = {
        "source_file": str(file_path),
        "tags": parsed.tags,
        "title": parsed.title,
    }
    chunks = chunk_markdown(parsed.body, base_metadata=base_meta)

    # Store in DB and vector store
    doc_id = str(uuid.uuid4())
    vector_store = get_vector_store()
    old_chunk_ids: list[str] = []
    new_chunk_ids: list[str] = []

    try:
        with get_connection() as conn:
            # Remove old data if re-ingesting
            if existing:
                doc_id = existing["id"]
                old_chunk_ids = [
                    row["id"]
                    for row in conn.execute(
                        "SELECT id FROM source_chunks WHERE document_id = ?",
                        (doc_id,),
                    ).fetchall()
                ]
                conn.execute(
                    "DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
                )
                conn.execute(
                    "UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
                    (file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
                )
            else:
                conn.execute(
                    "INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
                    (doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
                )

            if not chunks:
                log.warning("no_chunks_created", file_path=str(file_path))
            else:
                # Insert chunks
                chunk_contents = []
                chunk_metadatas = []

                for chunk in chunks:
                    chunk_id = str(uuid.uuid4())
                    new_chunk_ids.append(chunk_id)
                    chunk_contents.append(chunk.content)
                    chunk_metadatas.append({
                        "document_id": doc_id,
                        "heading_path": chunk.heading_path,
                        "source_file": str(file_path),
                        "tags": json.dumps(parsed.tags),
                        "title": parsed.title,
                    })

                    conn.execute(
                        "INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
                        (
                            chunk_id,
                            doc_id,
                            chunk.chunk_index,
                            chunk.content,
                            chunk.heading_path,
                            chunk.char_count,
                            json.dumps(chunk.metadata),
                        ),
                    )

                # Add new vectors before commit so DB can still roll back on failure.
                vector_store.add(new_chunk_ids, chunk_contents, chunk_metadatas)
    except Exception:
        if new_chunk_ids:
            vector_store.delete(new_chunk_ids)
        raise

    # Delete stale vectors only after the DB transaction committed.
    if old_chunk_ids:
        vector_store.delete(old_chunk_ids)

    duration_ms = int((time.time() - start) * 1000)
    if chunks:
        log.info(
            "file_ingested",
            file_path=str(file_path),
            chunks_created=len(chunks),
            duration_ms=duration_ms,
        )
    else:
        log.info(
            "file_ingested_empty",
            file_path=str(file_path),
            duration_ms=duration_ms,
        )

    return {
        "file": str(file_path),
        "status": "ingested" if chunks else "empty",
        "chunks": len(chunks),
        "duration_ms": duration_ms,
    }


def ingest_folder(folder_path: Path, purge_deleted: bool = True) -> list[dict]:
    """Ingest all markdown files in a folder recursively.

    Args:
        folder_path: Directory to scan for .md files.
        purge_deleted: If True, remove DB/vector entries for files
                       that no longer exist on disk.
    """
    folder_path = folder_path.resolve()
    if not folder_path.is_dir():
        raise NotADirectoryError(f"Not a directory: {folder_path}")

    results = []
    md_files = sorted(
        list(folder_path.rglob("*.md")) + list(folder_path.rglob("*.markdown"))
    )
    current_paths = {str(f.resolve()) for f in md_files}
    log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))

    # Ingest new/changed files
    for md_file in md_files:
        try:
            result = ingest_file(md_file)
            results.append(result)
        except Exception as e:
            log.error("ingestion_error", file_path=str(md_file), error=str(e))
            results.append({"file": str(md_file), "status": "error", "error": str(e)})

    # Purge entries for deleted files
    if purge_deleted:
        deleted = _purge_deleted_files(folder_path, current_paths)
        if deleted:
            log.info("purged_deleted_files", count=deleted)
            results.append({"status": "purged", "deleted_count": deleted})

    return results


def get_source_status() -> list[dict]:
    """Describe configured source directories and their readiness."""
    sources = []
    for spec in _config.settings.source_specs:
        path = spec["path"]
        assert isinstance(path, Path)
        sources.append(
            {
                "name": spec["name"],
                "enabled": spec["enabled"],
                "path": str(path),
                "exists": path.exists(),
                "is_dir": path.is_dir(),
                "read_only": spec["read_only"],
            }
        )
    return sources


def ingest_configured_sources(purge_deleted: bool = False) -> list[dict]:
    """Ingest enabled source directories declared in config.

    Purge is disabled by default here because sources are intended to be
    read-only inputs and should not be treated as the primary writable state.
    """
    results = []
    for source in get_source_status():
        if not source["enabled"]:
            results.append({"source": source["name"], "status": "disabled", "path": source["path"]})
            continue
        if not source["exists"] or not source["is_dir"]:
            results.append({"source": source["name"], "status": "missing", "path": source["path"]})
            continue

        folder_results = ingest_folder(Path(source["path"]), purge_deleted=purge_deleted)
        results.append(
            {
                "source": source["name"],
                "status": "ingested",
                "path": source["path"],
                "results": folder_results,
            }
        )
    return results


def get_ingestion_stats() -> dict:
    """Return ingestion statistics."""
    with get_connection() as conn:
        docs = conn.execute("SELECT COUNT(*) as c FROM source_documents").fetchone()
        chunks = conn.execute("SELECT COUNT(*) as c FROM source_chunks").fetchone()
        recent = conn.execute(
            "SELECT file_path, title, ingested_at FROM source_documents "
            "ORDER BY updated_at DESC LIMIT 5"
        ).fetchall()

    vector_store = get_vector_store()
    return {
        "total_documents": docs["c"],
        "total_chunks": chunks["c"],
        "total_vectors": vector_store.count,
        "recent_documents": [
            {"file_path": r["file_path"], "title": r["title"], "ingested_at": r["ingested_at"]}
            for r in recent
        ],
    }


def _read_file_safe(file_path: Path) -> str:
    """Read a file with encoding fallback."""
    for encoding in _ENCODINGS:
        try:
            return file_path.read_text(encoding=encoding)
        except (UnicodeDecodeError, ValueError):
            continue
    # Last resort: read with errors replaced
    return file_path.read_text(encoding="utf-8", errors="replace")


def _purge_deleted_files(folder_path: Path, current_paths: set[str]) -> int:
    """Remove DB/vector entries for files under folder_path that no longer exist."""
    folder_str = str(folder_path)
    deleted_count = 0
    vector_store = get_vector_store()
    chunk_ids_to_delete: list[str] = []

    with get_connection() as conn:
        rows = conn.execute(
            "SELECT id, file_path FROM source_documents"
        ).fetchall()

        for row in rows:
            doc_path = Path(row["file_path"])
            try:
                doc_path.relative_to(folder_path)
            except ValueError:
                continue

            if row["file_path"] not in current_paths:
                doc_id = row["id"]
                chunk_ids_to_delete.extend(
                    r["id"]
                    for r in conn.execute(
                        "SELECT id FROM source_chunks WHERE document_id = ?",
                        (doc_id,),
                    ).fetchall()
                )
                conn.execute("DELETE FROM source_chunks WHERE document_id = ?", (doc_id,))
                conn.execute("DELETE FROM source_documents WHERE id = ?", (doc_id,))
                log.info("purged_deleted_file", file_path=row["file_path"])
                deleted_count += 1

    if chunk_ids_to_delete:
        vector_store.delete(chunk_ids_to_delete)

    return deleted_count