"""Ingestion pipeline: parse → chunk → embed → store.""" import hashlib import json import threading import time import uuid from contextlib import contextmanager from pathlib import Path import atocore.config as _config from atocore.ingestion.chunker import chunk_markdown from atocore.ingestion.parser import parse_markdown from atocore.models.database import get_connection from atocore.observability.logger import get_logger from atocore.retrieval.vector_store import get_vector_store log = get_logger("ingestion") # Encodings to try when reading markdown files _ENCODINGS = ["utf-8", "utf-8-sig", "latin-1", "cp1252"] _INGESTION_LOCK = threading.Lock() @contextmanager def exclusive_ingestion(): """Serialize long-running ingestion operations across API requests.""" _INGESTION_LOCK.acquire() try: yield finally: _INGESTION_LOCK.release() def ingest_file(file_path: Path) -> dict: """Ingest a single markdown file. Returns stats.""" start = time.time() file_path = file_path.resolve() if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if file_path.suffix.lower() not in (".md", ".markdown"): raise ValueError(f"Not a markdown file: {file_path}") # Read with encoding fallback raw_content = _read_file_safe(file_path) file_hash = hashlib.sha256(raw_content.encode("utf-8")).hexdigest() # Check if already ingested and unchanged with get_connection() as conn: existing = conn.execute( "SELECT id, file_hash FROM source_documents WHERE file_path = ?", (str(file_path),), ).fetchone() if existing and existing["file_hash"] == file_hash: log.info("file_skipped_unchanged", file_path=str(file_path)) return {"file": str(file_path), "status": "skipped", "reason": "unchanged"} # Parse parsed = parse_markdown(file_path, text=raw_content) # Chunk base_meta = { "source_file": str(file_path), "tags": parsed.tags, "title": parsed.title, } chunks = chunk_markdown(parsed.body, base_metadata=base_meta) # Store in DB and vector store doc_id = str(uuid.uuid4()) vector_store = get_vector_store() old_chunk_ids: list[str] = [] new_chunk_ids: list[str] = [] try: with get_connection() as conn: # Remove old data if re-ingesting if existing: doc_id = existing["id"] old_chunk_ids = [ row["id"] for row in conn.execute( "SELECT id FROM source_chunks WHERE document_id = ?", (doc_id,), ).fetchall() ] conn.execute( "DELETE FROM source_chunks WHERE document_id = ?", (doc_id,) ) conn.execute( "UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?", (file_hash, parsed.title, json.dumps(parsed.tags), doc_id), ) else: conn.execute( "INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)", (doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)), ) if not chunks: log.warning("no_chunks_created", file_path=str(file_path)) else: # Insert chunks chunk_contents = [] chunk_metadatas = [] for chunk in chunks: chunk_id = str(uuid.uuid4()) new_chunk_ids.append(chunk_id) chunk_contents.append(chunk.content) chunk_metadatas.append({ "document_id": doc_id, "heading_path": chunk.heading_path, "source_file": str(file_path), "tags": json.dumps(parsed.tags), "title": parsed.title, }) conn.execute( "INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)", ( chunk_id, doc_id, chunk.chunk_index, chunk.content, chunk.heading_path, chunk.char_count, json.dumps(chunk.metadata), ), ) # Add new vectors before commit so DB can still roll back on failure. vector_store.add(new_chunk_ids, chunk_contents, chunk_metadatas) except Exception: if new_chunk_ids: vector_store.delete(new_chunk_ids) raise # Delete stale vectors only after the DB transaction committed. if old_chunk_ids: vector_store.delete(old_chunk_ids) duration_ms = int((time.time() - start) * 1000) if chunks: log.info( "file_ingested", file_path=str(file_path), chunks_created=len(chunks), duration_ms=duration_ms, ) else: log.info( "file_ingested_empty", file_path=str(file_path), duration_ms=duration_ms, ) return { "file": str(file_path), "status": "ingested" if chunks else "empty", "chunks": len(chunks), "duration_ms": duration_ms, } def ingest_folder(folder_path: Path, purge_deleted: bool = True) -> list[dict]: """Ingest all markdown files in a folder recursively. Args: folder_path: Directory to scan for .md files. purge_deleted: If True, remove DB/vector entries for files that no longer exist on disk. """ folder_path = folder_path.resolve() if not folder_path.is_dir(): raise NotADirectoryError(f"Not a directory: {folder_path}") results = [] md_files = sorted( list(folder_path.rglob("*.md")) + list(folder_path.rglob("*.markdown")) ) current_paths = {str(f.resolve()) for f in md_files} log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files)) # Ingest new/changed files for md_file in md_files: try: result = ingest_file(md_file) results.append(result) except Exception as e: log.error("ingestion_error", file_path=str(md_file), error=str(e)) results.append({"file": str(md_file), "status": "error", "error": str(e)}) # Purge entries for deleted files if purge_deleted: deleted = _purge_deleted_files(folder_path, current_paths) if deleted: log.info("purged_deleted_files", count=deleted) results.append({"status": "purged", "deleted_count": deleted}) return results def get_source_status() -> list[dict]: """Describe configured source directories and their readiness.""" sources = [] for spec in _config.settings.source_specs: path = spec["path"] assert isinstance(path, Path) sources.append( { "name": spec["name"], "enabled": spec["enabled"], "path": str(path), "exists": path.exists(), "is_dir": path.is_dir(), "read_only": spec["read_only"], } ) return sources def ingest_configured_sources(purge_deleted: bool = False) -> list[dict]: """Ingest enabled source directories declared in config. Purge is disabled by default here because sources are intended to be read-only inputs and should not be treated as the primary writable state. """ results = [] for source in get_source_status(): if not source["enabled"]: results.append({"source": source["name"], "status": "disabled", "path": source["path"]}) continue if not source["exists"] or not source["is_dir"]: results.append({"source": source["name"], "status": "missing", "path": source["path"]}) continue folder_results = ingest_folder(Path(source["path"]), purge_deleted=purge_deleted) results.append( { "source": source["name"], "status": "ingested", "path": source["path"], "results": folder_results, } ) return results def get_ingestion_stats() -> dict: """Return ingestion statistics.""" with get_connection() as conn: docs = conn.execute("SELECT COUNT(*) as c FROM source_documents").fetchone() chunks = conn.execute("SELECT COUNT(*) as c FROM source_chunks").fetchone() recent = conn.execute( "SELECT file_path, title, ingested_at FROM source_documents " "ORDER BY updated_at DESC LIMIT 5" ).fetchall() vector_store = get_vector_store() return { "total_documents": docs["c"], "total_chunks": chunks["c"], "total_vectors": vector_store.count, "recent_documents": [ {"file_path": r["file_path"], "title": r["title"], "ingested_at": r["ingested_at"]} for r in recent ], } def _read_file_safe(file_path: Path) -> str: """Read a file with encoding fallback.""" for encoding in _ENCODINGS: try: return file_path.read_text(encoding=encoding) except (UnicodeDecodeError, ValueError): continue # Last resort: read with errors replaced return file_path.read_text(encoding="utf-8", errors="replace") def _purge_deleted_files(folder_path: Path, current_paths: set[str]) -> int: """Remove DB/vector entries for files under folder_path that no longer exist.""" folder_str = str(folder_path) deleted_count = 0 vector_store = get_vector_store() chunk_ids_to_delete: list[str] = [] with get_connection() as conn: rows = conn.execute( "SELECT id, file_path FROM source_documents" ).fetchall() for row in rows: doc_path = Path(row["file_path"]) try: doc_path.relative_to(folder_path) except ValueError: continue if row["file_path"] not in current_paths: doc_id = row["id"] chunk_ids_to_delete.extend( r["id"] for r in conn.execute( "SELECT id FROM source_chunks WHERE document_id = ?", (doc_id,), ).fetchall() ) conn.execute("DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)) conn.execute("DELETE FROM source_documents WHERE id = ?", (doc_id,)) log.info("purged_deleted_file", file_path=row["file_path"]) deleted_count += 1 if chunk_ids_to_delete: vector_store.delete(chunk_ids_to_delete) return deleted_count