Stabilize core correctness and sync project plan state

2026-04-05 17:53:23 -04:00
parent b48f0c95ab
commit b0889b3925
20 changed files with 551 additions and 168 deletions
--- a/src/atocore/ingestion/chunker.py
+++ b/src/atocore/ingestion/chunker.py
@@ -3,7 +3,7 @@
 import re
 from dataclasses import dataclass, field

-from atocore.config import settings
+import atocore.config as _config


@dataclass
@@ -29,9 +29,9 @@ def chunk_markdown(
    3. If still > max_size, split on paragraph breaks
    4. If still > max_size, hard split with overlap
    """
-    max_size = max_size or settings.chunk_max_size
-    overlap = overlap or settings.chunk_overlap
-    min_size = min_size or settings.chunk_min_size
+    max_size = max_size or _config.settings.chunk_max_size
+    overlap = overlap or _config.settings.chunk_overlap
+    min_size = min_size or _config.settings.chunk_min_size
    base_metadata = base_metadata or {}

    sections = _split_by_heading(body, level=2)
--- a/src/atocore/ingestion/parser.py
+++ b/src/atocore/ingestion/parser.py
@@ -17,10 +17,10 @@ class ParsedDocument:
    headings: list[tuple[int, str]] = field(default_factory=list)


-def parse_markdown(file_path: Path) -> ParsedDocument:
+def parse_markdown(file_path: Path, text: str | None = None) -> ParsedDocument:
    """Parse a markdown file, extracting frontmatter and structure."""
-    text = file_path.read_text(encoding="utf-8")
-    post = frontmatter.loads(text)
+    raw_text = text if text is not None else file_path.read_text(encoding="utf-8")
+    post = frontmatter.loads(raw_text)

    meta = dict(post.metadata) if post.metadata else {}
    body = post.content.strip()
--- a/src/atocore/ingestion/pipeline.py
+++ b/src/atocore/ingestion/pipeline.py
@@ -6,7 +6,6 @@ import time
 import uuid
 from pathlib import Path

-from atocore.config import settings
 from atocore.ingestion.chunker import chunk_markdown
 from atocore.ingestion.parser import parse_markdown
 from atocore.models.database import get_connection
@@ -45,7 +44,7 @@ def ingest_file(file_path: Path) -> dict:
            return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}

    # Parse
-    parsed = parse_markdown(file_path)
+    parsed = parse_markdown(file_path, text=raw_content)

    # Chunk
    base_meta = {
@@ -55,85 +54,98 @@ def ingest_file(file_path: Path) -> dict:
    }
    chunks = chunk_markdown(parsed.body, base_metadata=base_meta)

-    if not chunks:
-        log.warning("no_chunks_created", file_path=str(file_path))
-        return {"file": str(file_path), "status": "empty", "chunks": 0}
-
    # Store in DB and vector store
    doc_id = str(uuid.uuid4())
    vector_store = get_vector_store()
+    old_chunk_ids: list[str] = []
+    new_chunk_ids: list[str] = []

-    with get_connection() as conn:
-        # Remove old data if re-ingesting
-        if existing:
-            doc_id = existing["id"]
-            old_chunk_ids = [
-                row["id"]
-                for row in conn.execute(
-                    "SELECT id FROM source_chunks WHERE document_id = ?",
-                    (doc_id,),
-                ).fetchall()
-            ]
-            conn.execute(
-                "DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
-            )
-            conn.execute(
-                "UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
-                (file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
-            )
-            # Remove old vectors
-            if old_chunk_ids:
-                vector_store.delete(old_chunk_ids)
-        else:
-            conn.execute(
-                "INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
-                (doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
-            )
+    try:
+        with get_connection() as conn:
+            # Remove old data if re-ingesting
+            if existing:
+                doc_id = existing["id"]
+                old_chunk_ids = [
+                    row["id"]
+                    for row in conn.execute(
+                        "SELECT id FROM source_chunks WHERE document_id = ?",
+                        (doc_id,),
+                    ).fetchall()
+                ]
+                conn.execute(
+                    "DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
+                )
+                conn.execute(
+                    "UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
+                    (file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
+                )
+            else:
+                conn.execute(
+                    "INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
+                    (doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
+                )

-        # Insert chunks
-        chunk_ids = []
-        chunk_contents = []
-        chunk_metadatas = []
+            if not chunks:
+                log.warning("no_chunks_created", file_path=str(file_path))
+            else:
+                # Insert chunks
+                chunk_contents = []
+                chunk_metadatas = []

-        for chunk in chunks:
-            chunk_id = str(uuid.uuid4())
-            chunk_ids.append(chunk_id)
-            chunk_contents.append(chunk.content)
-            chunk_metadatas.append({
-                "document_id": doc_id,
-                "heading_path": chunk.heading_path,
-                "source_file": str(file_path),
-                "tags": json.dumps(parsed.tags),
-                "title": parsed.title,
-            })
+                for chunk in chunks:
+                    chunk_id = str(uuid.uuid4())
+                    new_chunk_ids.append(chunk_id)
+                    chunk_contents.append(chunk.content)
+                    chunk_metadatas.append({
+                        "document_id": doc_id,
+                        "heading_path": chunk.heading_path,
+                        "source_file": str(file_path),
+                        "tags": json.dumps(parsed.tags),
+                        "title": parsed.title,
+                    })

-            conn.execute(
-                "INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
-                (
-                    chunk_id,
-                    doc_id,
-                    chunk.chunk_index,
-                    chunk.content,
-                    chunk.heading_path,
-                    chunk.char_count,
-                    json.dumps(chunk.metadata),
-                ),
-            )
+                    conn.execute(
+                        "INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
+                        (
+                            chunk_id,
+                            doc_id,
+                            chunk.chunk_index,
+                            chunk.content,
+                            chunk.heading_path,
+                            chunk.char_count,
+                            json.dumps(chunk.metadata),
+                        ),
+                    )

-        # Store embeddings
-        vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)
+                # Add new vectors before commit so DB can still roll back on failure.
+                vector_store.add(new_chunk_ids, chunk_contents, chunk_metadatas)
+    except Exception:
+        if new_chunk_ids:
+            vector_store.delete(new_chunk_ids)
+        raise
+
+    # Delete stale vectors only after the DB transaction committed.
+    if old_chunk_ids:
+        vector_store.delete(old_chunk_ids)

    duration_ms = int((time.time() - start) * 1000)
-    log.info(
-        "file_ingested",
-        file_path=str(file_path),
-        chunks_created=len(chunks),
-        duration_ms=duration_ms,
-    )
+    if chunks:
+        log.info(
+            "file_ingested",
+            file_path=str(file_path),
+            chunks_created=len(chunks),
+            duration_ms=duration_ms,
+        )
+    else:
+        log.info(
+            "file_ingested_empty",
+            file_path=str(file_path),
+            duration_ms=duration_ms,
+        )

    return {
        "file": str(file_path),
-        "status": "ingested",
+        "status": "ingested" if chunks else "empty",
        "chunks": len(chunks),
        "duration_ms": duration_ms,
    }
@@ -152,7 +164,9 @@ def ingest_folder(folder_path: Path, purge_deleted: bool = True) -> list[dict]:
        raise NotADirectoryError(f"Not a directory: {folder_path}")

    results = []
-    md_files = sorted(folder_path.rglob("*.md"))
+    md_files = sorted(
+        list(folder_path.rglob("*.md")) + list(folder_path.rglob("*.markdown"))
+    )
    current_paths = {str(f.resolve()) for f in md_files}
    log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))

@@ -213,32 +227,35 @@ def _purge_deleted_files(folder_path: Path, current_paths: set[str]) -> int:
    folder_str = str(folder_path)
    deleted_count = 0
    vector_store = get_vector_store()
+    chunk_ids_to_delete: list[str] = []

    with get_connection() as conn:
-        # Find documents under this folder
        rows = conn.execute(
-            "SELECT id, file_path FROM source_documents WHERE file_path LIKE ?",
-            (f"{folder_str}%",),
+            "SELECT id, file_path FROM source_documents"
        ).fetchall()

        for row in rows:
+            doc_path = Path(row["file_path"])
+            try:
+                doc_path.relative_to(folder_path)
+            except ValueError:
+                continue
+
            if row["file_path"] not in current_paths:
                doc_id = row["id"]
-                # Get chunk IDs for vector deletion
-                chunk_ids = [
+                chunk_ids_to_delete.extend(
                    r["id"]
                    for r in conn.execute(
                        "SELECT id FROM source_chunks WHERE document_id = ?",
                        (doc_id,),
                    ).fetchall()
-                ]
-                # Delete from DB
+                )
                conn.execute("DELETE FROM source_chunks WHERE document_id = ?", (doc_id,))
                conn.execute("DELETE FROM source_documents WHERE id = ?", (doc_id,))
-                # Delete from vectors
-                if chunk_ids:
-                    vector_store.delete(chunk_ids)
                log.info("purged_deleted_file", file_path=row["file_path"])
                deleted_count += 1

+    if chunk_ids_to_delete:
+        vector_store.delete(chunk_ids_to_delete)
+
    return deleted_count