feat: Phase 1 ingestion hardening + Phase 5 Trusted Project State

Phase 1 - Ingestion hardening: - Encoding fallback (UTF-8/UTF-8-sig/Latin-1/CP1252) - Delete detection: purge DB/vector entries for removed files - Ingestion stats endpoint (GET /stats) Phase 5 - Trusted Project State: - project_state table with categories (status, decision, requirement, contact, milestone, fact, config) - CRUD API: POST/GET/DELETE /project/state - Upsert semantics, invalidation (supersede) support - Context builder integrates project state at highest trust precedence - Project state gets 20% budget allocation, appears first in context - Trust precedence: Project State > Retrieved Chunks (per Master Plan) 33/33 tests passing. Validated end-to-end with GigaBIT M1 project data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:41:59 -04:00
parent 6081462058
commit 531c560db7
7 changed files with 671 additions and 35 deletions
--- a/src/atocore/ingestion/pipeline.py
+++ b/src/atocore/ingestion/pipeline.py
@@ -15,6 +15,9 @@ from atocore.retrieval.vector_store import get_vector_store

 log = get_logger("ingestion")

+# Encodings to try when reading markdown files
+_ENCODINGS = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
+

 def ingest_file(file_path: Path) -> dict:
    """Ingest a single markdown file. Returns stats."""
@@ -26,9 +29,9 @@ def ingest_file(file_path: Path) -> dict:
    if file_path.suffix.lower() not in (".md", ".markdown"):
        raise ValueError(f"Not a markdown file: {file_path}")

-    # Read and hash
-    raw_content = file_path.read_text(encoding="utf-8")
-    file_hash = hashlib.sha256(raw_content.encode()).hexdigest()
+    # Read with encoding fallback
+    raw_content = _read_file_safe(file_path)
+    file_hash = hashlib.sha256(raw_content.encode("utf-8")).hexdigest()

    # Check if already ingested and unchanged
    with get_connection() as conn:
@@ -136,16 +139,24 @@ def ingest_file(file_path: Path) -> dict:
    }


-def ingest_folder(folder_path: Path) -> list[dict]:
-    """Ingest all markdown files in a folder recursively."""
+def ingest_folder(folder_path: Path, purge_deleted: bool = True) -> list[dict]:
+    """Ingest all markdown files in a folder recursively.
+
+    Args:
+        folder_path: Directory to scan for .md files.
+        purge_deleted: If True, remove DB/vector entries for files
+                       that no longer exist on disk.
+    """
    folder_path = folder_path.resolve()
    if not folder_path.is_dir():
        raise NotADirectoryError(f"Not a directory: {folder_path}")

    results = []
    md_files = sorted(folder_path.rglob("*.md"))
+    current_paths = {str(f.resolve()) for f in md_files}
    log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))

+    # Ingest new/changed files
    for md_file in md_files:
        try:
            result = ingest_file(md_file)
@@ -154,4 +165,80 @@ def ingest_folder(folder_path: Path) -> list[dict]:
            log.error("ingestion_error", file_path=str(md_file), error=str(e))
            results.append({"file": str(md_file), "status": "error", "error": str(e)})

+    # Purge entries for deleted files
+    if purge_deleted:
+        deleted = _purge_deleted_files(folder_path, current_paths)
+        if deleted:
+            log.info("purged_deleted_files", count=deleted)
+            results.append({"status": "purged", "deleted_count": deleted})
+
    return results
+
+
+def get_ingestion_stats() -> dict:
+    """Return ingestion statistics."""
+    with get_connection() as conn:
+        docs = conn.execute("SELECT COUNT(*) as c FROM source_documents").fetchone()
+        chunks = conn.execute("SELECT COUNT(*) as c FROM source_chunks").fetchone()
+        recent = conn.execute(
+            "SELECT file_path, title, ingested_at FROM source_documents "
+            "ORDER BY updated_at DESC LIMIT 5"
+        ).fetchall()
+
+    vector_store = get_vector_store()
+    return {
+        "total_documents": docs["c"],
+        "total_chunks": chunks["c"],
+        "total_vectors": vector_store.count,
+        "recent_documents": [
+            {"file_path": r["file_path"], "title": r["title"], "ingested_at": r["ingested_at"]}
+            for r in recent
+        ],
+    }
+
+
+def _read_file_safe(file_path: Path) -> str:
+    """Read a file with encoding fallback."""
+    for encoding in _ENCODINGS:
+        try:
+            return file_path.read_text(encoding=encoding)
+        except (UnicodeDecodeError, ValueError):
+            continue
+    # Last resort: read with errors replaced
+    return file_path.read_text(encoding="utf-8", errors="replace")
+
+
+def _purge_deleted_files(folder_path: Path, current_paths: set[str]) -> int:
+    """Remove DB/vector entries for files under folder_path that no longer exist."""
+    folder_str = str(folder_path)
+    deleted_count = 0
+    vector_store = get_vector_store()
+
+    with get_connection() as conn:
+        # Find documents under this folder
+        rows = conn.execute(
+            "SELECT id, file_path FROM source_documents WHERE file_path LIKE ?",
+            (f"{folder_str}%",),
+        ).fetchall()
+
+        for row in rows:
+            if row["file_path"] not in current_paths:
+                doc_id = row["id"]
+                # Get chunk IDs for vector deletion
+                chunk_ids = [
+                    r["id"]
+                    for r in conn.execute(
+                        "SELECT id FROM source_chunks WHERE document_id = ?",
+                        (doc_id,),
+                    ).fetchall()
+                ]
+                # Delete from DB
+                conn.execute("DELETE FROM source_chunks WHERE document_id = ?", (doc_id,))
+                conn.execute("DELETE FROM source_documents WHERE id = ?", (doc_id,))
+                # Delete from vectors
+                if chunk_ids:
+                    vector_store.delete(chunk_ids)
+                log.info("purged_deleted_file", file_path=row["file_path"])
+                deleted_count += 1
+
+    return deleted_count