Phase 1 - Ingestion hardening: - Encoding fallback (UTF-8/UTF-8-sig/Latin-1/CP1252) - Delete detection: purge DB/vector entries for removed files - Ingestion stats endpoint (GET /stats) Phase 5 - Trusted Project State: - project_state table with categories (status, decision, requirement, contact, milestone, fact, config) - CRUD API: POST/GET/DELETE /project/state - Upsert semantics, invalidation (supersede) support - Context builder integrates project state at highest trust precedence - Project state gets 20% budget allocation, appears first in context - Trust precedence: Project State > Retrieved Chunks (per Master Plan) 33/33 tests passing. Validated end-to-end with GigaBIT M1 project data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
245 lines
8.4 KiB
Python
245 lines
8.4 KiB
Python
"""Ingestion pipeline: parse → chunk → embed → store."""
|
|
|
|
import hashlib
|
|
import json
|
|
import time
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
from atocore.config import settings
|
|
from atocore.ingestion.chunker import chunk_markdown
|
|
from atocore.ingestion.parser import parse_markdown
|
|
from atocore.models.database import get_connection
|
|
from atocore.observability.logger import get_logger
|
|
from atocore.retrieval.vector_store import get_vector_store
|
|
|
|
log = get_logger("ingestion")
|
|
|
|
# Encodings to try when reading markdown files
|
|
_ENCODINGS = ["utf-8", "utf-8-sig", "latin-1", "cp1252"]
|
|
|
|
|
|
def ingest_file(file_path: Path) -> dict:
|
|
"""Ingest a single markdown file. Returns stats."""
|
|
start = time.time()
|
|
file_path = file_path.resolve()
|
|
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
if file_path.suffix.lower() not in (".md", ".markdown"):
|
|
raise ValueError(f"Not a markdown file: {file_path}")
|
|
|
|
# Read with encoding fallback
|
|
raw_content = _read_file_safe(file_path)
|
|
file_hash = hashlib.sha256(raw_content.encode("utf-8")).hexdigest()
|
|
|
|
# Check if already ingested and unchanged
|
|
with get_connection() as conn:
|
|
existing = conn.execute(
|
|
"SELECT id, file_hash FROM source_documents WHERE file_path = ?",
|
|
(str(file_path),),
|
|
).fetchone()
|
|
|
|
if existing and existing["file_hash"] == file_hash:
|
|
log.info("file_skipped_unchanged", file_path=str(file_path))
|
|
return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}
|
|
|
|
# Parse
|
|
parsed = parse_markdown(file_path)
|
|
|
|
# Chunk
|
|
base_meta = {
|
|
"source_file": str(file_path),
|
|
"tags": parsed.tags,
|
|
"title": parsed.title,
|
|
}
|
|
chunks = chunk_markdown(parsed.body, base_metadata=base_meta)
|
|
|
|
if not chunks:
|
|
log.warning("no_chunks_created", file_path=str(file_path))
|
|
return {"file": str(file_path), "status": "empty", "chunks": 0}
|
|
|
|
# Store in DB and vector store
|
|
doc_id = str(uuid.uuid4())
|
|
vector_store = get_vector_store()
|
|
|
|
with get_connection() as conn:
|
|
# Remove old data if re-ingesting
|
|
if existing:
|
|
doc_id = existing["id"]
|
|
old_chunk_ids = [
|
|
row["id"]
|
|
for row in conn.execute(
|
|
"SELECT id FROM source_chunks WHERE document_id = ?",
|
|
(doc_id,),
|
|
).fetchall()
|
|
]
|
|
conn.execute(
|
|
"DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
|
|
)
|
|
conn.execute(
|
|
"UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
|
(file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
|
|
)
|
|
# Remove old vectors
|
|
if old_chunk_ids:
|
|
vector_store.delete(old_chunk_ids)
|
|
else:
|
|
conn.execute(
|
|
"INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
|
|
(doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
|
|
)
|
|
|
|
# Insert chunks
|
|
chunk_ids = []
|
|
chunk_contents = []
|
|
chunk_metadatas = []
|
|
|
|
for chunk in chunks:
|
|
chunk_id = str(uuid.uuid4())
|
|
chunk_ids.append(chunk_id)
|
|
chunk_contents.append(chunk.content)
|
|
chunk_metadatas.append({
|
|
"document_id": doc_id,
|
|
"heading_path": chunk.heading_path,
|
|
"source_file": str(file_path),
|
|
"tags": json.dumps(parsed.tags),
|
|
"title": parsed.title,
|
|
})
|
|
|
|
conn.execute(
|
|
"INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
(
|
|
chunk_id,
|
|
doc_id,
|
|
chunk.chunk_index,
|
|
chunk.content,
|
|
chunk.heading_path,
|
|
chunk.char_count,
|
|
json.dumps(chunk.metadata),
|
|
),
|
|
)
|
|
|
|
# Store embeddings
|
|
vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)
|
|
|
|
duration_ms = int((time.time() - start) * 1000)
|
|
log.info(
|
|
"file_ingested",
|
|
file_path=str(file_path),
|
|
chunks_created=len(chunks),
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
return {
|
|
"file": str(file_path),
|
|
"status": "ingested",
|
|
"chunks": len(chunks),
|
|
"duration_ms": duration_ms,
|
|
}
|
|
|
|
|
|
def ingest_folder(folder_path: Path, purge_deleted: bool = True) -> list[dict]:
|
|
"""Ingest all markdown files in a folder recursively.
|
|
|
|
Args:
|
|
folder_path: Directory to scan for .md files.
|
|
purge_deleted: If True, remove DB/vector entries for files
|
|
that no longer exist on disk.
|
|
"""
|
|
folder_path = folder_path.resolve()
|
|
if not folder_path.is_dir():
|
|
raise NotADirectoryError(f"Not a directory: {folder_path}")
|
|
|
|
results = []
|
|
md_files = sorted(folder_path.rglob("*.md"))
|
|
current_paths = {str(f.resolve()) for f in md_files}
|
|
log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))
|
|
|
|
# Ingest new/changed files
|
|
for md_file in md_files:
|
|
try:
|
|
result = ingest_file(md_file)
|
|
results.append(result)
|
|
except Exception as e:
|
|
log.error("ingestion_error", file_path=str(md_file), error=str(e))
|
|
results.append({"file": str(md_file), "status": "error", "error": str(e)})
|
|
|
|
# Purge entries for deleted files
|
|
if purge_deleted:
|
|
deleted = _purge_deleted_files(folder_path, current_paths)
|
|
if deleted:
|
|
log.info("purged_deleted_files", count=deleted)
|
|
results.append({"status": "purged", "deleted_count": deleted})
|
|
|
|
return results
|
|
|
|
|
|
def get_ingestion_stats() -> dict:
|
|
"""Return ingestion statistics."""
|
|
with get_connection() as conn:
|
|
docs = conn.execute("SELECT COUNT(*) as c FROM source_documents").fetchone()
|
|
chunks = conn.execute("SELECT COUNT(*) as c FROM source_chunks").fetchone()
|
|
recent = conn.execute(
|
|
"SELECT file_path, title, ingested_at FROM source_documents "
|
|
"ORDER BY updated_at DESC LIMIT 5"
|
|
).fetchall()
|
|
|
|
vector_store = get_vector_store()
|
|
return {
|
|
"total_documents": docs["c"],
|
|
"total_chunks": chunks["c"],
|
|
"total_vectors": vector_store.count,
|
|
"recent_documents": [
|
|
{"file_path": r["file_path"], "title": r["title"], "ingested_at": r["ingested_at"]}
|
|
for r in recent
|
|
],
|
|
}
|
|
|
|
|
|
def _read_file_safe(file_path: Path) -> str:
|
|
"""Read a file with encoding fallback."""
|
|
for encoding in _ENCODINGS:
|
|
try:
|
|
return file_path.read_text(encoding=encoding)
|
|
except (UnicodeDecodeError, ValueError):
|
|
continue
|
|
# Last resort: read with errors replaced
|
|
return file_path.read_text(encoding="utf-8", errors="replace")
|
|
|
|
|
|
def _purge_deleted_files(folder_path: Path, current_paths: set[str]) -> int:
|
|
"""Remove DB/vector entries for files under folder_path that no longer exist."""
|
|
folder_str = str(folder_path)
|
|
deleted_count = 0
|
|
vector_store = get_vector_store()
|
|
|
|
with get_connection() as conn:
|
|
# Find documents under this folder
|
|
rows = conn.execute(
|
|
"SELECT id, file_path FROM source_documents WHERE file_path LIKE ?",
|
|
(f"{folder_str}%",),
|
|
).fetchall()
|
|
|
|
for row in rows:
|
|
if row["file_path"] not in current_paths:
|
|
doc_id = row["id"]
|
|
# Get chunk IDs for vector deletion
|
|
chunk_ids = [
|
|
r["id"]
|
|
for r in conn.execute(
|
|
"SELECT id FROM source_chunks WHERE document_id = ?",
|
|
(doc_id,),
|
|
).fetchall()
|
|
]
|
|
# Delete from DB
|
|
conn.execute("DELETE FROM source_chunks WHERE document_id = ?", (doc_id,))
|
|
conn.execute("DELETE FROM source_documents WHERE id = ?", (doc_id,))
|
|
# Delete from vectors
|
|
if chunk_ids:
|
|
vector_store.delete(chunk_ids)
|
|
log.info("purged_deleted_file", file_path=row["file_path"])
|
|
deleted_count += 1
|
|
|
|
return deleted_count
|