feat: Phase 1 ingestion hardening + Phase 5 Trusted Project State

Phase 1 - Ingestion hardening: - Encoding fallback (UTF-8/UTF-8-sig/Latin-1/CP1252) - Delete detection: purge DB/vector entries for removed files - Ingestion stats endpoint (GET /stats) Phase 5 - Trusted Project State: - project_state table with categories (status, decision, requirement, contact, milestone, fact, config) - CRUD API: POST/GET/DELETE /project/state - Upsert semantics, invalidation (supersede) support - Context builder integrates project state at highest trust precedence - Project state gets 20% budget allocation, appears first in context - Trust precedence: Project State > Retrieved Chunks (per Master Plan) 33/33 tests passing. Validated end-to-end with GigaBIT M1 project data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:41:59 -04:00
parent 6081462058
commit 531c560db7
7 changed files with 671 additions and 35 deletions
--- a/src/atocore/context/builder.py
+++ b/src/atocore/context/builder.py
@@ -1,11 +1,16 @@
-"""Context pack assembly: retrieve, rank, budget, format."""
+"""Context pack assembly: retrieve, rank, budget, format.
+
+Trust precedence (per Master Plan):
+  1. Trusted Project State → always included first, uses its own budget slice
+  2. Retrieved chunks → ranked, deduplicated, budget-constrained
+"""

-import json
 import time
 from dataclasses import dataclass, field
 from pathlib import Path

 from atocore.config import settings
+from atocore.context.project_state import format_project_state, get_state
 from atocore.observability.logger import get_logger
 from atocore.retrieval.retriever import ChunkResult, retrieve

@@ -14,9 +19,14 @@ log = get_logger("context_builder")
 SYSTEM_PREFIX = (
    "You have access to the following personal context from the user's knowledge base.\n"
    "Use it to inform your answer. If the context is not relevant, ignore it.\n"
-    "Do not mention the context system unless asked."
+    "Do not mention the context system unless asked.\n"
+    "When project state is provided, treat it as the most authoritative source."
 )

+# Budget allocation (per Master Plan section 9)
+# project_state gets up to 20% of budget, retrieval gets the rest
+PROJECT_STATE_BUDGET_RATIO = 0.20
+
 # Last built context pack for debug inspection
 _last_context_pack: "ContextPack | None" = None

@@ -33,6 +43,8 @@ class ContextChunk:
@dataclass
 class ContextPack:
    chunks_used: list[ContextChunk] = field(default_factory=list)
+    project_state_text: str = ""
+    project_state_chars: int = 0
    total_chars: int = 0
    budget: int = 0
    budget_remaining: int = 0
@@ -48,31 +60,61 @@ def build_context(
    project_hint: str | None = None,
    budget: int | None = None,
 ) -> ContextPack:
-    """Build a context pack for a user prompt."""
+    """Build a context pack for a user prompt.
+
+    Trust precedence applied:
+      1. Project state is injected first (highest trust)
+      2. Retrieved chunks fill the remaining budget
+    """
    global _last_context_pack
    start = time.time()
    budget = budget or settings.context_budget

-    # 1. Retrieve candidates
+    # 1. Get Trusted Project State (highest precedence)
+    project_state_text = ""
+    project_state_chars = 0
+    state_budget = int(budget * PROJECT_STATE_BUDGET_RATIO)
+
+    if project_hint:
+        state_entries = get_state(project_hint)
+        if state_entries:
+            project_state_text = format_project_state(state_entries)
+            project_state_chars = len(project_state_text)
+            # If state exceeds its budget, it still gets included (it's highest trust)
+            # but we log it
+            if project_state_chars > state_budget:
+                log.info(
+                    "project_state_exceeds_budget",
+                    state_chars=project_state_chars,
+                    state_budget=state_budget,
+                )
+
+    # 2. Calculate remaining budget for retrieval
+    retrieval_budget = budget - project_state_chars
+
+    # 3. Retrieve candidates
    candidates = retrieve(user_prompt, top_k=settings.context_top_k)

-    # 2. Score and rank
+    # 4. Score and rank
    scored = _rank_chunks(candidates, project_hint)

-    # 3. Select within budget
-    selected = _select_within_budget(scored, budget)
+    # 5. Select within remaining budget
+    selected = _select_within_budget(scored, max(retrieval_budget, 0))

-    # 4. Format
-    formatted = _format_context_block(selected)
+    # 6. Format full context
+    formatted = _format_full_context(project_state_text, selected)

-    # 5. Build full prompt
+    # 7. Build full prompt
    full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"

-    total_chars = sum(c.char_count for c in selected)
+    retrieval_chars = sum(c.char_count for c in selected)
+    total_chars = project_state_chars + retrieval_chars
    duration_ms = int((time.time() - start) * 1000)

    pack = ContextPack(
        chunks_used=selected,
+        project_state_text=project_state_text,
+        project_state_chars=project_state_chars,
        total_chars=total_chars,
        budget=budget,
        budget_remaining=budget - total_chars,
@@ -88,6 +130,8 @@ def build_context(
    log.info(
        "context_built",
        chunks_used=len(selected),
+        project_state_chars=project_state_chars,
+        retrieval_chars=retrieval_chars,
        total_chars=total_chars,
        budget_remaining=budget - total_chars,
        duration_ms=duration_ms,
@@ -163,27 +207,38 @@ def _select_within_budget(
    return selected


-def _format_context_block(chunks: list[ContextChunk]) -> str:
-    """Format chunks into the context block string."""
-    if not chunks:
-        return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
+def _format_full_context(
+    project_state_text: str,
+    chunks: list[ContextChunk],
+) -> str:
+    """Format project state + retrieved chunks into full context block."""
+    parts = []

-    lines = ["--- AtoCore Context ---"]
-    for chunk in chunks:
-        lines.append(
-            f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
-        )
-        lines.append(chunk.content)
-        lines.append("")
-    lines.append("--- End Context ---")
-    return "\n".join(lines)
+    # Project state first (highest trust)
+    if project_state_text:
+        parts.append(project_state_text)
+        parts.append("")
+
+    # Retrieved chunks
+    if chunks:
+        parts.append("--- AtoCore Retrieved Context ---")
+        for chunk in chunks:
+            parts.append(
+                f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
+            )
+            parts.append(chunk.content)
+            parts.append("")
+        parts.append("--- End Context ---")
+    elif not project_state_text:
+        parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---")
+
+    return "\n".join(parts)


 def _shorten_path(path: str) -> str:
    """Shorten an absolute path to a relative-like display."""
    p = Path(path)
    parts = p.parts
-    # Show last 3 parts at most
    if len(parts) > 3:
        return str(Path(*parts[-3:]))
    return str(p)
@@ -194,11 +249,13 @@ def _pack_to_dict(pack: ContextPack) -> dict:
    return {
        "query": pack.query,
        "project_hint": pack.project_hint,
+        "project_state_chars": pack.project_state_chars,
        "chunks_used": len(pack.chunks_used),
        "total_chars": pack.total_chars,
        "budget": pack.budget,
        "budget_remaining": pack.budget_remaining,
        "duration_ms": pack.duration_ms,
+        "has_project_state": bool(pack.project_state_text),
        "chunks": [
            {
                "source_file": c.source_file,
--- a/src/atocore/context/project_state.py
+++ b/src/atocore/context/project_state.py
@@ -0,0 +1,231 @@
+"""Trusted Project State — the highest-priority context source.
+
+Per the Master Plan trust precedence:
+  1. Trusted Project State (this module)
+  2. AtoDrive artifacts
+  3. Recent validated memory
+  4. AtoVault summaries
+  5. PKM chunks
+  6. Historical / low-confidence
+
+Project state is manually curated or explicitly confirmed facts about a project.
+It always wins over retrieval-based context when there's a conflict.
+"""
+
+import json
+import time
+import uuid
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+
+from atocore.models.database import get_connection
+from atocore.observability.logger import get_logger
+
+log = get_logger("project_state")
+
+# DB schema extension for project state
+PROJECT_STATE_SCHEMA = """
+CREATE TABLE IF NOT EXISTS project_state (
+    id TEXT PRIMARY KEY,
+    project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
+    category TEXT NOT NULL,
+    key TEXT NOT NULL,
+    value TEXT NOT NULL,
+    source TEXT DEFAULT '',
+    confidence REAL DEFAULT 1.0,
+    status TEXT DEFAULT 'active',
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE(project_id, category, key)
+);
+
+CREATE INDEX IF NOT EXISTS idx_project_state_project ON project_state(project_id);
+CREATE INDEX IF NOT EXISTS idx_project_state_category ON project_state(category);
+CREATE INDEX IF NOT EXISTS idx_project_state_status ON project_state(status);
+"""
+
+# Valid categories for project state entries
+CATEGORIES = [
+    "status",       # current project status, phase, blockers
+    "decision",     # confirmed design/engineering decisions
+    "requirement",  # key requirements and constraints
+    "contact",      # key people, vendors, stakeholders
+    "milestone",    # dates, deadlines, deliverables
+    "fact",         # verified technical facts
+    "config",       # project configuration, parameters
+]
+
+
+@dataclass
+class ProjectStateEntry:
+    id: str
+    project_id: str
+    category: str
+    key: str
+    value: str
+    source: str = ""
+    confidence: float = 1.0
+    status: str = "active"
+    created_at: str = ""
+    updated_at: str = ""
+
+
+def init_project_state_schema() -> None:
+    """Create the project_state table if it doesn't exist."""
+    with get_connection() as conn:
+        conn.executescript(PROJECT_STATE_SCHEMA)
+    log.info("project_state_schema_initialized")
+
+
+def ensure_project(name: str, description: str = "") -> str:
+    """Get or create a project by name. Returns project_id."""
+    with get_connection() as conn:
+        row = conn.execute(
+            "SELECT id FROM projects WHERE name = ?", (name,)
+        ).fetchone()
+        if row:
+            return row["id"]
+
+        project_id = str(uuid.uuid4())
+        conn.execute(
+            "INSERT INTO projects (id, name, description) VALUES (?, ?, ?)",
+            (project_id, name, description),
+        )
+        log.info("project_created", name=name, project_id=project_id)
+        return project_id
+
+
+def set_state(
+    project_name: str,
+    category: str,
+    key: str,
+    value: str,
+    source: str = "",
+    confidence: float = 1.0,
+) -> ProjectStateEntry:
+    """Set or update a project state entry. Upsert semantics."""
+    if category not in CATEGORIES:
+        raise ValueError(f"Invalid category '{category}'. Must be one of: {CATEGORIES}")
+
+    project_id = ensure_project(project_name)
+    entry_id = str(uuid.uuid4())
+    now = datetime.now(timezone.utc).isoformat()
+
+    with get_connection() as conn:
+        # Check if entry exists
+        existing = conn.execute(
+            "SELECT id FROM project_state WHERE project_id = ? AND category = ? AND key = ?",
+            (project_id, category, key),
+        ).fetchone()
+
+        if existing:
+            entry_id = existing["id"]
+            conn.execute(
+                "UPDATE project_state SET value = ?, source = ?, confidence = ?, "
+                "status = 'active', updated_at = CURRENT_TIMESTAMP "
+                "WHERE id = ?",
+                (value, source, confidence, entry_id),
+            )
+            log.info("project_state_updated", project=project_name, category=category, key=key)
+        else:
+            conn.execute(
+                "INSERT INTO project_state (id, project_id, category, key, value, source, confidence) "
+                "VALUES (?, ?, ?, ?, ?, ?, ?)",
+                (entry_id, project_id, category, key, value, source, confidence),
+            )
+            log.info("project_state_created", project=project_name, category=category, key=key)
+
+    return ProjectStateEntry(
+        id=entry_id,
+        project_id=project_id,
+        category=category,
+        key=key,
+        value=value,
+        source=source,
+        confidence=confidence,
+        status="active",
+        created_at=now,
+        updated_at=now,
+    )
+
+
+def get_state(
+    project_name: str,
+    category: str | None = None,
+    active_only: bool = True,
+) -> list[ProjectStateEntry]:
+    """Get project state entries, optionally filtered by category."""
+    with get_connection() as conn:
+        project = conn.execute(
+            "SELECT id FROM projects WHERE name = ?", (project_name,)
+        ).fetchone()
+        if not project:
+            return []
+
+        query = "SELECT * FROM project_state WHERE project_id = ?"
+        params: list = [project["id"]]
+
+        if category:
+            query += " AND category = ?"
+            params.append(category)
+        if active_only:
+            query += " AND status = 'active'"
+
+        query += " ORDER BY category, key"
+        rows = conn.execute(query, params).fetchall()
+
+    return [
+        ProjectStateEntry(
+            id=r["id"],
+            project_id=r["project_id"],
+            category=r["category"],
+            key=r["key"],
+            value=r["value"],
+            source=r["source"],
+            confidence=r["confidence"],
+            status=r["status"],
+            created_at=r["created_at"],
+            updated_at=r["updated_at"],
+        )
+        for r in rows
+    ]
+
+
+def invalidate_state(project_name: str, category: str, key: str) -> bool:
+    """Mark a project state entry as superseded."""
+    with get_connection() as conn:
+        project = conn.execute(
+            "SELECT id FROM projects WHERE name = ?", (project_name,)
+        ).fetchone()
+        if not project:
+            return False
+
+        result = conn.execute(
+            "UPDATE project_state SET status = 'superseded', updated_at = CURRENT_TIMESTAMP "
+            "WHERE project_id = ? AND category = ? AND key = ? AND status = 'active'",
+            (project["id"], category, key),
+        )
+        if result.rowcount > 0:
+            log.info("project_state_invalidated", project=project_name, category=category, key=key)
+            return True
+        return False
+
+
+def format_project_state(entries: list[ProjectStateEntry]) -> str:
+    """Format project state entries for context injection."""
+    if not entries:
+        return ""
+
+    lines = ["--- Trusted Project State ---"]
+    current_category = ""
+
+    for entry in entries:
+        if entry.category != current_category:
+            current_category = entry.category
+            lines.append(f"\n[{current_category.upper()}]")
+        lines.append(f"  {entry.key}: {entry.value}")
+        if entry.source:
+            lines.append(f"    (source: {entry.source})")
+
+    lines.append("\n--- End Project State ---")
+    return "\n".join(lines)