feat: Phase 1 ingestion hardening + Phase 5 Trusted Project State

Phase 1 - Ingestion hardening: - Encoding fallback (UTF-8/UTF-8-sig/Latin-1/CP1252) - Delete detection: purge DB/vector entries for removed files - Ingestion stats endpoint (GET /stats) Phase 5 - Trusted Project State: - project_state table with categories (status, decision, requirement, contact, milestone, fact, config) - CRUD API: POST/GET/DELETE /project/state - Upsert semantics, invalidation (supersede) support - Context builder integrates project state at highest trust precedence - Project state gets 20% budget allocation, appears first in context - Trust precedence: Project State > Retrieved Chunks (per Master Plan) 33/33 tests passing. Validated end-to-end with GigaBIT M1 project data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:41:59 -04:00
parent 6081462058
commit 531c560db7
7 changed files with 671 additions and 35 deletions
--- a/src/atocore/context/builder.py
+++ b/src/atocore/context/builder.py
@@ -1,11 +1,16 @@
-"""Context pack assembly: retrieve, rank, budget, format."""
+"""Context pack assembly: retrieve, rank, budget, format.
+
+Trust precedence (per Master Plan):
+  1. Trusted Project State → always included first, uses its own budget slice
+  2. Retrieved chunks → ranked, deduplicated, budget-constrained
+"""

-import json
 import time
 from dataclasses import dataclass, field
 from pathlib import Path

 from atocore.config import settings
+from atocore.context.project_state import format_project_state, get_state
 from atocore.observability.logger import get_logger
 from atocore.retrieval.retriever import ChunkResult, retrieve

@@ -14,9 +19,14 @@ log = get_logger("context_builder")
 SYSTEM_PREFIX = (
    "You have access to the following personal context from the user's knowledge base.\n"
    "Use it to inform your answer. If the context is not relevant, ignore it.\n"
-    "Do not mention the context system unless asked."
+    "Do not mention the context system unless asked.\n"
+    "When project state is provided, treat it as the most authoritative source."
 )

+# Budget allocation (per Master Plan section 9)
+# project_state gets up to 20% of budget, retrieval gets the rest
+PROJECT_STATE_BUDGET_RATIO = 0.20
+
 # Last built context pack for debug inspection
 _last_context_pack: "ContextPack | None" = None

@@ -33,6 +43,8 @@ class ContextChunk:
@dataclass
 class ContextPack:
    chunks_used: list[ContextChunk] = field(default_factory=list)
+    project_state_text: str = ""
+    project_state_chars: int = 0
    total_chars: int = 0
    budget: int = 0
    budget_remaining: int = 0
@@ -48,31 +60,61 @@ def build_context(
    project_hint: str | None = None,
    budget: int | None = None,
 ) -> ContextPack:
-    """Build a context pack for a user prompt."""
+    """Build a context pack for a user prompt.
+
+    Trust precedence applied:
+      1. Project state is injected first (highest trust)
+      2. Retrieved chunks fill the remaining budget
+    """
    global _last_context_pack
    start = time.time()
    budget = budget or settings.context_budget

-    # 1. Retrieve candidates
+    # 1. Get Trusted Project State (highest precedence)
+    project_state_text = ""
+    project_state_chars = 0
+    state_budget = int(budget * PROJECT_STATE_BUDGET_RATIO)
+
+    if project_hint:
+        state_entries = get_state(project_hint)
+        if state_entries:
+            project_state_text = format_project_state(state_entries)
+            project_state_chars = len(project_state_text)
+            # If state exceeds its budget, it still gets included (it's highest trust)
+            # but we log it
+            if project_state_chars > state_budget:
+                log.info(
+                    "project_state_exceeds_budget",
+                    state_chars=project_state_chars,
+                    state_budget=state_budget,
+                )
+
+    # 2. Calculate remaining budget for retrieval
+    retrieval_budget = budget - project_state_chars
+
+    # 3. Retrieve candidates
    candidates = retrieve(user_prompt, top_k=settings.context_top_k)

-    # 2. Score and rank
+    # 4. Score and rank
    scored = _rank_chunks(candidates, project_hint)

-    # 3. Select within budget
-    selected = _select_within_budget(scored, budget)
+    # 5. Select within remaining budget
+    selected = _select_within_budget(scored, max(retrieval_budget, 0))

-    # 4. Format
-    formatted = _format_context_block(selected)
+    # 6. Format full context
+    formatted = _format_full_context(project_state_text, selected)

-    # 5. Build full prompt
+    # 7. Build full prompt
    full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"

-    total_chars = sum(c.char_count for c in selected)
+    retrieval_chars = sum(c.char_count for c in selected)
+    total_chars = project_state_chars + retrieval_chars
    duration_ms = int((time.time() - start) * 1000)

    pack = ContextPack(
        chunks_used=selected,
+        project_state_text=project_state_text,
+        project_state_chars=project_state_chars,
        total_chars=total_chars,
        budget=budget,
        budget_remaining=budget - total_chars,
@@ -88,6 +130,8 @@ def build_context(
    log.info(
        "context_built",
        chunks_used=len(selected),
+        project_state_chars=project_state_chars,
+        retrieval_chars=retrieval_chars,
        total_chars=total_chars,
        budget_remaining=budget - total_chars,
        duration_ms=duration_ms,
@@ -163,27 +207,38 @@ def _select_within_budget(
    return selected


-def _format_context_block(chunks: list[ContextChunk]) -> str:
-    """Format chunks into the context block string."""
-    if not chunks:
-        return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
+def _format_full_context(
+    project_state_text: str,
+    chunks: list[ContextChunk],
+) -> str:
+    """Format project state + retrieved chunks into full context block."""
+    parts = []

-    lines = ["--- AtoCore Context ---"]
-    for chunk in chunks:
-        lines.append(
-            f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
-        )
-        lines.append(chunk.content)
-        lines.append("")
-    lines.append("--- End Context ---")
-    return "\n".join(lines)
+    # Project state first (highest trust)
+    if project_state_text:
+        parts.append(project_state_text)
+        parts.append("")
+
+    # Retrieved chunks
+    if chunks:
+        parts.append("--- AtoCore Retrieved Context ---")
+        for chunk in chunks:
+            parts.append(
+                f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
+            )
+            parts.append(chunk.content)
+            parts.append("")
+        parts.append("--- End Context ---")
+    elif not project_state_text:
+        parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---")
+
+    return "\n".join(parts)


 def _shorten_path(path: str) -> str:
    """Shorten an absolute path to a relative-like display."""
    p = Path(path)
    parts = p.parts
-    # Show last 3 parts at most
    if len(parts) > 3:
        return str(Path(*parts[-3:]))
    return str(p)
@@ -194,11 +249,13 @@ def _pack_to_dict(pack: ContextPack) -> dict:
    return {
        "query": pack.query,
        "project_hint": pack.project_hint,
+        "project_state_chars": pack.project_state_chars,
        "chunks_used": len(pack.chunks_used),
        "total_chars": pack.total_chars,
        "budget": pack.budget,
        "budget_remaining": pack.budget_remaining,
        "duration_ms": pack.duration_ms,
+        "has_project_state": bool(pack.project_state_text),
        "chunks": [
            {
                "source_file": c.source_file,