feat: Phase 1 ingestion hardening + Phase 5 Trusted Project State
Phase 1 - Ingestion hardening: - Encoding fallback (UTF-8/UTF-8-sig/Latin-1/CP1252) - Delete detection: purge DB/vector entries for removed files - Ingestion stats endpoint (GET /stats) Phase 5 - Trusted Project State: - project_state table with categories (status, decision, requirement, contact, milestone, fact, config) - CRUD API: POST/GET/DELETE /project/state - Upsert semantics, invalidation (supersede) support - Context builder integrates project state at highest trust precedence - Project state gets 20% budget allocation, appears first in context - Trust precedence: Project State > Retrieved Chunks (per Master Plan) 33/33 tests passing. Validated end-to-end with GigaBIT M1 project data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,16 @@
|
||||
"""Context pack assembly: retrieve, rank, budget, format."""
|
||||
"""Context pack assembly: retrieve, rank, budget, format.
|
||||
|
||||
Trust precedence (per Master Plan):
|
||||
1. Trusted Project State → always included first, uses its own budget slice
|
||||
2. Retrieved chunks → ranked, deduplicated, budget-constrained
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.context.project_state import format_project_state, get_state
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.retriever import ChunkResult, retrieve
|
||||
|
||||
@@ -14,9 +19,14 @@ log = get_logger("context_builder")
|
||||
SYSTEM_PREFIX = (
|
||||
"You have access to the following personal context from the user's knowledge base.\n"
|
||||
"Use it to inform your answer. If the context is not relevant, ignore it.\n"
|
||||
"Do not mention the context system unless asked."
|
||||
"Do not mention the context system unless asked.\n"
|
||||
"When project state is provided, treat it as the most authoritative source."
|
||||
)
|
||||
|
||||
# Budget allocation (per Master Plan section 9)
|
||||
# project_state gets up to 20% of budget, retrieval gets the rest
|
||||
PROJECT_STATE_BUDGET_RATIO = 0.20
|
||||
|
||||
# Last built context pack for debug inspection
|
||||
_last_context_pack: "ContextPack | None" = None
|
||||
|
||||
@@ -33,6 +43,8 @@ class ContextChunk:
|
||||
@dataclass
|
||||
class ContextPack:
|
||||
chunks_used: list[ContextChunk] = field(default_factory=list)
|
||||
project_state_text: str = ""
|
||||
project_state_chars: int = 0
|
||||
total_chars: int = 0
|
||||
budget: int = 0
|
||||
budget_remaining: int = 0
|
||||
@@ -48,31 +60,61 @@ def build_context(
|
||||
project_hint: str | None = None,
|
||||
budget: int | None = None,
|
||||
) -> ContextPack:
|
||||
"""Build a context pack for a user prompt."""
|
||||
"""Build a context pack for a user prompt.
|
||||
|
||||
Trust precedence applied:
|
||||
1. Project state is injected first (highest trust)
|
||||
2. Retrieved chunks fill the remaining budget
|
||||
"""
|
||||
global _last_context_pack
|
||||
start = time.time()
|
||||
budget = budget or settings.context_budget
|
||||
|
||||
# 1. Retrieve candidates
|
||||
# 1. Get Trusted Project State (highest precedence)
|
||||
project_state_text = ""
|
||||
project_state_chars = 0
|
||||
state_budget = int(budget * PROJECT_STATE_BUDGET_RATIO)
|
||||
|
||||
if project_hint:
|
||||
state_entries = get_state(project_hint)
|
||||
if state_entries:
|
||||
project_state_text = format_project_state(state_entries)
|
||||
project_state_chars = len(project_state_text)
|
||||
# If state exceeds its budget, it still gets included (it's highest trust)
|
||||
# but we log it
|
||||
if project_state_chars > state_budget:
|
||||
log.info(
|
||||
"project_state_exceeds_budget",
|
||||
state_chars=project_state_chars,
|
||||
state_budget=state_budget,
|
||||
)
|
||||
|
||||
# 2. Calculate remaining budget for retrieval
|
||||
retrieval_budget = budget - project_state_chars
|
||||
|
||||
# 3. Retrieve candidates
|
||||
candidates = retrieve(user_prompt, top_k=settings.context_top_k)
|
||||
|
||||
# 2. Score and rank
|
||||
# 4. Score and rank
|
||||
scored = _rank_chunks(candidates, project_hint)
|
||||
|
||||
# 3. Select within budget
|
||||
selected = _select_within_budget(scored, budget)
|
||||
# 5. Select within remaining budget
|
||||
selected = _select_within_budget(scored, max(retrieval_budget, 0))
|
||||
|
||||
# 4. Format
|
||||
formatted = _format_context_block(selected)
|
||||
# 6. Format full context
|
||||
formatted = _format_full_context(project_state_text, selected)
|
||||
|
||||
# 5. Build full prompt
|
||||
# 7. Build full prompt
|
||||
full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"
|
||||
|
||||
total_chars = sum(c.char_count for c in selected)
|
||||
retrieval_chars = sum(c.char_count for c in selected)
|
||||
total_chars = project_state_chars + retrieval_chars
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
|
||||
pack = ContextPack(
|
||||
chunks_used=selected,
|
||||
project_state_text=project_state_text,
|
||||
project_state_chars=project_state_chars,
|
||||
total_chars=total_chars,
|
||||
budget=budget,
|
||||
budget_remaining=budget - total_chars,
|
||||
@@ -88,6 +130,8 @@ def build_context(
|
||||
log.info(
|
||||
"context_built",
|
||||
chunks_used=len(selected),
|
||||
project_state_chars=project_state_chars,
|
||||
retrieval_chars=retrieval_chars,
|
||||
total_chars=total_chars,
|
||||
budget_remaining=budget - total_chars,
|
||||
duration_ms=duration_ms,
|
||||
@@ -163,27 +207,38 @@ def _select_within_budget(
|
||||
return selected
|
||||
|
||||
|
||||
def _format_context_block(chunks: list[ContextChunk]) -> str:
|
||||
"""Format chunks into the context block string."""
|
||||
if not chunks:
|
||||
return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
|
||||
def _format_full_context(
|
||||
project_state_text: str,
|
||||
chunks: list[ContextChunk],
|
||||
) -> str:
|
||||
"""Format project state + retrieved chunks into full context block."""
|
||||
parts = []
|
||||
|
||||
lines = ["--- AtoCore Context ---"]
|
||||
for chunk in chunks:
|
||||
lines.append(
|
||||
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
|
||||
)
|
||||
lines.append(chunk.content)
|
||||
lines.append("")
|
||||
lines.append("--- End Context ---")
|
||||
return "\n".join(lines)
|
||||
# Project state first (highest trust)
|
||||
if project_state_text:
|
||||
parts.append(project_state_text)
|
||||
parts.append("")
|
||||
|
||||
# Retrieved chunks
|
||||
if chunks:
|
||||
parts.append("--- AtoCore Retrieved Context ---")
|
||||
for chunk in chunks:
|
||||
parts.append(
|
||||
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
|
||||
)
|
||||
parts.append(chunk.content)
|
||||
parts.append("")
|
||||
parts.append("--- End Context ---")
|
||||
elif not project_state_text:
|
||||
parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _shorten_path(path: str) -> str:
|
||||
"""Shorten an absolute path to a relative-like display."""
|
||||
p = Path(path)
|
||||
parts = p.parts
|
||||
# Show last 3 parts at most
|
||||
if len(parts) > 3:
|
||||
return str(Path(*parts[-3:]))
|
||||
return str(p)
|
||||
@@ -194,11 +249,13 @@ def _pack_to_dict(pack: ContextPack) -> dict:
|
||||
return {
|
||||
"query": pack.query,
|
||||
"project_hint": pack.project_hint,
|
||||
"project_state_chars": pack.project_state_chars,
|
||||
"chunks_used": len(pack.chunks_used),
|
||||
"total_chars": pack.total_chars,
|
||||
"budget": pack.budget,
|
||||
"budget_remaining": pack.budget_remaining,
|
||||
"duration_ms": pack.duration_ms,
|
||||
"has_project_state": bool(pack.project_state_text),
|
||||
"chunks": [
|
||||
{
|
||||
"source_file": c.source_file,
|
||||
|
||||
Reference in New Issue
Block a user