feat: Phase 1 ingestion hardening + Phase 5 Trusted Project State

Phase 1 - Ingestion hardening:
- Encoding fallback (UTF-8/UTF-8-sig/Latin-1/CP1252)
- Delete detection: purge DB/vector entries for removed files
- Ingestion stats endpoint (GET /stats)

Phase 5 - Trusted Project State:
- project_state table with categories (status, decision, requirement, contact, milestone, fact, config)
- CRUD API: POST/GET/DELETE /project/state
- Upsert semantics, invalidation (supersede) support
- Context builder integrates project state at highest trust precedence
- Project state gets 20% budget allocation, appears first in context
- Trust precedence: Project State > Retrieved Chunks (per Master Plan)

33/33 tests passing. Validated end-to-end with GigaBIT M1 project data.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-05 09:41:59 -04:00
parent 6081462058
commit 531c560db7
7 changed files with 671 additions and 35 deletions

View File

@@ -1,11 +1,16 @@
"""Context pack assembly: retrieve, rank, budget, format."""
"""Context pack assembly: retrieve, rank, budget, format.
Trust precedence (per Master Plan):
1. Trusted Project State → always included first, uses its own budget slice
2. Retrieved chunks → ranked, deduplicated, budget-constrained
"""
import json
import time
from dataclasses import dataclass, field
from pathlib import Path
from atocore.config import settings
from atocore.context.project_state import format_project_state, get_state
from atocore.observability.logger import get_logger
from atocore.retrieval.retriever import ChunkResult, retrieve
@@ -14,9 +19,14 @@ log = get_logger("context_builder")
SYSTEM_PREFIX = (
"You have access to the following personal context from the user's knowledge base.\n"
"Use it to inform your answer. If the context is not relevant, ignore it.\n"
"Do not mention the context system unless asked."
"Do not mention the context system unless asked.\n"
"When project state is provided, treat it as the most authoritative source."
)
# Budget allocation (per Master Plan section 9)
# project_state gets up to 20% of budget, retrieval gets the rest
PROJECT_STATE_BUDGET_RATIO = 0.20
# Last built context pack for debug inspection
_last_context_pack: "ContextPack | None" = None
@@ -33,6 +43,8 @@ class ContextChunk:
@dataclass
class ContextPack:
chunks_used: list[ContextChunk] = field(default_factory=list)
project_state_text: str = ""
project_state_chars: int = 0
total_chars: int = 0
budget: int = 0
budget_remaining: int = 0
@@ -48,31 +60,61 @@ def build_context(
project_hint: str | None = None,
budget: int | None = None,
) -> ContextPack:
"""Build a context pack for a user prompt."""
"""Build a context pack for a user prompt.
Trust precedence applied:
1. Project state is injected first (highest trust)
2. Retrieved chunks fill the remaining budget
"""
global _last_context_pack
start = time.time()
budget = budget or settings.context_budget
# 1. Retrieve candidates
# 1. Get Trusted Project State (highest precedence)
project_state_text = ""
project_state_chars = 0
state_budget = int(budget * PROJECT_STATE_BUDGET_RATIO)
if project_hint:
state_entries = get_state(project_hint)
if state_entries:
project_state_text = format_project_state(state_entries)
project_state_chars = len(project_state_text)
# If state exceeds its budget, it still gets included (it's highest trust)
# but we log it
if project_state_chars > state_budget:
log.info(
"project_state_exceeds_budget",
state_chars=project_state_chars,
state_budget=state_budget,
)
# 2. Calculate remaining budget for retrieval
retrieval_budget = budget - project_state_chars
# 3. Retrieve candidates
candidates = retrieve(user_prompt, top_k=settings.context_top_k)
# 2. Score and rank
# 4. Score and rank
scored = _rank_chunks(candidates, project_hint)
# 3. Select within budget
selected = _select_within_budget(scored, budget)
# 5. Select within remaining budget
selected = _select_within_budget(scored, max(retrieval_budget, 0))
# 4. Format
formatted = _format_context_block(selected)
# 6. Format full context
formatted = _format_full_context(project_state_text, selected)
# 5. Build full prompt
# 7. Build full prompt
full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"
total_chars = sum(c.char_count for c in selected)
retrieval_chars = sum(c.char_count for c in selected)
total_chars = project_state_chars + retrieval_chars
duration_ms = int((time.time() - start) * 1000)
pack = ContextPack(
chunks_used=selected,
project_state_text=project_state_text,
project_state_chars=project_state_chars,
total_chars=total_chars,
budget=budget,
budget_remaining=budget - total_chars,
@@ -88,6 +130,8 @@ def build_context(
log.info(
"context_built",
chunks_used=len(selected),
project_state_chars=project_state_chars,
retrieval_chars=retrieval_chars,
total_chars=total_chars,
budget_remaining=budget - total_chars,
duration_ms=duration_ms,
@@ -163,27 +207,38 @@ def _select_within_budget(
return selected
def _format_context_block(chunks: list[ContextChunk]) -> str:
"""Format chunks into the context block string."""
if not chunks:
return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
def _format_full_context(
project_state_text: str,
chunks: list[ContextChunk],
) -> str:
"""Format project state + retrieved chunks into full context block."""
parts = []
lines = ["--- AtoCore Context ---"]
for chunk in chunks:
lines.append(
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
)
lines.append(chunk.content)
lines.append("")
lines.append("--- End Context ---")
return "\n".join(lines)
# Project state first (highest trust)
if project_state_text:
parts.append(project_state_text)
parts.append("")
# Retrieved chunks
if chunks:
parts.append("--- AtoCore Retrieved Context ---")
for chunk in chunks:
parts.append(
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
)
parts.append(chunk.content)
parts.append("")
parts.append("--- End Context ---")
elif not project_state_text:
parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---")
return "\n".join(parts)
def _shorten_path(path: str) -> str:
"""Shorten an absolute path to a relative-like display."""
p = Path(path)
parts = p.parts
# Show last 3 parts at most
if len(parts) > 3:
return str(Path(*parts[-3:]))
return str(p)
@@ -194,11 +249,13 @@ def _pack_to_dict(pack: ContextPack) -> dict:
return {
"query": pack.query,
"project_hint": pack.project_hint,
"project_state_chars": pack.project_state_chars,
"chunks_used": len(pack.chunks_used),
"total_chars": pack.total_chars,
"budget": pack.budget,
"budget_remaining": pack.budget_remaining,
"duration_ms": pack.duration_ms,
"has_project_state": bool(pack.project_state_text),
"chunks": [
{
"source_file": c.source_file,

View File

@@ -0,0 +1,231 @@
"""Trusted Project State — the highest-priority context source.
Per the Master Plan trust precedence:
1. Trusted Project State (this module)
2. AtoDrive artifacts
3. Recent validated memory
4. AtoVault summaries
5. PKM chunks
6. Historical / low-confidence
Project state is manually curated or explicitly confirmed facts about a project.
It always wins over retrieval-based context when there's a conflict.
"""
import json
import time
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
log = get_logger("project_state")
# DB schema extension for project state
PROJECT_STATE_SCHEMA = """
CREATE TABLE IF NOT EXISTS project_state (
id TEXT PRIMARY KEY,
project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
category TEXT NOT NULL,
key TEXT NOT NULL,
value TEXT NOT NULL,
source TEXT DEFAULT '',
confidence REAL DEFAULT 1.0,
status TEXT DEFAULT 'active',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
UNIQUE(project_id, category, key)
);
CREATE INDEX IF NOT EXISTS idx_project_state_project ON project_state(project_id);
CREATE INDEX IF NOT EXISTS idx_project_state_category ON project_state(category);
CREATE INDEX IF NOT EXISTS idx_project_state_status ON project_state(status);
"""
# Valid categories for project state entries
CATEGORIES = [
"status", # current project status, phase, blockers
"decision", # confirmed design/engineering decisions
"requirement", # key requirements and constraints
"contact", # key people, vendors, stakeholders
"milestone", # dates, deadlines, deliverables
"fact", # verified technical facts
"config", # project configuration, parameters
]
@dataclass
class ProjectStateEntry:
id: str
project_id: str
category: str
key: str
value: str
source: str = ""
confidence: float = 1.0
status: str = "active"
created_at: str = ""
updated_at: str = ""
def init_project_state_schema() -> None:
"""Create the project_state table if it doesn't exist."""
with get_connection() as conn:
conn.executescript(PROJECT_STATE_SCHEMA)
log.info("project_state_schema_initialized")
def ensure_project(name: str, description: str = "") -> str:
"""Get or create a project by name. Returns project_id."""
with get_connection() as conn:
row = conn.execute(
"SELECT id FROM projects WHERE name = ?", (name,)
).fetchone()
if row:
return row["id"]
project_id = str(uuid.uuid4())
conn.execute(
"INSERT INTO projects (id, name, description) VALUES (?, ?, ?)",
(project_id, name, description),
)
log.info("project_created", name=name, project_id=project_id)
return project_id
def set_state(
project_name: str,
category: str,
key: str,
value: str,
source: str = "",
confidence: float = 1.0,
) -> ProjectStateEntry:
"""Set or update a project state entry. Upsert semantics."""
if category not in CATEGORIES:
raise ValueError(f"Invalid category '{category}'. Must be one of: {CATEGORIES}")
project_id = ensure_project(project_name)
entry_id = str(uuid.uuid4())
now = datetime.now(timezone.utc).isoformat()
with get_connection() as conn:
# Check if entry exists
existing = conn.execute(
"SELECT id FROM project_state WHERE project_id = ? AND category = ? AND key = ?",
(project_id, category, key),
).fetchone()
if existing:
entry_id = existing["id"]
conn.execute(
"UPDATE project_state SET value = ?, source = ?, confidence = ?, "
"status = 'active', updated_at = CURRENT_TIMESTAMP "
"WHERE id = ?",
(value, source, confidence, entry_id),
)
log.info("project_state_updated", project=project_name, category=category, key=key)
else:
conn.execute(
"INSERT INTO project_state (id, project_id, category, key, value, source, confidence) "
"VALUES (?, ?, ?, ?, ?, ?, ?)",
(entry_id, project_id, category, key, value, source, confidence),
)
log.info("project_state_created", project=project_name, category=category, key=key)
return ProjectStateEntry(
id=entry_id,
project_id=project_id,
category=category,
key=key,
value=value,
source=source,
confidence=confidence,
status="active",
created_at=now,
updated_at=now,
)
def get_state(
project_name: str,
category: str | None = None,
active_only: bool = True,
) -> list[ProjectStateEntry]:
"""Get project state entries, optionally filtered by category."""
with get_connection() as conn:
project = conn.execute(
"SELECT id FROM projects WHERE name = ?", (project_name,)
).fetchone()
if not project:
return []
query = "SELECT * FROM project_state WHERE project_id = ?"
params: list = [project["id"]]
if category:
query += " AND category = ?"
params.append(category)
if active_only:
query += " AND status = 'active'"
query += " ORDER BY category, key"
rows = conn.execute(query, params).fetchall()
return [
ProjectStateEntry(
id=r["id"],
project_id=r["project_id"],
category=r["category"],
key=r["key"],
value=r["value"],
source=r["source"],
confidence=r["confidence"],
status=r["status"],
created_at=r["created_at"],
updated_at=r["updated_at"],
)
for r in rows
]
def invalidate_state(project_name: str, category: str, key: str) -> bool:
"""Mark a project state entry as superseded."""
with get_connection() as conn:
project = conn.execute(
"SELECT id FROM projects WHERE name = ?", (project_name,)
).fetchone()
if not project:
return False
result = conn.execute(
"UPDATE project_state SET status = 'superseded', updated_at = CURRENT_TIMESTAMP "
"WHERE project_id = ? AND category = ? AND key = ? AND status = 'active'",
(project["id"], category, key),
)
if result.rowcount > 0:
log.info("project_state_invalidated", project=project_name, category=category, key=key)
return True
return False
def format_project_state(entries: list[ProjectStateEntry]) -> str:
"""Format project state entries for context injection."""
if not entries:
return ""
lines = ["--- Trusted Project State ---"]
current_category = ""
for entry in entries:
if entry.category != current_category:
current_category = entry.category
lines.append(f"\n[{current_category.upper()}]")
lines.append(f" {entry.key}: {entry.value}")
if entry.source:
lines.append(f" (source: {entry.source})")
lines.append("\n--- End Project State ---")
return "\n".join(lines)