feat: Phase 1 ingestion hardening + Phase 5 Trusted Project State
Phase 1 - Ingestion hardening: - Encoding fallback (UTF-8/UTF-8-sig/Latin-1/CP1252) - Delete detection: purge DB/vector entries for removed files - Ingestion stats endpoint (GET /stats) Phase 5 - Trusted Project State: - project_state table with categories (status, decision, requirement, contact, milestone, fact, config) - CRUD API: POST/GET/DELETE /project/state - Upsert semantics, invalidation (supersede) support - Context builder integrates project state at highest trust precedence - Project state gets 20% budget allocation, appears first in context - Trust precedence: Project State > Retrieved Chunks (per Master Plan) 33/33 tests passing. Validated end-to-end with GigaBIT M1 project data. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,11 +1,16 @@
|
||||
"""Context pack assembly: retrieve, rank, budget, format."""
|
||||
"""Context pack assembly: retrieve, rank, budget, format.
|
||||
|
||||
Trust precedence (per Master Plan):
|
||||
1. Trusted Project State → always included first, uses its own budget slice
|
||||
2. Retrieved chunks → ranked, deduplicated, budget-constrained
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.context.project_state import format_project_state, get_state
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.retriever import ChunkResult, retrieve
|
||||
|
||||
@@ -14,9 +19,14 @@ log = get_logger("context_builder")
|
||||
SYSTEM_PREFIX = (
|
||||
"You have access to the following personal context from the user's knowledge base.\n"
|
||||
"Use it to inform your answer. If the context is not relevant, ignore it.\n"
|
||||
"Do not mention the context system unless asked."
|
||||
"Do not mention the context system unless asked.\n"
|
||||
"When project state is provided, treat it as the most authoritative source."
|
||||
)
|
||||
|
||||
# Budget allocation (per Master Plan section 9)
|
||||
# project_state gets up to 20% of budget, retrieval gets the rest
|
||||
PROJECT_STATE_BUDGET_RATIO = 0.20
|
||||
|
||||
# Last built context pack for debug inspection
|
||||
_last_context_pack: "ContextPack | None" = None
|
||||
|
||||
@@ -33,6 +43,8 @@ class ContextChunk:
|
||||
@dataclass
|
||||
class ContextPack:
|
||||
chunks_used: list[ContextChunk] = field(default_factory=list)
|
||||
project_state_text: str = ""
|
||||
project_state_chars: int = 0
|
||||
total_chars: int = 0
|
||||
budget: int = 0
|
||||
budget_remaining: int = 0
|
||||
@@ -48,31 +60,61 @@ def build_context(
|
||||
project_hint: str | None = None,
|
||||
budget: int | None = None,
|
||||
) -> ContextPack:
|
||||
"""Build a context pack for a user prompt."""
|
||||
"""Build a context pack for a user prompt.
|
||||
|
||||
Trust precedence applied:
|
||||
1. Project state is injected first (highest trust)
|
||||
2. Retrieved chunks fill the remaining budget
|
||||
"""
|
||||
global _last_context_pack
|
||||
start = time.time()
|
||||
budget = budget or settings.context_budget
|
||||
|
||||
# 1. Retrieve candidates
|
||||
# 1. Get Trusted Project State (highest precedence)
|
||||
project_state_text = ""
|
||||
project_state_chars = 0
|
||||
state_budget = int(budget * PROJECT_STATE_BUDGET_RATIO)
|
||||
|
||||
if project_hint:
|
||||
state_entries = get_state(project_hint)
|
||||
if state_entries:
|
||||
project_state_text = format_project_state(state_entries)
|
||||
project_state_chars = len(project_state_text)
|
||||
# If state exceeds its budget, it still gets included (it's highest trust)
|
||||
# but we log it
|
||||
if project_state_chars > state_budget:
|
||||
log.info(
|
||||
"project_state_exceeds_budget",
|
||||
state_chars=project_state_chars,
|
||||
state_budget=state_budget,
|
||||
)
|
||||
|
||||
# 2. Calculate remaining budget for retrieval
|
||||
retrieval_budget = budget - project_state_chars
|
||||
|
||||
# 3. Retrieve candidates
|
||||
candidates = retrieve(user_prompt, top_k=settings.context_top_k)
|
||||
|
||||
# 2. Score and rank
|
||||
# 4. Score and rank
|
||||
scored = _rank_chunks(candidates, project_hint)
|
||||
|
||||
# 3. Select within budget
|
||||
selected = _select_within_budget(scored, budget)
|
||||
# 5. Select within remaining budget
|
||||
selected = _select_within_budget(scored, max(retrieval_budget, 0))
|
||||
|
||||
# 4. Format
|
||||
formatted = _format_context_block(selected)
|
||||
# 6. Format full context
|
||||
formatted = _format_full_context(project_state_text, selected)
|
||||
|
||||
# 5. Build full prompt
|
||||
# 7. Build full prompt
|
||||
full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"
|
||||
|
||||
total_chars = sum(c.char_count for c in selected)
|
||||
retrieval_chars = sum(c.char_count for c in selected)
|
||||
total_chars = project_state_chars + retrieval_chars
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
|
||||
pack = ContextPack(
|
||||
chunks_used=selected,
|
||||
project_state_text=project_state_text,
|
||||
project_state_chars=project_state_chars,
|
||||
total_chars=total_chars,
|
||||
budget=budget,
|
||||
budget_remaining=budget - total_chars,
|
||||
@@ -88,6 +130,8 @@ def build_context(
|
||||
log.info(
|
||||
"context_built",
|
||||
chunks_used=len(selected),
|
||||
project_state_chars=project_state_chars,
|
||||
retrieval_chars=retrieval_chars,
|
||||
total_chars=total_chars,
|
||||
budget_remaining=budget - total_chars,
|
||||
duration_ms=duration_ms,
|
||||
@@ -163,27 +207,38 @@ def _select_within_budget(
|
||||
return selected
|
||||
|
||||
|
||||
def _format_context_block(chunks: list[ContextChunk]) -> str:
|
||||
"""Format chunks into the context block string."""
|
||||
if not chunks:
|
||||
return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
|
||||
def _format_full_context(
|
||||
project_state_text: str,
|
||||
chunks: list[ContextChunk],
|
||||
) -> str:
|
||||
"""Format project state + retrieved chunks into full context block."""
|
||||
parts = []
|
||||
|
||||
lines = ["--- AtoCore Context ---"]
|
||||
for chunk in chunks:
|
||||
lines.append(
|
||||
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
|
||||
)
|
||||
lines.append(chunk.content)
|
||||
lines.append("")
|
||||
lines.append("--- End Context ---")
|
||||
return "\n".join(lines)
|
||||
# Project state first (highest trust)
|
||||
if project_state_text:
|
||||
parts.append(project_state_text)
|
||||
parts.append("")
|
||||
|
||||
# Retrieved chunks
|
||||
if chunks:
|
||||
parts.append("--- AtoCore Retrieved Context ---")
|
||||
for chunk in chunks:
|
||||
parts.append(
|
||||
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
|
||||
)
|
||||
parts.append(chunk.content)
|
||||
parts.append("")
|
||||
parts.append("--- End Context ---")
|
||||
elif not project_state_text:
|
||||
parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---")
|
||||
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _shorten_path(path: str) -> str:
|
||||
"""Shorten an absolute path to a relative-like display."""
|
||||
p = Path(path)
|
||||
parts = p.parts
|
||||
# Show last 3 parts at most
|
||||
if len(parts) > 3:
|
||||
return str(Path(*parts[-3:]))
|
||||
return str(p)
|
||||
@@ -194,11 +249,13 @@ def _pack_to_dict(pack: ContextPack) -> dict:
|
||||
return {
|
||||
"query": pack.query,
|
||||
"project_hint": pack.project_hint,
|
||||
"project_state_chars": pack.project_state_chars,
|
||||
"chunks_used": len(pack.chunks_used),
|
||||
"total_chars": pack.total_chars,
|
||||
"budget": pack.budget,
|
||||
"budget_remaining": pack.budget_remaining,
|
||||
"duration_ms": pack.duration_ms,
|
||||
"has_project_state": bool(pack.project_state_text),
|
||||
"chunks": [
|
||||
{
|
||||
"source_file": c.source_file,
|
||||
|
||||
231
src/atocore/context/project_state.py
Normal file
231
src/atocore/context/project_state.py
Normal file
@@ -0,0 +1,231 @@
|
||||
"""Trusted Project State — the highest-priority context source.
|
||||
|
||||
Per the Master Plan trust precedence:
|
||||
1. Trusted Project State (this module)
|
||||
2. AtoDrive artifacts
|
||||
3. Recent validated memory
|
||||
4. AtoVault summaries
|
||||
5. PKM chunks
|
||||
6. Historical / low-confidence
|
||||
|
||||
Project state is manually curated or explicitly confirmed facts about a project.
|
||||
It always wins over retrieval-based context when there's a conflict.
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from atocore.models.database import get_connection
|
||||
from atocore.observability.logger import get_logger
|
||||
|
||||
log = get_logger("project_state")
|
||||
|
||||
# DB schema extension for project state
|
||||
PROJECT_STATE_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS project_state (
|
||||
id TEXT PRIMARY KEY,
|
||||
project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
|
||||
category TEXT NOT NULL,
|
||||
key TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
source TEXT DEFAULT '',
|
||||
confidence REAL DEFAULT 1.0,
|
||||
status TEXT DEFAULT 'active',
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(project_id, category, key)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_project_state_project ON project_state(project_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_project_state_category ON project_state(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_project_state_status ON project_state(status);
|
||||
"""
|
||||
|
||||
# Valid categories for project state entries
|
||||
CATEGORIES = [
|
||||
"status", # current project status, phase, blockers
|
||||
"decision", # confirmed design/engineering decisions
|
||||
"requirement", # key requirements and constraints
|
||||
"contact", # key people, vendors, stakeholders
|
||||
"milestone", # dates, deadlines, deliverables
|
||||
"fact", # verified technical facts
|
||||
"config", # project configuration, parameters
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProjectStateEntry:
|
||||
id: str
|
||||
project_id: str
|
||||
category: str
|
||||
key: str
|
||||
value: str
|
||||
source: str = ""
|
||||
confidence: float = 1.0
|
||||
status: str = "active"
|
||||
created_at: str = ""
|
||||
updated_at: str = ""
|
||||
|
||||
|
||||
def init_project_state_schema() -> None:
|
||||
"""Create the project_state table if it doesn't exist."""
|
||||
with get_connection() as conn:
|
||||
conn.executescript(PROJECT_STATE_SCHEMA)
|
||||
log.info("project_state_schema_initialized")
|
||||
|
||||
|
||||
def ensure_project(name: str, description: str = "") -> str:
|
||||
"""Get or create a project by name. Returns project_id."""
|
||||
with get_connection() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT id FROM projects WHERE name = ?", (name,)
|
||||
).fetchone()
|
||||
if row:
|
||||
return row["id"]
|
||||
|
||||
project_id = str(uuid.uuid4())
|
||||
conn.execute(
|
||||
"INSERT INTO projects (id, name, description) VALUES (?, ?, ?)",
|
||||
(project_id, name, description),
|
||||
)
|
||||
log.info("project_created", name=name, project_id=project_id)
|
||||
return project_id
|
||||
|
||||
|
||||
def set_state(
|
||||
project_name: str,
|
||||
category: str,
|
||||
key: str,
|
||||
value: str,
|
||||
source: str = "",
|
||||
confidence: float = 1.0,
|
||||
) -> ProjectStateEntry:
|
||||
"""Set or update a project state entry. Upsert semantics."""
|
||||
if category not in CATEGORIES:
|
||||
raise ValueError(f"Invalid category '{category}'. Must be one of: {CATEGORIES}")
|
||||
|
||||
project_id = ensure_project(project_name)
|
||||
entry_id = str(uuid.uuid4())
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
with get_connection() as conn:
|
||||
# Check if entry exists
|
||||
existing = conn.execute(
|
||||
"SELECT id FROM project_state WHERE project_id = ? AND category = ? AND key = ?",
|
||||
(project_id, category, key),
|
||||
).fetchone()
|
||||
|
||||
if existing:
|
||||
entry_id = existing["id"]
|
||||
conn.execute(
|
||||
"UPDATE project_state SET value = ?, source = ?, confidence = ?, "
|
||||
"status = 'active', updated_at = CURRENT_TIMESTAMP "
|
||||
"WHERE id = ?",
|
||||
(value, source, confidence, entry_id),
|
||||
)
|
||||
log.info("project_state_updated", project=project_name, category=category, key=key)
|
||||
else:
|
||||
conn.execute(
|
||||
"INSERT INTO project_state (id, project_id, category, key, value, source, confidence) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||
(entry_id, project_id, category, key, value, source, confidence),
|
||||
)
|
||||
log.info("project_state_created", project=project_name, category=category, key=key)
|
||||
|
||||
return ProjectStateEntry(
|
||||
id=entry_id,
|
||||
project_id=project_id,
|
||||
category=category,
|
||||
key=key,
|
||||
value=value,
|
||||
source=source,
|
||||
confidence=confidence,
|
||||
status="active",
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
)
|
||||
|
||||
|
||||
def get_state(
|
||||
project_name: str,
|
||||
category: str | None = None,
|
||||
active_only: bool = True,
|
||||
) -> list[ProjectStateEntry]:
|
||||
"""Get project state entries, optionally filtered by category."""
|
||||
with get_connection() as conn:
|
||||
project = conn.execute(
|
||||
"SELECT id FROM projects WHERE name = ?", (project_name,)
|
||||
).fetchone()
|
||||
if not project:
|
||||
return []
|
||||
|
||||
query = "SELECT * FROM project_state WHERE project_id = ?"
|
||||
params: list = [project["id"]]
|
||||
|
||||
if category:
|
||||
query += " AND category = ?"
|
||||
params.append(category)
|
||||
if active_only:
|
||||
query += " AND status = 'active'"
|
||||
|
||||
query += " ORDER BY category, key"
|
||||
rows = conn.execute(query, params).fetchall()
|
||||
|
||||
return [
|
||||
ProjectStateEntry(
|
||||
id=r["id"],
|
||||
project_id=r["project_id"],
|
||||
category=r["category"],
|
||||
key=r["key"],
|
||||
value=r["value"],
|
||||
source=r["source"],
|
||||
confidence=r["confidence"],
|
||||
status=r["status"],
|
||||
created_at=r["created_at"],
|
||||
updated_at=r["updated_at"],
|
||||
)
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
def invalidate_state(project_name: str, category: str, key: str) -> bool:
|
||||
"""Mark a project state entry as superseded."""
|
||||
with get_connection() as conn:
|
||||
project = conn.execute(
|
||||
"SELECT id FROM projects WHERE name = ?", (project_name,)
|
||||
).fetchone()
|
||||
if not project:
|
||||
return False
|
||||
|
||||
result = conn.execute(
|
||||
"UPDATE project_state SET status = 'superseded', updated_at = CURRENT_TIMESTAMP "
|
||||
"WHERE project_id = ? AND category = ? AND key = ? AND status = 'active'",
|
||||
(project["id"], category, key),
|
||||
)
|
||||
if result.rowcount > 0:
|
||||
log.info("project_state_invalidated", project=project_name, category=category, key=key)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def format_project_state(entries: list[ProjectStateEntry]) -> str:
|
||||
"""Format project state entries for context injection."""
|
||||
if not entries:
|
||||
return ""
|
||||
|
||||
lines = ["--- Trusted Project State ---"]
|
||||
current_category = ""
|
||||
|
||||
for entry in entries:
|
||||
if entry.category != current_category:
|
||||
current_category = entry.category
|
||||
lines.append(f"\n[{current_category.upper()}]")
|
||||
lines.append(f" {entry.key}: {entry.value}")
|
||||
if entry.source:
|
||||
lines.append(f" (source: {entry.source})")
|
||||
|
||||
lines.append("\n--- End Project State ---")
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user