"""Context pack assembly: retrieve, rank, budget, format. Trust precedence (per Master Plan): 1. Trusted Project State → always included first, uses its own budget slice 2. Retrieved chunks → ranked, deduplicated, budget-constrained """ import time from dataclasses import dataclass, field from pathlib import Path from atocore.config import settings from atocore.context.project_state import format_project_state, get_state from atocore.observability.logger import get_logger from atocore.retrieval.retriever import ChunkResult, retrieve log = get_logger("context_builder") SYSTEM_PREFIX = ( "You have access to the following personal context from the user's knowledge base.\n" "Use it to inform your answer. If the context is not relevant, ignore it.\n" "Do not mention the context system unless asked.\n" "When project state is provided, treat it as the most authoritative source." ) # Budget allocation (per Master Plan section 9) # project_state gets up to 20% of budget, retrieval gets the rest PROJECT_STATE_BUDGET_RATIO = 0.20 # Last built context pack for debug inspection _last_context_pack: "ContextPack | None" = None @dataclass class ContextChunk: content: str source_file: str heading_path: str score: float char_count: int @dataclass class ContextPack: chunks_used: list[ContextChunk] = field(default_factory=list) project_state_text: str = "" project_state_chars: int = 0 total_chars: int = 0 budget: int = 0 budget_remaining: int = 0 formatted_context: str = "" full_prompt: str = "" query: str = "" project_hint: str = "" duration_ms: int = 0 def build_context( user_prompt: str, project_hint: str | None = None, budget: int | None = None, ) -> ContextPack: """Build a context pack for a user prompt. Trust precedence applied: 1. Project state is injected first (highest trust) 2. Retrieved chunks fill the remaining budget """ global _last_context_pack start = time.time() budget = budget or settings.context_budget # 1. Get Trusted Project State (highest precedence) project_state_text = "" project_state_chars = 0 state_budget = int(budget * PROJECT_STATE_BUDGET_RATIO) if project_hint: state_entries = get_state(project_hint) if state_entries: project_state_text = format_project_state(state_entries) project_state_chars = len(project_state_text) # If state exceeds its budget, it still gets included (it's highest trust) # but we log it if project_state_chars > state_budget: log.info( "project_state_exceeds_budget", state_chars=project_state_chars, state_budget=state_budget, ) # 2. Calculate remaining budget for retrieval retrieval_budget = budget - project_state_chars # 3. Retrieve candidates candidates = retrieve(user_prompt, top_k=settings.context_top_k) # 4. Score and rank scored = _rank_chunks(candidates, project_hint) # 5. Select within remaining budget selected = _select_within_budget(scored, max(retrieval_budget, 0)) # 6. Format full context formatted = _format_full_context(project_state_text, selected) # 7. Build full prompt full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}" retrieval_chars = sum(c.char_count for c in selected) total_chars = project_state_chars + retrieval_chars duration_ms = int((time.time() - start) * 1000) pack = ContextPack( chunks_used=selected, project_state_text=project_state_text, project_state_chars=project_state_chars, total_chars=total_chars, budget=budget, budget_remaining=budget - total_chars, formatted_context=formatted, full_prompt=full_prompt, query=user_prompt, project_hint=project_hint or "", duration_ms=duration_ms, ) _last_context_pack = pack log.info( "context_built", chunks_used=len(selected), project_state_chars=project_state_chars, retrieval_chars=retrieval_chars, total_chars=total_chars, budget_remaining=budget - total_chars, duration_ms=duration_ms, ) log.debug("context_pack_detail", pack=_pack_to_dict(pack)) return pack def get_last_context_pack() -> ContextPack | None: """Return the last built context pack for debug inspection.""" return _last_context_pack def _rank_chunks( candidates: list[ChunkResult], project_hint: str | None, ) -> list[tuple[float, ChunkResult]]: """Rank candidates with boosting for project match.""" scored = [] seen_content: set[str] = set() for chunk in candidates: # Deduplicate by content prefix (first 200 chars) content_key = chunk.content[:200] if content_key in seen_content: continue seen_content.add(content_key) # Base score from similarity final_score = chunk.score # Project boost if project_hint: tags_str = chunk.tags.lower() if chunk.tags else "" source_str = chunk.source_file.lower() title_str = chunk.title.lower() if chunk.title else "" hint_lower = project_hint.lower() if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str: final_score *= 1.3 scored.append((final_score, chunk)) # Sort by score descending scored.sort(key=lambda x: x[0], reverse=True) return scored def _select_within_budget( scored: list[tuple[float, ChunkResult]], budget: int, ) -> list[ContextChunk]: """Select top chunks that fit within the character budget.""" selected = [] used = 0 for score, chunk in scored: chunk_len = len(chunk.content) if used + chunk_len > budget: continue selected.append( ContextChunk( content=chunk.content, source_file=_shorten_path(chunk.source_file), heading_path=chunk.heading_path, score=score, char_count=chunk_len, ) ) used += chunk_len return selected def _format_full_context( project_state_text: str, chunks: list[ContextChunk], ) -> str: """Format project state + retrieved chunks into full context block.""" parts = [] # Project state first (highest trust) if project_state_text: parts.append(project_state_text) parts.append("") # Retrieved chunks if chunks: parts.append("--- AtoCore Retrieved Context ---") for chunk in chunks: parts.append( f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]" ) parts.append(chunk.content) parts.append("") parts.append("--- End Context ---") elif not project_state_text: parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---") return "\n".join(parts) def _shorten_path(path: str) -> str: """Shorten an absolute path to a relative-like display.""" p = Path(path) parts = p.parts if len(parts) > 3: return str(Path(*parts[-3:])) return str(p) def _pack_to_dict(pack: ContextPack) -> dict: """Convert a context pack to a JSON-serializable dict.""" return { "query": pack.query, "project_hint": pack.project_hint, "project_state_chars": pack.project_state_chars, "chunks_used": len(pack.chunks_used), "total_chars": pack.total_chars, "budget": pack.budget, "budget_remaining": pack.budget_remaining, "duration_ms": pack.duration_ms, "has_project_state": bool(pack.project_state_text), "chunks": [ { "source_file": c.source_file, "heading_path": c.heading_path, "score": c.score, "char_count": c.char_count, "content_preview": c.content[:100], } for c in pack.chunks_used ], }