src/atocore/context/builder.py

"""Context pack assembly: retrieve, rank, budget, format."""

import json
import time
from dataclasses import dataclass, field
from pathlib import Path

from atocore.config import settings
from atocore.observability.logger import get_logger
from atocore.retrieval.retriever import ChunkResult, retrieve

log = get_logger("context_builder")

SYSTEM_PREFIX = (
    "You have access to the following personal context from the user's knowledge base.\n"
    "Use it to inform your answer. If the context is not relevant, ignore it.\n"
    "Do not mention the context system unless asked."
)

# Last built context pack for debug inspection
_last_context_pack: "ContextPack | None" = None


@dataclass
class ContextChunk:
    content: str
    source_file: str
    heading_path: str
    score: float
    char_count: int


@dataclass
class ContextPack:
    chunks_used: list[ContextChunk] = field(default_factory=list)
    total_chars: int = 0
    budget: int = 0
    budget_remaining: int = 0
    formatted_context: str = ""
    full_prompt: str = ""
    query: str = ""
    project_hint: str = ""
    duration_ms: int = 0


def build_context(
    user_prompt: str,
    project_hint: str | None = None,
    budget: int | None = None,
) -> ContextPack:
    """Build a context pack for a user prompt."""
    global _last_context_pack
    start = time.time()
    budget = budget or settings.context_budget

    # 1. Retrieve candidates
    candidates = retrieve(user_prompt, top_k=settings.context_top_k)

    # 2. Score and rank
    scored = _rank_chunks(candidates, project_hint)

    # 3. Select within budget
    selected = _select_within_budget(scored, budget)

    # 4. Format
    formatted = _format_context_block(selected)

    # 5. Build full prompt
    full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"

    total_chars = sum(c.char_count for c in selected)
    duration_ms = int((time.time() - start) * 1000)

    pack = ContextPack(
        chunks_used=selected,
        total_chars=total_chars,
        budget=budget,
        budget_remaining=budget - total_chars,
        formatted_context=formatted,
        full_prompt=full_prompt,
        query=user_prompt,
        project_hint=project_hint or "",
        duration_ms=duration_ms,
    )

    _last_context_pack = pack

    log.info(
        "context_built",
        chunks_used=len(selected),
        total_chars=total_chars,
        budget_remaining=budget - total_chars,
        duration_ms=duration_ms,
    )
    log.debug("context_pack_detail", pack=_pack_to_dict(pack))

    return pack


def get_last_context_pack() -> ContextPack | None:
    """Return the last built context pack for debug inspection."""
    return _last_context_pack


def _rank_chunks(
    candidates: list[ChunkResult],
    project_hint: str | None,
) -> list[tuple[float, ChunkResult]]:
    """Rank candidates with boosting for project match."""
    scored = []
    seen_content: set[str] = set()

    for chunk in candidates:
        # Deduplicate by content prefix (first 200 chars)
        content_key = chunk.content[:200]
        if content_key in seen_content:
            continue
        seen_content.add(content_key)

        # Base score from similarity
        final_score = chunk.score

        # Project boost
        if project_hint:
            tags_str = chunk.tags.lower() if chunk.tags else ""
            source_str = chunk.source_file.lower()
            title_str = chunk.title.lower() if chunk.title else ""
            hint_lower = project_hint.lower()

            if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:
                final_score += 0.3

        scored.append((final_score, chunk))

    # Sort by score descending
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored


def _select_within_budget(
    scored: list[tuple[float, ChunkResult]],
    budget: int,
) -> list[ContextChunk]:
    """Select top chunks that fit within the character budget."""
    selected = []
    used = 0

    for score, chunk in scored:
        chunk_len = len(chunk.content)
        if used + chunk_len > budget:
            continue
        selected.append(
            ContextChunk(
                content=chunk.content,
                source_file=_shorten_path(chunk.source_file),
                heading_path=chunk.heading_path,
                score=score,
                char_count=chunk_len,
            )
        )
        used += chunk_len

    return selected


def _format_context_block(chunks: list[ContextChunk]) -> str:
    """Format chunks into the context block string."""
    if not chunks:
        return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"

    lines = ["--- AtoCore Context ---"]
    for chunk in chunks:
        lines.append(
            f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
        )
        lines.append(chunk.content)
        lines.append("")
    lines.append("--- End Context ---")
    return "\n".join(lines)


def _shorten_path(path: str) -> str:
    """Shorten an absolute path to a relative-like display."""
    p = Path(path)
    parts = p.parts
    # Show last 3 parts at most
    if len(parts) > 3:
        return str(Path(*parts[-3:]))
    return str(p)


def _pack_to_dict(pack: ContextPack) -> dict:
    """Convert a context pack to a JSON-serializable dict."""
    return {
        "query": pack.query,
        "project_hint": pack.project_hint,
        "chunks_used": len(pack.chunks_used),
        "total_chars": pack.total_chars,
        "budget": pack.budget,
        "budget_remaining": pack.budget_remaining,
        "duration_ms": pack.duration_ms,
        "chunks": [
            {
                "source_file": c.source_file,
                "heading_path": c.heading_path,
                "score": c.score,
                "char_count": c.char_count,
                "content_preview": c.content[:100],
            }
            for c in pack.chunks_used
        ],
    }
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC) Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-05 09:21:27 -04:00			`"""Context pack assembly: retrieve, rank, budget, format."""`

			`import json`
			`import time`
			`from dataclasses import dataclass, field`
			`from pathlib import Path`

			`from atocore.config import settings`
			`from atocore.observability.logger import get_logger`
			`from atocore.retrieval.retriever import ChunkResult, retrieve`

			`log = get_logger("context_builder")`

			`SYSTEM_PREFIX = (`
			`"You have access to the following personal context from the user's knowledge base.\n"`
			`"Use it to inform your answer. If the context is not relevant, ignore it.\n"`
			`"Do not mention the context system unless asked."`
			`)`

			`# Last built context pack for debug inspection`
			`_last_context_pack: "ContextPack \| None" = None`


			`@dataclass`
			`class ContextChunk:`
			`content: str`
			`source_file: str`
			`heading_path: str`
			`score: float`
			`char_count: int`


			`@dataclass`
			`class ContextPack:`
			`chunks_used: list[ContextChunk] = field(default_factory=list)`
			`total_chars: int = 0`
			`budget: int = 0`
			`budget_remaining: int = 0`
			`formatted_context: str = ""`
			`full_prompt: str = ""`
			`query: str = ""`
			`project_hint: str = ""`
			`duration_ms: int = 0`


			`def build_context(`
			`user_prompt: str,`
			`project_hint: str \| None = None,`
			`budget: int \| None = None,`
			`) -> ContextPack:`
			`"""Build a context pack for a user prompt."""`
			`global _last_context_pack`
			`start = time.time()`
			`budget = budget or settings.context_budget`

			`# 1. Retrieve candidates`
			`candidates = retrieve(user_prompt, top_k=settings.context_top_k)`

			`# 2. Score and rank`
			`scored = _rank_chunks(candidates, project_hint)`

			`# 3. Select within budget`
			`selected = _select_within_budget(scored, budget)`

			`# 4. Format`
			`formatted = _format_context_block(selected)`

			`# 5. Build full prompt`
			`full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"`

			`total_chars = sum(c.char_count for c in selected)`
			`duration_ms = int((time.time() - start) * 1000)`

			`pack = ContextPack(`
			`chunks_used=selected,`
			`total_chars=total_chars,`
			`budget=budget,`
			`budget_remaining=budget - total_chars,`
			`formatted_context=formatted,`
			`full_prompt=full_prompt,`
			`query=user_prompt,`
			`project_hint=project_hint or "",`
			`duration_ms=duration_ms,`
			`)`

			`_last_context_pack = pack`

			`log.info(`
			`"context_built",`
			`chunks_used=len(selected),`
			`total_chars=total_chars,`
			`budget_remaining=budget - total_chars,`
			`duration_ms=duration_ms,`
			`)`
			`log.debug("context_pack_detail", pack=_pack_to_dict(pack))`

			`return pack`


			`def get_last_context_pack() -> ContextPack \| None:`
			`"""Return the last built context pack for debug inspection."""`
			`return _last_context_pack`


			`def _rank_chunks(`
			`candidates: list[ChunkResult],`
			`project_hint: str \| None,`
			`) -> list[tuple[float, ChunkResult]]:`
			`"""Rank candidates with boosting for project match."""`
			`scored = []`
			`seen_content: set[str] = set()`

			`for chunk in candidates:`
			`# Deduplicate by content prefix (first 200 chars)`
			`content_key = chunk.content[:200]`
			`if content_key in seen_content:`
			`continue`
			`seen_content.add(content_key)`

			`# Base score from similarity`
			`final_score = chunk.score`

			`# Project boost`
			`if project_hint:`
			`tags_str = chunk.tags.lower() if chunk.tags else ""`
			`source_str = chunk.source_file.lower()`
			`title_str = chunk.title.lower() if chunk.title else ""`
			`hint_lower = project_hint.lower()`

			`if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:`
			`final_score += 0.3`

			`scored.append((final_score, chunk))`

			`# Sort by score descending`
			`scored.sort(key=lambda x: x[0], reverse=True)`
			`return scored`


			`def _select_within_budget(`
			`scored: list[tuple[float, ChunkResult]],`
			`budget: int,`
			`) -> list[ContextChunk]:`
			`"""Select top chunks that fit within the character budget."""`
			`selected = []`
			`used = 0`

			`for score, chunk in scored:`
			`chunk_len = len(chunk.content)`
			`if used + chunk_len > budget:`
			`continue`
			`selected.append(`
			`ContextChunk(`
			`content=chunk.content,`
			`source_file=_shorten_path(chunk.source_file),`
			`heading_path=chunk.heading_path,`
			`score=score,`
			`char_count=chunk_len,`
			`)`
			`)`
			`used += chunk_len`

			`return selected`


			`def _format_context_block(chunks: list[ContextChunk]) -> str:`
			`"""Format chunks into the context block string."""`
			`if not chunks:`
			`return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"`

			`lines = ["--- AtoCore Context ---"]`
			`for chunk in chunks:`
			`lines.append(`
			`f"[Source: {chunk.source_file} \| Section: {chunk.heading_path} \| Score: {chunk.score:.2f}]"`
			`)`
			`lines.append(chunk.content)`
			`lines.append("")`
			`lines.append("--- End Context ---")`
			`return "\n".join(lines)`


			`def _shorten_path(path: str) -> str:`
			`"""Shorten an absolute path to a relative-like display."""`
			`p = Path(path)`
			`parts = p.parts`
			`# Show last 3 parts at most`
			`if len(parts) > 3:`
			`return str(Path(*parts[-3:]))`
			`return str(p)`


			`def _pack_to_dict(pack: ContextPack) -> dict:`
			`"""Convert a context pack to a JSON-serializable dict."""`
			`return {`
			`"query": pack.query,`
			`"project_hint": pack.project_hint,`
			`"chunks_used": len(pack.chunks_used),`
			`"total_chars": pack.total_chars,`
			`"budget": pack.budget,`
			`"budget_remaining": pack.budget_remaining,`
			`"duration_ms": pack.duration_ms,`
			`"chunks": [`
			`{`
			`"source_file": c.source_file,`
			`"heading_path": c.heading_path,`
			`"score": c.score,`
			`"char_count": c.char_count,`
			`"content_preview": c.content[:100],`
			`}`
			`for c in pack.chunks_used`
			`],`
			`}`