ATOCore/src/atocore/context/builder.py

"""Context pack assembly: retrieve, rank, budget, format.

Trust precedence (per Master Plan):
  1. Trusted Project State → always included first, highest authority
  2. Identity + Preference memories → included next
  3. Retrieved chunks → ranked, deduplicated, budget-constrained
"""

import time
from dataclasses import dataclass, field
from pathlib import Path

import atocore.config as _config
from atocore.context.project_state import format_project_state, get_state
from atocore.memory.service import get_memories_for_context
from atocore.observability.logger import get_logger
from atocore.engineering.service import get_entities, get_entity_with_context
from atocore.projects.registry import resolve_project_name
from atocore.retrieval.retriever import ChunkResult, retrieve

log = get_logger("context_builder")

SYSTEM_PREFIX = (
    "You have access to the following personal context from the user's knowledge base.\n"
    "Use it to inform your answer. If the context is not relevant, ignore it.\n"
    "Do not mention the context system unless asked.\n"
    "When project state is provided, treat it as the most authoritative source."
)

# Budget allocation (per Master Plan section 9):
#   identity: 5%, preferences: 5%, project state: 20%, retrieval: 60%+
PROJECT_STATE_BUDGET_RATIO = 0.20
MEMORY_BUDGET_RATIO = 0.05  # identity + preference; lowered from 0.10 to avoid squeezing project memories and chunks
# Project-scoped memories (project/knowledge/episodic) are the outlet
# for the Phase 9 reflection loop on the retrieval side. Budget sits
# between identity/preference and retrieved chunks so a reinforced
# memory can actually reach the model.
PROJECT_MEMORY_BUDGET_RATIO = 0.25
PROJECT_MEMORY_TYPES = ["project", "knowledge", "episodic"]
# General domain knowledge — unscoped memories (project="") that surface
# in every context pack regardless of project hint. These are earned
# engineering insights that apply across projects (e.g., "Preston removal
# model breaks down below 5N because the contact assumption fails").
DOMAIN_KNOWLEDGE_BUDGET_RATIO = 0.10
DOMAIN_KNOWLEDGE_TYPES = ["knowledge"]
ENGINEERING_CONTEXT_BUDGET_RATIO = 0.10

# Last built context pack for debug inspection
_last_context_pack: "ContextPack | None" = None


@dataclass
class ContextChunk:
    content: str
    source_file: str
    heading_path: str
    score: float
    char_count: int


@dataclass
class ContextPack:
    chunks_used: list[ContextChunk] = field(default_factory=list)
    project_state_text: str = ""
    project_state_chars: int = 0
    memory_text: str = ""
    memory_chars: int = 0
    project_memory_text: str = ""
    project_memory_chars: int = 0
    domain_knowledge_text: str = ""
    domain_knowledge_chars: int = 0
    engineering_context_text: str = ""
    engineering_context_chars: int = 0
    total_chars: int = 0
    budget: int = 0
    budget_remaining: int = 0
    formatted_context: str = ""
    full_prompt: str = ""
    query: str = ""
    project_hint: str = ""
    duration_ms: int = 0


def build_context(
    user_prompt: str,
    project_hint: str | None = None,
    budget: int | None = None,
) -> ContextPack:
    """Build a context pack for a user prompt.

    Trust precedence applied:
      1. Project state is injected first (highest trust)
      2. Identity + preference memories (second trust level)
      3. Retrieved chunks fill the remaining budget
    """
    global _last_context_pack
    start = time.time()
    budget = _config.settings.context_budget if budget is None else max(budget, 0)

    # 1. Get Trusted Project State (highest precedence)
    project_state_text = ""
    project_state_chars = 0
    project_state_budget = min(
        budget,
        max(0, int(budget * PROJECT_STATE_BUDGET_RATIO)),
    )

    # Canonicalize the project hint through the registry so callers
    # can pass an alias (`p05`, `gigabit`) and still find trusted
    # state stored under the canonical project id. The same helper
    # is used everywhere a project name crosses a trust boundary
    # (project_state, memories, interactions). When the registry has
    # no entry the helper returns the input unchanged so hand-curated
    # state that predates the registry still works.
    canonical_project = resolve_project_name(project_hint) if project_hint else ""
    if canonical_project:
        state_entries = get_state(canonical_project)
        if state_entries:
            project_state_text = format_project_state(state_entries)
            project_state_text, project_state_chars = _truncate_text_block(
                project_state_text,
                project_state_budget or budget,
            )

    # 2. Get identity + preference memories (second precedence)
    memory_budget = min(int(budget * MEMORY_BUDGET_RATIO), max(budget - project_state_chars, 0))
    memory_text, memory_chars = get_memories_for_context(
        memory_types=["identity", "preference"],
        budget=memory_budget,
        query=user_prompt,
    )

    # 2b. Get project-scoped memories (third precedence). Only
    # populated when a canonical project is in scope — cross-project
    # memory bleed would rot the pack. Active-only filtering is
    # handled by the shared min_confidence=0.5 gate inside
    # get_memories_for_context.
    project_memory_text = ""
    project_memory_chars = 0
    if canonical_project:
        project_memory_budget = min(
            int(budget * PROJECT_MEMORY_BUDGET_RATIO),
            max(budget - project_state_chars - memory_chars, 0),
        )
        project_memory_text, project_memory_chars = get_memories_for_context(
            memory_types=PROJECT_MEMORY_TYPES,
            project=canonical_project,
            budget=project_memory_budget,
            header="--- Project Memories ---",
            footer="--- End Project Memories ---",
            query=user_prompt,
        )

    # 2c. Domain knowledge — cross-project earned insight with project=""
    # that surfaces regardless of which project the query is about.
    domain_knowledge_text = ""
    domain_knowledge_chars = 0
    domain_budget = min(
        int(budget * DOMAIN_KNOWLEDGE_BUDGET_RATIO),
        max(budget - project_state_chars - memory_chars - project_memory_chars, 0),
    )
    if domain_budget > 0:
        domain_knowledge_text, domain_knowledge_chars = get_memories_for_context(
            memory_types=DOMAIN_KNOWLEDGE_TYPES,
            project="",
            budget=domain_budget,
            header="--- Domain Knowledge ---",
            footer="--- End Domain Knowledge ---",
            query=user_prompt,
        )

    # 2d. Engineering context — structured entity/relationship data
    # when the query matches a known entity name.
    engineering_context_text = ""
    engineering_context_chars = 0
    if canonical_project:
        eng_budget = min(
            int(budget * ENGINEERING_CONTEXT_BUDGET_RATIO),
            max(budget - project_state_chars - memory_chars
                - project_memory_chars - domain_knowledge_chars, 0),
        )
        if eng_budget > 0:
            engineering_context_text = _build_engineering_context(
                user_prompt, canonical_project, eng_budget,
            )
            engineering_context_chars = len(engineering_context_text)

    # 3. Calculate remaining budget for retrieval
    retrieval_budget = (
        budget - project_state_chars - memory_chars
        - project_memory_chars - domain_knowledge_chars
        - engineering_context_chars
    )

    # 4. Retrieve candidates
    candidates = (
        retrieve(
            user_prompt,
            top_k=_config.settings.context_top_k,
            project_hint=project_hint,
        )
        if retrieval_budget > 0
        else []
    )

    # 5. Score and rank
    scored = _rank_chunks(candidates, project_hint)

    # 6. Select within remaining budget
    selected = _select_within_budget(scored, max(retrieval_budget, 0))

    # 7. Format full context
    formatted = _format_full_context(
        project_state_text, memory_text, project_memory_text,
        domain_knowledge_text, engineering_context_text, selected,
    )
    if len(formatted) > budget:
        formatted, selected = _trim_context_to_budget(
            project_state_text,
            memory_text,
            project_memory_text,
            domain_knowledge_text,
            engineering_context_text,
            selected,
            budget,
        )

    # 8. Build full prompt
    full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"

    project_state_chars = len(project_state_text)
    memory_chars = len(memory_text)
    project_memory_chars = len(project_memory_text)
    domain_knowledge_chars = len(domain_knowledge_text)
    engineering_context_chars = len(engineering_context_text)
    retrieval_chars = sum(c.char_count for c in selected)
    total_chars = len(formatted)
    duration_ms = int((time.time() - start) * 1000)

    pack = ContextPack(
        chunks_used=selected,
        project_state_text=project_state_text,
        project_state_chars=project_state_chars,
        memory_text=memory_text,
        memory_chars=memory_chars,
        project_memory_text=project_memory_text,
        project_memory_chars=project_memory_chars,
        domain_knowledge_text=domain_knowledge_text,
        domain_knowledge_chars=domain_knowledge_chars,
        engineering_context_text=engineering_context_text,
        engineering_context_chars=engineering_context_chars,
        total_chars=total_chars,
        budget=budget,
        budget_remaining=budget - total_chars,
        formatted_context=formatted,
        full_prompt=full_prompt,
        query=user_prompt,
        project_hint=project_hint or "",
        duration_ms=duration_ms,
    )

    _last_context_pack = pack

    log.info(
        "context_built",
        chunks_used=len(selected),
        project_state_chars=project_state_chars,
        memory_chars=memory_chars,
        project_memory_chars=project_memory_chars,
        domain_knowledge_chars=domain_knowledge_chars,
        engineering_context_chars=engineering_context_chars,
        retrieval_chars=retrieval_chars,
        total_chars=total_chars,
        budget_remaining=budget - total_chars,
        duration_ms=duration_ms,
    )
    log.debug("context_pack_detail", pack=_pack_to_dict(pack))

    return pack


def get_last_context_pack() -> ContextPack | None:
    """Return the last built context pack for debug inspection."""
    return _last_context_pack


def _rank_chunks(
    candidates: list[ChunkResult],
    project_hint: str | None,
) -> list[tuple[float, ChunkResult]]:
    """Rank candidates with boosting for project match."""
    scored = []
    seen_content: set[str] = set()

    for chunk in candidates:
        # Deduplicate by content prefix (first 200 chars)
        content_key = chunk.content[:200]
        if content_key in seen_content:
            continue
        seen_content.add(content_key)

        # Base score from similarity
        final_score = chunk.score

        # Project boost
        if project_hint:
            tags_str = chunk.tags.lower() if chunk.tags else ""
            source_str = chunk.source_file.lower()
            title_str = chunk.title.lower() if chunk.title else ""
            hint_lower = project_hint.lower()

            if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:
                final_score *= 1.3

        scored.append((final_score, chunk))

    # Sort by score descending
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored


def _select_within_budget(
    scored: list[tuple[float, ChunkResult]],
    budget: int,
) -> list[ContextChunk]:
    """Select top chunks that fit within the character budget."""
    selected = []
    used = 0

    for score, chunk in scored:
        chunk_len = len(chunk.content)
        if used + chunk_len > budget:
            continue
        selected.append(
            ContextChunk(
                content=chunk.content,
                source_file=_shorten_path(chunk.source_file),
                heading_path=chunk.heading_path,
                score=score,
                char_count=chunk_len,
            )
        )
        used += chunk_len

    return selected


def _format_full_context(
    project_state_text: str,
    memory_text: str,
    project_memory_text: str,
    domain_knowledge_text: str,
    engineering_context_text: str = "",
    chunks: list[ContextChunk] | None = None,
) -> str:
    """Format project state + memories + retrieved chunks into full context block."""
    parts = []

    # 1. Project state first (highest trust)
    if project_state_text:
        parts.append(project_state_text)
        parts.append("")

    # 2. Identity + preference memories (second trust level)
    if memory_text:
        parts.append(memory_text)
        parts.append("")

    # 3. Project-scoped memories (third trust level)
    if project_memory_text:
        parts.append(project_memory_text)
        parts.append("")

    # 4. Domain knowledge (cross-project earned insight)
    if domain_knowledge_text:
        parts.append(domain_knowledge_text)
        parts.append("")

    # 5. Engineering context (structured entity/relationship data)
    if engineering_context_text:
        parts.append(engineering_context_text)
        parts.append("")

    # 6. Retrieved chunks (lowest trust)
    if chunks:
        parts.append("--- AtoCore Retrieved Context ---")
        if project_state_text:
            parts.append("If retrieved context conflicts with Trusted Project State above, trust the Trusted Project State.")
        for chunk in chunks:
            parts.append(
                f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
            )
            parts.append(chunk.content)
            parts.append("")
        parts.append("--- End Context ---")
    elif not project_state_text and not memory_text and not project_memory_text and not domain_knowledge_text and not engineering_context_text:
        parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---")

    return "\n".join(parts)


def _shorten_path(path: str) -> str:
    """Shorten an absolute path to a relative-like display."""
    p = Path(path)
    parts = p.parts
    if len(parts) > 3:
        return str(Path(*parts[-3:]))
    return str(p)


def _pack_to_dict(pack: ContextPack) -> dict:
    """Convert a context pack to a JSON-serializable dict."""
    return {
        "query": pack.query,
        "project_hint": pack.project_hint,
        "project_state_chars": pack.project_state_chars,
        "memory_chars": pack.memory_chars,
        "project_memory_chars": pack.project_memory_chars,
        "domain_knowledge_chars": pack.domain_knowledge_chars,
        "chunks_used": len(pack.chunks_used),
        "total_chars": pack.total_chars,
        "budget": pack.budget,
        "budget_remaining": pack.budget_remaining,
        "duration_ms": pack.duration_ms,
        "has_project_state": bool(pack.project_state_text),
        "has_memories": bool(pack.memory_text),
        "has_project_memories": bool(pack.project_memory_text),
        "has_domain_knowledge": bool(pack.domain_knowledge_text),
        "has_engineering_context": bool(pack.engineering_context_text),
        "chunks": [
            {
                "source_file": c.source_file,
                "heading_path": c.heading_path,
                "score": c.score,
                "char_count": c.char_count,
                "content_preview": c.content[:100],
            }
            for c in pack.chunks_used
        ],
    }


def _build_engineering_context(
    query: str,
    project: str,
    budget: int,
) -> str:
    """Find entities matching the query and format their context.

    Uses simple word-overlap matching between query tokens and entity
    names to find relevant entities, then formats the top match with
    its relationships as a compact text band.
    """
    if budget < 100:
        return ""

    from atocore.memory.reinforcement import _normalize, _tokenize

    query_tokens = _tokenize(_normalize(query))
    if not query_tokens:
        return ""

    try:
        entities = get_entities(project=project, limit=100)
    except Exception:
        return ""

    if not entities:
        return ""

    scored: list[tuple[int, "Entity"]] = []
    for ent in entities:
        name_tokens = _tokenize(_normalize(ent.name))
        desc_tokens = _tokenize(_normalize(ent.description))
        overlap = len(query_tokens & (name_tokens | desc_tokens))
        if overlap > 0:
            scored.append((overlap, ent))

    if not scored:
        return ""

    scored.sort(key=lambda t: t[0], reverse=True)
    best_entity = scored[0][1]

    try:
        ctx = get_entity_with_context(best_entity.id)
    except Exception:
        return ""

    if ctx is None:
        return ""

    lines = ["--- Engineering Context ---"]
    lines.append(f"[{best_entity.entity_type}] {best_entity.name}")
    if best_entity.description:
        lines.append(f"  {best_entity.description[:150]}")

    for rel in ctx["relationships"][:8]:
        other_id = (
            rel.target_entity_id
            if rel.source_entity_id == best_entity.id
            else rel.source_entity_id
        )
        other = ctx["related_entities"].get(other_id)
        if other:
            direction = "->" if rel.source_entity_id == best_entity.id else "<-"
            lines.append(
                f"  {direction} {rel.relationship_type} [{other.entity_type}] {other.name}"
            )

    # Phase 5H: append a compact gaps summary so the LLM always sees
    # "what we're currently missing" alongside the entity neighborhood.
    # This is the director's most-used insight — orphan requirements,
    # risky decisions, unsupported claims — surfaced in every context pack
    # for project-scoped queries.
    try:
        from atocore.engineering.queries import all_gaps as _all_gaps
        gaps = _all_gaps(project)
        orphan_n = gaps["orphan_requirements"]["count"]
        risky_n = gaps["risky_decisions"]["count"]
        unsup_n = gaps["unsupported_claims"]["count"]
        if orphan_n or risky_n or unsup_n:
            lines.append("")
            lines.append(f"Gaps: {orphan_n} orphan reqs, {risky_n} risky decisions, {unsup_n} unsupported claims")
    except Exception:
        pass

    lines.append("--- End Engineering Context ---")
    text = "\n".join(lines)

    if len(text) > budget:
        text = text[:budget - 3].rstrip() + "..."

    return text


def _truncate_text_block(text: str, budget: int) -> tuple[str, int]:
    """Trim a formatted text block so trusted tiers cannot exceed the total budget."""
    if budget <= 0 or not text:
        return "", 0
    if len(text) <= budget:
        return text, len(text)
    if budget <= 3:
        trimmed = text[:budget]
    else:
        trimmed = f"{text[: budget - 3].rstrip()}..."
    return trimmed, len(trimmed)


def _trim_context_to_budget(
    project_state_text: str,
    memory_text: str,
    project_memory_text: str,
    domain_knowledge_text: str,
    engineering_context_text: str,
    chunks: list[ContextChunk],
    budget: int,
) -> tuple[str, list[ContextChunk]]:
    """Trim retrieval -> engineering -> domain -> project memories -> identity -> state."""
    kept_chunks = list(chunks)
    formatted = _format_full_context(
        project_state_text, memory_text, project_memory_text,
        domain_knowledge_text, engineering_context_text, kept_chunks,
    )
    while len(formatted) > budget and kept_chunks:
        kept_chunks.pop()
        formatted = _format_full_context(
            project_state_text, memory_text, project_memory_text,
            domain_knowledge_text, engineering_context_text, kept_chunks,
        )

    if len(formatted) <= budget:
        return formatted, kept_chunks

    # Drop engineering context first.
    engineering_context_text = ""
    formatted = _format_full_context(
        project_state_text, memory_text, project_memory_text,
        domain_knowledge_text, engineering_context_text, kept_chunks,
    )
    if len(formatted) <= budget:
        return formatted, kept_chunks

    # Drop domain knowledge next.
    domain_knowledge_text, _ = _truncate_text_block(domain_knowledge_text, 0)
    formatted = _format_full_context(
        project_state_text, memory_text, project_memory_text,
        domain_knowledge_text, engineering_context_text, kept_chunks,
    )
    if len(formatted) <= budget:
        return formatted, kept_chunks

    project_memory_text, _ = _truncate_text_block(
        project_memory_text,
        max(budget - len(project_state_text) - len(memory_text), 0),
    )
    formatted = _format_full_context(
        project_state_text, memory_text, project_memory_text,
        domain_knowledge_text, engineering_context_text, kept_chunks,
    )
    if len(formatted) <= budget:
        return formatted, kept_chunks

    memory_text, _ = _truncate_text_block(memory_text, max(budget - len(project_state_text), 0))
    formatted = _format_full_context(
        project_state_text, memory_text, project_memory_text,
        domain_knowledge_text, engineering_context_text, kept_chunks,
    )
    if len(formatted) <= budget:
        return formatted, kept_chunks

    project_state_text, _ = _truncate_text_block(project_state_text, budget)
    formatted = _format_full_context(project_state_text, "", "", "", [])
    if len(formatted) > budget:
        formatted, _ = _truncate_text_block(formatted, budget)
    return formatted, []