feat: dual-layer knowledge extraction + domain knowledge band

The extraction system now produces two kinds of candidates from
the same conversation:

A. PROJECT-SPECIFIC: applied facts scoped to a named project
   (unchanged behavior)
B. DOMAIN KNOWLEDGE: generalizable engineering insight earned
   through project work, tagged with a domain (physics, materials,
   optics, mechanics, manufacturing, metrology, controls, software,
   math, finance) and stored with project="" so it surfaces across
   all projects.

Critical quality bar enforced in the system prompt: "Would a
competent engineer need experience to know this, or could they
find it in 30 seconds on Google?" Textbook values, definitions,
and obvious facts are explicitly excluded. Only hard-won insight
qualifies — the kind that takes weeks of FEA or real machining
experience to discover.

Domain tags are embedded in the content as a prefix ("[physics]",
"[materials]") so they survive without a schema migration. A future
column can parse them out.

Context builder gains a new tier between project memories and
retrieved chunks:

  Tier 1: Trusted Project State     (project-specific)
  Tier 2: Identity / Preferences    (global)
  Tier 3: Project Memories          (project-specific)
  Tier 4: Domain Knowledge (NEW)    (cross-project, 10% budget)
  Tier 5: Retrieved Chunks          (project-boosted)

Trim order: chunks -> domain knowledge -> project memories ->
identity/preference -> project state.

Host-side extraction script updated with the same prompt and
domain-tag handling.

LLM_EXTRACTOR_VERSION bumped to llm-0.3.0.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-13 09:04:04 -04:00
parent db89978871
commit 9118f824fa
3 changed files with 142 additions and 38 deletions

View File

@@ -36,6 +36,12 @@ MEMORY_BUDGET_RATIO = 0.05 # identity + preference; lowered from 0.10 to avoid
# memory can actually reach the model.
PROJECT_MEMORY_BUDGET_RATIO = 0.25
PROJECT_MEMORY_TYPES = ["project", "knowledge", "episodic"]
# General domain knowledge — unscoped memories (project="") that surface
# in every context pack regardless of project hint. These are earned
# engineering insights that apply across projects (e.g., "Preston removal
# model breaks down below 5N because the contact assumption fails").
DOMAIN_KNOWLEDGE_BUDGET_RATIO = 0.10
DOMAIN_KNOWLEDGE_TYPES = ["knowledge"]
# Last built context pack for debug inspection
_last_context_pack: "ContextPack | None" = None
@@ -59,6 +65,8 @@ class ContextPack:
memory_chars: int = 0
project_memory_text: str = ""
project_memory_chars: int = 0
domain_knowledge_text: str = ""
domain_knowledge_chars: int = 0
total_chars: int = 0
budget: int = 0
budget_remaining: int = 0
@@ -139,8 +147,29 @@ def build_context(
query=user_prompt,
)
# 2c. Domain knowledge — cross-project earned insight with project=""
# that surfaces regardless of which project the query is about.
domain_knowledge_text = ""
domain_knowledge_chars = 0
domain_budget = min(
int(budget * DOMAIN_KNOWLEDGE_BUDGET_RATIO),
max(budget - project_state_chars - memory_chars - project_memory_chars, 0),
)
if domain_budget > 0:
domain_knowledge_text, domain_knowledge_chars = get_memories_for_context(
memory_types=DOMAIN_KNOWLEDGE_TYPES,
project="",
budget=domain_budget,
header="--- Domain Knowledge ---",
footer="--- End Domain Knowledge ---",
query=user_prompt,
)
# 3. Calculate remaining budget for retrieval
retrieval_budget = budget - project_state_chars - memory_chars - project_memory_chars
retrieval_budget = (
budget - project_state_chars - memory_chars
- project_memory_chars - domain_knowledge_chars
)
# 4. Retrieve candidates
candidates = (
@@ -161,13 +190,15 @@ def build_context(
# 7. Format full context
formatted = _format_full_context(
project_state_text, memory_text, project_memory_text, selected
project_state_text, memory_text, project_memory_text,
domain_knowledge_text, selected,
)
if len(formatted) > budget:
formatted, selected = _trim_context_to_budget(
project_state_text,
memory_text,
project_memory_text,
domain_knowledge_text,
selected,
budget,
)
@@ -178,6 +209,7 @@ def build_context(
project_state_chars = len(project_state_text)
memory_chars = len(memory_text)
project_memory_chars = len(project_memory_text)
domain_knowledge_chars = len(domain_knowledge_text)
retrieval_chars = sum(c.char_count for c in selected)
total_chars = len(formatted)
duration_ms = int((time.time() - start) * 1000)
@@ -190,6 +222,8 @@ def build_context(
memory_chars=memory_chars,
project_memory_text=project_memory_text,
project_memory_chars=project_memory_chars,
domain_knowledge_text=domain_knowledge_text,
domain_knowledge_chars=domain_knowledge_chars,
total_chars=total_chars,
budget=budget,
budget_remaining=budget - total_chars,
@@ -208,6 +242,7 @@ def build_context(
project_state_chars=project_state_chars,
memory_chars=memory_chars,
project_memory_chars=project_memory_chars,
domain_knowledge_chars=domain_knowledge_chars,
retrieval_chars=retrieval_chars,
total_chars=total_chars,
budget_remaining=budget - total_chars,
@@ -288,6 +323,7 @@ def _format_full_context(
project_state_text: str,
memory_text: str,
project_memory_text: str,
domain_knowledge_text: str,
chunks: list[ContextChunk],
) -> str:
"""Format project state + memories + retrieved chunks into full context block."""
@@ -308,7 +344,12 @@ def _format_full_context(
parts.append(project_memory_text)
parts.append("")
# 4. Retrieved chunks (lowest trust)
# 4. Domain knowledge (cross-project earned insight)
if domain_knowledge_text:
parts.append(domain_knowledge_text)
parts.append("")
# 5. Retrieved chunks (lowest trust)
if chunks:
parts.append("--- AtoCore Retrieved Context ---")
if project_state_text:
@@ -320,7 +361,7 @@ def _format_full_context(
parts.append(chunk.content)
parts.append("")
parts.append("--- End Context ---")
elif not project_state_text and not memory_text and not project_memory_text:
elif not project_state_text and not memory_text and not project_memory_text and not domain_knowledge_text:
parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---")
return "\n".join(parts)
@@ -343,6 +384,7 @@ def _pack_to_dict(pack: ContextPack) -> dict:
"project_state_chars": pack.project_state_chars,
"memory_chars": pack.memory_chars,
"project_memory_chars": pack.project_memory_chars,
"domain_knowledge_chars": pack.domain_knowledge_chars,
"chunks_used": len(pack.chunks_used),
"total_chars": pack.total_chars,
"budget": pack.budget,
@@ -351,6 +393,7 @@ def _pack_to_dict(pack: ContextPack) -> dict:
"has_project_state": bool(pack.project_state_text),
"has_memories": bool(pack.memory_text),
"has_project_memories": bool(pack.project_memory_text),
"has_domain_knowledge": bool(pack.domain_knowledge_text),
"chunks": [
{
"source_file": c.source_file,
@@ -381,44 +424,56 @@ def _trim_context_to_budget(
project_state_text: str,
memory_text: str,
project_memory_text: str,
domain_knowledge_text: str,
chunks: list[ContextChunk],
budget: int,
) -> tuple[str, list[ContextChunk]]:
"""Trim retrieval project memories identity/preference project state."""
"""Trim retrieval -> domain knowledge -> project memories -> identity/preference -> project state."""
kept_chunks = list(chunks)
formatted = _format_full_context(
project_state_text, memory_text, project_memory_text, kept_chunks
project_state_text, memory_text, project_memory_text,
domain_knowledge_text, kept_chunks,
)
while len(formatted) > budget and kept_chunks:
kept_chunks.pop()
formatted = _format_full_context(
project_state_text, memory_text, project_memory_text, kept_chunks
project_state_text, memory_text, project_memory_text,
domain_knowledge_text, kept_chunks,
)
if len(formatted) <= budget:
return formatted, kept_chunks
# Drop project memories next (they were the most recently added
# tier and carry less trust than identity/preference).
# Drop domain knowledge first (lowest trust of the memory tiers).
domain_knowledge_text, _ = _truncate_text_block(domain_knowledge_text, 0)
formatted = _format_full_context(
project_state_text, memory_text, project_memory_text,
domain_knowledge_text, kept_chunks,
)
if len(formatted) <= budget:
return formatted, kept_chunks
project_memory_text, _ = _truncate_text_block(
project_memory_text,
max(budget - len(project_state_text) - len(memory_text), 0),
)
formatted = _format_full_context(
project_state_text, memory_text, project_memory_text, kept_chunks
project_state_text, memory_text, project_memory_text,
domain_knowledge_text, kept_chunks,
)
if len(formatted) <= budget:
return formatted, kept_chunks
memory_text, _ = _truncate_text_block(memory_text, max(budget - len(project_state_text), 0))
formatted = _format_full_context(
project_state_text, memory_text, project_memory_text, kept_chunks
project_state_text, memory_text, project_memory_text,
domain_knowledge_text, kept_chunks,
)
if len(formatted) <= budget:
return formatted, kept_chunks
project_state_text, _ = _truncate_text_block(project_state_text, budget)
formatted = _format_full_context(project_state_text, "", "", [])
formatted = _format_full_context(project_state_text, "", "", "", [])
if len(formatted) > budget:
formatted, _ = _truncate_text_block(formatted, budget)
return formatted, []