feat: dual-layer knowledge extraction + domain knowledge band
The extraction system now produces two kinds of candidates from
the same conversation:
A. PROJECT-SPECIFIC: applied facts scoped to a named project
(unchanged behavior)
B. DOMAIN KNOWLEDGE: generalizable engineering insight earned
through project work, tagged with a domain (physics, materials,
optics, mechanics, manufacturing, metrology, controls, software,
math, finance) and stored with project="" so it surfaces across
all projects.
Critical quality bar enforced in the system prompt: "Would a
competent engineer need experience to know this, or could they
find it in 30 seconds on Google?" Textbook values, definitions,
and obvious facts are explicitly excluded. Only hard-won insight
qualifies — the kind that takes weeks of FEA or real machining
experience to discover.
Domain tags are embedded in the content as a prefix ("[physics]",
"[materials]") so they survive without a schema migration. A future
column can parse them out.
Context builder gains a new tier between project memories and
retrieved chunks:
Tier 1: Trusted Project State (project-specific)
Tier 2: Identity / Preferences (global)
Tier 3: Project Memories (project-specific)
Tier 4: Domain Knowledge (NEW) (cross-project, 10% budget)
Tier 5: Retrieved Chunks (project-boosted)
Trim order: chunks -> domain knowledge -> project memories ->
identity/preference -> project state.
Host-side extraction script updated with the same prompt and
domain-tag handling.
LLM_EXTRACTOR_VERSION bumped to llm-0.3.0.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -36,6 +36,12 @@ MEMORY_BUDGET_RATIO = 0.05 # identity + preference; lowered from 0.10 to avoid
|
||||
# memory can actually reach the model.
|
||||
PROJECT_MEMORY_BUDGET_RATIO = 0.25
|
||||
PROJECT_MEMORY_TYPES = ["project", "knowledge", "episodic"]
|
||||
# General domain knowledge — unscoped memories (project="") that surface
|
||||
# in every context pack regardless of project hint. These are earned
|
||||
# engineering insights that apply across projects (e.g., "Preston removal
|
||||
# model breaks down below 5N because the contact assumption fails").
|
||||
DOMAIN_KNOWLEDGE_BUDGET_RATIO = 0.10
|
||||
DOMAIN_KNOWLEDGE_TYPES = ["knowledge"]
|
||||
|
||||
# Last built context pack for debug inspection
|
||||
_last_context_pack: "ContextPack | None" = None
|
||||
@@ -59,6 +65,8 @@ class ContextPack:
|
||||
memory_chars: int = 0
|
||||
project_memory_text: str = ""
|
||||
project_memory_chars: int = 0
|
||||
domain_knowledge_text: str = ""
|
||||
domain_knowledge_chars: int = 0
|
||||
total_chars: int = 0
|
||||
budget: int = 0
|
||||
budget_remaining: int = 0
|
||||
@@ -139,8 +147,29 @@ def build_context(
|
||||
query=user_prompt,
|
||||
)
|
||||
|
||||
# 2c. Domain knowledge — cross-project earned insight with project=""
|
||||
# that surfaces regardless of which project the query is about.
|
||||
domain_knowledge_text = ""
|
||||
domain_knowledge_chars = 0
|
||||
domain_budget = min(
|
||||
int(budget * DOMAIN_KNOWLEDGE_BUDGET_RATIO),
|
||||
max(budget - project_state_chars - memory_chars - project_memory_chars, 0),
|
||||
)
|
||||
if domain_budget > 0:
|
||||
domain_knowledge_text, domain_knowledge_chars = get_memories_for_context(
|
||||
memory_types=DOMAIN_KNOWLEDGE_TYPES,
|
||||
project="",
|
||||
budget=domain_budget,
|
||||
header="--- Domain Knowledge ---",
|
||||
footer="--- End Domain Knowledge ---",
|
||||
query=user_prompt,
|
||||
)
|
||||
|
||||
# 3. Calculate remaining budget for retrieval
|
||||
retrieval_budget = budget - project_state_chars - memory_chars - project_memory_chars
|
||||
retrieval_budget = (
|
||||
budget - project_state_chars - memory_chars
|
||||
- project_memory_chars - domain_knowledge_chars
|
||||
)
|
||||
|
||||
# 4. Retrieve candidates
|
||||
candidates = (
|
||||
@@ -161,13 +190,15 @@ def build_context(
|
||||
|
||||
# 7. Format full context
|
||||
formatted = _format_full_context(
|
||||
project_state_text, memory_text, project_memory_text, selected
|
||||
project_state_text, memory_text, project_memory_text,
|
||||
domain_knowledge_text, selected,
|
||||
)
|
||||
if len(formatted) > budget:
|
||||
formatted, selected = _trim_context_to_budget(
|
||||
project_state_text,
|
||||
memory_text,
|
||||
project_memory_text,
|
||||
domain_knowledge_text,
|
||||
selected,
|
||||
budget,
|
||||
)
|
||||
@@ -178,6 +209,7 @@ def build_context(
|
||||
project_state_chars = len(project_state_text)
|
||||
memory_chars = len(memory_text)
|
||||
project_memory_chars = len(project_memory_text)
|
||||
domain_knowledge_chars = len(domain_knowledge_text)
|
||||
retrieval_chars = sum(c.char_count for c in selected)
|
||||
total_chars = len(formatted)
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
@@ -190,6 +222,8 @@ def build_context(
|
||||
memory_chars=memory_chars,
|
||||
project_memory_text=project_memory_text,
|
||||
project_memory_chars=project_memory_chars,
|
||||
domain_knowledge_text=domain_knowledge_text,
|
||||
domain_knowledge_chars=domain_knowledge_chars,
|
||||
total_chars=total_chars,
|
||||
budget=budget,
|
||||
budget_remaining=budget - total_chars,
|
||||
@@ -208,6 +242,7 @@ def build_context(
|
||||
project_state_chars=project_state_chars,
|
||||
memory_chars=memory_chars,
|
||||
project_memory_chars=project_memory_chars,
|
||||
domain_knowledge_chars=domain_knowledge_chars,
|
||||
retrieval_chars=retrieval_chars,
|
||||
total_chars=total_chars,
|
||||
budget_remaining=budget - total_chars,
|
||||
@@ -288,6 +323,7 @@ def _format_full_context(
|
||||
project_state_text: str,
|
||||
memory_text: str,
|
||||
project_memory_text: str,
|
||||
domain_knowledge_text: str,
|
||||
chunks: list[ContextChunk],
|
||||
) -> str:
|
||||
"""Format project state + memories + retrieved chunks into full context block."""
|
||||
@@ -308,7 +344,12 @@ def _format_full_context(
|
||||
parts.append(project_memory_text)
|
||||
parts.append("")
|
||||
|
||||
# 4. Retrieved chunks (lowest trust)
|
||||
# 4. Domain knowledge (cross-project earned insight)
|
||||
if domain_knowledge_text:
|
||||
parts.append(domain_knowledge_text)
|
||||
parts.append("")
|
||||
|
||||
# 5. Retrieved chunks (lowest trust)
|
||||
if chunks:
|
||||
parts.append("--- AtoCore Retrieved Context ---")
|
||||
if project_state_text:
|
||||
@@ -320,7 +361,7 @@ def _format_full_context(
|
||||
parts.append(chunk.content)
|
||||
parts.append("")
|
||||
parts.append("--- End Context ---")
|
||||
elif not project_state_text and not memory_text and not project_memory_text:
|
||||
elif not project_state_text and not memory_text and not project_memory_text and not domain_knowledge_text:
|
||||
parts.append("--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---")
|
||||
|
||||
return "\n".join(parts)
|
||||
@@ -343,6 +384,7 @@ def _pack_to_dict(pack: ContextPack) -> dict:
|
||||
"project_state_chars": pack.project_state_chars,
|
||||
"memory_chars": pack.memory_chars,
|
||||
"project_memory_chars": pack.project_memory_chars,
|
||||
"domain_knowledge_chars": pack.domain_knowledge_chars,
|
||||
"chunks_used": len(pack.chunks_used),
|
||||
"total_chars": pack.total_chars,
|
||||
"budget": pack.budget,
|
||||
@@ -351,6 +393,7 @@ def _pack_to_dict(pack: ContextPack) -> dict:
|
||||
"has_project_state": bool(pack.project_state_text),
|
||||
"has_memories": bool(pack.memory_text),
|
||||
"has_project_memories": bool(pack.project_memory_text),
|
||||
"has_domain_knowledge": bool(pack.domain_knowledge_text),
|
||||
"chunks": [
|
||||
{
|
||||
"source_file": c.source_file,
|
||||
@@ -381,44 +424,56 @@ def _trim_context_to_budget(
|
||||
project_state_text: str,
|
||||
memory_text: str,
|
||||
project_memory_text: str,
|
||||
domain_knowledge_text: str,
|
||||
chunks: list[ContextChunk],
|
||||
budget: int,
|
||||
) -> tuple[str, list[ContextChunk]]:
|
||||
"""Trim retrieval → project memories → identity/preference → project state."""
|
||||
"""Trim retrieval -> domain knowledge -> project memories -> identity/preference -> project state."""
|
||||
kept_chunks = list(chunks)
|
||||
formatted = _format_full_context(
|
||||
project_state_text, memory_text, project_memory_text, kept_chunks
|
||||
project_state_text, memory_text, project_memory_text,
|
||||
domain_knowledge_text, kept_chunks,
|
||||
)
|
||||
while len(formatted) > budget and kept_chunks:
|
||||
kept_chunks.pop()
|
||||
formatted = _format_full_context(
|
||||
project_state_text, memory_text, project_memory_text, kept_chunks
|
||||
project_state_text, memory_text, project_memory_text,
|
||||
domain_knowledge_text, kept_chunks,
|
||||
)
|
||||
|
||||
if len(formatted) <= budget:
|
||||
return formatted, kept_chunks
|
||||
|
||||
# Drop project memories next (they were the most recently added
|
||||
# tier and carry less trust than identity/preference).
|
||||
# Drop domain knowledge first (lowest trust of the memory tiers).
|
||||
domain_knowledge_text, _ = _truncate_text_block(domain_knowledge_text, 0)
|
||||
formatted = _format_full_context(
|
||||
project_state_text, memory_text, project_memory_text,
|
||||
domain_knowledge_text, kept_chunks,
|
||||
)
|
||||
if len(formatted) <= budget:
|
||||
return formatted, kept_chunks
|
||||
|
||||
project_memory_text, _ = _truncate_text_block(
|
||||
project_memory_text,
|
||||
max(budget - len(project_state_text) - len(memory_text), 0),
|
||||
)
|
||||
formatted = _format_full_context(
|
||||
project_state_text, memory_text, project_memory_text, kept_chunks
|
||||
project_state_text, memory_text, project_memory_text,
|
||||
domain_knowledge_text, kept_chunks,
|
||||
)
|
||||
if len(formatted) <= budget:
|
||||
return formatted, kept_chunks
|
||||
|
||||
memory_text, _ = _truncate_text_block(memory_text, max(budget - len(project_state_text), 0))
|
||||
formatted = _format_full_context(
|
||||
project_state_text, memory_text, project_memory_text, kept_chunks
|
||||
project_state_text, memory_text, project_memory_text,
|
||||
domain_knowledge_text, kept_chunks,
|
||||
)
|
||||
if len(formatted) <= budget:
|
||||
return formatted, kept_chunks
|
||||
|
||||
project_state_text, _ = _truncate_text_block(project_state_text, budget)
|
||||
formatted = _format_full_context(project_state_text, "", "", [])
|
||||
formatted = _format_full_context(project_state_text, "", "", "", [])
|
||||
if len(formatted) > budget:
|
||||
formatted, _ = _truncate_text_block(formatted, budget)
|
||||
return formatted, []
|
||||
|
||||
Reference in New Issue
Block a user