fix(extraction): R11 container 503 + R12 shared prompt module

R11: POST /admin/extract-batch with mode=llm now returns 503 when the
claude CLI is unavailable (was silently returning success with 0
candidates), with a message pointing at the host-side script. +2 tests.

R12: extracted SYSTEM_PROMPT + parse_llm_json_array +
normalize_candidate_item + build_user_message into stdlib-only
src/atocore/memory/_llm_prompt.py. Both the container extractor and
scripts/batch_llm_extract_live.py now import from it, eliminating the
prompt/parser drift risk.

Tests 297 -> 299.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-15 10:47:01 -04:00
parent dc9fdd3a38
commit c2e7064238
6 changed files with 310 additions and 302 deletions

View File

@@ -55,6 +55,7 @@ from atocore.memory.extractor import (
)
from atocore.memory.extractor_llm import (
LLM_EXTRACTOR_VERSION,
_cli_available as _llm_cli_available,
extract_candidates_llm,
)
from atocore.memory.reinforcement import reinforce_from_interaction
@@ -832,6 +833,18 @@ def api_extract_batch(req: ExtractBatchRequest | None = None) -> dict:
invoke this endpoint explicitly (cron, manual curl, CLI).
"""
payload = req or ExtractBatchRequest()
if payload.mode == "llm" and not _llm_cli_available():
raise HTTPException(
status_code=503,
detail=(
"LLM extraction unavailable in this runtime: the `claude` CLI "
"is not on PATH. Run host-side via "
"`scripts/batch_llm_extract_live.py` instead, or call this "
"endpoint with mode=\"rule\"."
),
)
since = payload.since
if not since:

View File

@@ -0,0 +1,183 @@
"""Shared LLM-extractor prompt + parser (stdlib-only).
R12: single source of truth for the system prompt, memory type set,
size limits, and raw JSON parsing used by both paths that shell out
to ``claude -p``:
- ``atocore.memory.extractor_llm`` (in-container extractor, wraps the
parsed dicts in ``MemoryCandidate`` with registry-checked project
attribution)
- ``scripts/batch_llm_extract_live.py`` (host-side extractor, can't
import the full atocore package because Dalidou's host Python lacks
the container's deps; imports this module via ``sys.path``)
This module MUST stay stdlib-only. No ``atocore`` imports, no third-
party packages. Callers apply their own project-attribution policy on
top of the normalized dicts this module emits.
"""
from __future__ import annotations
import json
from typing import Any
LLM_EXTRACTOR_VERSION = "llm-0.4.0"
MAX_RESPONSE_CHARS = 8000
MAX_PROMPT_CHARS = 2000
MEMORY_TYPES = {"identity", "preference", "project", "episodic", "knowledge", "adaptation"}
SYSTEM_PROMPT = """You extract memory candidates from LLM conversation turns for a personal context engine called AtoCore.
AtoCore is the brain for Atomaste's engineering work. Known projects:
p04-gigabit, p05-interferometer, p06-polisher, atomizer-v2, atocore,
abb-space. Unknown project names — still tag them, the system auto-detects.
Your job is to emit SIGNALS that matter for future context. Be aggressive:
err on the side of capturing useful signal. Triage filters noise downstream.
WHAT TO EMIT (in order of importance):
1. PROJECT ACTIVITY — any mention of a project with context worth remembering:
- "Schott quote received for ABB-Space" (event + project)
- "Cédric asked about p06 firmware timing" (stakeholder event)
- "Still waiting on Zygo lead-time from Nabeel" (blocker status)
- "p05 vendor decision needs to happen this week" (action item)
2. DECISIONS AND CHOICES — anything that commits to a direction:
- "Going with Zygo Verifire SV for p05" (decision)
- "Dropping stitching from primary workflow" (design choice)
- "USB SSD mandatory, not SD card" (architectural commitment)
3. DURABLE ENGINEERING INSIGHT — earned knowledge that generalizes:
- "CTE gradient dominates WFE at F/1.2" (materials insight)
- "Preston model breaks below 5N because contact assumption fails"
- "m=1 coma NOT correctable by force modulation" (controls insight)
Test: would a competent engineer NEED experience to know this?
If it's textbook/google-findable, skip it.
4. STAKEHOLDER AND VENDOR EVENTS:
- "Email sent to Nabeel 2026-04-13 asking for lead time"
- "Meeting with Jason on Table 7 next Tuesday"
- "Starspec wants updated CAD by Friday"
5. PREFERENCES AND ADAPTATIONS that shape how Antoine works:
- "Antoine prefers OAuth over API keys"
- "Extraction stays off the capture hot path"
WHAT TO SKIP:
- Pure conversational filler ("ok thanks", "let me check")
- Instructional help content ("run this command", "here's how to...")
- Obvious textbook facts anyone can google in 30 seconds
- Session meta-chatter ("let me commit this", "deploy running")
- Transient system state snapshots ("36 active memories right now")
CANDIDATE TYPES — choose the best fit:
- project — a fact, decision, or event specific to one named project
- knowledge — durable engineering insight (use domain, not project)
- preference — how Antoine works / wants things done
- adaptation — a standing rule or adjustment to behavior
- episodic — a stakeholder event or milestone worth remembering
DOMAINS for knowledge candidates (required when type=knowledge and project is empty):
physics, materials, optics, mechanics, manufacturing, metrology,
controls, software, math, finance, business
TRUST HIERARCHY:
- project-specific: set project to the project id, leave domain empty
- domain knowledge: set domain, leave project empty
- events/activity: use project, type=project or episodic
- one conversation can produce MULTIPLE candidates — emit them all
OUTPUT RULES:
- Each candidate content under 250 characters, stands alone
- Default confidence 0.5. Raise to 0.7 only for ratified/committed claims.
- Raw JSON array, no prose, no markdown fences
- Empty array [] is fine when the conversation has no durable signal
Each element:
{"type": "project|knowledge|preference|adaptation|episodic", "content": "...", "project": "...", "domain": "", "confidence": 0.5}"""
def build_user_message(prompt: str, response: str, project_hint: str) -> str:
prompt_excerpt = (prompt or "")[:MAX_PROMPT_CHARS]
response_excerpt = (response or "")[:MAX_RESPONSE_CHARS]
return (
f"PROJECT HINT (may be empty): {project_hint or ''}\n\n"
f"USER PROMPT:\n{prompt_excerpt}\n\n"
f"ASSISTANT RESPONSE:\n{response_excerpt}\n\n"
"Return the JSON array now."
)
def parse_llm_json_array(raw_output: str) -> list[dict[str, Any]]:
"""Strip markdown fences / leading prose and return the parsed JSON
array as a list of raw dicts. Returns an empty list on any parse
failure — callers decide whether to log."""
text = (raw_output or "").strip()
if text.startswith("```"):
text = text.strip("`")
nl = text.find("\n")
if nl >= 0:
text = text[nl + 1:]
if text.endswith("```"):
text = text[:-3]
text = text.strip()
if not text or text == "[]":
return []
if not text.lstrip().startswith("["):
start = text.find("[")
end = text.rfind("]")
if start >= 0 and end > start:
text = text[start:end + 1]
try:
parsed = json.loads(text)
except json.JSONDecodeError:
return []
if not isinstance(parsed, list):
return []
return [item for item in parsed if isinstance(item, dict)]
def normalize_candidate_item(item: dict[str, Any]) -> dict[str, Any] | None:
"""Validate and normalize one raw model item into a candidate dict.
Returns None if the item fails basic validation (unknown type,
empty content). Does NOT apply project-attribution policy — that's
the caller's job, since the registry-check differs between the
in-container path and the host path.
Output keys: type, content, project (raw model value), domain,
confidence.
"""
mem_type = str(item.get("type") or "").strip().lower()
content = str(item.get("content") or "").strip()
if mem_type not in MEMORY_TYPES or not content:
return None
model_project = str(item.get("project") or "").strip()
domain = str(item.get("domain") or "").strip().lower()
try:
confidence = float(item.get("confidence", 0.5))
except (TypeError, ValueError):
confidence = 0.5
confidence = max(0.0, min(1.0, confidence))
if domain and not model_project:
content = f"[{domain}] {content}"
return {
"type": mem_type,
"content": content[:1000],
"project": model_project,
"domain": domain,
"confidence": confidence,
}

View File

@@ -49,7 +49,6 @@ Implementation notes:
from __future__ import annotations
import json
import os
import shutil
import subprocess
@@ -58,92 +57,21 @@ from dataclasses import dataclass
from functools import lru_cache
from atocore.interactions.service import Interaction
from atocore.memory._llm_prompt import (
LLM_EXTRACTOR_VERSION,
SYSTEM_PROMPT as _SYSTEM_PROMPT,
build_user_message,
normalize_candidate_item,
parse_llm_json_array,
)
from atocore.memory.extractor import MemoryCandidate
from atocore.memory.service import MEMORY_TYPES
from atocore.observability.logger import get_logger
log = get_logger("extractor_llm")
LLM_EXTRACTOR_VERSION = "llm-0.4.0"
DEFAULT_MODEL = os.environ.get("ATOCORE_LLM_EXTRACTOR_MODEL", "sonnet")
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_LLM_EXTRACTOR_TIMEOUT_S", "90"))
MAX_RESPONSE_CHARS = 8000
MAX_PROMPT_CHARS = 2000
_SYSTEM_PROMPT = """You extract memory candidates from LLM conversation turns for a personal context engine called AtoCore.
AtoCore is the brain for Atomaste's engineering work. Known projects:
p04-gigabit, p05-interferometer, p06-polisher, atomizer-v2, atocore,
abb-space. Unknown project names — still tag them, the system auto-detects.
Your job is to emit SIGNALS that matter for future context. Be aggressive:
err on the side of capturing useful signal. Triage filters noise downstream.
WHAT TO EMIT (in order of importance):
1. PROJECT ACTIVITY — any mention of a project with context worth remembering:
- "Schott quote received for ABB-Space" (event + project)
- "Cédric asked about p06 firmware timing" (stakeholder event)
- "Still waiting on Zygo lead-time from Nabeel" (blocker status)
- "p05 vendor decision needs to happen this week" (action item)
2. DECISIONS AND CHOICES — anything that commits to a direction:
- "Going with Zygo Verifire SV for p05" (decision)
- "Dropping stitching from primary workflow" (design choice)
- "USB SSD mandatory, not SD card" (architectural commitment)
3. DURABLE ENGINEERING INSIGHT — earned knowledge that generalizes:
- "CTE gradient dominates WFE at F/1.2" (materials insight)
- "Preston model breaks below 5N because contact assumption fails"
- "m=1 coma NOT correctable by force modulation" (controls insight)
Test: would a competent engineer NEED experience to know this?
If it's textbook/google-findable, skip it.
4. STAKEHOLDER AND VENDOR EVENTS:
- "Email sent to Nabeel 2026-04-13 asking for lead time"
- "Meeting with Jason on Table 7 next Tuesday"
- "Starspec wants updated CAD by Friday"
5. PREFERENCES AND ADAPTATIONS that shape how Antoine works:
- "Antoine prefers OAuth over API keys"
- "Extraction stays off the capture hot path"
WHAT TO SKIP:
- Pure conversational filler ("ok thanks", "let me check")
- Instructional help content ("run this command", "here's how to...")
- Obvious textbook facts anyone can google in 30 seconds
- Session meta-chatter ("let me commit this", "deploy running")
- Transient system state snapshots ("36 active memories right now")
CANDIDATE TYPES — choose the best fit:
- project — a fact, decision, or event specific to one named project
- knowledge — durable engineering insight (use domain, not project)
- preference — how Antoine works / wants things done
- adaptation — a standing rule or adjustment to behavior
- episodic — a stakeholder event or milestone worth remembering
DOMAINS for knowledge candidates (required when type=knowledge and project is empty):
physics, materials, optics, mechanics, manufacturing, metrology,
controls, software, math, finance, business
TRUST HIERARCHY:
- project-specific: set project to the project id, leave domain empty
- domain knowledge: set domain, leave project empty
- events/activity: use project, type=project or episodic
- one conversation can produce MULTIPLE candidates — emit them all
OUTPUT RULES:
- Each candidate content under 250 characters, stands alone
- Default confidence 0.5. Raise to 0.7 only for ratified/committed claims.
- Raw JSON array, no prose, no markdown fences
- Empty array [] is fine when the conversation has no durable signal
Each element:
{"type": "project|knowledge|preference|adaptation|episodic", "content": "...", "project": "...", "domain": "", "confidence": 0.5}"""
@dataclass
@@ -206,13 +134,10 @@ def extract_candidates_llm_verbose(
if not response_text:
return LLMExtractionResult(candidates=[], raw_output="", error="empty_response")
prompt_excerpt = (interaction.prompt or "")[:MAX_PROMPT_CHARS]
response_excerpt = response_text[:MAX_RESPONSE_CHARS]
user_message = (
f"PROJECT HINT (may be empty): {interaction.project or ''}\n\n"
f"USER PROMPT:\n{prompt_excerpt}\n\n"
f"ASSISTANT RESPONSE:\n{response_excerpt}\n\n"
"Return the JSON array now."
user_message = build_user_message(
interaction.prompt or "",
response_text,
interaction.project or "",
)
args = [
@@ -270,50 +195,25 @@ def extract_candidates_llm_verbose(
def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryCandidate]:
"""Parse the model's JSON output into MemoryCandidate objects.
Tolerates common model glitches: surrounding whitespace, stray
markdown fences, leading/trailing prose. Silently drops malformed
array elements rather than raising.
Shared stripping + per-item validation live in
``atocore.memory._llm_prompt``. This function adds the container-
only R9 project attribution: registry-check model_project and fall
back to the interaction scope when set.
"""
text = raw_output.strip()
if text.startswith("```"):
text = text.strip("`")
first_newline = text.find("\n")
if first_newline >= 0:
text = text[first_newline + 1 :]
if text.endswith("```"):
text = text[:-3]
text = text.strip()
if not text or text == "[]":
return []
if not text.lstrip().startswith("["):
start = text.find("[")
end = text.rfind("]")
if start >= 0 and end > start:
text = text[start : end + 1]
try:
parsed = json.loads(text)
except json.JSONDecodeError as exc:
log.error("llm_extractor_parse_failed", error=str(exc), raw_prefix=raw_output[:120])
return []
if not isinstance(parsed, list):
return []
raw_items = parse_llm_json_array(raw_output)
if not raw_items and raw_output.strip() not in ("", "[]"):
log.error("llm_extractor_parse_failed", raw_prefix=raw_output[:120])
results: list[MemoryCandidate] = []
for item in parsed:
if not isinstance(item, dict):
for raw_item in raw_items:
normalized = normalize_candidate_item(raw_item)
if normalized is None:
continue
mem_type = str(item.get("type") or "").strip().lower()
content = str(item.get("content") or "").strip()
model_project = str(item.get("project") or "").strip()
# R9 trust hierarchy for project attribution:
# 1. Interaction scope always wins when set (strongest signal)
# 2. Model project used only when interaction is unscoped
# AND model project resolves to a registered project
# 3. Empty string when both are empty/unregistered
model_project = normalized["project"]
# R9 trust hierarchy: interaction scope wins; else registry-
# resolve the model's tag; else keep the model's tag so auto-
# triage can surface unregistered projects.
if interaction.project:
project = interaction.project
elif model_project:
@@ -328,9 +228,6 @@ def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryC
if resolved in registered_ids:
project = resolved
else:
# Unregistered project — keep the model's tag so
# auto-triage / the operator can see it and decide
# whether to register it as a new project or lead.
project = model_project
log.info(
"unregistered_project_detected",
@@ -338,34 +235,19 @@ def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryC
interaction_id=interaction.id,
)
except Exception:
project = model_project if model_project else ""
project = model_project
else:
project = ""
domain = str(item.get("domain") or "").strip().lower()
confidence_raw = item.get("confidence", 0.5)
if mem_type not in MEMORY_TYPES:
continue
if not content:
continue
# Domain knowledge: embed the domain tag in the content so it
# survives without a schema migration. The context builder
# can match on it via query-relevance ranking, and a future
# migration can parse it into a proper column.
if domain and not project:
content = f"[{domain}] {content}"
try:
confidence = float(confidence_raw)
except (TypeError, ValueError):
confidence = 0.5
confidence = max(0.0, min(1.0, confidence))
content = normalized["content"]
results.append(
MemoryCandidate(
memory_type=mem_type,
content=content[:1000],
memory_type=normalized["type"],
content=content,
rule="llm_extraction",
source_span=content[:200],
project=project,
confidence=confidence,
confidence=normalized["confidence"],
source_interaction_id=interaction.id,
extractor_version=LLM_EXTRACTOR_VERSION,
)