From c2e70642386acf7414e9709be098ce8391451dc2 Mon Sep 17 00:00:00 2001 From: Anto01 Date: Wed, 15 Apr 2026 10:47:01 -0400 Subject: [PATCH] fix(extraction): R11 container 503 + R12 shared prompt module R11: POST /admin/extract-batch with mode=llm now returns 503 when the claude CLI is unavailable (was silently returning success with 0 candidates), with a message pointing at the host-side script. +2 tests. R12: extracted SYSTEM_PROMPT + parse_llm_json_array + normalize_candidate_item + build_user_message into stdlib-only src/atocore/memory/_llm_prompt.py. Both the container extractor and scripts/batch_llm_extract_live.py now import from it, eliminating the prompt/parser drift risk. Tests 297 -> 299. Co-Authored-By: Claude Opus 4.6 (1M context) --- DEV-LEDGER.md | 16 +-- scripts/batch_llm_extract_live.py | 183 ++++++---------------------- src/atocore/api/routes.py | 13 ++ src/atocore/memory/_llm_prompt.py | 183 ++++++++++++++++++++++++++++ src/atocore/memory/extractor_llm.py | 182 +++++---------------------- tests/test_extraction_pipeline.py | 35 ++++++ 6 files changed, 310 insertions(+), 302 deletions(-) create mode 100644 src/atocore/memory/_llm_prompt.py diff --git a/DEV-LEDGER.md b/DEV-LEDGER.md index e45eb3b..b93ac8f 100644 --- a/DEV-LEDGER.md +++ b/DEV-LEDGER.md @@ -6,10 +6,10 @@ ## Orientation -- **live_sha** (Dalidou `/health` build_sha): `3f23ca1` (signal-aggressive extractor live; fix needs redeploy) -- **last_updated**: 2026-04-14 by Claude (OpenClaw importer live, Karpathy upgrades shipped) -- **main_tip**: `58ea21d` -- **test_count**: 297 passing (+7 engineering layer tests) +- **live_sha** (Dalidou `/health` build_sha): `58ea21d` (verified 2026-04-14 via /health) +- **last_updated**: 2026-04-14 by Claude (R11+R12 closed, R3 declined) +- **main_tip**: `dc9fdd3` (pre-R11/R12 commit; new commit pending for this session) +- **test_count**: 299 passing (+2 R11 api-503 tests) - **harness**: `17/18 PASS` (only p06-tailscale — chunk bleed) - **vectors**: 33,253 - **active_memories**: 84 (31 project, 23 knowledge, 10 episodic, 8 adaptation, 7 preference, 5 identity) @@ -131,7 +131,7 @@ One branch `codex/extractor-eval-loop` for Day 1-5, a second `codex/retrieval-ha |-----|--------|----------|------------------------------------|-------------------------------------------------------------------------|--------------|--------|------------|-------------| | R1 | Codex | P1 | deploy/hooks/capture_stop.py:76-85 | Live Claude capture still omits `extract`, so "loop closed both sides" remains overstated in practice even though the API supports it | fixed | Claude | 2026-04-11 | c67bec0 | | R2 | Codex | P1 | src/atocore/context/builder.py | Project memories excluded from pack | fixed | Claude | 2026-04-11 | 8ea53f4 | -| R3 | Claude | P2 | src/atocore/memory/extractor.py | Rule cues (`## Decision:`) never fire on conversational LLM text | open | Claude | 2026-04-11 | | +| R3 | Claude | P2 | src/atocore/memory/extractor.py | Rule cues (`## Decision:`) never fire on conversational LLM text | declined | Claude | 2026-04-11 | see 2026-04-14 session log | | R4 | Codex | P2 | DEV-LEDGER.md:11 | Orientation `main_tip` was stale versus `HEAD` / `origin/main` | fixed | Codex | 2026-04-11 | 81307ce | | R5 | Codex | P1 | src/atocore/interactions/service.py:157-174 | The deployed extraction path still calls only the rule extractor; the new LLM extractor is eval/script-only, so Day 4 "gate cleared" is true as a benchmark result but not as an operational extraction path | fixed | Claude | 2026-04-12 | c67bec0 | | R6 | Codex | P1 | src/atocore/memory/extractor_llm.py:258-276 | LLM extraction accepts model-supplied `project` verbatim with no fallback to `interaction.project`; live triage promoted a clearly p06 memory (offline/network rule) as project=`""`, which explains the p06-offline-design harness miss and falsifies the current "all 3 failures are budget-contention" claim | fixed | Claude | 2026-04-12 | 39d73e9 | @@ -139,8 +139,8 @@ One branch `codex/extractor-eval-loop` for Day 1-5, a second `codex/retrieval-ha | R8 | Codex | P2 | tests/test_extractor_llm.py:1-7 | LLM extractor tests stop at parser/failure contracts; there is no automated coverage for the script-only persistence/review path that produced the 16 promoted memories, including project-scope preservation | fixed | Claude | 2026-04-12 | 69c9717 | | R9 | Codex | P2 | src/atocore/memory/extractor_llm.py:258-259 | The R6 fallback only repairs empty project output. A wrong non-empty model project still overrides the interaction's known scope, so project attribution is improved but not yet trust-preserving. | fixed | Claude | 2026-04-12 | e5e9a99 | | R10 | Codex | P2 | docs/master-plan-status.md:31-33 | "Phase 8 - OpenClaw Integration" is fair as a baseline milestone, but not as a "primary" integration claim. `t420-openclaw/atocore.py` currently covers a narrow read-oriented subset (13 request shapes vs 32 API routes) plus fail-open health, while memory/interactions/admin write paths remain out of surface. | open | Claude | 2026-04-12 | | -| R11 | Codex | P2 | src/atocore/api/routes.py:773-845 | `POST /admin/extract-batch` still accepts `mode="llm"` inside the container and returns a successful 0-candidate result instead of surfacing that host-only LLM extraction is unavailable from this runtime. That is a misleading API contract for operators. | open | Claude | 2026-04-12 | | -| R12 | Codex | P2 | scripts/batch_llm_extract_live.py:39-190 | The host-side extractor duplicates the LLM system prompt and JSON parsing logic from `src/atocore/memory/extractor_llm.py`. It works today, but this is now a prompt/parser drift risk across the container and host implementations. | open | Claude | 2026-04-12 | | +| R11 | Codex | P2 | src/atocore/api/routes.py:773-845 | `POST /admin/extract-batch` still accepts `mode="llm"` inside the container and returns a successful 0-candidate result instead of surfacing that host-only LLM extraction is unavailable from this runtime. That is a misleading API contract for operators. | fixed | Claude | 2026-04-12 | (pending) | +| R12 | Codex | P2 | scripts/batch_llm_extract_live.py:39-190 | The host-side extractor duplicates the LLM system prompt and JSON parsing logic from `src/atocore/memory/extractor_llm.py`. It works today, but this is now a prompt/parser drift risk across the container and host implementations. | fixed | Claude | 2026-04-12 | (pending) | | R13 | Codex | P2 | DEV-LEDGER.md:12 | The new `286 passing` test-count claim is not reproducibly auditable from the current audit environments: neither Dalidou nor the clean worktree has `pytest` available. The claim may be true in Claude's dev shell, but it remains unverified in this audit. | open | Claude | 2026-04-12 | | ## Recent Decisions @@ -159,6 +159,8 @@ One branch `codex/extractor-eval-loop` for Day 1-5, a second `codex/retrieval-ha ## Session Log +- **2026-04-14 Claude (pm)** Closed R11+R12, declined R3. **R11 (fixed):** `POST /admin/extract-batch` with `mode="llm"` now returns 503 when the `claude` CLI is not on PATH, with a message pointing at the host-side script. Previously it silently returned a success-0 payload, masking host-vs-container truth. 2 new tests in `test_extraction_pipeline.py` cover the 503 path and the rule-mode-still-works path. **R12 (fixed):** extracted shared `SYSTEM_PROMPT` + `parse_llm_json_array` + `normalize_candidate_item` + `build_user_message` into stdlib-only `src/atocore/memory/_llm_prompt.py`. Both `src/atocore/memory/extractor_llm.py` (container) and `scripts/batch_llm_extract_live.py` (host) now import from it. The host script uses `sys.path` to reach the stdlib-only module without needing the full atocore package. Project-attribution policy stays path-specific (container uses registry-check; host defers to server). **R3 (declined):** rule cues not firing on conversational LLM text is by design now — the LLM extractor (llm-0.4.0) is the production path for conversational content as of the Day 4 gate (2026-04-12). Expanding rules to match conversational prose risks the FP blowup Day 2 already showed. Rule extractor stays narrow for structural PKM text. Tests 297 → 299. Live `/health` still `58ea21d`; this session's changes need deploy. + - **2026-04-14 Claude** MAJOR session: Engineering knowledge layer V1 (Layer 2) built — entity + relationship tables, 15 types, 12 relationship kinds, 35 bootstrapped entities across p04/p05/p06. Human Mirror (Layer 3) — GET /projects/{name}/mirror.html + navigable wiki at /wiki with search. Karpathy-inspired upgrades: contradiction detection in triage, weekly lint pass, weekly synthesis pass producing "current state" paragraphs at top of project pages. Auto-detection of new projects from extraction. Registry persistence fix (ATOCORE_PROJECT_REGISTRY_DIR env var). abb-space/p08 aliases added, atomizer-v2 ingested (568 docs, +12,472 vectors). Identity/preference seed (6 new), signal-aggressive extractor rewrite (llm-0.4.0), auto vault refresh in cron. **OpenClaw one-way pull importer** built per codex proposal — reads /home/papa/clawd SOUL.md, USER.md, MEMORY.md, MODEL-ROUTING.md, memory/*.md via SSH, hash-delta import, pipeline triages. First import: 10 candidates → 10 promoted with lenient triage rule. Active memories 47→84. State entries 61→78. Tests 290→297. Dashboard at /admin/dashboard. Wiki at /wiki. diff --git a/scripts/batch_llm_extract_live.py b/scripts/batch_llm_extract_live.py index c129d73..9680faa 100644 --- a/scripts/batch_llm_extract_live.py +++ b/scripts/batch_llm_extract_live.py @@ -1,12 +1,15 @@ -"""Host-side LLM batch extraction — pure HTTP client, no atocore imports. +"""Host-side LLM batch extraction — HTTP client + shared prompt module. Fetches interactions from the AtoCore API, runs ``claude -p`` locally -for each, and POSTs candidates back. Zero dependency on atocore source -or Python packages — only uses stdlib + the ``claude`` CLI on PATH. +for each, and POSTs candidates back. Uses stdlib + the ``claude`` CLI +on PATH, plus the stdlib-only shared prompt/parser module at +``atocore.memory._llm_prompt`` to eliminate prompt/parser drift +against the in-container extractor (R12). This is necessary because the ``claude`` CLI is on the Dalidou HOST but not inside the Docker container, and the host's Python doesn't -have the container's dependencies (pydantic_settings, etc.). +have the container's dependencies (pydantic_settings, etc.) — so we +only import the one stdlib-only module, not the full atocore package. """ from __future__ import annotations @@ -23,88 +26,26 @@ import urllib.parse import urllib.request from datetime import datetime, timezone +# R12: share the prompt + parser with the in-container extractor so +# the two paths can't drift. The imported module is stdlib-only by +# design; see src/atocore/memory/_llm_prompt.py. +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src")) +if _SRC_DIR not in sys.path: + sys.path.insert(0, _SRC_DIR) + +from atocore.memory._llm_prompt import ( # noqa: E402 + MEMORY_TYPES, + SYSTEM_PROMPT, + build_user_message, + normalize_candidate_item, + parse_llm_json_array, +) + DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://localhost:8100") DEFAULT_MODEL = os.environ.get("ATOCORE_LLM_EXTRACTOR_MODEL", "sonnet") DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_LLM_EXTRACTOR_TIMEOUT_S", "90")) -MAX_RESPONSE_CHARS = 8000 -MAX_PROMPT_CHARS = 2000 -MEMORY_TYPES = {"identity", "preference", "project", "episodic", "knowledge", "adaptation"} - -SYSTEM_PROMPT = """You extract memory candidates from LLM conversation turns for a personal context engine called AtoCore. - -AtoCore is the brain for Atomaste's engineering work. Known projects: -p04-gigabit, p05-interferometer, p06-polisher, atomizer-v2, atocore, -abb-space. Unknown project names — still tag them, the system auto-detects. - -Your job is to emit SIGNALS that matter for future context. Be aggressive: -err on the side of capturing useful signal. Triage filters noise downstream. - -WHAT TO EMIT (in order of importance): - -1. PROJECT ACTIVITY — any mention of a project with context worth remembering: - - "Schott quote received for ABB-Space" (event + project) - - "Cédric asked about p06 firmware timing" (stakeholder event) - - "Still waiting on Zygo lead-time from Nabeel" (blocker status) - - "p05 vendor decision needs to happen this week" (action item) - -2. DECISIONS AND CHOICES — anything that commits to a direction: - - "Going with Zygo Verifire SV for p05" (decision) - - "Dropping stitching from primary workflow" (design choice) - - "USB SSD mandatory, not SD card" (architectural commitment) - -3. DURABLE ENGINEERING INSIGHT — earned knowledge that generalizes: - - "CTE gradient dominates WFE at F/1.2" (materials insight) - - "Preston model breaks below 5N because contact assumption fails" - - "m=1 coma NOT correctable by force modulation" (controls insight) - Test: would a competent engineer NEED experience to know this? - If it's textbook/google-findable, skip it. - -4. STAKEHOLDER AND VENDOR EVENTS: - - "Email sent to Nabeel 2026-04-13 asking for lead time" - - "Meeting with Jason on Table 7 next Tuesday" - - "Starspec wants updated CAD by Friday" - -5. PREFERENCES AND ADAPTATIONS that shape how Antoine works: - - "Antoine prefers OAuth over API keys" - - "Extraction stays off the capture hot path" - -WHAT TO SKIP: - -- Pure conversational filler ("ok thanks", "let me check") -- Instructional help content ("run this command", "here's how to...") -- Obvious textbook facts anyone can google in 30 seconds -- Session meta-chatter ("let me commit this", "deploy running") -- Transient system state snapshots ("36 active memories right now") - -CANDIDATE TYPES — choose the best fit: - -- project — a fact, decision, or event specific to one named project -- knowledge — durable engineering insight (use domain, not project) -- preference — how Antoine works / wants things done -- adaptation — a standing rule or adjustment to behavior -- episodic — a stakeholder event or milestone worth remembering - -DOMAINS for knowledge candidates (required when type=knowledge and project is empty): -physics, materials, optics, mechanics, manufacturing, metrology, -controls, software, math, finance, business - -TRUST HIERARCHY: - -- project-specific: set project to the project id, leave domain empty -- domain knowledge: set domain, leave project empty -- events/activity: use project, type=project or episodic -- one conversation can produce MULTIPLE candidates — emit them all - -OUTPUT RULES: - -- Each candidate content under 250 characters, stands alone -- Default confidence 0.5. Raise to 0.7 only for ratified/committed claims. -- Raw JSON array, no prose, no markdown fences -- Empty array [] is fine when the conversation has no durable signal - -Each element: -{"type": "project|knowledge|preference|adaptation|episodic", "content": "...", "project": "...", "domain": "", "confidence": 0.5}""" _sandbox_cwd = None @@ -175,14 +116,7 @@ def extract_one(prompt, response, project, model, timeout_s): if not shutil.which("claude"): return [], "claude_cli_missing" - prompt_excerpt = prompt[:MAX_PROMPT_CHARS] - response_excerpt = response[:MAX_RESPONSE_CHARS] - user_message = ( - f"PROJECT HINT (may be empty): {project}\n\n" - f"USER PROMPT:\n{prompt_excerpt}\n\n" - f"ASSISTANT RESPONSE:\n{response_excerpt}\n\n" - "Return the JSON array now." - ) + user_message = build_user_message(prompt, response, project) args = [ "claude", "-p", @@ -211,66 +145,25 @@ def extract_one(prompt, response, project, model, timeout_s): def parse_candidates(raw, interaction_project): - """Parse model JSON output into candidate dicts.""" - text = raw.strip() - if text.startswith("```"): - text = text.strip("`") - nl = text.find("\n") - if nl >= 0: - text = text[nl + 1:] - if text.endswith("```"): - text = text[:-3] - text = text.strip() - - if not text or text == "[]": - return [] - - if not text.lstrip().startswith("["): - start = text.find("[") - end = text.rfind("]") - if start >= 0 and end > start: - text = text[start:end + 1] - - try: - parsed = json.loads(text) - except json.JSONDecodeError: - return [] - - if not isinstance(parsed, list): - return [] + """Parse model JSON output into candidate dicts. + Stripping + per-item normalization come from the shared + ``_llm_prompt`` module. Host-side project attribution: interaction + scope wins, otherwise keep the model's tag (the API's own R9 + registry-check will happen server-side in the container on write; + here we preserve the signal instead of dropping it). + """ results = [] - for item in parsed: - if not isinstance(item, dict): + for item in parse_llm_json_array(raw): + normalized = normalize_candidate_item(item) + if normalized is None: continue - mem_type = str(item.get("type") or "").strip().lower() - content = str(item.get("content") or "").strip() - model_project = str(item.get("project") or "").strip() - domain = str(item.get("domain") or "").strip().lower() - # R9 trust hierarchy: interaction scope always wins when set. - # For unscoped interactions, keep model's project tag even if - # unregistered — the system will detect new projects/leads. - if interaction_project: - project = interaction_project - elif model_project: - project = model_project - else: - project = "" - # Domain knowledge: embed tag in content for cross-project retrieval - if domain and not project: - content = f"[{domain}] {content}" - conf = item.get("confidence", 0.5) - if mem_type not in MEMORY_TYPES or not content: - continue - try: - conf = max(0.0, min(1.0, float(conf))) - except (TypeError, ValueError): - conf = 0.5 + project = interaction_project or normalized["project"] or "" results.append({ - "memory_type": mem_type, - "content": content[:1000], + "memory_type": normalized["type"], + "content": normalized["content"], "project": project, - "confidence": conf, + "confidence": normalized["confidence"], }) return results diff --git a/src/atocore/api/routes.py b/src/atocore/api/routes.py index 445ac08..f8e98d8 100644 --- a/src/atocore/api/routes.py +++ b/src/atocore/api/routes.py @@ -55,6 +55,7 @@ from atocore.memory.extractor import ( ) from atocore.memory.extractor_llm import ( LLM_EXTRACTOR_VERSION, + _cli_available as _llm_cli_available, extract_candidates_llm, ) from atocore.memory.reinforcement import reinforce_from_interaction @@ -832,6 +833,18 @@ def api_extract_batch(req: ExtractBatchRequest | None = None) -> dict: invoke this endpoint explicitly (cron, manual curl, CLI). """ payload = req or ExtractBatchRequest() + + if payload.mode == "llm" and not _llm_cli_available(): + raise HTTPException( + status_code=503, + detail=( + "LLM extraction unavailable in this runtime: the `claude` CLI " + "is not on PATH. Run host-side via " + "`scripts/batch_llm_extract_live.py` instead, or call this " + "endpoint with mode=\"rule\"." + ), + ) + since = payload.since if not since: diff --git a/src/atocore/memory/_llm_prompt.py b/src/atocore/memory/_llm_prompt.py new file mode 100644 index 0000000..2d0c1b0 --- /dev/null +++ b/src/atocore/memory/_llm_prompt.py @@ -0,0 +1,183 @@ +"""Shared LLM-extractor prompt + parser (stdlib-only). + +R12: single source of truth for the system prompt, memory type set, +size limits, and raw JSON parsing used by both paths that shell out +to ``claude -p``: + +- ``atocore.memory.extractor_llm`` (in-container extractor, wraps the + parsed dicts in ``MemoryCandidate`` with registry-checked project + attribution) +- ``scripts/batch_llm_extract_live.py`` (host-side extractor, can't + import the full atocore package because Dalidou's host Python lacks + the container's deps; imports this module via ``sys.path``) + +This module MUST stay stdlib-only. No ``atocore`` imports, no third- +party packages. Callers apply their own project-attribution policy on +top of the normalized dicts this module emits. +""" + +from __future__ import annotations + +import json +from typing import Any + +LLM_EXTRACTOR_VERSION = "llm-0.4.0" +MAX_RESPONSE_CHARS = 8000 +MAX_PROMPT_CHARS = 2000 +MEMORY_TYPES = {"identity", "preference", "project", "episodic", "knowledge", "adaptation"} + +SYSTEM_PROMPT = """You extract memory candidates from LLM conversation turns for a personal context engine called AtoCore. + +AtoCore is the brain for Atomaste's engineering work. Known projects: +p04-gigabit, p05-interferometer, p06-polisher, atomizer-v2, atocore, +abb-space. Unknown project names — still tag them, the system auto-detects. + +Your job is to emit SIGNALS that matter for future context. Be aggressive: +err on the side of capturing useful signal. Triage filters noise downstream. + +WHAT TO EMIT (in order of importance): + +1. PROJECT ACTIVITY — any mention of a project with context worth remembering: + - "Schott quote received for ABB-Space" (event + project) + - "Cédric asked about p06 firmware timing" (stakeholder event) + - "Still waiting on Zygo lead-time from Nabeel" (blocker status) + - "p05 vendor decision needs to happen this week" (action item) + +2. DECISIONS AND CHOICES — anything that commits to a direction: + - "Going with Zygo Verifire SV for p05" (decision) + - "Dropping stitching from primary workflow" (design choice) + - "USB SSD mandatory, not SD card" (architectural commitment) + +3. DURABLE ENGINEERING INSIGHT — earned knowledge that generalizes: + - "CTE gradient dominates WFE at F/1.2" (materials insight) + - "Preston model breaks below 5N because contact assumption fails" + - "m=1 coma NOT correctable by force modulation" (controls insight) + Test: would a competent engineer NEED experience to know this? + If it's textbook/google-findable, skip it. + +4. STAKEHOLDER AND VENDOR EVENTS: + - "Email sent to Nabeel 2026-04-13 asking for lead time" + - "Meeting with Jason on Table 7 next Tuesday" + - "Starspec wants updated CAD by Friday" + +5. PREFERENCES AND ADAPTATIONS that shape how Antoine works: + - "Antoine prefers OAuth over API keys" + - "Extraction stays off the capture hot path" + +WHAT TO SKIP: + +- Pure conversational filler ("ok thanks", "let me check") +- Instructional help content ("run this command", "here's how to...") +- Obvious textbook facts anyone can google in 30 seconds +- Session meta-chatter ("let me commit this", "deploy running") +- Transient system state snapshots ("36 active memories right now") + +CANDIDATE TYPES — choose the best fit: + +- project — a fact, decision, or event specific to one named project +- knowledge — durable engineering insight (use domain, not project) +- preference — how Antoine works / wants things done +- adaptation — a standing rule or adjustment to behavior +- episodic — a stakeholder event or milestone worth remembering + +DOMAINS for knowledge candidates (required when type=knowledge and project is empty): +physics, materials, optics, mechanics, manufacturing, metrology, +controls, software, math, finance, business + +TRUST HIERARCHY: + +- project-specific: set project to the project id, leave domain empty +- domain knowledge: set domain, leave project empty +- events/activity: use project, type=project or episodic +- one conversation can produce MULTIPLE candidates — emit them all + +OUTPUT RULES: + +- Each candidate content under 250 characters, stands alone +- Default confidence 0.5. Raise to 0.7 only for ratified/committed claims. +- Raw JSON array, no prose, no markdown fences +- Empty array [] is fine when the conversation has no durable signal + +Each element: +{"type": "project|knowledge|preference|adaptation|episodic", "content": "...", "project": "...", "domain": "", "confidence": 0.5}""" + + +def build_user_message(prompt: str, response: str, project_hint: str) -> str: + prompt_excerpt = (prompt or "")[:MAX_PROMPT_CHARS] + response_excerpt = (response or "")[:MAX_RESPONSE_CHARS] + return ( + f"PROJECT HINT (may be empty): {project_hint or ''}\n\n" + f"USER PROMPT:\n{prompt_excerpt}\n\n" + f"ASSISTANT RESPONSE:\n{response_excerpt}\n\n" + "Return the JSON array now." + ) + + +def parse_llm_json_array(raw_output: str) -> list[dict[str, Any]]: + """Strip markdown fences / leading prose and return the parsed JSON + array as a list of raw dicts. Returns an empty list on any parse + failure — callers decide whether to log.""" + text = (raw_output or "").strip() + if text.startswith("```"): + text = text.strip("`") + nl = text.find("\n") + if nl >= 0: + text = text[nl + 1:] + if text.endswith("```"): + text = text[:-3] + text = text.strip() + + if not text or text == "[]": + return [] + + if not text.lstrip().startswith("["): + start = text.find("[") + end = text.rfind("]") + if start >= 0 and end > start: + text = text[start:end + 1] + + try: + parsed = json.loads(text) + except json.JSONDecodeError: + return [] + + if not isinstance(parsed, list): + return [] + return [item for item in parsed if isinstance(item, dict)] + + +def normalize_candidate_item(item: dict[str, Any]) -> dict[str, Any] | None: + """Validate and normalize one raw model item into a candidate dict. + + Returns None if the item fails basic validation (unknown type, + empty content). Does NOT apply project-attribution policy — that's + the caller's job, since the registry-check differs between the + in-container path and the host path. + + Output keys: type, content, project (raw model value), domain, + confidence. + """ + mem_type = str(item.get("type") or "").strip().lower() + content = str(item.get("content") or "").strip() + if mem_type not in MEMORY_TYPES or not content: + return None + + model_project = str(item.get("project") or "").strip() + domain = str(item.get("domain") or "").strip().lower() + + try: + confidence = float(item.get("confidence", 0.5)) + except (TypeError, ValueError): + confidence = 0.5 + confidence = max(0.0, min(1.0, confidence)) + + if domain and not model_project: + content = f"[{domain}] {content}" + + return { + "type": mem_type, + "content": content[:1000], + "project": model_project, + "domain": domain, + "confidence": confidence, + } diff --git a/src/atocore/memory/extractor_llm.py b/src/atocore/memory/extractor_llm.py index acbb3a6..de2f523 100644 --- a/src/atocore/memory/extractor_llm.py +++ b/src/atocore/memory/extractor_llm.py @@ -49,7 +49,6 @@ Implementation notes: from __future__ import annotations -import json import os import shutil import subprocess @@ -58,92 +57,21 @@ from dataclasses import dataclass from functools import lru_cache from atocore.interactions.service import Interaction +from atocore.memory._llm_prompt import ( + LLM_EXTRACTOR_VERSION, + SYSTEM_PROMPT as _SYSTEM_PROMPT, + build_user_message, + normalize_candidate_item, + parse_llm_json_array, +) from atocore.memory.extractor import MemoryCandidate -from atocore.memory.service import MEMORY_TYPES from atocore.observability.logger import get_logger log = get_logger("extractor_llm") -LLM_EXTRACTOR_VERSION = "llm-0.4.0" DEFAULT_MODEL = os.environ.get("ATOCORE_LLM_EXTRACTOR_MODEL", "sonnet") DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_LLM_EXTRACTOR_TIMEOUT_S", "90")) -MAX_RESPONSE_CHARS = 8000 -MAX_PROMPT_CHARS = 2000 -_SYSTEM_PROMPT = """You extract memory candidates from LLM conversation turns for a personal context engine called AtoCore. - -AtoCore is the brain for Atomaste's engineering work. Known projects: -p04-gigabit, p05-interferometer, p06-polisher, atomizer-v2, atocore, -abb-space. Unknown project names — still tag them, the system auto-detects. - -Your job is to emit SIGNALS that matter for future context. Be aggressive: -err on the side of capturing useful signal. Triage filters noise downstream. - -WHAT TO EMIT (in order of importance): - -1. PROJECT ACTIVITY — any mention of a project with context worth remembering: - - "Schott quote received for ABB-Space" (event + project) - - "Cédric asked about p06 firmware timing" (stakeholder event) - - "Still waiting on Zygo lead-time from Nabeel" (blocker status) - - "p05 vendor decision needs to happen this week" (action item) - -2. DECISIONS AND CHOICES — anything that commits to a direction: - - "Going with Zygo Verifire SV for p05" (decision) - - "Dropping stitching from primary workflow" (design choice) - - "USB SSD mandatory, not SD card" (architectural commitment) - -3. DURABLE ENGINEERING INSIGHT — earned knowledge that generalizes: - - "CTE gradient dominates WFE at F/1.2" (materials insight) - - "Preston model breaks below 5N because contact assumption fails" - - "m=1 coma NOT correctable by force modulation" (controls insight) - Test: would a competent engineer NEED experience to know this? - If it's textbook/google-findable, skip it. - -4. STAKEHOLDER AND VENDOR EVENTS: - - "Email sent to Nabeel 2026-04-13 asking for lead time" - - "Meeting with Jason on Table 7 next Tuesday" - - "Starspec wants updated CAD by Friday" - -5. PREFERENCES AND ADAPTATIONS that shape how Antoine works: - - "Antoine prefers OAuth over API keys" - - "Extraction stays off the capture hot path" - -WHAT TO SKIP: - -- Pure conversational filler ("ok thanks", "let me check") -- Instructional help content ("run this command", "here's how to...") -- Obvious textbook facts anyone can google in 30 seconds -- Session meta-chatter ("let me commit this", "deploy running") -- Transient system state snapshots ("36 active memories right now") - -CANDIDATE TYPES — choose the best fit: - -- project — a fact, decision, or event specific to one named project -- knowledge — durable engineering insight (use domain, not project) -- preference — how Antoine works / wants things done -- adaptation — a standing rule or adjustment to behavior -- episodic — a stakeholder event or milestone worth remembering - -DOMAINS for knowledge candidates (required when type=knowledge and project is empty): -physics, materials, optics, mechanics, manufacturing, metrology, -controls, software, math, finance, business - -TRUST HIERARCHY: - -- project-specific: set project to the project id, leave domain empty -- domain knowledge: set domain, leave project empty -- events/activity: use project, type=project or episodic -- one conversation can produce MULTIPLE candidates — emit them all - -OUTPUT RULES: - -- Each candidate content under 250 characters, stands alone -- Default confidence 0.5. Raise to 0.7 only for ratified/committed claims. -- Raw JSON array, no prose, no markdown fences -- Empty array [] is fine when the conversation has no durable signal - -Each element: -{"type": "project|knowledge|preference|adaptation|episodic", "content": "...", "project": "...", "domain": "", "confidence": 0.5}""" @dataclass @@ -206,13 +134,10 @@ def extract_candidates_llm_verbose( if not response_text: return LLMExtractionResult(candidates=[], raw_output="", error="empty_response") - prompt_excerpt = (interaction.prompt or "")[:MAX_PROMPT_CHARS] - response_excerpt = response_text[:MAX_RESPONSE_CHARS] - user_message = ( - f"PROJECT HINT (may be empty): {interaction.project or ''}\n\n" - f"USER PROMPT:\n{prompt_excerpt}\n\n" - f"ASSISTANT RESPONSE:\n{response_excerpt}\n\n" - "Return the JSON array now." + user_message = build_user_message( + interaction.prompt or "", + response_text, + interaction.project or "", ) args = [ @@ -270,50 +195,25 @@ def extract_candidates_llm_verbose( def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryCandidate]: """Parse the model's JSON output into MemoryCandidate objects. - Tolerates common model glitches: surrounding whitespace, stray - markdown fences, leading/trailing prose. Silently drops malformed - array elements rather than raising. + Shared stripping + per-item validation live in + ``atocore.memory._llm_prompt``. This function adds the container- + only R9 project attribution: registry-check model_project and fall + back to the interaction scope when set. """ - text = raw_output.strip() - if text.startswith("```"): - text = text.strip("`") - first_newline = text.find("\n") - if first_newline >= 0: - text = text[first_newline + 1 :] - if text.endswith("```"): - text = text[:-3] - text = text.strip() - - if not text or text == "[]": - return [] - - if not text.lstrip().startswith("["): - start = text.find("[") - end = text.rfind("]") - if start >= 0 and end > start: - text = text[start : end + 1] - - try: - parsed = json.loads(text) - except json.JSONDecodeError as exc: - log.error("llm_extractor_parse_failed", error=str(exc), raw_prefix=raw_output[:120]) - return [] - - if not isinstance(parsed, list): - return [] + raw_items = parse_llm_json_array(raw_output) + if not raw_items and raw_output.strip() not in ("", "[]"): + log.error("llm_extractor_parse_failed", raw_prefix=raw_output[:120]) results: list[MemoryCandidate] = [] - for item in parsed: - if not isinstance(item, dict): + for raw_item in raw_items: + normalized = normalize_candidate_item(raw_item) + if normalized is None: continue - mem_type = str(item.get("type") or "").strip().lower() - content = str(item.get("content") or "").strip() - model_project = str(item.get("project") or "").strip() - # R9 trust hierarchy for project attribution: - # 1. Interaction scope always wins when set (strongest signal) - # 2. Model project used only when interaction is unscoped - # AND model project resolves to a registered project - # 3. Empty string when both are empty/unregistered + + model_project = normalized["project"] + # R9 trust hierarchy: interaction scope wins; else registry- + # resolve the model's tag; else keep the model's tag so auto- + # triage can surface unregistered projects. if interaction.project: project = interaction.project elif model_project: @@ -328,9 +228,6 @@ def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryC if resolved in registered_ids: project = resolved else: - # Unregistered project — keep the model's tag so - # auto-triage / the operator can see it and decide - # whether to register it as a new project or lead. project = model_project log.info( "unregistered_project_detected", @@ -338,34 +235,19 @@ def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryC interaction_id=interaction.id, ) except Exception: - project = model_project if model_project else "" + project = model_project else: project = "" - domain = str(item.get("domain") or "").strip().lower() - confidence_raw = item.get("confidence", 0.5) - if mem_type not in MEMORY_TYPES: - continue - if not content: - continue - # Domain knowledge: embed the domain tag in the content so it - # survives without a schema migration. The context builder - # can match on it via query-relevance ranking, and a future - # migration can parse it into a proper column. - if domain and not project: - content = f"[{domain}] {content}" - try: - confidence = float(confidence_raw) - except (TypeError, ValueError): - confidence = 0.5 - confidence = max(0.0, min(1.0, confidence)) + + content = normalized["content"] results.append( MemoryCandidate( - memory_type=mem_type, - content=content[:1000], + memory_type=normalized["type"], + content=content, rule="llm_extraction", source_span=content[:200], project=project, - confidence=confidence, + confidence=normalized["confidence"], source_interaction_id=interaction.id, extractor_version=LLM_EXTRACTOR_VERSION, ) diff --git a/tests/test_extraction_pipeline.py b/tests/test_extraction_pipeline.py index 7c28b43..bee9b89 100644 --- a/tests/test_extraction_pipeline.py +++ b/tests/test_extraction_pipeline.py @@ -171,3 +171,38 @@ def test_llm_extraction_failure_returns_empty(tmp_data_dir, monkeypatch): # Nothing in the candidate queue queue = get_memories(status="candidate", limit=10) assert len(queue) == 0 + + +def test_extract_batch_api_503_when_cli_missing(tmp_data_dir, monkeypatch): + """R11: POST /admin/extract-batch with mode=llm must fail loud when + the `claude` CLI is unavailable, instead of silently returning a + success-with-0-candidates payload (which masked host-vs-container + truth for operators).""" + from fastapi.testclient import TestClient + from atocore.main import app + import atocore.api.routes as routes + + init_db() + monkeypatch.setattr(routes, "_llm_cli_available", lambda: False) + + client = TestClient(app) + response = client.post("/admin/extract-batch", json={"mode": "llm"}) + + assert response.status_code == 503 + assert "claude" in response.json()["detail"].lower() + + +def test_extract_batch_api_rule_mode_ok_without_cli(tmp_data_dir, monkeypatch): + """Rule mode must still work when the LLM CLI is missing — R11 only + affects mode=llm.""" + from fastapi.testclient import TestClient + from atocore.main import app + import atocore.api.routes as routes + + init_db() + monkeypatch.setattr(routes, "_llm_cli_available", lambda: False) + + client = TestClient(app) + response = client.post("/admin/extract-batch", json={"mode": "rule"}) + + assert response.status_code == 200