From b309e7fd491d92459cfcdf0e05c3ef45a0d512ad Mon Sep 17 00:00:00 2001 From: Anto01 Date: Sat, 11 Apr 2026 15:18:30 -0400 Subject: [PATCH] =?UTF-8?q?feat(eval-loop):=20Day=204=20=E2=80=94=20LLM-as?= =?UTF-8?q?sisted=20extractor=20path=20(additive,=20flagged)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Day 2 baseline showed 0% recall for the rule-based extractor across 5 distinct miss classes. Day 4 decision gate: prototype an LLM-assisted mode behind a flag. Option A ratified by Antoine. New module src/atocore/memory/extractor_llm.py: - extract_candidates_llm(interaction) returns the same MemoryCandidate dataclass the rule extractor produces, so both paths flow through the existing triage / candidate pipeline unchanged. - extract_candidates_llm_verbose() also returns the raw model output and any error string, for eval and debugging. - Uses Claude Haiku 4.5 by default; model overridable via ATOCORE_LLM_EXTRACTOR_MODEL env. Timeout via ATOCORE_LLM_EXTRACTOR_TIMEOUT_S (default 20s). - Silent-failure contract: missing API key, unreachable model, malformed JSON — all return [] and log an error. Never raises into the caller. The capture audit trail must not break on an optional side effect. - Parser tolerates markdown fences, surrounding prose, invalid memory types, clamps confidence to [0,1], drops empty content. - System prompt explicitly tells the model to return [] for most conversational turns (durable-fact bar, not "extract everything"). - Trust rules unchanged: candidates are never auto-promoted, extraction stays off the capture hot path, human triages via the existing CLI. scripts/extractor_eval.py: new --mode {rule,llm} flag so the same labeled corpus can be scored against both extractors. Default remains rule so existing invocations are unchanged. tests/test_extractor_llm.py: 12 new unit tests covering the parser (empty array, malformed JSON, markdown fences, surrounding prose, invalid types, empty content, confidence clamping, version tagging), plus contract tests for missing API key, empty response, and a mocked api_error path so failure modes never raise. Test count: 264 -> 276 passing. No existing tests changed. Next step: run `python scripts/extractor_eval.py --mode llm` against the labeled set with ANTHROPIC_API_KEY in env, record the delta, decide whether to wire LLM mode into the API endpoint and CLI or keep it script-only for now. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/extractor_eval.py | 17 ++- src/atocore/memory/extractor_llm.py | 227 ++++++++++++++++++++++++++++ tests/test_extractor_llm.py | 129 ++++++++++++++++ 3 files changed, 370 insertions(+), 3 deletions(-) create mode 100644 src/atocore/memory/extractor_llm.py create mode 100644 tests/test_extractor_llm.py diff --git a/scripts/extractor_eval.py b/scripts/extractor_eval.py index a641b7c..1819efc 100644 --- a/scripts/extractor_eval.py +++ b/scripts/extractor_eval.py @@ -33,6 +33,7 @@ sys.path.insert(0, str(_REPO_ROOT / "src")) from atocore.interactions.service import Interaction # noqa: E402 from atocore.memory.extractor import extract_candidates_from_interaction # noqa: E402 +from atocore.memory.extractor_llm import extract_candidates_llm # noqa: E402 DEFAULT_SNAPSHOT = _REPO_ROOT / "scripts" / "eval_data" / "interactions_snapshot_2026-04-11.json" DEFAULT_LABELS = _REPO_ROOT / "scripts" / "eval_data" / "extractor_labels_2026-04-11.json" @@ -71,7 +72,7 @@ def interaction_from_snapshot(snap: dict) -> Interaction: ) -def score(snapshot: dict[str, dict], labels_doc: dict) -> list[LabelResult]: +def score(snapshot: dict[str, dict], labels_doc: dict, mode: str = "rule") -> list[LabelResult]: results: list[LabelResult] = [] for label in labels_doc["labels"]: iid = label["id"] @@ -89,7 +90,10 @@ def score(snapshot: dict[str, dict], labels_doc: dict) -> list[LabelResult]: ) continue interaction = interaction_from_snapshot(snap) - candidates = extract_candidates_from_interaction(interaction) + if mode == "llm": + candidates = extract_candidates_llm(interaction) + else: + candidates = extract_candidates_from_interaction(interaction) actual_count = len(candidates) expected_count = int(label.get("expected_count", 0)) results.append( @@ -214,12 +218,19 @@ def main() -> int: parser.add_argument("--snapshot", type=Path, default=DEFAULT_SNAPSHOT) parser.add_argument("--labels", type=Path, default=DEFAULT_LABELS) parser.add_argument("--json", action="store_true", help="emit machine-readable JSON") + parser.add_argument( + "--mode", + choices=["rule", "llm"], + default="rule", + help="which extractor to score (default: rule)", + ) args = parser.parse_args() snapshot = load_snapshot(args.snapshot) labels = load_labels(args.labels) - results = score(snapshot, labels) + results = score(snapshot, labels, mode=args.mode) summary = aggregate(results) + summary["mode"] = args.mode if args.json: print_json(results, summary) diff --git a/src/atocore/memory/extractor_llm.py b/src/atocore/memory/extractor_llm.py new file mode 100644 index 0000000..8775273 --- /dev/null +++ b/src/atocore/memory/extractor_llm.py @@ -0,0 +1,227 @@ +"""LLM-assisted candidate-memory extraction. + +Day 4 of the 2026-04-11 mini-phase: the rule-based extractor hit 0% +recall against real conversational claude-code captures (Day 2 baseline +scorecard in ``scripts/eval_data/extractor_labels_2026-04-11.json``), +with false negatives spread across 5 distinct miss classes. A single +rule expansion cannot close that gap, so this module adds an optional +LLM-assisted mode that reads the full prompt+response, asks a small +model (default: Claude Haiku 4.5) for structured candidate objects, +and returns the same ``MemoryCandidate`` dataclass the rule extractor +produces so both paths flow through the same candidate pipeline. + +Trust rules carried forward from the rule-based extractor: + +- Candidates are NEVER auto-promoted. Caller persists with + ``status="candidate"`` and a human reviews via the triage CLI. +- This path is additive. The rule-based extractor keeps working + exactly as before; callers opt in by importing this module. +- Extraction stays off the capture hot path — this is batch / manual + only, per the 2026-04-11 decision. +- Failure is silent. Missing API key, unreachable model, malformed + JSON, timeout — all return an empty list and log an error. Never + raise into the caller, because the capture audit trail must not + break on an optional side effect. + +Configuration: + +- ``ANTHROPIC_API_KEY`` env var must be set or the function returns []. +- ``ATOCORE_LLM_EXTRACTOR_MODEL`` overrides the default model id. +- ``ATOCORE_LLM_EXTRACTOR_TIMEOUT_S`` overrides the request timeout + (default 20 seconds). +""" + +from __future__ import annotations + +import json +import os +from dataclasses import dataclass + +from atocore.interactions.service import Interaction +from atocore.memory.extractor import EXTRACTOR_VERSION, MemoryCandidate +from atocore.memory.service import MEMORY_TYPES +from atocore.observability.logger import get_logger + +log = get_logger("extractor_llm") + +LLM_EXTRACTOR_VERSION = "llm-0.1.0" +DEFAULT_MODEL = os.environ.get("ATOCORE_LLM_EXTRACTOR_MODEL", "claude-haiku-4-5-20251001") +DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_LLM_EXTRACTOR_TIMEOUT_S", "20")) +MAX_RESPONSE_CHARS = 8000 +MAX_PROMPT_CHARS = 2000 + +_SYSTEM_PROMPT = """You extract durable memory candidates from LLM conversation turns for a personal context engine called AtoCore. + +Your job is to read one user prompt plus the assistant's response and decide which durable facts, decisions, preferences, architectural rules, or project invariants should be remembered across future sessions. + +Rules: + +1. Only surface durable claims. Skip transient status ("deploy is still running"), instructional guidance ("here is how to run the command"), troubleshooting tactics, ephemeral recommendations ("merge this PR now"), and session recaps. +2. A candidate is durable when a reader coming back in two weeks would still need to know it. Architectural choices, named rules, ratified decisions, invariants, procurement commitments, and project-level constraints qualify. Conversational fillers and step-by-step instructions do not. +3. Each candidate must stand alone. Rewrite the claim in one sentence under 200 characters with enough context that a reader without the conversation understands it. +4. Each candidate must have a type from this closed set: project, knowledge, preference, adaptation. +5. If the conversation is clearly scoped to a project (p04-gigabit, p05-interferometer, p06-polisher, atocore), set ``project`` to that id. Otherwise leave ``project`` empty. +6. If the response makes no durable claim, return an empty list. It is correct and expected to return [] on most conversational turns. +7. Confidence should be 0.5 by default for new candidates so review workload is honest. Raise to 0.6 only when the response states the claim in an unambiguous, committed form (e.g., "the decision is X", "the selected approach is Y", "X is non-negotiable"). +8. Output must be a raw JSON array and nothing else. No prose before or after. No markdown fences. + +Each array element has exactly this shape: + +{"type": "project|knowledge|preference|adaptation", "content": "...", "project": "...", "confidence": 0.5} + +Return [] when there is nothing to extract.""" + + +@dataclass +class LLMExtractionResult: + candidates: list[MemoryCandidate] + raw_output: str + error: str = "" + + +def extract_candidates_llm( + interaction: Interaction, + model: str | None = None, + timeout_s: float | None = None, +) -> list[MemoryCandidate]: + """Run the LLM-assisted extractor against one interaction. + + Returns a list of ``MemoryCandidate`` objects, empty on any failure + path. The caller is responsible for persistence. + """ + result = extract_candidates_llm_verbose( + interaction, + model=model, + timeout_s=timeout_s, + ) + return result.candidates + + +def extract_candidates_llm_verbose( + interaction: Interaction, + model: str | None = None, + timeout_s: float | None = None, +) -> LLMExtractionResult: + """Same as ``extract_candidates_llm`` but also returns the raw + model output and any error encountered, for eval / debugging. + """ + if not os.environ.get("ANTHROPIC_API_KEY"): + return LLMExtractionResult(candidates=[], raw_output="", error="missing_api_key") + + response_text = (interaction.response or "").strip() + if not response_text: + return LLMExtractionResult(candidates=[], raw_output="", error="empty_response") + + try: + import anthropic # noqa: F401 + except ImportError: + log.error("anthropic_sdk_missing") + return LLMExtractionResult(candidates=[], raw_output="", error="anthropic_sdk_missing") + + prompt_excerpt = (interaction.prompt or "")[:MAX_PROMPT_CHARS] + response_excerpt = response_text[:MAX_RESPONSE_CHARS] + user_message = ( + f"PROJECT HINT (may be empty): {interaction.project or ''}\n\n" + f"USER PROMPT:\n{prompt_excerpt}\n\n" + f"ASSISTANT RESPONSE:\n{response_excerpt}\n\n" + "Return the JSON array now." + ) + + try: + import anthropic + + client = anthropic.Anthropic(timeout=timeout_s or DEFAULT_TIMEOUT_S) + response = client.messages.create( + model=model or DEFAULT_MODEL, + max_tokens=1024, + system=_SYSTEM_PROMPT, + messages=[{"role": "user", "content": user_message}], + ) + except Exception as exc: # pragma: no cover - network / auth failures + log.error("llm_extractor_api_failed", error=str(exc)) + return LLMExtractionResult(candidates=[], raw_output="", error=f"api_error: {exc}") + + raw_output = "" + for block in response.content: + text = getattr(block, "text", None) + if text: + raw_output += text + raw_output = raw_output.strip() + + candidates = _parse_candidates(raw_output, interaction) + log.info( + "llm_extractor_done", + interaction_id=interaction.id, + candidate_count=len(candidates), + model=model or DEFAULT_MODEL, + ) + return LLMExtractionResult(candidates=candidates, raw_output=raw_output) + + +def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryCandidate]: + """Parse the model's JSON output into MemoryCandidate objects. + + Tolerates common model glitches: surrounding whitespace, stray + markdown fences, leading/trailing prose. Silently drops malformed + array elements rather than raising. + """ + text = raw_output.strip() + if text.startswith("```"): + # Strip markdown fences if the model added them despite the instruction. + text = text.strip("`") + first_newline = text.find("\n") + if first_newline >= 0: + text = text[first_newline + 1 :] + if text.endswith("```"): + text = text[:-3] + text = text.strip() + + if not text or text == "[]": + return [] + + # If the model wrapped the array in prose, try to isolate the JSON. + if not text.lstrip().startswith("["): + start = text.find("[") + end = text.rfind("]") + if start >= 0 and end > start: + text = text[start : end + 1] + + try: + parsed = json.loads(text) + except json.JSONDecodeError as exc: + log.error("llm_extractor_parse_failed", error=str(exc), raw_prefix=raw_output[:120]) + return [] + + if not isinstance(parsed, list): + return [] + + results: list[MemoryCandidate] = [] + for item in parsed: + if not isinstance(item, dict): + continue + mem_type = str(item.get("type") or "").strip().lower() + content = str(item.get("content") or "").strip() + project = str(item.get("project") or "").strip() + confidence_raw = item.get("confidence", 0.5) + if mem_type not in MEMORY_TYPES: + continue + if not content: + continue + try: + confidence = float(confidence_raw) + except (TypeError, ValueError): + confidence = 0.5 + confidence = max(0.0, min(1.0, confidence)) + results.append( + MemoryCandidate( + memory_type=mem_type, + content=content[:1000], + rule="llm_extraction", + source_span=content[:200], + project=project, + confidence=confidence, + source_interaction_id=interaction.id, + extractor_version=LLM_EXTRACTOR_VERSION, + ) + ) + return results diff --git a/tests/test_extractor_llm.py b/tests/test_extractor_llm.py new file mode 100644 index 0000000..eaacdb2 --- /dev/null +++ b/tests/test_extractor_llm.py @@ -0,0 +1,129 @@ +"""Tests for the LLM-assisted extractor path. + +Focused on the parser and failure-mode contracts — the actual network +call is exercised out of band by running +``python scripts/extractor_eval.py --mode llm`` against the frozen +labeled corpus with ``ANTHROPIC_API_KEY`` set. These tests only +exercise the pieces that don't need network. +""" + +from __future__ import annotations + +import os +from unittest.mock import patch + +import pytest + +from atocore.interactions.service import Interaction +from atocore.memory.extractor_llm import ( + LLM_EXTRACTOR_VERSION, + _parse_candidates, + extract_candidates_llm, + extract_candidates_llm_verbose, +) + + +def _make_interaction(prompt: str = "p", response: str = "r") -> Interaction: + return Interaction( + id="test-id", + prompt=prompt, + response=response, + response_summary="", + project="", + client="test", + session_id="", + ) + + +def test_parser_handles_empty_array(): + result = _parse_candidates("[]", _make_interaction()) + assert result == [] + + +def test_parser_handles_malformed_json(): + result = _parse_candidates("{ not valid json", _make_interaction()) + assert result == [] + + +def test_parser_strips_markdown_fences(): + raw = "```json\n[{\"type\": \"knowledge\", \"content\": \"x is y\", \"project\": \"\", \"confidence\": 0.5}]\n```" + result = _parse_candidates(raw, _make_interaction()) + assert len(result) == 1 + assert result[0].memory_type == "knowledge" + assert result[0].content == "x is y" + + +def test_parser_strips_surrounding_prose(): + raw = "Here are the candidates:\n[{\"type\": \"project\", \"content\": \"foo\", \"project\": \"p04\", \"confidence\": 0.6}]\nThat's it." + result = _parse_candidates(raw, _make_interaction()) + assert len(result) == 1 + assert result[0].memory_type == "project" + assert result[0].project == "p04" + + +def test_parser_drops_invalid_memory_types(): + raw = '[{"type": "nonsense", "content": "x"}, {"type": "project", "content": "y"}]' + result = _parse_candidates(raw, _make_interaction()) + assert len(result) == 1 + assert result[0].memory_type == "project" + + +def test_parser_drops_empty_content(): + raw = '[{"type": "knowledge", "content": " "}, {"type": "knowledge", "content": "real"}]' + result = _parse_candidates(raw, _make_interaction()) + assert len(result) == 1 + assert result[0].content == "real" + + +def test_parser_clamps_confidence_to_unit_interval(): + raw = '[{"type": "knowledge", "content": "c1", "confidence": 2.5}, {"type": "knowledge", "content": "c2", "confidence": -0.4}]' + result = _parse_candidates(raw, _make_interaction()) + assert result[0].confidence == 1.0 + assert result[1].confidence == 0.0 + + +def test_parser_defaults_confidence_on_missing_field(): + raw = '[{"type": "knowledge", "content": "c1"}]' + result = _parse_candidates(raw, _make_interaction()) + assert result[0].confidence == 0.5 + + +def test_parser_tags_version_and_rule(): + raw = '[{"type": "project", "content": "c1"}]' + result = _parse_candidates(raw, _make_interaction()) + assert result[0].rule == "llm_extraction" + assert result[0].extractor_version == LLM_EXTRACTOR_VERSION + assert result[0].source_interaction_id == "test-id" + + +def test_missing_api_key_returns_empty(monkeypatch): + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) + result = extract_candidates_llm_verbose(_make_interaction("p", "some real response")) + assert result.candidates == [] + assert result.error == "missing_api_key" + + +def test_empty_response_returns_empty(monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-key-not-used") + result = extract_candidates_llm_verbose(_make_interaction("p", "")) + assert result.candidates == [] + assert result.error == "empty_response" + + +def test_api_error_returns_empty(monkeypatch): + """A transport error from the SDK must not raise into the caller.""" + monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-key-not-used") + + class _BoomClient: + def __init__(self, *a, **kw): + pass + + class messages: # noqa: D401 + @staticmethod + def create(**kw): + raise RuntimeError("simulated network error") + + with patch("anthropic.Anthropic", _BoomClient): + result = extract_candidates_llm_verbose(_make_interaction("p", "real response")) + assert result.candidates == [] + assert "api_error" in result.error