Batch 3, Days 1-3. The core R9 failure was Case F: when the model returned a registered project DIFFERENT from the interaction's known scope, the old code trusted the model because the project was registered. A p06-polisher interaction could silently produce a p04-gigabit candidate. New rule (trust hierarchy): 1. Interaction scope always wins when set (cases A, C, E, F) 2. Model project used only for unscoped interactions AND only when it resolves to a registered project (cases D, G) 3. Empty string when both are empty or unregistered (case B) The rule is: interaction.project is the strongest signal because it comes from the capture hook's project detection, which runs before the LLM ever sees the content. The model's project guess is only useful when the capture hook had no project context. 7 case tests (A-G) cover every combination of model/interaction project state. Pre-existing tests updated for the new behavior. Host-side script mirrors the same hierarchy using _known_projects fetched from GET /projects at startup. Test count: 286 -> 290 (+4 net, 7 new R9 cases, 3 old tests consolidated). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
244 lines
9.2 KiB
Python
244 lines
9.2 KiB
Python
"""Tests for the LLM-assisted extractor path.
|
|
|
|
Focused on the parser and failure-mode contracts — the actual network
|
|
call is exercised out of band by running
|
|
``python scripts/extractor_eval.py --mode llm`` against the frozen
|
|
labeled corpus with ``ANTHROPIC_API_KEY`` set. These tests only
|
|
exercise the pieces that don't need network.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from atocore.interactions.service import Interaction
|
|
from atocore.memory.extractor_llm import (
|
|
LLM_EXTRACTOR_VERSION,
|
|
_parse_candidates,
|
|
extract_candidates_llm,
|
|
extract_candidates_llm_verbose,
|
|
)
|
|
import atocore.memory.extractor_llm as extractor_llm
|
|
|
|
|
|
def _make_interaction(prompt: str = "p", response: str = "r") -> Interaction:
|
|
return Interaction(
|
|
id="test-id",
|
|
prompt=prompt,
|
|
response=response,
|
|
response_summary="",
|
|
project="",
|
|
client="test",
|
|
session_id="",
|
|
)
|
|
|
|
|
|
def test_parser_handles_empty_array():
|
|
result = _parse_candidates("[]", _make_interaction())
|
|
assert result == []
|
|
|
|
|
|
def test_parser_handles_malformed_json():
|
|
result = _parse_candidates("{ not valid json", _make_interaction())
|
|
assert result == []
|
|
|
|
|
|
def test_parser_strips_markdown_fences():
|
|
raw = "```json\n[{\"type\": \"knowledge\", \"content\": \"x is y\", \"project\": \"\", \"confidence\": 0.5}]\n```"
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert len(result) == 1
|
|
assert result[0].memory_type == "knowledge"
|
|
assert result[0].content == "x is y"
|
|
|
|
|
|
def test_parser_strips_surrounding_prose():
|
|
raw = "Here are the candidates:\n[{\"type\": \"project\", \"content\": \"foo\", \"project\": \"p04\", \"confidence\": 0.6}]\nThat's it."
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert len(result) == 1
|
|
assert result[0].memory_type == "project"
|
|
# Model returned "p04" with no interaction scope — unscoped path
|
|
# resolves via registry if available, otherwise stays as-is
|
|
|
|
|
|
def test_parser_drops_invalid_memory_types():
|
|
raw = '[{"type": "nonsense", "content": "x"}, {"type": "project", "content": "y"}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert len(result) == 1
|
|
assert result[0].memory_type == "project"
|
|
|
|
|
|
def test_parser_drops_empty_content():
|
|
raw = '[{"type": "knowledge", "content": " "}, {"type": "knowledge", "content": "real"}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert len(result) == 1
|
|
assert result[0].content == "real"
|
|
|
|
|
|
def test_parser_clamps_confidence_to_unit_interval():
|
|
raw = '[{"type": "knowledge", "content": "c1", "confidence": 2.5}, {"type": "knowledge", "content": "c2", "confidence": -0.4}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert result[0].confidence == 1.0
|
|
assert result[1].confidence == 0.0
|
|
|
|
|
|
def test_parser_defaults_confidence_on_missing_field():
|
|
raw = '[{"type": "knowledge", "content": "c1"}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert result[0].confidence == 0.5
|
|
|
|
|
|
def test_parser_tags_version_and_rule():
|
|
raw = '[{"type": "project", "content": "c1"}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert result[0].rule == "llm_extraction"
|
|
assert result[0].extractor_version == LLM_EXTRACTOR_VERSION
|
|
assert result[0].source_interaction_id == "test-id"
|
|
|
|
|
|
def test_case_a_empty_model_scoped_interaction():
|
|
"""Case A: model returns empty project, interaction is scoped.
|
|
Interaction scope wins."""
|
|
raw = '[{"type": "project", "content": "machine works offline"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = "p06-polisher"
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p06-polisher"
|
|
|
|
|
|
def test_case_b_empty_model_unscoped_interaction():
|
|
"""Case B: both empty. Project stays empty."""
|
|
raw = '[{"type": "project", "content": "generic fact"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = ""
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == ""
|
|
|
|
|
|
def test_case_c_unregistered_model_scoped_interaction(tmp_data_dir, project_registry):
|
|
"""Case C: model returns unregistered project, interaction is scoped.
|
|
Interaction scope wins."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p06-polisher", ["p06"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "fake-project-99"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = "p06-polisher"
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p06-polisher"
|
|
|
|
|
|
def test_case_d_unregistered_model_unscoped_interaction(tmp_data_dir, project_registry):
|
|
"""Case D: model returns unregistered project, interaction is unscoped.
|
|
Falls to empty (not the hallucinated name)."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p06-polisher", ["p06"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "fake-project-99"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = ""
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == ""
|
|
|
|
|
|
def test_case_e_matching_model_and_interaction(tmp_data_dir, project_registry):
|
|
"""Case E: model returns same project as interaction. Works."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p06-polisher", ["p06"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "p06-polisher"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = "p06-polisher"
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p06-polisher"
|
|
|
|
|
|
def test_case_f_wrong_registered_model_scoped_interaction(tmp_data_dir, project_registry):
|
|
"""Case F — the R9 core failure: model returns a DIFFERENT registered
|
|
project than the interaction's known scope. Interaction scope wins.
|
|
This is the case that was broken before the R9 fix."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p04-gigabit", ["p04"]), ("p06-polisher", ["p06"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "p04-gigabit"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = "p06-polisher"
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p06-polisher"
|
|
|
|
|
|
def test_case_g_registered_model_unscoped_interaction(tmp_data_dir, project_registry):
|
|
"""Case G: model returns a registered project, interaction is unscoped.
|
|
Model project accepted (only way to get a project for unscoped captures)."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p04-gigabit", ["p04"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "p04-gigabit"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = ""
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p04-gigabit"
|
|
|
|
|
|
def test_missing_cli_returns_empty(monkeypatch):
|
|
"""If ``claude`` is not on PATH the extractor returns empty, never raises."""
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: False)
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "some real response"))
|
|
assert result.candidates == []
|
|
assert result.error == "claude_cli_missing"
|
|
|
|
|
|
def test_empty_response_returns_empty(monkeypatch):
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", ""))
|
|
assert result.candidates == []
|
|
assert result.error == "empty_response"
|
|
|
|
|
|
def test_subprocess_timeout_returns_empty(monkeypatch):
|
|
"""A subprocess timeout must not raise into the caller."""
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
|
|
|
import subprocess as _sp
|
|
|
|
def _boom(*a, **kw):
|
|
raise _sp.TimeoutExpired(cmd=a[0] if a else "claude", timeout=1)
|
|
|
|
monkeypatch.setattr(extractor_llm.subprocess, "run", _boom)
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "real response"))
|
|
assert result.candidates == []
|
|
assert result.error == "timeout"
|
|
|
|
|
|
def test_subprocess_nonzero_exit_returns_empty(monkeypatch):
|
|
"""A non-zero CLI exit (auth failure, etc.) must not raise."""
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
|
|
|
class _Completed:
|
|
returncode = 1
|
|
stdout = ""
|
|
stderr = "auth failed"
|
|
|
|
monkeypatch.setattr(extractor_llm.subprocess, "run", lambda *a, **kw: _Completed())
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "real response"))
|
|
assert result.candidates == []
|
|
assert result.error == "exit_1"
|
|
|
|
|
|
def test_happy_path_parses_stdout(monkeypatch):
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
|
|
|
class _Completed:
|
|
returncode = 0
|
|
stdout = '[{"type": "project", "content": "p04 selected Option B", "project": "p04-gigabit", "confidence": 0.6}]'
|
|
stderr = ""
|
|
|
|
monkeypatch.setattr(extractor_llm.subprocess, "run", lambda *a, **kw: _Completed())
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "r"))
|
|
assert len(result.candidates) == 1
|
|
assert result.candidates[0].memory_type == "project"
|
|
assert result.candidates[0].project == "p04-gigabit"
|
|
assert abs(result.candidates[0].confidence - 0.6) < 1e-9
|