130 lines
4.3 KiB
Python
130 lines
4.3 KiB
Python
|
|
"""Tests for the LLM-assisted extractor path.
|
||
|
|
|
||
|
|
Focused on the parser and failure-mode contracts — the actual network
|
||
|
|
call is exercised out of band by running
|
||
|
|
``python scripts/extractor_eval.py --mode llm`` against the frozen
|
||
|
|
labeled corpus with ``ANTHROPIC_API_KEY`` set. These tests only
|
||
|
|
exercise the pieces that don't need network.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import os
|
||
|
|
from unittest.mock import patch
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
from atocore.interactions.service import Interaction
|
||
|
|
from atocore.memory.extractor_llm import (
|
||
|
|
LLM_EXTRACTOR_VERSION,
|
||
|
|
_parse_candidates,
|
||
|
|
extract_candidates_llm,
|
||
|
|
extract_candidates_llm_verbose,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _make_interaction(prompt: str = "p", response: str = "r") -> Interaction:
|
||
|
|
return Interaction(
|
||
|
|
id="test-id",
|
||
|
|
prompt=prompt,
|
||
|
|
response=response,
|
||
|
|
response_summary="",
|
||
|
|
project="",
|
||
|
|
client="test",
|
||
|
|
session_id="",
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_handles_empty_array():
|
||
|
|
result = _parse_candidates("[]", _make_interaction())
|
||
|
|
assert result == []
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_handles_malformed_json():
|
||
|
|
result = _parse_candidates("{ not valid json", _make_interaction())
|
||
|
|
assert result == []
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_strips_markdown_fences():
|
||
|
|
raw = "```json\n[{\"type\": \"knowledge\", \"content\": \"x is y\", \"project\": \"\", \"confidence\": 0.5}]\n```"
|
||
|
|
result = _parse_candidates(raw, _make_interaction())
|
||
|
|
assert len(result) == 1
|
||
|
|
assert result[0].memory_type == "knowledge"
|
||
|
|
assert result[0].content == "x is y"
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_strips_surrounding_prose():
|
||
|
|
raw = "Here are the candidates:\n[{\"type\": \"project\", \"content\": \"foo\", \"project\": \"p04\", \"confidence\": 0.6}]\nThat's it."
|
||
|
|
result = _parse_candidates(raw, _make_interaction())
|
||
|
|
assert len(result) == 1
|
||
|
|
assert result[0].memory_type == "project"
|
||
|
|
assert result[0].project == "p04"
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_drops_invalid_memory_types():
|
||
|
|
raw = '[{"type": "nonsense", "content": "x"}, {"type": "project", "content": "y"}]'
|
||
|
|
result = _parse_candidates(raw, _make_interaction())
|
||
|
|
assert len(result) == 1
|
||
|
|
assert result[0].memory_type == "project"
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_drops_empty_content():
|
||
|
|
raw = '[{"type": "knowledge", "content": " "}, {"type": "knowledge", "content": "real"}]'
|
||
|
|
result = _parse_candidates(raw, _make_interaction())
|
||
|
|
assert len(result) == 1
|
||
|
|
assert result[0].content == "real"
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_clamps_confidence_to_unit_interval():
|
||
|
|
raw = '[{"type": "knowledge", "content": "c1", "confidence": 2.5}, {"type": "knowledge", "content": "c2", "confidence": -0.4}]'
|
||
|
|
result = _parse_candidates(raw, _make_interaction())
|
||
|
|
assert result[0].confidence == 1.0
|
||
|
|
assert result[1].confidence == 0.0
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_defaults_confidence_on_missing_field():
|
||
|
|
raw = '[{"type": "knowledge", "content": "c1"}]'
|
||
|
|
result = _parse_candidates(raw, _make_interaction())
|
||
|
|
assert result[0].confidence == 0.5
|
||
|
|
|
||
|
|
|
||
|
|
def test_parser_tags_version_and_rule():
|
||
|
|
raw = '[{"type": "project", "content": "c1"}]'
|
||
|
|
result = _parse_candidates(raw, _make_interaction())
|
||
|
|
assert result[0].rule == "llm_extraction"
|
||
|
|
assert result[0].extractor_version == LLM_EXTRACTOR_VERSION
|
||
|
|
assert result[0].source_interaction_id == "test-id"
|
||
|
|
|
||
|
|
|
||
|
|
def test_missing_api_key_returns_empty(monkeypatch):
|
||
|
|
monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
|
||
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "some real response"))
|
||
|
|
assert result.candidates == []
|
||
|
|
assert result.error == "missing_api_key"
|
||
|
|
|
||
|
|
|
||
|
|
def test_empty_response_returns_empty(monkeypatch):
|
||
|
|
monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-key-not-used")
|
||
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", ""))
|
||
|
|
assert result.candidates == []
|
||
|
|
assert result.error == "empty_response"
|
||
|
|
|
||
|
|
|
||
|
|
def test_api_error_returns_empty(monkeypatch):
|
||
|
|
"""A transport error from the SDK must not raise into the caller."""
|
||
|
|
monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-key-not-used")
|
||
|
|
|
||
|
|
class _BoomClient:
|
||
|
|
def __init__(self, *a, **kw):
|
||
|
|
pass
|
||
|
|
|
||
|
|
class messages: # noqa: D401
|
||
|
|
@staticmethod
|
||
|
|
def create(**kw):
|
||
|
|
raise RuntimeError("simulated network error")
|
||
|
|
|
||
|
|
with patch("anthropic.Anthropic", _BoomClient):
|
||
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "real response"))
|
||
|
|
assert result.candidates == []
|
||
|
|
assert "api_error" in result.error
|