Three changes: 1. ABB-Space registered as a lead project with stage=lead in Trusted Project State. Projects now have lifecycle awareness (lead/proposition vs active contract vs completed). 2. Extraction no longer drops unregistered project tags. When the LLM extractor sees a conversation about a project not in the registry, it keeps the model's tag on the candidate instead of falling back to empty. This enables auto-detection of new projects/leads from organic conversations. The nightly pipeline surfaces these candidates for triage, where the operator sees "hey, there's a new project called X" and can decide whether to register it. 3. Extraction prompt updated to tell the model: "If the conversation discusses a project NOT in the known list, still tag it — the system will auto-detect it." This removes the artificial ceiling that prevented new project discovery. Updated Case D test: unregistered + unscoped now keeps the model's tag instead of dropping to empty. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
244 lines
9.3 KiB
Python
244 lines
9.3 KiB
Python
"""Tests for the LLM-assisted extractor path.
|
|
|
|
Focused on the parser and failure-mode contracts — the actual network
|
|
call is exercised out of band by running
|
|
``python scripts/extractor_eval.py --mode llm`` against the frozen
|
|
labeled corpus with ``ANTHROPIC_API_KEY`` set. These tests only
|
|
exercise the pieces that don't need network.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from atocore.interactions.service import Interaction
|
|
from atocore.memory.extractor_llm import (
|
|
LLM_EXTRACTOR_VERSION,
|
|
_parse_candidates,
|
|
extract_candidates_llm,
|
|
extract_candidates_llm_verbose,
|
|
)
|
|
import atocore.memory.extractor_llm as extractor_llm
|
|
|
|
|
|
def _make_interaction(prompt: str = "p", response: str = "r") -> Interaction:
|
|
return Interaction(
|
|
id="test-id",
|
|
prompt=prompt,
|
|
response=response,
|
|
response_summary="",
|
|
project="",
|
|
client="test",
|
|
session_id="",
|
|
)
|
|
|
|
|
|
def test_parser_handles_empty_array():
|
|
result = _parse_candidates("[]", _make_interaction())
|
|
assert result == []
|
|
|
|
|
|
def test_parser_handles_malformed_json():
|
|
result = _parse_candidates("{ not valid json", _make_interaction())
|
|
assert result == []
|
|
|
|
|
|
def test_parser_strips_markdown_fences():
|
|
raw = "```json\n[{\"type\": \"knowledge\", \"content\": \"x is y\", \"project\": \"\", \"confidence\": 0.5}]\n```"
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert len(result) == 1
|
|
assert result[0].memory_type == "knowledge"
|
|
assert result[0].content == "x is y"
|
|
|
|
|
|
def test_parser_strips_surrounding_prose():
|
|
raw = "Here are the candidates:\n[{\"type\": \"project\", \"content\": \"foo\", \"project\": \"p04\", \"confidence\": 0.6}]\nThat's it."
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert len(result) == 1
|
|
assert result[0].memory_type == "project"
|
|
# Model returned "p04" with no interaction scope — unscoped path
|
|
# resolves via registry if available, otherwise stays as-is
|
|
|
|
|
|
def test_parser_drops_invalid_memory_types():
|
|
raw = '[{"type": "nonsense", "content": "x"}, {"type": "project", "content": "y"}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert len(result) == 1
|
|
assert result[0].memory_type == "project"
|
|
|
|
|
|
def test_parser_drops_empty_content():
|
|
raw = '[{"type": "knowledge", "content": " "}, {"type": "knowledge", "content": "real"}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert len(result) == 1
|
|
assert result[0].content == "real"
|
|
|
|
|
|
def test_parser_clamps_confidence_to_unit_interval():
|
|
raw = '[{"type": "knowledge", "content": "c1", "confidence": 2.5}, {"type": "knowledge", "content": "c2", "confidence": -0.4}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert result[0].confidence == 1.0
|
|
assert result[1].confidence == 0.0
|
|
|
|
|
|
def test_parser_defaults_confidence_on_missing_field():
|
|
raw = '[{"type": "knowledge", "content": "c1"}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert result[0].confidence == 0.5
|
|
|
|
|
|
def test_parser_tags_version_and_rule():
|
|
raw = '[{"type": "project", "content": "c1"}]'
|
|
result = _parse_candidates(raw, _make_interaction())
|
|
assert result[0].rule == "llm_extraction"
|
|
assert result[0].extractor_version == LLM_EXTRACTOR_VERSION
|
|
assert result[0].source_interaction_id == "test-id"
|
|
|
|
|
|
def test_case_a_empty_model_scoped_interaction():
|
|
"""Case A: model returns empty project, interaction is scoped.
|
|
Interaction scope wins."""
|
|
raw = '[{"type": "project", "content": "machine works offline"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = "p06-polisher"
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p06-polisher"
|
|
|
|
|
|
def test_case_b_empty_model_unscoped_interaction():
|
|
"""Case B: both empty. Project stays empty."""
|
|
raw = '[{"type": "project", "content": "generic fact"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = ""
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == ""
|
|
|
|
|
|
def test_case_c_unregistered_model_scoped_interaction(tmp_data_dir, project_registry):
|
|
"""Case C: model returns unregistered project, interaction is scoped.
|
|
Interaction scope wins."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p06-polisher", ["p06"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "fake-project-99"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = "p06-polisher"
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p06-polisher"
|
|
|
|
|
|
def test_case_d_unregistered_model_unscoped_keeps_tag(tmp_data_dir, project_registry):
|
|
"""Case D: model returns unregistered project, interaction is unscoped.
|
|
Keeps the model's tag for auto-project-detection (new behavior)."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p06-polisher", ["p06"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "new-lead-project"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = ""
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "new-lead-project"
|
|
|
|
|
|
def test_case_e_matching_model_and_interaction(tmp_data_dir, project_registry):
|
|
"""Case E: model returns same project as interaction. Works."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p06-polisher", ["p06"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "p06-polisher"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = "p06-polisher"
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p06-polisher"
|
|
|
|
|
|
def test_case_f_wrong_registered_model_scoped_interaction(tmp_data_dir, project_registry):
|
|
"""Case F — the R9 core failure: model returns a DIFFERENT registered
|
|
project than the interaction's known scope. Interaction scope wins.
|
|
This is the case that was broken before the R9 fix."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p04-gigabit", ["p04"]), ("p06-polisher", ["p06"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "p04-gigabit"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = "p06-polisher"
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p06-polisher"
|
|
|
|
|
|
def test_case_g_registered_model_unscoped_interaction(tmp_data_dir, project_registry):
|
|
"""Case G: model returns a registered project, interaction is unscoped.
|
|
Model project accepted (only way to get a project for unscoped captures)."""
|
|
from atocore.models.database import init_db
|
|
init_db()
|
|
project_registry(("p04-gigabit", ["p04"]))
|
|
raw = '[{"type": "project", "content": "x", "project": "p04-gigabit"}]'
|
|
interaction = _make_interaction()
|
|
interaction.project = ""
|
|
result = _parse_candidates(raw, interaction)
|
|
assert result[0].project == "p04-gigabit"
|
|
|
|
|
|
def test_missing_cli_returns_empty(monkeypatch):
|
|
"""If ``claude`` is not on PATH the extractor returns empty, never raises."""
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: False)
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "some real response"))
|
|
assert result.candidates == []
|
|
assert result.error == "claude_cli_missing"
|
|
|
|
|
|
def test_empty_response_returns_empty(monkeypatch):
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", ""))
|
|
assert result.candidates == []
|
|
assert result.error == "empty_response"
|
|
|
|
|
|
def test_subprocess_timeout_returns_empty(monkeypatch):
|
|
"""A subprocess timeout must not raise into the caller."""
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
|
|
|
import subprocess as _sp
|
|
|
|
def _boom(*a, **kw):
|
|
raise _sp.TimeoutExpired(cmd=a[0] if a else "claude", timeout=1)
|
|
|
|
monkeypatch.setattr(extractor_llm.subprocess, "run", _boom)
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "real response"))
|
|
assert result.candidates == []
|
|
assert result.error == "timeout"
|
|
|
|
|
|
def test_subprocess_nonzero_exit_returns_empty(monkeypatch):
|
|
"""A non-zero CLI exit (auth failure, etc.) must not raise."""
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
|
|
|
class _Completed:
|
|
returncode = 1
|
|
stdout = ""
|
|
stderr = "auth failed"
|
|
|
|
monkeypatch.setattr(extractor_llm.subprocess, "run", lambda *a, **kw: _Completed())
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "real response"))
|
|
assert result.candidates == []
|
|
assert result.error == "exit_1"
|
|
|
|
|
|
def test_happy_path_parses_stdout(monkeypatch):
|
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
|
|
|
class _Completed:
|
|
returncode = 0
|
|
stdout = '[{"type": "project", "content": "p04 selected Option B", "project": "p04-gigabit", "confidence": 0.6}]'
|
|
stderr = ""
|
|
|
|
monkeypatch.setattr(extractor_llm.subprocess, "run", lambda *a, **kw: _Completed())
|
|
result = extract_candidates_llm_verbose(_make_interaction("p", "r"))
|
|
assert len(result.candidates) == 1
|
|
assert result.candidates[0].memory_type == "project"
|
|
assert result.candidates[0].project == "p04-gigabit"
|
|
assert abs(result.candidates[0].confidence - 0.6) < 1e-9
|