2026-04-11 15:11:33 -04:00
|
|
|
"""Extractor eval runner — scores the rule-based extractor against a
|
|
|
|
|
labeled interaction corpus.
|
|
|
|
|
|
|
|
|
|
Pulls full interaction content from a frozen snapshot, runs each through
|
|
|
|
|
``extract_candidates_from_interaction``, and compares the output to the
|
|
|
|
|
expected counts from a labels file. Produces a per-label scorecard plus
|
|
|
|
|
aggregate precision / recall / yield numbers.
|
|
|
|
|
|
|
|
|
|
This harness deliberately stays file-based: snapshot + labels + this
|
|
|
|
|
runner. No Dalidou HTTP dependency once the snapshot is frozen, so the
|
|
|
|
|
eval is reproducible run-to-run even as live captures drift.
|
|
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
|
|
|
|
|
python scripts/extractor_eval.py # human report
|
|
|
|
|
python scripts/extractor_eval.py --json # machine-readable
|
|
|
|
|
python scripts/extractor_eval.py \\
|
|
|
|
|
--snapshot scripts/eval_data/interactions_snapshot_2026-04-11.json \\
|
|
|
|
|
--labels scripts/eval_data/extractor_labels_2026-04-11.json
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import json
|
|
|
|
|
import sys
|
|
|
|
|
from dataclasses import dataclass, field
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
# Make src/ importable without requiring an install.
|
|
|
|
|
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
|
|
|
sys.path.insert(0, str(_REPO_ROOT / "src"))
|
|
|
|
|
|
|
|
|
|
from atocore.interactions.service import Interaction # noqa: E402
|
|
|
|
|
from atocore.memory.extractor import extract_candidates_from_interaction # noqa: E402
|
feat(eval-loop): Day 4 — LLM-assisted extractor path (additive, flagged)
Day 2 baseline showed 0% recall for the rule-based extractor across
5 distinct miss classes. Day 4 decision gate: prototype an
LLM-assisted mode behind a flag. Option A ratified by Antoine.
New module src/atocore/memory/extractor_llm.py:
- extract_candidates_llm(interaction) returns the same MemoryCandidate
dataclass the rule extractor produces, so both paths flow through
the existing triage / candidate pipeline unchanged.
- extract_candidates_llm_verbose() also returns the raw model output
and any error string, for eval and debugging.
- Uses Claude Haiku 4.5 by default; model overridable via
ATOCORE_LLM_EXTRACTOR_MODEL env. Timeout via
ATOCORE_LLM_EXTRACTOR_TIMEOUT_S (default 20s).
- Silent-failure contract: missing API key, unreachable model,
malformed JSON — all return [] and log an error. Never raises
into the caller. The capture audit trail must not break on an
optional side effect.
- Parser tolerates markdown fences, surrounding prose, invalid
memory types, clamps confidence to [0,1], drops empty content.
- System prompt explicitly tells the model to return [] for most
conversational turns (durable-fact bar, not "extract everything").
- Trust rules unchanged: candidates are never auto-promoted,
extraction stays off the capture hot path, human triages via the
existing CLI.
scripts/extractor_eval.py: new --mode {rule,llm} flag so the same
labeled corpus can be scored against both extractors. Default
remains rule so existing invocations are unchanged.
tests/test_extractor_llm.py: 12 new unit tests covering the parser
(empty array, malformed JSON, markdown fences, surrounding prose,
invalid types, empty content, confidence clamping, version tagging),
plus contract tests for missing API key, empty response, and a
mocked api_error path so failure modes never raise.
Test count: 264 -> 276 passing. No existing tests changed.
Next step: run `python scripts/extractor_eval.py --mode llm` against
the labeled set with ANTHROPIC_API_KEY in env, record the delta,
decide whether to wire LLM mode into the API endpoint and CLI or
keep it script-only for now.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 15:18:30 -04:00
|
|
|
from atocore.memory.extractor_llm import extract_candidates_llm # noqa: E402
|
2026-04-11 15:11:33 -04:00
|
|
|
|
|
|
|
|
DEFAULT_SNAPSHOT = _REPO_ROOT / "scripts" / "eval_data" / "interactions_snapshot_2026-04-11.json"
|
|
|
|
|
DEFAULT_LABELS = _REPO_ROOT / "scripts" / "eval_data" / "extractor_labels_2026-04-11.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class LabelResult:
|
|
|
|
|
id: str
|
|
|
|
|
expected_count: int
|
|
|
|
|
actual_count: int
|
|
|
|
|
ok: bool
|
|
|
|
|
miss_class: str
|
|
|
|
|
notes: str
|
|
|
|
|
actual_candidates: list[dict] = field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_snapshot(path: Path) -> dict[str, dict]:
|
|
|
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
|
return {item["id"]: item for item in data.get("interactions", [])}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_labels(path: Path) -> dict:
|
|
|
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def interaction_from_snapshot(snap: dict) -> Interaction:
|
|
|
|
|
return Interaction(
|
|
|
|
|
id=snap["id"],
|
|
|
|
|
prompt=snap.get("prompt", "") or "",
|
|
|
|
|
response=snap.get("response", "") or "",
|
|
|
|
|
response_summary="",
|
|
|
|
|
project=snap.get("project", "") or "",
|
|
|
|
|
client=snap.get("client", "") or "",
|
|
|
|
|
session_id=snap.get("session_id", "") or "",
|
|
|
|
|
created_at=snap.get("created_at", "") or "",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
feat(eval-loop): Day 4 — LLM-assisted extractor path (additive, flagged)
Day 2 baseline showed 0% recall for the rule-based extractor across
5 distinct miss classes. Day 4 decision gate: prototype an
LLM-assisted mode behind a flag. Option A ratified by Antoine.
New module src/atocore/memory/extractor_llm.py:
- extract_candidates_llm(interaction) returns the same MemoryCandidate
dataclass the rule extractor produces, so both paths flow through
the existing triage / candidate pipeline unchanged.
- extract_candidates_llm_verbose() also returns the raw model output
and any error string, for eval and debugging.
- Uses Claude Haiku 4.5 by default; model overridable via
ATOCORE_LLM_EXTRACTOR_MODEL env. Timeout via
ATOCORE_LLM_EXTRACTOR_TIMEOUT_S (default 20s).
- Silent-failure contract: missing API key, unreachable model,
malformed JSON — all return [] and log an error. Never raises
into the caller. The capture audit trail must not break on an
optional side effect.
- Parser tolerates markdown fences, surrounding prose, invalid
memory types, clamps confidence to [0,1], drops empty content.
- System prompt explicitly tells the model to return [] for most
conversational turns (durable-fact bar, not "extract everything").
- Trust rules unchanged: candidates are never auto-promoted,
extraction stays off the capture hot path, human triages via the
existing CLI.
scripts/extractor_eval.py: new --mode {rule,llm} flag so the same
labeled corpus can be scored against both extractors. Default
remains rule so existing invocations are unchanged.
tests/test_extractor_llm.py: 12 new unit tests covering the parser
(empty array, malformed JSON, markdown fences, surrounding prose,
invalid types, empty content, confidence clamping, version tagging),
plus contract tests for missing API key, empty response, and a
mocked api_error path so failure modes never raise.
Test count: 264 -> 276 passing. No existing tests changed.
Next step: run `python scripts/extractor_eval.py --mode llm` against
the labeled set with ANTHROPIC_API_KEY in env, record the delta,
decide whether to wire LLM mode into the API endpoint and CLI or
keep it script-only for now.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 15:18:30 -04:00
|
|
|
def score(snapshot: dict[str, dict], labels_doc: dict, mode: str = "rule") -> list[LabelResult]:
|
2026-04-11 15:11:33 -04:00
|
|
|
results: list[LabelResult] = []
|
|
|
|
|
for label in labels_doc["labels"]:
|
|
|
|
|
iid = label["id"]
|
|
|
|
|
snap = snapshot.get(iid)
|
|
|
|
|
if snap is None:
|
|
|
|
|
results.append(
|
|
|
|
|
LabelResult(
|
|
|
|
|
id=iid,
|
|
|
|
|
expected_count=int(label.get("expected_count", 0)),
|
|
|
|
|
actual_count=-1,
|
|
|
|
|
ok=False,
|
|
|
|
|
miss_class="not_in_snapshot",
|
|
|
|
|
notes=label.get("notes", ""),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
interaction = interaction_from_snapshot(snap)
|
feat(eval-loop): Day 4 — LLM-assisted extractor path (additive, flagged)
Day 2 baseline showed 0% recall for the rule-based extractor across
5 distinct miss classes. Day 4 decision gate: prototype an
LLM-assisted mode behind a flag. Option A ratified by Antoine.
New module src/atocore/memory/extractor_llm.py:
- extract_candidates_llm(interaction) returns the same MemoryCandidate
dataclass the rule extractor produces, so both paths flow through
the existing triage / candidate pipeline unchanged.
- extract_candidates_llm_verbose() also returns the raw model output
and any error string, for eval and debugging.
- Uses Claude Haiku 4.5 by default; model overridable via
ATOCORE_LLM_EXTRACTOR_MODEL env. Timeout via
ATOCORE_LLM_EXTRACTOR_TIMEOUT_S (default 20s).
- Silent-failure contract: missing API key, unreachable model,
malformed JSON — all return [] and log an error. Never raises
into the caller. The capture audit trail must not break on an
optional side effect.
- Parser tolerates markdown fences, surrounding prose, invalid
memory types, clamps confidence to [0,1], drops empty content.
- System prompt explicitly tells the model to return [] for most
conversational turns (durable-fact bar, not "extract everything").
- Trust rules unchanged: candidates are never auto-promoted,
extraction stays off the capture hot path, human triages via the
existing CLI.
scripts/extractor_eval.py: new --mode {rule,llm} flag so the same
labeled corpus can be scored against both extractors. Default
remains rule so existing invocations are unchanged.
tests/test_extractor_llm.py: 12 new unit tests covering the parser
(empty array, malformed JSON, markdown fences, surrounding prose,
invalid types, empty content, confidence clamping, version tagging),
plus contract tests for missing API key, empty response, and a
mocked api_error path so failure modes never raise.
Test count: 264 -> 276 passing. No existing tests changed.
Next step: run `python scripts/extractor_eval.py --mode llm` against
the labeled set with ANTHROPIC_API_KEY in env, record the delta,
decide whether to wire LLM mode into the API endpoint and CLI or
keep it script-only for now.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 15:18:30 -04:00
|
|
|
if mode == "llm":
|
|
|
|
|
candidates = extract_candidates_llm(interaction)
|
|
|
|
|
else:
|
|
|
|
|
candidates = extract_candidates_from_interaction(interaction)
|
2026-04-11 15:11:33 -04:00
|
|
|
actual_count = len(candidates)
|
|
|
|
|
expected_count = int(label.get("expected_count", 0))
|
|
|
|
|
results.append(
|
|
|
|
|
LabelResult(
|
|
|
|
|
id=iid,
|
|
|
|
|
expected_count=expected_count,
|
|
|
|
|
actual_count=actual_count,
|
|
|
|
|
ok=(actual_count == expected_count),
|
|
|
|
|
miss_class=label.get("miss_class", "n/a"),
|
|
|
|
|
notes=label.get("notes", ""),
|
|
|
|
|
actual_candidates=[
|
|
|
|
|
{
|
|
|
|
|
"memory_type": c.memory_type,
|
|
|
|
|
"content": c.content,
|
|
|
|
|
"project": c.project,
|
|
|
|
|
"rule": c.rule,
|
|
|
|
|
}
|
|
|
|
|
for c in candidates
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def aggregate(results: list[LabelResult]) -> dict:
|
|
|
|
|
total = len(results)
|
|
|
|
|
exact_match = sum(1 for r in results if r.ok)
|
|
|
|
|
true_positive = sum(1 for r in results if r.expected_count > 0 and r.actual_count > 0)
|
|
|
|
|
false_positive_interactions = sum(
|
|
|
|
|
1 for r in results if r.expected_count == 0 and r.actual_count > 0
|
|
|
|
|
)
|
|
|
|
|
false_negative_interactions = sum(
|
|
|
|
|
1 for r in results if r.expected_count > 0 and r.actual_count == 0
|
|
|
|
|
)
|
|
|
|
|
positive_expected = sum(1 for r in results if r.expected_count > 0)
|
|
|
|
|
total_expected_candidates = sum(r.expected_count for r in results)
|
|
|
|
|
total_actual_candidates = sum(max(r.actual_count, 0) for r in results)
|
|
|
|
|
yield_rate = total_actual_candidates / total if total else 0.0
|
|
|
|
|
# Recall over interaction count that had at least one expected candidate:
|
|
|
|
|
recall = true_positive / positive_expected if positive_expected else 0.0
|
|
|
|
|
# Precision over interaction count that produced any candidate:
|
|
|
|
|
precision_denom = true_positive + false_positive_interactions
|
|
|
|
|
precision = true_positive / precision_denom if precision_denom else 0.0
|
|
|
|
|
# Miss class breakdown
|
|
|
|
|
miss_classes: dict[str, int] = {}
|
|
|
|
|
for r in results:
|
|
|
|
|
if r.expected_count > 0 and r.actual_count == 0:
|
|
|
|
|
key = r.miss_class or "unlabeled"
|
|
|
|
|
miss_classes[key] = miss_classes.get(key, 0) + 1
|
|
|
|
|
return {
|
|
|
|
|
"total": total,
|
|
|
|
|
"exact_match": exact_match,
|
|
|
|
|
"positive_expected": positive_expected,
|
|
|
|
|
"total_expected_candidates": total_expected_candidates,
|
|
|
|
|
"total_actual_candidates": total_actual_candidates,
|
|
|
|
|
"yield_rate": round(yield_rate, 3),
|
|
|
|
|
"recall": round(recall, 3),
|
|
|
|
|
"precision": round(precision, 3),
|
|
|
|
|
"false_positive_interactions": false_positive_interactions,
|
|
|
|
|
"false_negative_interactions": false_negative_interactions,
|
|
|
|
|
"miss_classes": miss_classes,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_human(results: list[LabelResult], summary: dict) -> None:
|
|
|
|
|
print("=== Extractor eval ===")
|
|
|
|
|
print(
|
|
|
|
|
f"labeled={summary['total']} "
|
|
|
|
|
f"exact_match={summary['exact_match']} "
|
|
|
|
|
f"positive_expected={summary['positive_expected']}"
|
|
|
|
|
)
|
|
|
|
|
print(
|
|
|
|
|
f"yield={summary['yield_rate']} "
|
|
|
|
|
f"recall={summary['recall']} "
|
|
|
|
|
f"precision={summary['precision']}"
|
|
|
|
|
)
|
|
|
|
|
print(
|
|
|
|
|
f"false_positives={summary['false_positive_interactions']} "
|
|
|
|
|
f"false_negatives={summary['false_negative_interactions']}"
|
|
|
|
|
)
|
|
|
|
|
print()
|
|
|
|
|
print("miss class breakdown (FN):")
|
|
|
|
|
if summary["miss_classes"]:
|
|
|
|
|
for k, v in sorted(summary["miss_classes"].items(), key=lambda kv: -kv[1]):
|
|
|
|
|
print(f" {v:3d} {k}")
|
|
|
|
|
else:
|
|
|
|
|
print(" (none)")
|
|
|
|
|
print()
|
|
|
|
|
print("per-interaction:")
|
|
|
|
|
for r in results:
|
|
|
|
|
marker = "OK " if r.ok else "MISS"
|
|
|
|
|
iid_short = r.id[:8]
|
|
|
|
|
print(f" {marker} {iid_short} expected={r.expected_count} actual={r.actual_count} class={r.miss_class}")
|
|
|
|
|
if r.actual_candidates:
|
|
|
|
|
for c in r.actual_candidates:
|
|
|
|
|
preview = (c["content"] or "")[:80]
|
|
|
|
|
print(f" [{c['memory_type']}] {preview}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_json(results: list[LabelResult], summary: dict) -> None:
|
|
|
|
|
payload = {
|
|
|
|
|
"summary": summary,
|
|
|
|
|
"results": [
|
|
|
|
|
{
|
|
|
|
|
"id": r.id,
|
|
|
|
|
"expected_count": r.expected_count,
|
|
|
|
|
"actual_count": r.actual_count,
|
|
|
|
|
"ok": r.ok,
|
|
|
|
|
"miss_class": r.miss_class,
|
|
|
|
|
"notes": r.notes,
|
|
|
|
|
"actual_candidates": r.actual_candidates,
|
|
|
|
|
}
|
|
|
|
|
for r in results
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
json.dump(payload, sys.stdout, indent=2)
|
|
|
|
|
sys.stdout.write("\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> int:
|
|
|
|
|
parser = argparse.ArgumentParser(description="AtoCore extractor eval")
|
|
|
|
|
parser.add_argument("--snapshot", type=Path, default=DEFAULT_SNAPSHOT)
|
|
|
|
|
parser.add_argument("--labels", type=Path, default=DEFAULT_LABELS)
|
|
|
|
|
parser.add_argument("--json", action="store_true", help="emit machine-readable JSON")
|
feat(eval-loop): Day 4 — LLM-assisted extractor path (additive, flagged)
Day 2 baseline showed 0% recall for the rule-based extractor across
5 distinct miss classes. Day 4 decision gate: prototype an
LLM-assisted mode behind a flag. Option A ratified by Antoine.
New module src/atocore/memory/extractor_llm.py:
- extract_candidates_llm(interaction) returns the same MemoryCandidate
dataclass the rule extractor produces, so both paths flow through
the existing triage / candidate pipeline unchanged.
- extract_candidates_llm_verbose() also returns the raw model output
and any error string, for eval and debugging.
- Uses Claude Haiku 4.5 by default; model overridable via
ATOCORE_LLM_EXTRACTOR_MODEL env. Timeout via
ATOCORE_LLM_EXTRACTOR_TIMEOUT_S (default 20s).
- Silent-failure contract: missing API key, unreachable model,
malformed JSON — all return [] and log an error. Never raises
into the caller. The capture audit trail must not break on an
optional side effect.
- Parser tolerates markdown fences, surrounding prose, invalid
memory types, clamps confidence to [0,1], drops empty content.
- System prompt explicitly tells the model to return [] for most
conversational turns (durable-fact bar, not "extract everything").
- Trust rules unchanged: candidates are never auto-promoted,
extraction stays off the capture hot path, human triages via the
existing CLI.
scripts/extractor_eval.py: new --mode {rule,llm} flag so the same
labeled corpus can be scored against both extractors. Default
remains rule so existing invocations are unchanged.
tests/test_extractor_llm.py: 12 new unit tests covering the parser
(empty array, malformed JSON, markdown fences, surrounding prose,
invalid types, empty content, confidence clamping, version tagging),
plus contract tests for missing API key, empty response, and a
mocked api_error path so failure modes never raise.
Test count: 264 -> 276 passing. No existing tests changed.
Next step: run `python scripts/extractor_eval.py --mode llm` against
the labeled set with ANTHROPIC_API_KEY in env, record the delta,
decide whether to wire LLM mode into the API endpoint and CLI or
keep it script-only for now.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 15:18:30 -04:00
|
|
|
parser.add_argument(
|
|
|
|
|
"--mode",
|
|
|
|
|
choices=["rule", "llm"],
|
|
|
|
|
default="rule",
|
|
|
|
|
help="which extractor to score (default: rule)",
|
|
|
|
|
)
|
2026-04-11 15:11:33 -04:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
snapshot = load_snapshot(args.snapshot)
|
|
|
|
|
labels = load_labels(args.labels)
|
feat(eval-loop): Day 4 — LLM-assisted extractor path (additive, flagged)
Day 2 baseline showed 0% recall for the rule-based extractor across
5 distinct miss classes. Day 4 decision gate: prototype an
LLM-assisted mode behind a flag. Option A ratified by Antoine.
New module src/atocore/memory/extractor_llm.py:
- extract_candidates_llm(interaction) returns the same MemoryCandidate
dataclass the rule extractor produces, so both paths flow through
the existing triage / candidate pipeline unchanged.
- extract_candidates_llm_verbose() also returns the raw model output
and any error string, for eval and debugging.
- Uses Claude Haiku 4.5 by default; model overridable via
ATOCORE_LLM_EXTRACTOR_MODEL env. Timeout via
ATOCORE_LLM_EXTRACTOR_TIMEOUT_S (default 20s).
- Silent-failure contract: missing API key, unreachable model,
malformed JSON — all return [] and log an error. Never raises
into the caller. The capture audit trail must not break on an
optional side effect.
- Parser tolerates markdown fences, surrounding prose, invalid
memory types, clamps confidence to [0,1], drops empty content.
- System prompt explicitly tells the model to return [] for most
conversational turns (durable-fact bar, not "extract everything").
- Trust rules unchanged: candidates are never auto-promoted,
extraction stays off the capture hot path, human triages via the
existing CLI.
scripts/extractor_eval.py: new --mode {rule,llm} flag so the same
labeled corpus can be scored against both extractors. Default
remains rule so existing invocations are unchanged.
tests/test_extractor_llm.py: 12 new unit tests covering the parser
(empty array, malformed JSON, markdown fences, surrounding prose,
invalid types, empty content, confidence clamping, version tagging),
plus contract tests for missing API key, empty response, and a
mocked api_error path so failure modes never raise.
Test count: 264 -> 276 passing. No existing tests changed.
Next step: run `python scripts/extractor_eval.py --mode llm` against
the labeled set with ANTHROPIC_API_KEY in env, record the delta,
decide whether to wire LLM mode into the API endpoint and CLI or
keep it script-only for now.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 15:18:30 -04:00
|
|
|
results = score(snapshot, labels, mode=args.mode)
|
2026-04-11 15:11:33 -04:00
|
|
|
summary = aggregate(results)
|
feat(eval-loop): Day 4 — LLM-assisted extractor path (additive, flagged)
Day 2 baseline showed 0% recall for the rule-based extractor across
5 distinct miss classes. Day 4 decision gate: prototype an
LLM-assisted mode behind a flag. Option A ratified by Antoine.
New module src/atocore/memory/extractor_llm.py:
- extract_candidates_llm(interaction) returns the same MemoryCandidate
dataclass the rule extractor produces, so both paths flow through
the existing triage / candidate pipeline unchanged.
- extract_candidates_llm_verbose() also returns the raw model output
and any error string, for eval and debugging.
- Uses Claude Haiku 4.5 by default; model overridable via
ATOCORE_LLM_EXTRACTOR_MODEL env. Timeout via
ATOCORE_LLM_EXTRACTOR_TIMEOUT_S (default 20s).
- Silent-failure contract: missing API key, unreachable model,
malformed JSON — all return [] and log an error. Never raises
into the caller. The capture audit trail must not break on an
optional side effect.
- Parser tolerates markdown fences, surrounding prose, invalid
memory types, clamps confidence to [0,1], drops empty content.
- System prompt explicitly tells the model to return [] for most
conversational turns (durable-fact bar, not "extract everything").
- Trust rules unchanged: candidates are never auto-promoted,
extraction stays off the capture hot path, human triages via the
existing CLI.
scripts/extractor_eval.py: new --mode {rule,llm} flag so the same
labeled corpus can be scored against both extractors. Default
remains rule so existing invocations are unchanged.
tests/test_extractor_llm.py: 12 new unit tests covering the parser
(empty array, malformed JSON, markdown fences, surrounding prose,
invalid types, empty content, confidence clamping, version tagging),
plus contract tests for missing API key, empty response, and a
mocked api_error path so failure modes never raise.
Test count: 264 -> 276 passing. No existing tests changed.
Next step: run `python scripts/extractor_eval.py --mode llm` against
the labeled set with ANTHROPIC_API_KEY in env, record the delta,
decide whether to wire LLM mode into the API endpoint and CLI or
keep it script-only for now.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-11 15:18:30 -04:00
|
|
|
summary["mode"] = args.mode
|
2026-04-11 15:11:33 -04:00
|
|
|
|
|
|
|
|
if args.json:
|
|
|
|
|
print_json(results, summary)
|
|
|
|
|
else:
|
|
|
|
|
print_human(results, summary)
|
|
|
|
|
|
|
|
|
|
return 0 if summary["false_negative_interactions"] == 0 and summary["false_positive_interactions"] == 0 else 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
raise SystemExit(main())
|