Second pass on the LLM-assisted extractor after Antoine's explicit
rule: no API key, ever. Refactored src/atocore/memory/extractor_llm.py
to shell out to the Claude Code 'claude -p' CLI via subprocess instead
of the anthropic SDK, so extraction reuses the user's existing Claude.ai
OAuth credentials and needs zero secret management.
Implementation:
- subprocess.run(["claude", "-p", "--model", "haiku",
"--append-system-prompt", <instructions>,
"--no-session-persistence", "--disable-slash-commands",
user_message], ...)
- cwd is a cached tempfile.mkdtemp() so every invocation starts with
a clean context instead of auto-discovering CLAUDE.md / AGENTS.md /
DEV-LEDGER.md from the repo root. We cannot use --bare because it
forces API-key auth, which defeats the purpose; the temp-cwd trick
is the lightest way to keep OAuth auth while skipping project
context loading.
- Silent-failure contract unchanged: missing CLI, non-zero exit,
timeout, malformed JSON — all return [] and log an error. The
capture audit trail must not break on an optional side effect.
- Default timeout bumped from 20s to 90s: Haiku + Node.js startup
+ OAuth check is ~20-40s per call in practice, plus real responses
up to 8KB take longer. 45s hit 2 timeouts on the first live run.
- tests/test_extractor_llm.py refactored: the API-key / anthropic SDK
tests are replaced by subprocess-mocking tests covering missing
CLI, timeout, non-zero exit, and a happy-path stdout parse. 14
tests, all green.
scripts/extractor_eval.py:
- New --output <path> flag writes the JSON result directly to a file,
bypassing stdout/log interleaving (structlog sends INFO to stdout
via PrintLoggerFactory, so a naive '> out.json' pollutes the file).
- Forces UTF-8 on stdout so real LLM output with em-dashes / arrows /
CJK doesn't crash the human report on Windows cp1252 consoles.
First live baseline run against the 20-interaction labeled corpus
(scripts/eval_data/extractor_llm_baseline_2026-04-11.json):
mode=llm labeled=20 recall=1.0 precision=0.357 yield_rate=2.55
total_actual_candidates=51 total_expected_candidates=7
false_negative_interactions=0 false_positive_interactions=9
Recall 0% -> 100% vs rule baseline — every human-labeled positive is
caught. Precision reads low (0.357) but inspection shows the "false
positives" are real candidates the human labels under-counted. For
example interaction a6b0d279 was labeled at 2 expected candidates,
the model caught all 6 polisher architectural facts; interaction
52c8c0f3 was labeled at 1, the model caught all 5 infra commitments.
The labels are the bottleneck, not the model.
Day 4 gate against Codex's criteria:
- candidate yield: 255% vs ≥15-25% target
- FP rate tolerable for manual triage: 51 candidates reviewable in
~10 minutes via the triage CLI
- ≥2 real non-synthetic candidates worth review: 20+ obvious wins
(polisher architecture set, p05 infra set, DEV-LEDGER protocol set)
Gate cleared. LLM-assisted extraction is the path forward for
conversational captures. Rule-based extractor stays as-is for
structured-cue inputs and remains the default mode. The next step
(Day 5 stabilize / document) will wire LLM mode behind a flag in
the public extraction endpoint and document scope.
Test count: 276 -> 278 passing. No existing tests changed.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
275 lines
9.9 KiB
Python
275 lines
9.9 KiB
Python
"""Extractor eval runner — scores the rule-based extractor against a
|
|
labeled interaction corpus.
|
|
|
|
Pulls full interaction content from a frozen snapshot, runs each through
|
|
``extract_candidates_from_interaction``, and compares the output to the
|
|
expected counts from a labels file. Produces a per-label scorecard plus
|
|
aggregate precision / recall / yield numbers.
|
|
|
|
This harness deliberately stays file-based: snapshot + labels + this
|
|
runner. No Dalidou HTTP dependency once the snapshot is frozen, so the
|
|
eval is reproducible run-to-run even as live captures drift.
|
|
|
|
Usage:
|
|
|
|
python scripts/extractor_eval.py # human report
|
|
python scripts/extractor_eval.py --json # machine-readable
|
|
python scripts/extractor_eval.py \\
|
|
--snapshot scripts/eval_data/interactions_snapshot_2026-04-11.json \\
|
|
--labels scripts/eval_data/extractor_labels_2026-04-11.json
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import io
|
|
import json
|
|
import sys
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
|
|
# Force UTF-8 on stdout so real LLM output (arrows, em-dashes, CJK)
|
|
# doesn't crash the human report on Windows cp1252 consoles.
|
|
if hasattr(sys.stdout, "buffer"):
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace", line_buffering=True)
|
|
|
|
# Make src/ importable without requiring an install.
|
|
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(_REPO_ROOT / "src"))
|
|
|
|
from atocore.interactions.service import Interaction # noqa: E402
|
|
from atocore.memory.extractor import extract_candidates_from_interaction # noqa: E402
|
|
from atocore.memory.extractor_llm import extract_candidates_llm # noqa: E402
|
|
|
|
DEFAULT_SNAPSHOT = _REPO_ROOT / "scripts" / "eval_data" / "interactions_snapshot_2026-04-11.json"
|
|
DEFAULT_LABELS = _REPO_ROOT / "scripts" / "eval_data" / "extractor_labels_2026-04-11.json"
|
|
|
|
|
|
@dataclass
|
|
class LabelResult:
|
|
id: str
|
|
expected_count: int
|
|
actual_count: int
|
|
ok: bool
|
|
miss_class: str
|
|
notes: str
|
|
actual_candidates: list[dict] = field(default_factory=list)
|
|
|
|
|
|
def load_snapshot(path: Path) -> dict[str, dict]:
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
return {item["id"]: item for item in data.get("interactions", [])}
|
|
|
|
|
|
def load_labels(path: Path) -> dict:
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
|
|
def interaction_from_snapshot(snap: dict) -> Interaction:
|
|
return Interaction(
|
|
id=snap["id"],
|
|
prompt=snap.get("prompt", "") or "",
|
|
response=snap.get("response", "") or "",
|
|
response_summary="",
|
|
project=snap.get("project", "") or "",
|
|
client=snap.get("client", "") or "",
|
|
session_id=snap.get("session_id", "") or "",
|
|
created_at=snap.get("created_at", "") or "",
|
|
)
|
|
|
|
|
|
def score(snapshot: dict[str, dict], labels_doc: dict, mode: str = "rule") -> list[LabelResult]:
|
|
results: list[LabelResult] = []
|
|
for label in labels_doc["labels"]:
|
|
iid = label["id"]
|
|
snap = snapshot.get(iid)
|
|
if snap is None:
|
|
results.append(
|
|
LabelResult(
|
|
id=iid,
|
|
expected_count=int(label.get("expected_count", 0)),
|
|
actual_count=-1,
|
|
ok=False,
|
|
miss_class="not_in_snapshot",
|
|
notes=label.get("notes", ""),
|
|
)
|
|
)
|
|
continue
|
|
interaction = interaction_from_snapshot(snap)
|
|
if mode == "llm":
|
|
candidates = extract_candidates_llm(interaction)
|
|
else:
|
|
candidates = extract_candidates_from_interaction(interaction)
|
|
actual_count = len(candidates)
|
|
expected_count = int(label.get("expected_count", 0))
|
|
results.append(
|
|
LabelResult(
|
|
id=iid,
|
|
expected_count=expected_count,
|
|
actual_count=actual_count,
|
|
ok=(actual_count == expected_count),
|
|
miss_class=label.get("miss_class", "n/a"),
|
|
notes=label.get("notes", ""),
|
|
actual_candidates=[
|
|
{
|
|
"memory_type": c.memory_type,
|
|
"content": c.content,
|
|
"project": c.project,
|
|
"rule": c.rule,
|
|
}
|
|
for c in candidates
|
|
],
|
|
)
|
|
)
|
|
return results
|
|
|
|
|
|
def aggregate(results: list[LabelResult]) -> dict:
|
|
total = len(results)
|
|
exact_match = sum(1 for r in results if r.ok)
|
|
true_positive = sum(1 for r in results if r.expected_count > 0 and r.actual_count > 0)
|
|
false_positive_interactions = sum(
|
|
1 for r in results if r.expected_count == 0 and r.actual_count > 0
|
|
)
|
|
false_negative_interactions = sum(
|
|
1 for r in results if r.expected_count > 0 and r.actual_count == 0
|
|
)
|
|
positive_expected = sum(1 for r in results if r.expected_count > 0)
|
|
total_expected_candidates = sum(r.expected_count for r in results)
|
|
total_actual_candidates = sum(max(r.actual_count, 0) for r in results)
|
|
yield_rate = total_actual_candidates / total if total else 0.0
|
|
# Recall over interaction count that had at least one expected candidate:
|
|
recall = true_positive / positive_expected if positive_expected else 0.0
|
|
# Precision over interaction count that produced any candidate:
|
|
precision_denom = true_positive + false_positive_interactions
|
|
precision = true_positive / precision_denom if precision_denom else 0.0
|
|
# Miss class breakdown
|
|
miss_classes: dict[str, int] = {}
|
|
for r in results:
|
|
if r.expected_count > 0 and r.actual_count == 0:
|
|
key = r.miss_class or "unlabeled"
|
|
miss_classes[key] = miss_classes.get(key, 0) + 1
|
|
return {
|
|
"total": total,
|
|
"exact_match": exact_match,
|
|
"positive_expected": positive_expected,
|
|
"total_expected_candidates": total_expected_candidates,
|
|
"total_actual_candidates": total_actual_candidates,
|
|
"yield_rate": round(yield_rate, 3),
|
|
"recall": round(recall, 3),
|
|
"precision": round(precision, 3),
|
|
"false_positive_interactions": false_positive_interactions,
|
|
"false_negative_interactions": false_negative_interactions,
|
|
"miss_classes": miss_classes,
|
|
}
|
|
|
|
|
|
def print_human(results: list[LabelResult], summary: dict) -> None:
|
|
print("=== Extractor eval ===")
|
|
print(
|
|
f"labeled={summary['total']} "
|
|
f"exact_match={summary['exact_match']} "
|
|
f"positive_expected={summary['positive_expected']}"
|
|
)
|
|
print(
|
|
f"yield={summary['yield_rate']} "
|
|
f"recall={summary['recall']} "
|
|
f"precision={summary['precision']}"
|
|
)
|
|
print(
|
|
f"false_positives={summary['false_positive_interactions']} "
|
|
f"false_negatives={summary['false_negative_interactions']}"
|
|
)
|
|
print()
|
|
print("miss class breakdown (FN):")
|
|
if summary["miss_classes"]:
|
|
for k, v in sorted(summary["miss_classes"].items(), key=lambda kv: -kv[1]):
|
|
print(f" {v:3d} {k}")
|
|
else:
|
|
print(" (none)")
|
|
print()
|
|
print("per-interaction:")
|
|
for r in results:
|
|
marker = "OK " if r.ok else "MISS"
|
|
iid_short = r.id[:8]
|
|
print(f" {marker} {iid_short} expected={r.expected_count} actual={r.actual_count} class={r.miss_class}")
|
|
if r.actual_candidates:
|
|
for c in r.actual_candidates:
|
|
preview = (c["content"] or "")[:80]
|
|
print(f" [{c['memory_type']}] {preview}")
|
|
|
|
|
|
def print_json(results: list[LabelResult], summary: dict) -> None:
|
|
payload = {
|
|
"summary": summary,
|
|
"results": [
|
|
{
|
|
"id": r.id,
|
|
"expected_count": r.expected_count,
|
|
"actual_count": r.actual_count,
|
|
"ok": r.ok,
|
|
"miss_class": r.miss_class,
|
|
"notes": r.notes,
|
|
"actual_candidates": r.actual_candidates,
|
|
}
|
|
for r in results
|
|
],
|
|
}
|
|
json.dump(payload, sys.stdout, indent=2)
|
|
sys.stdout.write("\n")
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="AtoCore extractor eval")
|
|
parser.add_argument("--snapshot", type=Path, default=DEFAULT_SNAPSHOT)
|
|
parser.add_argument("--labels", type=Path, default=DEFAULT_LABELS)
|
|
parser.add_argument("--json", action="store_true", help="emit machine-readable JSON")
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=None,
|
|
help="write JSON result to this file (bypasses log/stdout interleaving)",
|
|
)
|
|
parser.add_argument(
|
|
"--mode",
|
|
choices=["rule", "llm"],
|
|
default="rule",
|
|
help="which extractor to score (default: rule)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
snapshot = load_snapshot(args.snapshot)
|
|
labels = load_labels(args.labels)
|
|
results = score(snapshot, labels, mode=args.mode)
|
|
summary = aggregate(results)
|
|
summary["mode"] = args.mode
|
|
|
|
if args.output is not None:
|
|
payload = {
|
|
"summary": summary,
|
|
"results": [
|
|
{
|
|
"id": r.id,
|
|
"expected_count": r.expected_count,
|
|
"actual_count": r.actual_count,
|
|
"ok": r.ok,
|
|
"miss_class": r.miss_class,
|
|
"notes": r.notes,
|
|
"actual_candidates": r.actual_candidates,
|
|
}
|
|
for r in results
|
|
],
|
|
}
|
|
args.output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
print(f"wrote {args.output} ({summary['mode']}: recall={summary['recall']} precision={summary['precision']})")
|
|
elif args.json:
|
|
print_json(results, summary)
|
|
else:
|
|
print_human(results, summary)
|
|
|
|
return 0 if summary["false_negative_interactions"] == 0 and summary["false_positive_interactions"] == 0 else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|