scripts/extractor_eval.py

"""Extractor eval runner — scores the rule-based extractor against a
labeled interaction corpus.

Pulls full interaction content from a frozen snapshot, runs each through
``extract_candidates_from_interaction``, and compares the output to the
expected counts from a labels file. Produces a per-label scorecard plus
aggregate precision / recall / yield numbers.

This harness deliberately stays file-based: snapshot + labels + this
runner. No Dalidou HTTP dependency once the snapshot is frozen, so the
eval is reproducible run-to-run even as live captures drift.

Usage:

    python scripts/extractor_eval.py                # human report
    python scripts/extractor_eval.py --json         # machine-readable
    python scripts/extractor_eval.py \\
        --snapshot scripts/eval_data/interactions_snapshot_2026-04-11.json \\
        --labels   scripts/eval_data/extractor_labels_2026-04-11.json
"""

from __future__ import annotations

import argparse
import json
import sys
from dataclasses import dataclass, field
from pathlib import Path

# Make src/ importable without requiring an install.
_REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(_REPO_ROOT / "src"))

from atocore.interactions.service import Interaction  # noqa: E402
from atocore.memory.extractor import extract_candidates_from_interaction  # noqa: E402

DEFAULT_SNAPSHOT = _REPO_ROOT / "scripts" / "eval_data" / "interactions_snapshot_2026-04-11.json"
DEFAULT_LABELS = _REPO_ROOT / "scripts" / "eval_data" / "extractor_labels_2026-04-11.json"


@dataclass
class LabelResult:
    id: str
    expected_count: int
    actual_count: int
    ok: bool
    miss_class: str
    notes: str
    actual_candidates: list[dict] = field(default_factory=list)


def load_snapshot(path: Path) -> dict[str, dict]:
    data = json.loads(path.read_text(encoding="utf-8"))
    return {item["id"]: item for item in data.get("interactions", [])}


def load_labels(path: Path) -> dict:
    return json.loads(path.read_text(encoding="utf-8"))


def interaction_from_snapshot(snap: dict) -> Interaction:
    return Interaction(
        id=snap["id"],
        prompt=snap.get("prompt", "") or "",
        response=snap.get("response", "") or "",
        response_summary="",
        project=snap.get("project", "") or "",
        client=snap.get("client", "") or "",
        session_id=snap.get("session_id", "") or "",
        created_at=snap.get("created_at", "") or "",
    )


def score(snapshot: dict[str, dict], labels_doc: dict) -> list[LabelResult]:
    results: list[LabelResult] = []
    for label in labels_doc["labels"]:
        iid = label["id"]
        snap = snapshot.get(iid)
        if snap is None:
            results.append(
                LabelResult(
                    id=iid,
                    expected_count=int(label.get("expected_count", 0)),
                    actual_count=-1,
                    ok=False,
                    miss_class="not_in_snapshot",
                    notes=label.get("notes", ""),
                )
            )
            continue
        interaction = interaction_from_snapshot(snap)
        candidates = extract_candidates_from_interaction(interaction)
        actual_count = len(candidates)
        expected_count = int(label.get("expected_count", 0))
        results.append(
            LabelResult(
                id=iid,
                expected_count=expected_count,
                actual_count=actual_count,
                ok=(actual_count == expected_count),
                miss_class=label.get("miss_class", "n/a"),
                notes=label.get("notes", ""),
                actual_candidates=[
                    {
                        "memory_type": c.memory_type,
                        "content": c.content,
                        "project": c.project,
                        "rule": c.rule,
                    }
                    for c in candidates
                ],
            )
        )
    return results


def aggregate(results: list[LabelResult]) -> dict:
    total = len(results)
    exact_match = sum(1 for r in results if r.ok)
    true_positive = sum(1 for r in results if r.expected_count > 0 and r.actual_count > 0)
    false_positive_interactions = sum(
        1 for r in results if r.expected_count == 0 and r.actual_count > 0
    )
    false_negative_interactions = sum(
        1 for r in results if r.expected_count > 0 and r.actual_count == 0
    )
    positive_expected = sum(1 for r in results if r.expected_count > 0)
    total_expected_candidates = sum(r.expected_count for r in results)
    total_actual_candidates = sum(max(r.actual_count, 0) for r in results)
    yield_rate = total_actual_candidates / total if total else 0.0
    # Recall over interaction count that had at least one expected candidate:
    recall = true_positive / positive_expected if positive_expected else 0.0
    # Precision over interaction count that produced any candidate:
    precision_denom = true_positive + false_positive_interactions
    precision = true_positive / precision_denom if precision_denom else 0.0
    # Miss class breakdown
    miss_classes: dict[str, int] = {}
    for r in results:
        if r.expected_count > 0 and r.actual_count == 0:
            key = r.miss_class or "unlabeled"
            miss_classes[key] = miss_classes.get(key, 0) + 1
    return {
        "total": total,
        "exact_match": exact_match,
        "positive_expected": positive_expected,
        "total_expected_candidates": total_expected_candidates,
        "total_actual_candidates": total_actual_candidates,
        "yield_rate": round(yield_rate, 3),
        "recall": round(recall, 3),
        "precision": round(precision, 3),
        "false_positive_interactions": false_positive_interactions,
        "false_negative_interactions": false_negative_interactions,
        "miss_classes": miss_classes,
    }


def print_human(results: list[LabelResult], summary: dict) -> None:
    print("=== Extractor eval ===")
    print(
        f"labeled={summary['total']}  "
        f"exact_match={summary['exact_match']}  "
        f"positive_expected={summary['positive_expected']}"
    )
    print(
        f"yield={summary['yield_rate']}  "
        f"recall={summary['recall']}  "
        f"precision={summary['precision']}"
    )
    print(
        f"false_positives={summary['false_positive_interactions']}  "
        f"false_negatives={summary['false_negative_interactions']}"
    )
    print()
    print("miss class breakdown (FN):")
    if summary["miss_classes"]:
        for k, v in sorted(summary["miss_classes"].items(), key=lambda kv: -kv[1]):
            print(f"  {v:3d}  {k}")
    else:
        print("  (none)")
    print()
    print("per-interaction:")
    for r in results:
        marker = "OK  " if r.ok else "MISS"
        iid_short = r.id[:8]
        print(f"  {marker}  {iid_short}  expected={r.expected_count}  actual={r.actual_count}  class={r.miss_class}")
        if r.actual_candidates:
            for c in r.actual_candidates:
                preview = (c["content"] or "")[:80]
                print(f"         [{c['memory_type']}] {preview}")


def print_json(results: list[LabelResult], summary: dict) -> None:
    payload = {
        "summary": summary,
        "results": [
            {
                "id": r.id,
                "expected_count": r.expected_count,
                "actual_count": r.actual_count,
                "ok": r.ok,
                "miss_class": r.miss_class,
                "notes": r.notes,
                "actual_candidates": r.actual_candidates,
            }
            for r in results
        ],
    }
    json.dump(payload, sys.stdout, indent=2)
    sys.stdout.write("\n")


def main() -> int:
    parser = argparse.ArgumentParser(description="AtoCore extractor eval")
    parser.add_argument("--snapshot", type=Path, default=DEFAULT_SNAPSHOT)
    parser.add_argument("--labels", type=Path, default=DEFAULT_LABELS)
    parser.add_argument("--json", action="store_true", help="emit machine-readable JSON")
    args = parser.parse_args()

    snapshot = load_snapshot(args.snapshot)
    labels = load_labels(args.labels)
    results = score(snapshot, labels)
    summary = aggregate(results)

    if args.json:
        print_json(results, summary)
    else:
        print_human(results, summary)

    return 0 if summary["false_negative_interactions"] == 0 and summary["false_positive_interactions"] == 0 else 1


if __name__ == "__main__":
    raise SystemExit(main())
feat(eval-loop): Day 1+2 — labeled extractor corpus + baseline scorecard Day 1 (labeled corpus): - scripts/eval_data/interactions_snapshot_2026-04-11.json — frozen snapshot of 64 real claude-code interactions pulled from live Dalidou (test-client captures filtered out). This is the stable corpus the whole mini-phase labels against, independent of future captures. - scripts/eval_data/extractor_labels_2026-04-11.json — 20 hand-labeled interactions drawn by length-stratified random sample. Positives: 5/20 = ~25%, total expected candidates: 7. Plan deviation: Codex's plan asked for 30 (10/10/10 buckets); the real corpus is heavily skewed toward instructional/status content, so honest labeling of 20 already crosses the fail-early threshold of "at least 5 plausible positives" without padding. Day 2 (baseline measurement): - scripts/extractor_eval.py — file-based eval runner that loads the snapshot + labels, runs extract_candidates_from_interaction on each, and reports yield / recall / precision / miss-class breakdown. Returns exit 1 on any false positive or false negative. Current rule extractor against the labeled set: labeled=20 exact_match=15 positive_expected=5 yield=0.0 recall=0.0 precision=0.0 false_negatives=5 false_positives=0 miss_classes: recommendation_prose architectural_change_summary spec_update_announcement layered_recommendation alignment_assertion Interpretation: the rule-based extractor matches exactly zero of the 5 plausible positive interactions in the labeled set, and the misses are spread across 5 distinct cue classes with no single dominant pattern. This is the Day 4 hard-stop signal landing on Day 2 — a single rule expansion cannot close a 5-way miss, and widening rules blindly will collapse precision. The right move is to go straight to the Day 4 decision gate and consider LLM-assisted extraction. Escalating to DEV-LEDGER.md as R5 for human ratification before continuing. Not skipping Day 3 silently. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-11 15:11:33 -04:00			`"""Extractor eval runner — scores the rule-based extractor against a`
			`labeled interaction corpus.`

			`Pulls full interaction content from a frozen snapshot, runs each through`
			``extract_candidates_from_interaction``, and compares the output to the
			`expected counts from a labels file. Produces a per-label scorecard plus`
			`aggregate precision / recall / yield numbers.`

			`This harness deliberately stays file-based: snapshot + labels + this`
			`runner. No Dalidou HTTP dependency once the snapshot is frozen, so the`
			`eval is reproducible run-to-run even as live captures drift.`

			`Usage:`

			`python scripts/extractor_eval.py # human report`
			`python scripts/extractor_eval.py --json # machine-readable`
			`python scripts/extractor_eval.py \\`
			`--snapshot scripts/eval_data/interactions_snapshot_2026-04-11.json \\`
			`--labels scripts/eval_data/extractor_labels_2026-04-11.json`
			`"""`

			`from __future__ import annotations`

			`import argparse`
			`import json`
			`import sys`
			`from dataclasses import dataclass, field`
			`from pathlib import Path`

			`# Make src/ importable without requiring an install.`
			`_REPO_ROOT = Path(__file__).resolve().parent.parent`
			`sys.path.insert(0, str(_REPO_ROOT / "src"))`

			`from atocore.interactions.service import Interaction # noqa: E402`
			`from atocore.memory.extractor import extract_candidates_from_interaction # noqa: E402`

			`DEFAULT_SNAPSHOT = _REPO_ROOT / "scripts" / "eval_data" / "interactions_snapshot_2026-04-11.json"`
			`DEFAULT_LABELS = _REPO_ROOT / "scripts" / "eval_data" / "extractor_labels_2026-04-11.json"`


			`@dataclass`
			`class LabelResult:`
			`id: str`
			`expected_count: int`
			`actual_count: int`
			`ok: bool`
			`miss_class: str`
			`notes: str`
			`actual_candidates: list[dict] = field(default_factory=list)`


			`def load_snapshot(path: Path) -> dict[str, dict]:`
			`data = json.loads(path.read_text(encoding="utf-8"))`
			`return {item["id"]: item for item in data.get("interactions", [])}`


			`def load_labels(path: Path) -> dict:`
			`return json.loads(path.read_text(encoding="utf-8"))`


			`def interaction_from_snapshot(snap: dict) -> Interaction:`
			`return Interaction(`
			`id=snap["id"],`
			`prompt=snap.get("prompt", "") or "",`
			`response=snap.get("response", "") or "",`
			`response_summary="",`
			`project=snap.get("project", "") or "",`
			`client=snap.get("client", "") or "",`
			`session_id=snap.get("session_id", "") or "",`
			`created_at=snap.get("created_at", "") or "",`
			`)`


			`def score(snapshot: dict[str, dict], labels_doc: dict) -> list[LabelResult]:`
			`results: list[LabelResult] = []`
			`for label in labels_doc["labels"]:`
			`iid = label["id"]`
			`snap = snapshot.get(iid)`
			`if snap is None:`
			`results.append(`
			`LabelResult(`
			`id=iid,`
			`expected_count=int(label.get("expected_count", 0)),`
			`actual_count=-1,`
			`ok=False,`
			`miss_class="not_in_snapshot",`
			`notes=label.get("notes", ""),`
			`)`
			`)`
			`continue`
			`interaction = interaction_from_snapshot(snap)`
			`candidates = extract_candidates_from_interaction(interaction)`
			`actual_count = len(candidates)`
			`expected_count = int(label.get("expected_count", 0))`
			`results.append(`
			`LabelResult(`
			`id=iid,`
			`expected_count=expected_count,`
			`actual_count=actual_count,`
			`ok=(actual_count == expected_count),`
			`miss_class=label.get("miss_class", "n/a"),`
			`notes=label.get("notes", ""),`
			`actual_candidates=[`
			`{`
			`"memory_type": c.memory_type,`
			`"content": c.content,`
			`"project": c.project,`
			`"rule": c.rule,`
			`}`
			`for c in candidates`
			`],`
			`)`
			`)`
			`return results`


			`def aggregate(results: list[LabelResult]) -> dict:`
			`total = len(results)`
			`exact_match = sum(1 for r in results if r.ok)`
			`true_positive = sum(1 for r in results if r.expected_count > 0 and r.actual_count > 0)`
			`false_positive_interactions = sum(`
			`1 for r in results if r.expected_count == 0 and r.actual_count > 0`
			`)`
			`false_negative_interactions = sum(`
			`1 for r in results if r.expected_count > 0 and r.actual_count == 0`
			`)`
			`positive_expected = sum(1 for r in results if r.expected_count > 0)`
			`total_expected_candidates = sum(r.expected_count for r in results)`
			`total_actual_candidates = sum(max(r.actual_count, 0) for r in results)`
			`yield_rate = total_actual_candidates / total if total else 0.0`
			`# Recall over interaction count that had at least one expected candidate:`
			`recall = true_positive / positive_expected if positive_expected else 0.0`
			`# Precision over interaction count that produced any candidate:`
			`precision_denom = true_positive + false_positive_interactions`
			`precision = true_positive / precision_denom if precision_denom else 0.0`
			`# Miss class breakdown`
			`miss_classes: dict[str, int] = {}`
			`for r in results:`
			`if r.expected_count > 0 and r.actual_count == 0:`
			`key = r.miss_class or "unlabeled"`
			`miss_classes[key] = miss_classes.get(key, 0) + 1`
			`return {`
			`"total": total,`
			`"exact_match": exact_match,`
			`"positive_expected": positive_expected,`
			`"total_expected_candidates": total_expected_candidates,`
			`"total_actual_candidates": total_actual_candidates,`
			`"yield_rate": round(yield_rate, 3),`
			`"recall": round(recall, 3),`
			`"precision": round(precision, 3),`
			`"false_positive_interactions": false_positive_interactions,`
			`"false_negative_interactions": false_negative_interactions,`
			`"miss_classes": miss_classes,`
			`}`


			`def print_human(results: list[LabelResult], summary: dict) -> None:`
			`print("=== Extractor eval ===")`
			`print(`
			`f"labeled={summary['total']} "`
			`f"exact_match={summary['exact_match']} "`
			`f"positive_expected={summary['positive_expected']}"`
			`)`
			`print(`
			`f"yield={summary['yield_rate']} "`
			`f"recall={summary['recall']} "`
			`f"precision={summary['precision']}"`
			`)`
			`print(`
			`f"false_positives={summary['false_positive_interactions']} "`
			`f"false_negatives={summary['false_negative_interactions']}"`
			`)`
			`print()`
			`print("miss class breakdown (FN):")`
			`if summary["miss_classes"]:`
			`for k, v in sorted(summary["miss_classes"].items(), key=lambda kv: -kv[1]):`
			`print(f" {v:3d} {k}")`
			`else:`
			`print(" (none)")`
			`print()`
			`print("per-interaction:")`
			`for r in results:`
			`marker = "OK " if r.ok else "MISS"`
			`iid_short = r.id[:8]`
			`print(f" {marker} {iid_short} expected={r.expected_count} actual={r.actual_count} class={r.miss_class}")`
			`if r.actual_candidates:`
			`for c in r.actual_candidates:`
			`preview = (c["content"] or "")[:80]`
			`print(f" [{c['memory_type']}] {preview}")`


			`def print_json(results: list[LabelResult], summary: dict) -> None:`
			`payload = {`
			`"summary": summary,`
			`"results": [`
			`{`
			`"id": r.id,`
			`"expected_count": r.expected_count,`
			`"actual_count": r.actual_count,`
			`"ok": r.ok,`
			`"miss_class": r.miss_class,`
			`"notes": r.notes,`
			`"actual_candidates": r.actual_candidates,`
			`}`
			`for r in results`
			`],`
			`}`
			`json.dump(payload, sys.stdout, indent=2)`
			`sys.stdout.write("\n")`


			`def main() -> int:`
			`parser = argparse.ArgumentParser(description="AtoCore extractor eval")`
			`parser.add_argument("--snapshot", type=Path, default=DEFAULT_SNAPSHOT)`
			`parser.add_argument("--labels", type=Path, default=DEFAULT_LABELS)`
			`parser.add_argument("--json", action="store_true", help="emit machine-readable JSON")`
			`args = parser.parse_args()`

			`snapshot = load_snapshot(args.snapshot)`
			`labels = load_labels(args.labels)`
			`results = score(snapshot, labels)`
			`summary = aggregate(results)`

			`if args.json:`
			`print_json(results, summary)`
			`else:`
			`print_human(results, summary)`

			`return 0 if summary["false_negative_interactions"] == 0 and summary["false_positive_interactions"] == 0 else 1`


			`if __name__ == "__main__":`
			`raise SystemExit(main())`