feat(eval-loop): Day 1+2 — labeled extractor corpus + baseline scorecard
Day 1 (labeled corpus):
- scripts/eval_data/interactions_snapshot_2026-04-11.json — frozen
snapshot of 64 real claude-code interactions pulled from live
Dalidou (test-client captures filtered out). This is the stable
corpus the whole mini-phase labels against, independent of future
captures.
- scripts/eval_data/extractor_labels_2026-04-11.json — 20 hand-labeled
interactions drawn by length-stratified random sample. Positives:
5/20 = ~25%, total expected candidates: 7. Plan deviation: Codex's
plan asked for 30 (10/10/10 buckets); the real corpus is heavily
skewed toward instructional/status content, so honest labeling of
20 already crosses the fail-early threshold of "at least 5 plausible
positives" without padding.
Day 2 (baseline measurement):
- scripts/extractor_eval.py — file-based eval runner that loads the
snapshot + labels, runs extract_candidates_from_interaction on each,
and reports yield / recall / precision / miss-class breakdown.
Returns exit 1 on any false positive or false negative.
Current rule extractor against the labeled set:
labeled=20 exact_match=15 positive_expected=5
yield=0.0 recall=0.0 precision=0.0
false_negatives=5 false_positives=0
miss_classes:
recommendation_prose
architectural_change_summary
spec_update_announcement
layered_recommendation
alignment_assertion
Interpretation: the rule-based extractor matches exactly zero of the
5 plausible positive interactions in the labeled set, and the misses
are spread across 5 distinct cue classes with no single dominant
pattern. This is the Day 4 hard-stop signal landing on Day 2 — a
single rule expansion cannot close a 5-way miss, and widening rules
blindly will collapse precision. The right move is to go straight to
the Day 4 decision gate and consider LLM-assisted extraction.
Escalating to DEV-LEDGER.md as R5 for human ratification before
continuing. Not skipping Day 3 silently.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
145
scripts/eval_data/extractor_labels_2026-04-11.json
Normal file
145
scripts/eval_data/extractor_labels_2026-04-11.json
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
{
|
||||||
|
"version": "0.1",
|
||||||
|
"frozen_at": "2026-04-11",
|
||||||
|
"snapshot_file": "scripts/eval_data/interactions_snapshot_2026-04-11.json",
|
||||||
|
"labeled_count": 20,
|
||||||
|
"plan_deviation": "Codex's plan called for 30 labeled interactions (10 zero / 10 plausible / 10 ambiguous). Actual corpus is heavily skewed toward instructional/status content; after reading 20 drawn by length-stratified random sample, the honest positive rate is ~25% (5/20). Labeling more would mostly add zeros; the Day 2 measurement is not bottlenecked on sample size.",
|
||||||
|
"positive_count": 5,
|
||||||
|
"labels": [
|
||||||
|
{
|
||||||
|
"id": "ab239158-d6ac-4c51-b6e4-dd4ccea384a2",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Instructional deploy guidance. No durable claim."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "da153f2a-b20a-4dee-8c72-431ebb71f08c",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "'Deploy still in progress.' Pure status."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "7d8371ee-c6d3-4dfe-a7b0-2d091f075c15",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Git command walkthrough. No durable claim."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "14bf3f90-e318-466e-81ac-d35522741ba5",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Ledger status update. Transient fact, not a durable memory candidate."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "8f855235-c38d-4c27-9f2b-8530ebe1a2d8",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Short-term recommendation ('merge to main and deploy'), not a standing decision."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "04a96eb5-cd00-4e9f-9252-b2cc919000a4",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Dev server config table. Operational detail, not a memory."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "79d606ed-8981-454a-83af-c25226b1b65c",
|
||||||
|
"expected_count": 1,
|
||||||
|
"expected_type": "adaptation",
|
||||||
|
"expected_project": "",
|
||||||
|
"expected_snippet": "shared DEV-LEDGER as operating memory",
|
||||||
|
"miss_class": "recommendation_prose",
|
||||||
|
"notes": "A recommendation that later became a ratified decision. Rule extractor would need a 'simplest version that could work today' / 'I'd start with' cue class."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "a6b0d279-c564-4bce-a703-e476f4a148ad",
|
||||||
|
"expected_count": 2,
|
||||||
|
"expected_type": "project",
|
||||||
|
"expected_project": "p06-polisher",
|
||||||
|
"expected_snippet": "z_engaged bool; cam amplitude set mechanically and read by encoders",
|
||||||
|
"miss_class": "architectural_change_summary",
|
||||||
|
"notes": "Two durable architectural facts about the polisher machine (Z-axis is engage/retract, cam is read-only). Extractor would need to recognize 'A is now B' / 'X removed, Y added' patterns."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "4e00e398-2e89-4653-8ee5-3f65c7f4d2d3",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Clarification question to user."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "a6a7816a-7590-4616-84f4-49d9054c2a91",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Instructional response offering two next moves."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "03527502-316a-4a3e-989c-00719392c7d1",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Troubleshooting a paste failure. Ephemeral."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "1fff59fc-545f-42df-9dd1-a0e6dec1b7ee",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Agreement + follow-up question. No durable claim."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "eb65dc18-0030-4720-ace7-f55af9df719d",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Explanation of how the capture hook works. Instructional."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "52c8c0f3-32fb-4b48-9065-73c778a08417",
|
||||||
|
"expected_count": 1,
|
||||||
|
"expected_type": "project",
|
||||||
|
"expected_project": "p06-polisher",
|
||||||
|
"expected_snippet": "USB SSD mandatory on RPi; Tailscale for remote access",
|
||||||
|
"miss_class": "spec_update_announcement",
|
||||||
|
"notes": "Concrete architectural commitments just added to the polisher spec. Phrased as '§17.1 Local Storage - USB SSD mandatory, not SD card.' The '§' section markers could be a new cue."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "32d40414-15af-47ee-944b-2cceae9574b8",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Session recap. Historical summary, not a durable memory."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "b6d2cdfc-37fb-459a-96bd-caefb9beaab4",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Deployment prompt for Dalidou. Operational, not a memory."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "ee03d823-931b-4d4e-9258-88b4ed5eeb07",
|
||||||
|
"expected_count": 2,
|
||||||
|
"expected_type": "knowledge",
|
||||||
|
"expected_project": "p06-polisher",
|
||||||
|
"expected_snippet": "USB SSD is non-negotiable for local storage; Tailscale mesh for SSH/file transfer",
|
||||||
|
"miss_class": "layered_recommendation",
|
||||||
|
"notes": "Layered infra recommendation with 'non-negotiable' / 'strongly recommended' strength markers. The 'non-negotiable' token could be a new cue class."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "dd234d9f-0d1c-47e8-b01c-eebcb568c7e7",
|
||||||
|
"expected_count": 1,
|
||||||
|
"expected_type": "project",
|
||||||
|
"expected_project": "p06-polisher",
|
||||||
|
"expected_snippet": "interface contract is identical regardless of who generates the programs; machine is a standalone box",
|
||||||
|
"miss_class": "alignment_assertion",
|
||||||
|
"notes": "Architectural invariant assertion. '**Alignment verified**' / 'nothing changes for X' style. Likely too subtle for rule matching without LLM assistance."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "1f95891a-cf37-400e-9d68-4fad8e04dcbb",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Huge session handoff prompt. Informational only."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "5580950f-d010-4544-be4b-b3071271a698",
|
||||||
|
"expected_count": 0,
|
||||||
|
"miss_class": "n/a",
|
||||||
|
"notes": "Ledger schema sketch. Structural design proposal, later ratified — but the same idea was already captured as a ratified decision in the recent decisions section, so not worth re-extracting from this conversational form."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
1
scripts/eval_data/interactions_snapshot_2026-04-11.json
Normal file
1
scripts/eval_data/interactions_snapshot_2026-04-11.json
Normal file
File diff suppressed because one or more lines are too long
233
scripts/extractor_eval.py
Normal file
233
scripts/extractor_eval.py
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
"""Extractor eval runner — scores the rule-based extractor against a
|
||||||
|
labeled interaction corpus.
|
||||||
|
|
||||||
|
Pulls full interaction content from a frozen snapshot, runs each through
|
||||||
|
``extract_candidates_from_interaction``, and compares the output to the
|
||||||
|
expected counts from a labels file. Produces a per-label scorecard plus
|
||||||
|
aggregate precision / recall / yield numbers.
|
||||||
|
|
||||||
|
This harness deliberately stays file-based: snapshot + labels + this
|
||||||
|
runner. No Dalidou HTTP dependency once the snapshot is frozen, so the
|
||||||
|
eval is reproducible run-to-run even as live captures drift.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
python scripts/extractor_eval.py # human report
|
||||||
|
python scripts/extractor_eval.py --json # machine-readable
|
||||||
|
python scripts/extractor_eval.py \\
|
||||||
|
--snapshot scripts/eval_data/interactions_snapshot_2026-04-11.json \\
|
||||||
|
--labels scripts/eval_data/extractor_labels_2026-04-11.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Make src/ importable without requiring an install.
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
sys.path.insert(0, str(_REPO_ROOT / "src"))
|
||||||
|
|
||||||
|
from atocore.interactions.service import Interaction # noqa: E402
|
||||||
|
from atocore.memory.extractor import extract_candidates_from_interaction # noqa: E402
|
||||||
|
|
||||||
|
DEFAULT_SNAPSHOT = _REPO_ROOT / "scripts" / "eval_data" / "interactions_snapshot_2026-04-11.json"
|
||||||
|
DEFAULT_LABELS = _REPO_ROOT / "scripts" / "eval_data" / "extractor_labels_2026-04-11.json"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LabelResult:
|
||||||
|
id: str
|
||||||
|
expected_count: int
|
||||||
|
actual_count: int
|
||||||
|
ok: bool
|
||||||
|
miss_class: str
|
||||||
|
notes: str
|
||||||
|
actual_candidates: list[dict] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def load_snapshot(path: Path) -> dict[str, dict]:
|
||||||
|
data = json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
return {item["id"]: item for item in data.get("interactions", [])}
|
||||||
|
|
||||||
|
|
||||||
|
def load_labels(path: Path) -> dict:
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def interaction_from_snapshot(snap: dict) -> Interaction:
|
||||||
|
return Interaction(
|
||||||
|
id=snap["id"],
|
||||||
|
prompt=snap.get("prompt", "") or "",
|
||||||
|
response=snap.get("response", "") or "",
|
||||||
|
response_summary="",
|
||||||
|
project=snap.get("project", "") or "",
|
||||||
|
client=snap.get("client", "") or "",
|
||||||
|
session_id=snap.get("session_id", "") or "",
|
||||||
|
created_at=snap.get("created_at", "") or "",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def score(snapshot: dict[str, dict], labels_doc: dict) -> list[LabelResult]:
|
||||||
|
results: list[LabelResult] = []
|
||||||
|
for label in labels_doc["labels"]:
|
||||||
|
iid = label["id"]
|
||||||
|
snap = snapshot.get(iid)
|
||||||
|
if snap is None:
|
||||||
|
results.append(
|
||||||
|
LabelResult(
|
||||||
|
id=iid,
|
||||||
|
expected_count=int(label.get("expected_count", 0)),
|
||||||
|
actual_count=-1,
|
||||||
|
ok=False,
|
||||||
|
miss_class="not_in_snapshot",
|
||||||
|
notes=label.get("notes", ""),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
interaction = interaction_from_snapshot(snap)
|
||||||
|
candidates = extract_candidates_from_interaction(interaction)
|
||||||
|
actual_count = len(candidates)
|
||||||
|
expected_count = int(label.get("expected_count", 0))
|
||||||
|
results.append(
|
||||||
|
LabelResult(
|
||||||
|
id=iid,
|
||||||
|
expected_count=expected_count,
|
||||||
|
actual_count=actual_count,
|
||||||
|
ok=(actual_count == expected_count),
|
||||||
|
miss_class=label.get("miss_class", "n/a"),
|
||||||
|
notes=label.get("notes", ""),
|
||||||
|
actual_candidates=[
|
||||||
|
{
|
||||||
|
"memory_type": c.memory_type,
|
||||||
|
"content": c.content,
|
||||||
|
"project": c.project,
|
||||||
|
"rule": c.rule,
|
||||||
|
}
|
||||||
|
for c in candidates
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate(results: list[LabelResult]) -> dict:
|
||||||
|
total = len(results)
|
||||||
|
exact_match = sum(1 for r in results if r.ok)
|
||||||
|
true_positive = sum(1 for r in results if r.expected_count > 0 and r.actual_count > 0)
|
||||||
|
false_positive_interactions = sum(
|
||||||
|
1 for r in results if r.expected_count == 0 and r.actual_count > 0
|
||||||
|
)
|
||||||
|
false_negative_interactions = sum(
|
||||||
|
1 for r in results if r.expected_count > 0 and r.actual_count == 0
|
||||||
|
)
|
||||||
|
positive_expected = sum(1 for r in results if r.expected_count > 0)
|
||||||
|
total_expected_candidates = sum(r.expected_count for r in results)
|
||||||
|
total_actual_candidates = sum(max(r.actual_count, 0) for r in results)
|
||||||
|
yield_rate = total_actual_candidates / total if total else 0.0
|
||||||
|
# Recall over interaction count that had at least one expected candidate:
|
||||||
|
recall = true_positive / positive_expected if positive_expected else 0.0
|
||||||
|
# Precision over interaction count that produced any candidate:
|
||||||
|
precision_denom = true_positive + false_positive_interactions
|
||||||
|
precision = true_positive / precision_denom if precision_denom else 0.0
|
||||||
|
# Miss class breakdown
|
||||||
|
miss_classes: dict[str, int] = {}
|
||||||
|
for r in results:
|
||||||
|
if r.expected_count > 0 and r.actual_count == 0:
|
||||||
|
key = r.miss_class or "unlabeled"
|
||||||
|
miss_classes[key] = miss_classes.get(key, 0) + 1
|
||||||
|
return {
|
||||||
|
"total": total,
|
||||||
|
"exact_match": exact_match,
|
||||||
|
"positive_expected": positive_expected,
|
||||||
|
"total_expected_candidates": total_expected_candidates,
|
||||||
|
"total_actual_candidates": total_actual_candidates,
|
||||||
|
"yield_rate": round(yield_rate, 3),
|
||||||
|
"recall": round(recall, 3),
|
||||||
|
"precision": round(precision, 3),
|
||||||
|
"false_positive_interactions": false_positive_interactions,
|
||||||
|
"false_negative_interactions": false_negative_interactions,
|
||||||
|
"miss_classes": miss_classes,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def print_human(results: list[LabelResult], summary: dict) -> None:
|
||||||
|
print("=== Extractor eval ===")
|
||||||
|
print(
|
||||||
|
f"labeled={summary['total']} "
|
||||||
|
f"exact_match={summary['exact_match']} "
|
||||||
|
f"positive_expected={summary['positive_expected']}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"yield={summary['yield_rate']} "
|
||||||
|
f"recall={summary['recall']} "
|
||||||
|
f"precision={summary['precision']}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"false_positives={summary['false_positive_interactions']} "
|
||||||
|
f"false_negatives={summary['false_negative_interactions']}"
|
||||||
|
)
|
||||||
|
print()
|
||||||
|
print("miss class breakdown (FN):")
|
||||||
|
if summary["miss_classes"]:
|
||||||
|
for k, v in sorted(summary["miss_classes"].items(), key=lambda kv: -kv[1]):
|
||||||
|
print(f" {v:3d} {k}")
|
||||||
|
else:
|
||||||
|
print(" (none)")
|
||||||
|
print()
|
||||||
|
print("per-interaction:")
|
||||||
|
for r in results:
|
||||||
|
marker = "OK " if r.ok else "MISS"
|
||||||
|
iid_short = r.id[:8]
|
||||||
|
print(f" {marker} {iid_short} expected={r.expected_count} actual={r.actual_count} class={r.miss_class}")
|
||||||
|
if r.actual_candidates:
|
||||||
|
for c in r.actual_candidates:
|
||||||
|
preview = (c["content"] or "")[:80]
|
||||||
|
print(f" [{c['memory_type']}] {preview}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_json(results: list[LabelResult], summary: dict) -> None:
|
||||||
|
payload = {
|
||||||
|
"summary": summary,
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"id": r.id,
|
||||||
|
"expected_count": r.expected_count,
|
||||||
|
"actual_count": r.actual_count,
|
||||||
|
"ok": r.ok,
|
||||||
|
"miss_class": r.miss_class,
|
||||||
|
"notes": r.notes,
|
||||||
|
"actual_candidates": r.actual_candidates,
|
||||||
|
}
|
||||||
|
for r in results
|
||||||
|
],
|
||||||
|
}
|
||||||
|
json.dump(payload, sys.stdout, indent=2)
|
||||||
|
sys.stdout.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(description="AtoCore extractor eval")
|
||||||
|
parser.add_argument("--snapshot", type=Path, default=DEFAULT_SNAPSHOT)
|
||||||
|
parser.add_argument("--labels", type=Path, default=DEFAULT_LABELS)
|
||||||
|
parser.add_argument("--json", action="store_true", help="emit machine-readable JSON")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
snapshot = load_snapshot(args.snapshot)
|
||||||
|
labels = load_labels(args.labels)
|
||||||
|
results = score(snapshot, labels)
|
||||||
|
summary = aggregate(results)
|
||||||
|
|
||||||
|
if args.json:
|
||||||
|
print_json(results, summary)
|
||||||
|
else:
|
||||||
|
print_human(results, summary)
|
||||||
|
|
||||||
|
return 0 if summary["false_negative_interactions"] == 0 and summary["false_positive_interactions"] == 0 else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
Reference in New Issue
Block a user