"""Retrieval quality eval harness. Runs a fixed set of project-hinted questions against ``POST /context/build`` on a live AtoCore instance and scores the resulting ``formatted_context`` against per-question expectations. The goal is a diffable scorecard that tells you, run-to-run, whether a retrieval / builder / ingestion change moved the needle. Design notes ------------ - Fixtures live in ``scripts/retrieval_eval_fixtures.json`` so new questions can be added without touching Python. Each fixture names the project, the prompt, and a checklist of substrings that MUST appear in ``formatted_context`` (``expect_present``) and substrings that MUST NOT appear (``expect_absent``). The absent list catches cross-project bleed and stale content. - The checklist is deliberately substring-based (not regex, not embedding-similarity) so a failure is always a trivially reproducible "this string is not in that string". Richer scoring can come later once we know the harness is useful. - The harness is external to the app runtime and talks to AtoCore over HTTP, so it works against dev, staging, or prod. It follows the same environment-variable contract as ``atocore_client.py`` (``ATOCORE_BASE_URL``, ``ATOCORE_TIMEOUT_SECONDS``). - Exit code 0 on all-pass, 1 on any fixture failure. Intended for manual runs today; a future cron / CI hook can consume the JSON output via ``--json``. Usage ----- python scripts/retrieval_eval.py # human-readable report python scripts/retrieval_eval.py --json # machine-readable python scripts/retrieval_eval.py --fixtures path/to/custom.json """ from __future__ import annotations import argparse import json import os import sys import urllib.error import urllib.parse import urllib.request from dataclasses import dataclass, field from pathlib import Path DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://dalidou:8100") DEFAULT_TIMEOUT = int(os.environ.get("ATOCORE_TIMEOUT_SECONDS", "30")) DEFAULT_BUDGET = 3000 DEFAULT_FIXTURES = Path(__file__).parent / "retrieval_eval_fixtures.json" @dataclass class Fixture: name: str project: str prompt: str budget: int = DEFAULT_BUDGET expect_present: list[str] = field(default_factory=list) expect_absent: list[str] = field(default_factory=list) notes: str = "" @dataclass class FixtureResult: fixture: Fixture ok: bool missing_present: list[str] unexpected_absent: list[str] total_chars: int error: str = "" def load_fixtures(path: Path) -> list[Fixture]: data = json.loads(path.read_text(encoding="utf-8")) if not isinstance(data, list): raise ValueError(f"{path} must contain a JSON array of fixtures") fixtures: list[Fixture] = [] for i, raw in enumerate(data): if not isinstance(raw, dict): raise ValueError(f"fixture {i} is not an object") fixtures.append( Fixture( name=raw["name"], project=raw.get("project", ""), prompt=raw["prompt"], budget=int(raw.get("budget", DEFAULT_BUDGET)), expect_present=list(raw.get("expect_present", [])), expect_absent=list(raw.get("expect_absent", [])), notes=raw.get("notes", ""), ) ) return fixtures def run_fixture(fixture: Fixture, base_url: str, timeout: int) -> FixtureResult: payload = { "prompt": fixture.prompt, "project": fixture.project or None, "budget": fixture.budget, } req = urllib.request.Request( url=f"{base_url}/context/build", method="POST", headers={"Content-Type": "application/json"}, data=json.dumps(payload).encode("utf-8"), ) try: with urllib.request.urlopen(req, timeout=timeout) as resp: body = json.loads(resp.read().decode("utf-8")) except urllib.error.URLError as exc: return FixtureResult( fixture=fixture, ok=False, missing_present=list(fixture.expect_present), unexpected_absent=[], total_chars=0, error=f"http_error: {exc}", ) formatted = body.get("formatted_context") or "" missing = [s for s in fixture.expect_present if s not in formatted] unexpected = [s for s in fixture.expect_absent if s in formatted] return FixtureResult( fixture=fixture, ok=not missing and not unexpected, missing_present=missing, unexpected_absent=unexpected, total_chars=len(formatted), ) def print_human_report(results: list[FixtureResult]) -> None: total = len(results) passed = sum(1 for r in results if r.ok) print(f"Retrieval eval: {passed}/{total} fixtures passed") print() for r in results: marker = "PASS" if r.ok else "FAIL" print(f"[{marker}] {r.fixture.name} project={r.fixture.project} chars={r.total_chars}") if r.error: print(f" error: {r.error}") for miss in r.missing_present: print(f" missing expected: {miss!r}") for bleed in r.unexpected_absent: print(f" unexpected present: {bleed!r}") if r.fixture.notes and not r.ok: print(f" notes: {r.fixture.notes}") def print_json_report(results: list[FixtureResult]) -> None: payload = { "total": len(results), "passed": sum(1 for r in results if r.ok), "fixtures": [ { "name": r.fixture.name, "project": r.fixture.project, "ok": r.ok, "total_chars": r.total_chars, "missing_present": r.missing_present, "unexpected_absent": r.unexpected_absent, "error": r.error, } for r in results ], } json.dump(payload, sys.stdout, indent=2) sys.stdout.write("\n") def main() -> int: parser = argparse.ArgumentParser(description="AtoCore retrieval quality eval harness") parser.add_argument("--base-url", default=DEFAULT_BASE_URL) parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT) parser.add_argument("--fixtures", type=Path, default=DEFAULT_FIXTURES) parser.add_argument("--json", action="store_true", help="emit machine-readable JSON") args = parser.parse_args() fixtures = load_fixtures(args.fixtures) results = [run_fixture(f, args.base_url, args.timeout) for f in fixtures] if args.json: print_json_report(results) else: print_human_report(results) return 0 if all(r.ok for r in results) else 1 if __name__ == "__main__": raise SystemExit(main())