2026-04-11 12:39:03 -04:00
|
|
|
"""Retrieval quality eval harness.
|
|
|
|
|
|
|
|
|
|
Runs a fixed set of project-hinted questions against
|
|
|
|
|
``POST /context/build`` on a live AtoCore instance and scores the
|
|
|
|
|
resulting ``formatted_context`` against per-question expectations.
|
|
|
|
|
The goal is a diffable scorecard that tells you, run-to-run,
|
|
|
|
|
whether a retrieval / builder / ingestion change moved the needle.
|
|
|
|
|
|
|
|
|
|
Design notes
|
|
|
|
|
------------
|
|
|
|
|
- Fixtures live in ``scripts/retrieval_eval_fixtures.json`` so new
|
|
|
|
|
questions can be added without touching Python. Each fixture
|
|
|
|
|
names the project, the prompt, and a checklist of substrings that
|
|
|
|
|
MUST appear in ``formatted_context`` (``expect_present``) and
|
|
|
|
|
substrings that MUST NOT appear (``expect_absent``). The absent
|
|
|
|
|
list catches cross-project bleed and stale content.
|
|
|
|
|
- The checklist is deliberately substring-based (not regex, not
|
|
|
|
|
embedding-similarity) so a failure is always a trivially
|
|
|
|
|
reproducible "this string is not in that string". Richer scoring
|
|
|
|
|
can come later once we know the harness is useful.
|
|
|
|
|
- The harness is external to the app runtime and talks to AtoCore
|
|
|
|
|
over HTTP, so it works against dev, staging, or prod. It follows
|
|
|
|
|
the same environment-variable contract as ``atocore_client.py``
|
|
|
|
|
(``ATOCORE_BASE_URL``, ``ATOCORE_TIMEOUT_SECONDS``).
|
|
|
|
|
- Exit code 0 on all-pass, 1 on any fixture failure. Intended for
|
|
|
|
|
manual runs today; a future cron / CI hook can consume the
|
|
|
|
|
JSON output via ``--json``.
|
|
|
|
|
|
|
|
|
|
Usage
|
|
|
|
|
-----
|
|
|
|
|
|
|
|
|
|
python scripts/retrieval_eval.py # human-readable report
|
|
|
|
|
python scripts/retrieval_eval.py --json # machine-readable
|
|
|
|
|
python scripts/retrieval_eval.py --fixtures path/to/custom.json
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import sys
|
|
|
|
|
import urllib.error
|
|
|
|
|
import urllib.parse
|
|
|
|
|
import urllib.request
|
|
|
|
|
from dataclasses import dataclass, field
|
2026-04-24 10:46:56 -04:00
|
|
|
from datetime import datetime, timezone
|
2026-04-11 12:39:03 -04:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://dalidou:8100")
|
|
|
|
|
DEFAULT_TIMEOUT = int(os.environ.get("ATOCORE_TIMEOUT_SECONDS", "30"))
|
|
|
|
|
DEFAULT_BUDGET = 3000
|
|
|
|
|
DEFAULT_FIXTURES = Path(__file__).parent / "retrieval_eval_fixtures.json"
|
|
|
|
|
|
|
|
|
|
|
2026-04-24 10:46:56 -04:00
|
|
|
def request_json(base_url: str, path: str, timeout: int) -> dict:
|
|
|
|
|
req = urllib.request.Request(f"{base_url}{path}", method="GET")
|
|
|
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
|
|
|
body = resp.read().decode("utf-8")
|
|
|
|
|
return json.loads(body) if body.strip() else {}
|
|
|
|
|
|
|
|
|
|
|
2026-04-11 12:39:03 -04:00
|
|
|
@dataclass
|
|
|
|
|
class Fixture:
|
|
|
|
|
name: str
|
|
|
|
|
project: str
|
|
|
|
|
prompt: str
|
|
|
|
|
budget: int = DEFAULT_BUDGET
|
|
|
|
|
expect_present: list[str] = field(default_factory=list)
|
|
|
|
|
expect_absent: list[str] = field(default_factory=list)
|
2026-04-24 10:46:56 -04:00
|
|
|
known_issue: bool = False
|
2026-04-11 12:39:03 -04:00
|
|
|
notes: str = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class FixtureResult:
|
|
|
|
|
fixture: Fixture
|
|
|
|
|
ok: bool
|
|
|
|
|
missing_present: list[str]
|
|
|
|
|
unexpected_absent: list[str]
|
|
|
|
|
total_chars: int
|
2026-04-24 10:46:56 -04:00
|
|
|
known_issue: bool = False
|
2026-04-11 12:39:03 -04:00
|
|
|
error: str = ""
|
|
|
|
|
|
2026-04-24 10:46:56 -04:00
|
|
|
@property
|
|
|
|
|
def blocking_failure(self) -> bool:
|
|
|
|
|
return not self.ok and not self.known_issue
|
|
|
|
|
|
2026-04-11 12:39:03 -04:00
|
|
|
|
|
|
|
|
def load_fixtures(path: Path) -> list[Fixture]:
|
|
|
|
|
data = json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
|
if not isinstance(data, list):
|
|
|
|
|
raise ValueError(f"{path} must contain a JSON array of fixtures")
|
|
|
|
|
fixtures: list[Fixture] = []
|
|
|
|
|
for i, raw in enumerate(data):
|
|
|
|
|
if not isinstance(raw, dict):
|
|
|
|
|
raise ValueError(f"fixture {i} is not an object")
|
|
|
|
|
fixtures.append(
|
|
|
|
|
Fixture(
|
|
|
|
|
name=raw["name"],
|
|
|
|
|
project=raw.get("project", ""),
|
|
|
|
|
prompt=raw["prompt"],
|
|
|
|
|
budget=int(raw.get("budget", DEFAULT_BUDGET)),
|
|
|
|
|
expect_present=list(raw.get("expect_present", [])),
|
|
|
|
|
expect_absent=list(raw.get("expect_absent", [])),
|
2026-04-24 10:46:56 -04:00
|
|
|
known_issue=bool(raw.get("known_issue", False)),
|
2026-04-11 12:39:03 -04:00
|
|
|
notes=raw.get("notes", ""),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
return fixtures
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_fixture(fixture: Fixture, base_url: str, timeout: int) -> FixtureResult:
|
|
|
|
|
payload = {
|
|
|
|
|
"prompt": fixture.prompt,
|
|
|
|
|
"project": fixture.project or None,
|
|
|
|
|
"budget": fixture.budget,
|
|
|
|
|
}
|
|
|
|
|
req = urllib.request.Request(
|
|
|
|
|
url=f"{base_url}/context/build",
|
|
|
|
|
method="POST",
|
|
|
|
|
headers={"Content-Type": "application/json"},
|
|
|
|
|
data=json.dumps(payload).encode("utf-8"),
|
|
|
|
|
)
|
|
|
|
|
try:
|
|
|
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
|
|
|
body = json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
except urllib.error.URLError as exc:
|
|
|
|
|
return FixtureResult(
|
|
|
|
|
fixture=fixture,
|
|
|
|
|
ok=False,
|
|
|
|
|
missing_present=list(fixture.expect_present),
|
|
|
|
|
unexpected_absent=[],
|
|
|
|
|
total_chars=0,
|
2026-04-24 10:46:56 -04:00
|
|
|
known_issue=fixture.known_issue,
|
2026-04-11 12:39:03 -04:00
|
|
|
error=f"http_error: {exc}",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
formatted = body.get("formatted_context") or ""
|
|
|
|
|
missing = [s for s in fixture.expect_present if s not in formatted]
|
|
|
|
|
unexpected = [s for s in fixture.expect_absent if s in formatted]
|
|
|
|
|
return FixtureResult(
|
|
|
|
|
fixture=fixture,
|
|
|
|
|
ok=not missing and not unexpected,
|
|
|
|
|
missing_present=missing,
|
|
|
|
|
unexpected_absent=unexpected,
|
|
|
|
|
total_chars=len(formatted),
|
2026-04-24 10:46:56 -04:00
|
|
|
known_issue=fixture.known_issue,
|
2026-04-11 12:39:03 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2026-04-24 10:46:56 -04:00
|
|
|
def print_human_report(results: list[FixtureResult], metadata: dict) -> None:
|
2026-04-11 12:39:03 -04:00
|
|
|
total = len(results)
|
|
|
|
|
passed = sum(1 for r in results if r.ok)
|
2026-04-24 10:46:56 -04:00
|
|
|
known = sum(1 for r in results if not r.ok and r.known_issue)
|
|
|
|
|
blocking = sum(1 for r in results if r.blocking_failure)
|
2026-04-11 12:39:03 -04:00
|
|
|
print(f"Retrieval eval: {passed}/{total} fixtures passed")
|
2026-04-24 10:46:56 -04:00
|
|
|
print(
|
|
|
|
|
"Target: "
|
|
|
|
|
f"{metadata.get('base_url', 'unknown')} "
|
|
|
|
|
f"build={metadata.get('health', {}).get('build_sha', 'unknown')}"
|
|
|
|
|
)
|
|
|
|
|
if known or blocking:
|
|
|
|
|
print(f"Blocking failures: {blocking} Known issues: {known}")
|
2026-04-11 12:39:03 -04:00
|
|
|
print()
|
|
|
|
|
for r in results:
|
2026-04-24 10:46:56 -04:00
|
|
|
marker = "PASS" if r.ok else ("KNOWN" if r.known_issue else "FAIL")
|
2026-04-11 12:39:03 -04:00
|
|
|
print(f"[{marker}] {r.fixture.name} project={r.fixture.project} chars={r.total_chars}")
|
|
|
|
|
if r.error:
|
|
|
|
|
print(f" error: {r.error}")
|
|
|
|
|
for miss in r.missing_present:
|
|
|
|
|
print(f" missing expected: {miss!r}")
|
|
|
|
|
for bleed in r.unexpected_absent:
|
|
|
|
|
print(f" unexpected present: {bleed!r}")
|
|
|
|
|
if r.fixture.notes and not r.ok:
|
|
|
|
|
print(f" notes: {r.fixture.notes}")
|
|
|
|
|
|
|
|
|
|
|
2026-04-24 10:46:56 -04:00
|
|
|
def print_json_report(results: list[FixtureResult], metadata: dict) -> None:
|
2026-04-11 12:39:03 -04:00
|
|
|
payload = {
|
2026-04-24 10:46:56 -04:00
|
|
|
"generated_at": metadata.get("generated_at"),
|
|
|
|
|
"base_url": metadata.get("base_url"),
|
|
|
|
|
"health": metadata.get("health", {}),
|
2026-04-11 12:39:03 -04:00
|
|
|
"total": len(results),
|
|
|
|
|
"passed": sum(1 for r in results if r.ok),
|
2026-04-24 10:46:56 -04:00
|
|
|
"known_issues": sum(1 for r in results if not r.ok and r.known_issue),
|
|
|
|
|
"blocking_failures": sum(1 for r in results if r.blocking_failure),
|
2026-04-11 12:39:03 -04:00
|
|
|
"fixtures": [
|
|
|
|
|
{
|
|
|
|
|
"name": r.fixture.name,
|
|
|
|
|
"project": r.fixture.project,
|
|
|
|
|
"ok": r.ok,
|
2026-04-24 10:46:56 -04:00
|
|
|
"known_issue": r.known_issue,
|
2026-04-11 12:39:03 -04:00
|
|
|
"total_chars": r.total_chars,
|
|
|
|
|
"missing_present": r.missing_present,
|
|
|
|
|
"unexpected_absent": r.unexpected_absent,
|
|
|
|
|
"error": r.error,
|
|
|
|
|
}
|
|
|
|
|
for r in results
|
|
|
|
|
],
|
|
|
|
|
}
|
|
|
|
|
json.dump(payload, sys.stdout, indent=2)
|
|
|
|
|
sys.stdout.write("\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main() -> int:
|
|
|
|
|
parser = argparse.ArgumentParser(description="AtoCore retrieval quality eval harness")
|
|
|
|
|
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
|
|
|
|
parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT)
|
|
|
|
|
parser.add_argument("--fixtures", type=Path, default=DEFAULT_FIXTURES)
|
|
|
|
|
parser.add_argument("--json", action="store_true", help="emit machine-readable JSON")
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
2026-04-24 10:46:56 -04:00
|
|
|
base_url = args.base_url.rstrip("/")
|
|
|
|
|
try:
|
|
|
|
|
health = request_json(base_url, "/health", args.timeout)
|
|
|
|
|
except (urllib.error.URLError, TimeoutError, OSError, json.JSONDecodeError) as exc:
|
|
|
|
|
health = {"error": str(exc)}
|
|
|
|
|
metadata = {
|
|
|
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
|
|
|
"base_url": base_url,
|
|
|
|
|
"health": health,
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-11 12:39:03 -04:00
|
|
|
fixtures = load_fixtures(args.fixtures)
|
2026-04-24 10:46:56 -04:00
|
|
|
results = [run_fixture(f, base_url, args.timeout) for f in fixtures]
|
2026-04-11 12:39:03 -04:00
|
|
|
|
|
|
|
|
if args.json:
|
2026-04-24 10:46:56 -04:00
|
|
|
print_json_report(results, metadata)
|
2026-04-11 12:39:03 -04:00
|
|
|
else:
|
2026-04-24 10:46:56 -04:00
|
|
|
print_human_report(results, metadata)
|
2026-04-11 12:39:03 -04:00
|
|
|
|
2026-04-24 10:46:56 -04:00
|
|
|
return 0 if not any(r.blocking_failure for r in results) else 1
|
2026-04-11 12:39:03 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
raise SystemExit(main())
|