scripts/retrieval_eval.py

"""Retrieval quality eval harness.

Runs a fixed set of project-hinted questions against
``POST /context/build`` on a live AtoCore instance and scores the
resulting ``formatted_context`` against per-question expectations.
The goal is a diffable scorecard that tells you, run-to-run,
whether a retrieval / builder / ingestion change moved the needle.

Design notes
------------
- Fixtures live in ``scripts/retrieval_eval_fixtures.json`` so new
  questions can be added without touching Python. Each fixture
  names the project, the prompt, and a checklist of substrings that
  MUST appear in ``formatted_context`` (``expect_present``) and
  substrings that MUST NOT appear (``expect_absent``). The absent
  list catches cross-project bleed and stale content.
- The checklist is deliberately substring-based (not regex, not
  embedding-similarity) so a failure is always a trivially
  reproducible "this string is not in that string". Richer scoring
  can come later once we know the harness is useful.
- The harness is external to the app runtime and talks to AtoCore
  over HTTP, so it works against dev, staging, or prod. It follows
  the same environment-variable contract as ``atocore_client.py``
  (``ATOCORE_BASE_URL``, ``ATOCORE_TIMEOUT_SECONDS``).
- Exit code 0 on all-pass, 1 on any fixture failure. Intended for
  manual runs today; a future cron / CI hook can consume the
  JSON output via ``--json``.

Usage
-----

    python scripts/retrieval_eval.py            # human-readable report
    python scripts/retrieval_eval.py --json     # machine-readable
    python scripts/retrieval_eval.py --fixtures path/to/custom.json
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import urllib.error
import urllib.parse
import urllib.request
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path

DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://dalidou:8100")
DEFAULT_TIMEOUT = int(os.environ.get("ATOCORE_TIMEOUT_SECONDS", "30"))
DEFAULT_BUDGET = 3000
DEFAULT_FIXTURES = Path(__file__).parent / "retrieval_eval_fixtures.json"


def request_json(base_url: str, path: str, timeout: int) -> dict:
    req = urllib.request.Request(f"{base_url}{path}", method="GET")
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        body = resp.read().decode("utf-8")
    return json.loads(body) if body.strip() else {}


@dataclass
class Fixture:
    name: str
    project: str
    prompt: str
    budget: int = DEFAULT_BUDGET
    expect_present: list[str] = field(default_factory=list)
    expect_absent: list[str] = field(default_factory=list)
    known_issue: bool = False
    notes: str = ""


@dataclass
class FixtureResult:
    fixture: Fixture
    ok: bool
    missing_present: list[str]
    unexpected_absent: list[str]
    total_chars: int
    known_issue: bool = False
    error: str = ""

    @property
    def blocking_failure(self) -> bool:
        return not self.ok and not self.known_issue


def load_fixtures(path: Path) -> list[Fixture]:
    data = json.loads(path.read_text(encoding="utf-8"))
    if not isinstance(data, list):
        raise ValueError(f"{path} must contain a JSON array of fixtures")
    fixtures: list[Fixture] = []
    for i, raw in enumerate(data):
        if not isinstance(raw, dict):
            raise ValueError(f"fixture {i} is not an object")
        fixtures.append(
            Fixture(
                name=raw["name"],
                project=raw.get("project", ""),
                prompt=raw["prompt"],
                budget=int(raw.get("budget", DEFAULT_BUDGET)),
                expect_present=list(raw.get("expect_present", [])),
                expect_absent=list(raw.get("expect_absent", [])),
                known_issue=bool(raw.get("known_issue", False)),
                notes=raw.get("notes", ""),
            )
        )
    return fixtures


def run_fixture(fixture: Fixture, base_url: str, timeout: int) -> FixtureResult:
    payload = {
        "prompt": fixture.prompt,
        "project": fixture.project or None,
        "budget": fixture.budget,
    }
    req = urllib.request.Request(
        url=f"{base_url}/context/build",
        method="POST",
        headers={"Content-Type": "application/json"},
        data=json.dumps(payload).encode("utf-8"),
    )
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            body = json.loads(resp.read().decode("utf-8"))
    except urllib.error.URLError as exc:
        return FixtureResult(
            fixture=fixture,
            ok=False,
            missing_present=list(fixture.expect_present),
            unexpected_absent=[],
            total_chars=0,
            known_issue=fixture.known_issue,
            error=f"http_error: {exc}",
        )

    formatted = body.get("formatted_context") or ""
    missing = [s for s in fixture.expect_present if s not in formatted]
    unexpected = [s for s in fixture.expect_absent if s in formatted]
    return FixtureResult(
        fixture=fixture,
        ok=not missing and not unexpected,
        missing_present=missing,
        unexpected_absent=unexpected,
        total_chars=len(formatted),
        known_issue=fixture.known_issue,
    )


def print_human_report(results: list[FixtureResult], metadata: dict) -> None:
    total = len(results)
    passed = sum(1 for r in results if r.ok)
    known = sum(1 for r in results if not r.ok and r.known_issue)
    blocking = sum(1 for r in results if r.blocking_failure)
    print(f"Retrieval eval: {passed}/{total} fixtures passed")
    print(
        "Target: "
        f"{metadata.get('base_url', 'unknown')} "
        f"build={metadata.get('health', {}).get('build_sha', 'unknown')}"
    )
    if known or blocking:
        print(f"Blocking failures: {blocking}  Known issues: {known}")
    print()
    for r in results:
        marker = "PASS" if r.ok else ("KNOWN" if r.known_issue else "FAIL")
        print(f"[{marker}] {r.fixture.name}  project={r.fixture.project}  chars={r.total_chars}")
        if r.error:
            print(f"       error: {r.error}")
        for miss in r.missing_present:
            print(f"       missing expected: {miss!r}")
        for bleed in r.unexpected_absent:
            print(f"       unexpected present: {bleed!r}")
        if r.fixture.notes and not r.ok:
            print(f"       notes: {r.fixture.notes}")


def print_json_report(results: list[FixtureResult], metadata: dict) -> None:
    payload = {
        "generated_at": metadata.get("generated_at"),
        "base_url": metadata.get("base_url"),
        "health": metadata.get("health", {}),
        "total": len(results),
        "passed": sum(1 for r in results if r.ok),
        "known_issues": sum(1 for r in results if not r.ok and r.known_issue),
        "blocking_failures": sum(1 for r in results if r.blocking_failure),
        "fixtures": [
            {
                "name": r.fixture.name,
                "project": r.fixture.project,
                "ok": r.ok,
                "known_issue": r.known_issue,
                "total_chars": r.total_chars,
                "missing_present": r.missing_present,
                "unexpected_absent": r.unexpected_absent,
                "error": r.error,
            }
            for r in results
        ],
    }
    json.dump(payload, sys.stdout, indent=2)
    sys.stdout.write("\n")


def main() -> int:
    parser = argparse.ArgumentParser(description="AtoCore retrieval quality eval harness")
    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
    parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT)
    parser.add_argument("--fixtures", type=Path, default=DEFAULT_FIXTURES)
    parser.add_argument("--json", action="store_true", help="emit machine-readable JSON")
    args = parser.parse_args()

    base_url = args.base_url.rstrip("/")
    try:
        health = request_json(base_url, "/health", args.timeout)
    except (urllib.error.URLError, TimeoutError, OSError, json.JSONDecodeError) as exc:
        health = {"error": str(exc)}
    metadata = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "base_url": base_url,
        "health": health,
    }

    fixtures = load_fixtures(args.fixtures)
    results = [run_fixture(f, base_url, args.timeout) for f in fixtures]

    if args.json:
        print_json_report(results, metadata)
    else:
        print_human_report(results, metadata)

    return 0 if not any(r.blocking_failure for r in results) else 1


if __name__ == "__main__":
    raise SystemExit(main())