ATOCore/scripts/retrieval_eval.py

"""Retrieval quality eval harness.

Runs a fixed set of project-hinted questions against
``POST /context/build`` on a live AtoCore instance and scores the
resulting ``formatted_context`` against per-question expectations.
The goal is a diffable scorecard that tells you, run-to-run,
whether a retrieval / builder / ingestion change moved the needle.

Design notes
------------
- Fixtures live in ``scripts/retrieval_eval_fixtures.json`` so new
  questions can be added without touching Python. Each fixture
  names the project, the prompt, and a checklist of substrings that
  MUST appear in ``formatted_context`` (``expect_present``) and
  substrings that MUST NOT appear (``expect_absent``). The absent
  list catches cross-project bleed and stale content.
- The checklist is deliberately substring-based (not regex, not
  embedding-similarity) so a failure is always a trivially
  reproducible "this string is not in that string". Richer scoring
  can come later once we know the harness is useful.
- The harness is external to the app runtime and talks to AtoCore
  over HTTP, so it works against dev, staging, or prod. It follows
  the same environment-variable contract as ``atocore_client.py``
  (``ATOCORE_BASE_URL``, ``ATOCORE_TIMEOUT_SECONDS``).
- Exit code 0 on all-pass, 1 on any fixture failure. Intended for
  manual runs today; a future cron / CI hook can consume the
  JSON output via ``--json``.

Usage
-----

    python scripts/retrieval_eval.py            # human-readable report
    python scripts/retrieval_eval.py --json     # machine-readable
    python scripts/retrieval_eval.py --fixtures path/to/custom.json
"""

from __future__ import annotations

import argparse
import json
import os
import sys
import urllib.error
import urllib.parse
import urllib.request
from dataclasses import dataclass, field
from pathlib import Path

DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://dalidou:8100")
DEFAULT_TIMEOUT = int(os.environ.get("ATOCORE_TIMEOUT_SECONDS", "30"))
DEFAULT_BUDGET = 3000
DEFAULT_FIXTURES = Path(__file__).parent / "retrieval_eval_fixtures.json"


@dataclass
class Fixture:
    name: str
    project: str
    prompt: str
    budget: int = DEFAULT_BUDGET
    expect_present: list[str] = field(default_factory=list)
    expect_absent: list[str] = field(default_factory=list)
    notes: str = ""


@dataclass
class FixtureResult:
    fixture: Fixture
    ok: bool
    missing_present: list[str]
    unexpected_absent: list[str]
    total_chars: int
    error: str = ""


def load_fixtures(path: Path) -> list[Fixture]:
    data = json.loads(path.read_text(encoding="utf-8"))
    if not isinstance(data, list):
        raise ValueError(f"{path} must contain a JSON array of fixtures")
    fixtures: list[Fixture] = []
    for i, raw in enumerate(data):
        if not isinstance(raw, dict):
            raise ValueError(f"fixture {i} is not an object")
        fixtures.append(
            Fixture(
                name=raw["name"],
                project=raw.get("project", ""),
                prompt=raw["prompt"],
                budget=int(raw.get("budget", DEFAULT_BUDGET)),
                expect_present=list(raw.get("expect_present", [])),
                expect_absent=list(raw.get("expect_absent", [])),
                notes=raw.get("notes", ""),
            )
        )
    return fixtures


def run_fixture(fixture: Fixture, base_url: str, timeout: int) -> FixtureResult:
    payload = {
        "prompt": fixture.prompt,
        "project": fixture.project or None,
        "budget": fixture.budget,
    }
    req = urllib.request.Request(
        url=f"{base_url}/context/build",
        method="POST",
        headers={"Content-Type": "application/json"},
        data=json.dumps(payload).encode("utf-8"),
    )
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            body = json.loads(resp.read().decode("utf-8"))
    except urllib.error.URLError as exc:
        return FixtureResult(
            fixture=fixture,
            ok=False,
            missing_present=list(fixture.expect_present),
            unexpected_absent=[],
            total_chars=0,
            error=f"http_error: {exc}",
        )

    formatted = body.get("formatted_context") or ""
    missing = [s for s in fixture.expect_present if s not in formatted]
    unexpected = [s for s in fixture.expect_absent if s in formatted]
    return FixtureResult(
        fixture=fixture,
        ok=not missing and not unexpected,
        missing_present=missing,
        unexpected_absent=unexpected,
        total_chars=len(formatted),
    )


def print_human_report(results: list[FixtureResult]) -> None:
    total = len(results)
    passed = sum(1 for r in results if r.ok)
    print(f"Retrieval eval: {passed}/{total} fixtures passed")
    print()
    for r in results:
        marker = "PASS" if r.ok else "FAIL"
        print(f"[{marker}] {r.fixture.name}  project={r.fixture.project}  chars={r.total_chars}")
        if r.error:
            print(f"       error: {r.error}")
        for miss in r.missing_present:
            print(f"       missing expected: {miss!r}")
        for bleed in r.unexpected_absent:
            print(f"       unexpected present: {bleed!r}")
        if r.fixture.notes and not r.ok:
            print(f"       notes: {r.fixture.notes}")


def print_json_report(results: list[FixtureResult]) -> None:
    payload = {
        "total": len(results),
        "passed": sum(1 for r in results if r.ok),
        "fixtures": [
            {
                "name": r.fixture.name,
                "project": r.fixture.project,
                "ok": r.ok,
                "total_chars": r.total_chars,
                "missing_present": r.missing_present,
                "unexpected_absent": r.unexpected_absent,
                "error": r.error,
            }
            for r in results
        ],
    }
    json.dump(payload, sys.stdout, indent=2)
    sys.stdout.write("\n")


def main() -> int:
    parser = argparse.ArgumentParser(description="AtoCore retrieval quality eval harness")
    parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
    parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT)
    parser.add_argument("--fixtures", type=Path, default=DEFAULT_FIXTURES)
    parser.add_argument("--json", action="store_true", help="emit machine-readable JSON")
    args = parser.parse_args()

    fixtures = load_fixtures(args.fixtures)
    results = [run_fixture(f, args.base_url, args.timeout) for f in fixtures]

    if args.json:
        print_json_report(results)
    else:
        print_human_report(results)

    return 0 if all(r.ok for r in results) else 1


if __name__ == "__main__":
    raise SystemExit(main())