fix(R7/R9): overlap-density ranking + project trust-preservation
R7: ranking scorer now uses overlap-density (overlap_count / memory_token_count) as primary key instead of raw overlap count. A 5-token memory with 3 overlapping tokens (density 0.6) now beats a 40-token overview memory with 3 overlapping tokens (density 0.075) at the same absolute count. Secondary: absolute overlap. Tertiary: confidence. Targeting p06-firmware-interface harness fixture. R9: when the LLM extractor returns a project that differs from the interaction's known project, it now checks the project registry. If the model's project is a registered canonical ID, trust it. If not (hallucinated name), fall back to the interaction's project. Uses load_project_registry() for the check. The host-side script mirrors this via an API call to GET /projects at startup. Two new tests: test_parser_keeps_registered_model_project and test_parser_rejects_hallucinated_project. Test count: 280 -> 281. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -100,6 +100,22 @@ def set_last_run(base_url, timestamp):
|
||||
pass
|
||||
|
||||
|
||||
_known_projects: set[str] = set()
|
||||
|
||||
|
||||
def _load_known_projects(base_url):
|
||||
"""Fetch registered project IDs from the API for R9 validation."""
|
||||
global _known_projects
|
||||
try:
|
||||
data = api_get(base_url, "/projects")
|
||||
_known_projects = {p["id"] for p in data.get("projects", [])}
|
||||
for p in data.get("projects", []):
|
||||
for alias in p.get("aliases", []):
|
||||
_known_projects.add(alias)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def extract_one(prompt, response, project, model, timeout_s):
|
||||
"""Run claude -p on one interaction, return parsed candidates."""
|
||||
if not shutil.which("claude"):
|
||||
@@ -178,6 +194,12 @@ def parse_candidates(raw, interaction_project):
|
||||
project = str(item.get("project") or "").strip()
|
||||
if not project and interaction_project:
|
||||
project = interaction_project
|
||||
elif project and interaction_project and project != interaction_project:
|
||||
# R9: model hallucinated an unrecognized project — fall back.
|
||||
# The host-side script can't import the registry, so we
|
||||
# check against a known set fetched from the API.
|
||||
if project not in _known_projects:
|
||||
project = interaction_project
|
||||
conf = item.get("confidence", 0.5)
|
||||
if mem_type not in MEMORY_TYPES or not content:
|
||||
continue
|
||||
@@ -202,8 +224,9 @@ def main():
|
||||
parser.add_argument("--model", default=DEFAULT_MODEL)
|
||||
args = parser.parse_args()
|
||||
|
||||
_load_known_projects(args.base_url)
|
||||
since = args.since or get_last_run(args.base_url)
|
||||
print(f"since={since or '(first run)'} limit={args.limit} model={args.model}")
|
||||
print(f"since={since or '(first run)'} limit={args.limit} model={args.model} known_projects={len(_known_projects)}")
|
||||
|
||||
params = [f"limit={args.limit}"]
|
||||
if since:
|
||||
|
||||
Reference in New Issue
Block a user