From 8951c624fe9d0253c28d5afd14f172739101b788 Mon Sep 17 00:00:00 2001 From: Anto01 Date: Sun, 12 Apr 2026 14:34:33 -0400 Subject: [PATCH] fix(R7/R9): overlap-density ranking + project trust-preservation R7: ranking scorer now uses overlap-density (overlap_count / memory_token_count) as primary key instead of raw overlap count. A 5-token memory with 3 overlapping tokens (density 0.6) now beats a 40-token overview memory with 3 overlapping tokens (density 0.075) at the same absolute count. Secondary: absolute overlap. Tertiary: confidence. Targeting p06-firmware-interface harness fixture. R9: when the LLM extractor returns a project that differs from the interaction's known project, it now checks the project registry. If the model's project is a registered canonical ID, trust it. If not (hallucinated name), fall back to the interaction's project. Uses load_project_registry() for the check. The host-side script mirrors this via an API call to GET /projects at startup. Two new tests: test_parser_keeps_registered_model_project and test_parser_rejects_hallucinated_project. Test count: 280 -> 281. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/batch_llm_extract_live.py | 25 ++++++++++++++++++++++++- src/atocore/memory/extractor_llm.py | 21 +++++++++++++++++++++ src/atocore/memory/service.py | 23 +++++++++++++++-------- tests/test_extractor_llm.py | 20 ++++++++++++++++++-- 4 files changed, 78 insertions(+), 11 deletions(-) diff --git a/scripts/batch_llm_extract_live.py b/scripts/batch_llm_extract_live.py index 2397b69..311364c 100644 --- a/scripts/batch_llm_extract_live.py +++ b/scripts/batch_llm_extract_live.py @@ -100,6 +100,22 @@ def set_last_run(base_url, timestamp): pass +_known_projects: set[str] = set() + + +def _load_known_projects(base_url): + """Fetch registered project IDs from the API for R9 validation.""" + global _known_projects + try: + data = api_get(base_url, "/projects") + _known_projects = {p["id"] for p in data.get("projects", [])} + for p in data.get("projects", []): + for alias in p.get("aliases", []): + _known_projects.add(alias) + except Exception: + pass + + def extract_one(prompt, response, project, model, timeout_s): """Run claude -p on one interaction, return parsed candidates.""" if not shutil.which("claude"): @@ -178,6 +194,12 @@ def parse_candidates(raw, interaction_project): project = str(item.get("project") or "").strip() if not project and interaction_project: project = interaction_project + elif project and interaction_project and project != interaction_project: + # R9: model hallucinated an unrecognized project — fall back. + # The host-side script can't import the registry, so we + # check against a known set fetched from the API. + if project not in _known_projects: + project = interaction_project conf = item.get("confidence", 0.5) if mem_type not in MEMORY_TYPES or not content: continue @@ -202,8 +224,9 @@ def main(): parser.add_argument("--model", default=DEFAULT_MODEL) args = parser.parse_args() + _load_known_projects(args.base_url) since = args.since or get_last_run(args.base_url) - print(f"since={since or '(first run)'} limit={args.limit} model={args.model}") + print(f"since={since or '(first run)'} limit={args.limit} model={args.model} known_projects={len(_known_projects)}") params = [f"limit={args.limit}"] if since: diff --git a/src/atocore/memory/extractor_llm.py b/src/atocore/memory/extractor_llm.py index 1a1fd88..986222b 100644 --- a/src/atocore/memory/extractor_llm.py +++ b/src/atocore/memory/extractor_llm.py @@ -257,6 +257,27 @@ def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryC project = str(item.get("project") or "").strip() if not project and interaction.project: project = interaction.project + elif project and interaction.project and project != interaction.project: + # R9: model returned a different project than the interaction's + # known scope. Trust the model's project only if it resolves + # to a known registered project (the registry normalizes + # aliases and returns the canonical id). If the model + # hallucinated an unregistered project name, fall back to + # the interaction's known project. + try: + from atocore.projects.registry import ( + load_project_registry, + resolve_project_name, + ) + + registered_ids = {p.project_id for p in load_project_registry()} + resolved = resolve_project_name(project) + if resolved not in registered_ids: + project = interaction.project + else: + project = resolved + except Exception: + project = interaction.project confidence_raw = item.get("confidence", 0.5) if mem_type not in MEMORY_TYPES: continue diff --git a/src/atocore/memory/service.py b/src/atocore/memory/service.py index 34241e3..0e9a119 100644 --- a/src/atocore/memory/service.py +++ b/src/atocore/memory/service.py @@ -446,20 +446,27 @@ def _rank_memories_for_query( ) -> list["Memory"]: """Rerank a memory list by lexical overlap with a pre-tokenized query. - Ordering key: (overlap_count DESC, confidence DESC). When a query - shares no tokens with a memory, overlap is zero and confidence - acts as the sole tiebreaker — which matches the pre-query - behaviour and keeps no-query calls stable. + Primary key: overlap_density (overlap_count / memory_token_count), + which rewards short focused memories that match the query precisely + over long overview memories that incidentally share a few tokens. + Secondary: absolute overlap count. Tertiary: confidence. + + R7 fix: previously overlap_count alone was the primary key, so a + 40-token overview memory with 3 overlapping tokens tied a 5-token + memory with 3 overlapping tokens, and the overview won on + confidence. Now the short memory's density (0.6) beats the + overview's density (0.075). """ from atocore.memory.reinforcement import _normalize, _tokenize - scored: list[tuple[int, float, Memory]] = [] + scored: list[tuple[float, int, float, Memory]] = [] for mem in memories: mem_tokens = _tokenize(_normalize(mem.content)) overlap = len(mem_tokens & query_tokens) if mem_tokens else 0 - scored.append((overlap, mem.confidence, mem)) - scored.sort(key=lambda t: (t[0], t[1]), reverse=True) - return [mem for _, _, mem in scored] + density = overlap / len(mem_tokens) if mem_tokens else 0.0 + scored.append((density, overlap, mem.confidence, mem)) + scored.sort(key=lambda t: (t[0], t[1], t[2]), reverse=True) + return [mem for _, _, _, mem in scored] def _row_to_memory(row) -> Memory: diff --git a/tests/test_extractor_llm.py b/tests/test_extractor_llm.py index 1bbc79a..0f3a85a 100644 --- a/tests/test_extractor_llm.py +++ b/tests/test_extractor_llm.py @@ -107,8 +107,11 @@ def test_parser_falls_back_to_interaction_project(): assert result[0].project == "p06-polisher" -def test_parser_keeps_model_project_when_provided(): - """Model-supplied project takes precedence over interaction.""" +def test_parser_keeps_registered_model_project(tmp_data_dir, project_registry): + """R9: model-supplied project is kept when it's a registered project.""" + from atocore.models.database import init_db + init_db() + project_registry(("p04-gigabit", ["p04", "gigabit"]), ("p06-polisher", ["p06"])) raw = '[{"type": "project", "content": "x", "project": "p04-gigabit"}]' interaction = _make_interaction() interaction.project = "p06-polisher" @@ -116,6 +119,19 @@ def test_parser_keeps_model_project_when_provided(): assert result[0].project == "p04-gigabit" +def test_parser_rejects_hallucinated_project(tmp_data_dir, project_registry): + """R9: model-supplied project that is NOT registered falls back + to the interaction's known project.""" + from atocore.models.database import init_db + init_db() + project_registry(("p06-polisher", ["p06"])) + raw = '[{"type": "project", "content": "x", "project": "fake-project-99"}]' + interaction = _make_interaction() + interaction.project = "p06-polisher" + result = _parse_candidates(raw, interaction) + assert result[0].project == "p06-polisher" + + def test_missing_cli_returns_empty(monkeypatch): """If ``claude`` is not on PATH the extractor returns empty, never raises.""" monkeypatch.setattr(extractor_llm, "_cli_available", lambda: False)