fix(R7/R9): overlap-density ranking + project trust-preservation
R7: ranking scorer now uses overlap-density (overlap_count / memory_token_count) as primary key instead of raw overlap count. A 5-token memory with 3 overlapping tokens (density 0.6) now beats a 40-token overview memory with 3 overlapping tokens (density 0.075) at the same absolute count. Secondary: absolute overlap. Tertiary: confidence. Targeting p06-firmware-interface harness fixture. R9: when the LLM extractor returns a project that differs from the interaction's known project, it now checks the project registry. If the model's project is a registered canonical ID, trust it. If not (hallucinated name), fall back to the interaction's project. Uses load_project_registry() for the check. The host-side script mirrors this via an API call to GET /projects at startup. Two new tests: test_parser_keeps_registered_model_project and test_parser_rejects_hallucinated_project. Test count: 280 -> 281. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -100,6 +100,22 @@ def set_last_run(base_url, timestamp):
|
||||
pass
|
||||
|
||||
|
||||
_known_projects: set[str] = set()
|
||||
|
||||
|
||||
def _load_known_projects(base_url):
|
||||
"""Fetch registered project IDs from the API for R9 validation."""
|
||||
global _known_projects
|
||||
try:
|
||||
data = api_get(base_url, "/projects")
|
||||
_known_projects = {p["id"] for p in data.get("projects", [])}
|
||||
for p in data.get("projects", []):
|
||||
for alias in p.get("aliases", []):
|
||||
_known_projects.add(alias)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def extract_one(prompt, response, project, model, timeout_s):
|
||||
"""Run claude -p on one interaction, return parsed candidates."""
|
||||
if not shutil.which("claude"):
|
||||
@@ -178,6 +194,12 @@ def parse_candidates(raw, interaction_project):
|
||||
project = str(item.get("project") or "").strip()
|
||||
if not project and interaction_project:
|
||||
project = interaction_project
|
||||
elif project and interaction_project and project != interaction_project:
|
||||
# R9: model hallucinated an unrecognized project — fall back.
|
||||
# The host-side script can't import the registry, so we
|
||||
# check against a known set fetched from the API.
|
||||
if project not in _known_projects:
|
||||
project = interaction_project
|
||||
conf = item.get("confidence", 0.5)
|
||||
if mem_type not in MEMORY_TYPES or not content:
|
||||
continue
|
||||
@@ -202,8 +224,9 @@ def main():
|
||||
parser.add_argument("--model", default=DEFAULT_MODEL)
|
||||
args = parser.parse_args()
|
||||
|
||||
_load_known_projects(args.base_url)
|
||||
since = args.since or get_last_run(args.base_url)
|
||||
print(f"since={since or '(first run)'} limit={args.limit} model={args.model}")
|
||||
print(f"since={since or '(first run)'} limit={args.limit} model={args.model} known_projects={len(_known_projects)}")
|
||||
|
||||
params = [f"limit={args.limit}"]
|
||||
if since:
|
||||
|
||||
@@ -257,6 +257,27 @@ def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryC
|
||||
project = str(item.get("project") or "").strip()
|
||||
if not project and interaction.project:
|
||||
project = interaction.project
|
||||
elif project and interaction.project and project != interaction.project:
|
||||
# R9: model returned a different project than the interaction's
|
||||
# known scope. Trust the model's project only if it resolves
|
||||
# to a known registered project (the registry normalizes
|
||||
# aliases and returns the canonical id). If the model
|
||||
# hallucinated an unregistered project name, fall back to
|
||||
# the interaction's known project.
|
||||
try:
|
||||
from atocore.projects.registry import (
|
||||
load_project_registry,
|
||||
resolve_project_name,
|
||||
)
|
||||
|
||||
registered_ids = {p.project_id for p in load_project_registry()}
|
||||
resolved = resolve_project_name(project)
|
||||
if resolved not in registered_ids:
|
||||
project = interaction.project
|
||||
else:
|
||||
project = resolved
|
||||
except Exception:
|
||||
project = interaction.project
|
||||
confidence_raw = item.get("confidence", 0.5)
|
||||
if mem_type not in MEMORY_TYPES:
|
||||
continue
|
||||
|
||||
@@ -446,20 +446,27 @@ def _rank_memories_for_query(
|
||||
) -> list["Memory"]:
|
||||
"""Rerank a memory list by lexical overlap with a pre-tokenized query.
|
||||
|
||||
Ordering key: (overlap_count DESC, confidence DESC). When a query
|
||||
shares no tokens with a memory, overlap is zero and confidence
|
||||
acts as the sole tiebreaker — which matches the pre-query
|
||||
behaviour and keeps no-query calls stable.
|
||||
Primary key: overlap_density (overlap_count / memory_token_count),
|
||||
which rewards short focused memories that match the query precisely
|
||||
over long overview memories that incidentally share a few tokens.
|
||||
Secondary: absolute overlap count. Tertiary: confidence.
|
||||
|
||||
R7 fix: previously overlap_count alone was the primary key, so a
|
||||
40-token overview memory with 3 overlapping tokens tied a 5-token
|
||||
memory with 3 overlapping tokens, and the overview won on
|
||||
confidence. Now the short memory's density (0.6) beats the
|
||||
overview's density (0.075).
|
||||
"""
|
||||
from atocore.memory.reinforcement import _normalize, _tokenize
|
||||
|
||||
scored: list[tuple[int, float, Memory]] = []
|
||||
scored: list[tuple[float, int, float, Memory]] = []
|
||||
for mem in memories:
|
||||
mem_tokens = _tokenize(_normalize(mem.content))
|
||||
overlap = len(mem_tokens & query_tokens) if mem_tokens else 0
|
||||
scored.append((overlap, mem.confidence, mem))
|
||||
scored.sort(key=lambda t: (t[0], t[1]), reverse=True)
|
||||
return [mem for _, _, mem in scored]
|
||||
density = overlap / len(mem_tokens) if mem_tokens else 0.0
|
||||
scored.append((density, overlap, mem.confidence, mem))
|
||||
scored.sort(key=lambda t: (t[0], t[1], t[2]), reverse=True)
|
||||
return [mem for _, _, _, mem in scored]
|
||||
|
||||
|
||||
def _row_to_memory(row) -> Memory:
|
||||
|
||||
@@ -107,8 +107,11 @@ def test_parser_falls_back_to_interaction_project():
|
||||
assert result[0].project == "p06-polisher"
|
||||
|
||||
|
||||
def test_parser_keeps_model_project_when_provided():
|
||||
"""Model-supplied project takes precedence over interaction."""
|
||||
def test_parser_keeps_registered_model_project(tmp_data_dir, project_registry):
|
||||
"""R9: model-supplied project is kept when it's a registered project."""
|
||||
from atocore.models.database import init_db
|
||||
init_db()
|
||||
project_registry(("p04-gigabit", ["p04", "gigabit"]), ("p06-polisher", ["p06"]))
|
||||
raw = '[{"type": "project", "content": "x", "project": "p04-gigabit"}]'
|
||||
interaction = _make_interaction()
|
||||
interaction.project = "p06-polisher"
|
||||
@@ -116,6 +119,19 @@ def test_parser_keeps_model_project_when_provided():
|
||||
assert result[0].project == "p04-gigabit"
|
||||
|
||||
|
||||
def test_parser_rejects_hallucinated_project(tmp_data_dir, project_registry):
|
||||
"""R9: model-supplied project that is NOT registered falls back
|
||||
to the interaction's known project."""
|
||||
from atocore.models.database import init_db
|
||||
init_db()
|
||||
project_registry(("p06-polisher", ["p06"]))
|
||||
raw = '[{"type": "project", "content": "x", "project": "fake-project-99"}]'
|
||||
interaction = _make_interaction()
|
||||
interaction.project = "p06-polisher"
|
||||
result = _parse_candidates(raw, interaction)
|
||||
assert result[0].project == "p06-polisher"
|
||||
|
||||
|
||||
def test_missing_cli_returns_empty(monkeypatch):
|
||||
"""If ``claude`` is not on PATH the extractor returns empty, never raises."""
|
||||
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: False)
|
||||
|
||||
Reference in New Issue
Block a user