diff --git a/src/atocore/memory/service.py b/src/atocore/memory/service.py index 60afffd..70d2f88 100644 --- a/src/atocore/memory/service.py +++ b/src/atocore/memory/service.py @@ -495,7 +495,7 @@ def propose_emerging_projects(min_active: int = 10) -> list[dict]: # they share a non-trivial token (length >= 4 after splitting on # '-' and '_'). Cheap, defensible, and the operator gets to veto. def _tokens(label: str) -> set[str]: - parts = label.replace("_", "-").split("-") + parts = label.lower().replace("_", "-").split("-") return {p for p in parts if len(p) >= 4} label_tokens = {label: _tokens(label) for label, _a, _c in unregistered} diff --git a/tests/test_emerging_project_proposals.py b/tests/test_emerging_project_proposals.py index 0acf285..840cb10 100644 --- a/tests/test_emerging_project_proposals.py +++ b/tests/test_emerging_project_proposals.py @@ -103,20 +103,65 @@ def test_proposals_suggest_sibling_aliases_via_shared_tokens(env): def test_proposals_short_token_does_not_match(env): - """Two-or-three-letter tokens are too noisy to suggest aliases on. - 'apm' (3 chars) and 'apm-fpga' (3 + 4) share 'apm' (3 chars) — the - 'apm' token alone is too short, but 'fpga' (4) is long enough so - the match comes from the longer token. We test the negative: 'apm' - and 'drill' must NOT be siblings.""" - for i in range(11): - create_memory("knowledge", f"apm fact {i}", project="apm") - for i in range(11): - create_memory("knowledge", f"drill fact {i}", project="drill") + """Per Codex Wave 1.5 P2: previously this test only asserted apm + and drill have empty siblings, which is trivially true because they + share no tokens at all. The real risk is an accidental relaxation + that lets <4-char tokens trigger clustering. Construct a setup where + that would matter: + - 'apm' and 'apm-fpga': only the 3-char 'apm' is shared. They must + NOT cluster, because 'apm' is too short. + - 'foo-fpga' and 'bar-fpga': the 4-char 'fpga' is shared. They + MUST cluster. + """ + for label in ("apm", "apm-fpga", "foo-fpga", "bar-fpga"): + for i in range(11): + create_memory("knowledge", f"{label} fact {i}", project=label) client = TestClient(app) proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]} - assert proposals["apm"]["suggested_aliases"] == [] - assert proposals["drill"]["suggested_aliases"] == [] + + # Negative: short-token match must not happen + assert "apm-fpga" not in proposals["apm"]["suggested_aliases"], ( + "'apm' (3 chars) is below the 4-char minimum; 'apm' and 'apm-fpga' " + "must not cluster via the 'apm' token." + ) + + # Positive: long-token match must happen — both directions + assert "bar-fpga" in proposals["foo-fpga"]["suggested_aliases"] + assert "foo-fpga" in proposals["bar-fpga"]["suggested_aliases"] + # And 'apm-fpga' clusters with the others via 'fpga' + assert "apm-fpga" in proposals["foo-fpga"]["suggested_aliases"] + + +def test_proposals_clustering_is_case_insensitive(env): + """Token comparison must be case-insensitive so labels captured + with mixed casing still cluster. Codex Wave 1.5 P3.""" + for label in ("HydroTech-Mining", "hydrotech-split-tank"): + for i in range(11): + create_memory("knowledge", f"{label} fact {i}", project=label) + + client = TestClient(app) + proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]} + assert "hydrotech-split-tank" in proposals["HydroTech-Mining"]["suggested_aliases"] + assert "HydroTech-Mining" in proposals["hydrotech-split-tank"]["suggested_aliases"] + + +def test_proposals_registered_token_does_not_leak_into_sibling_set(env, monkeypatch): + """Registered project ids must be filtered BEFORE clustering so a + registered token doesn't get suggested as an alias for an + unregistered sibling. p04-gigabit is registered in env; an + unregistered 'gigabit-other' must not list 'p04-gigabit' as alias.""" + for i in range(15): + create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit") + for i in range(11): + create_memory("knowledge", f"gigabit-other fact {i}", project="gigabit-other") + + client = TestClient(app) + proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]} + assert "p04-gigabit" not in proposals + assert "gigabit-other" in proposals + # And the registered name must not surface as a sibling + assert "p04-gigabit" not in proposals["gigabit-other"]["suggested_aliases"] def test_proposals_include_sample_memories_and_guessed_root(env):