From 38f6e525afe5f406bd78e5902808af8f8007549b Mon Sep 17 00:00:00 2001
From: Anto01 <antoine.letarte@gmail.com>
Date: Sat, 11 Apr 2026 13:04:01 -0400
Subject: [PATCH] fix: tokenizer splits hyphenated identifiers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hyphen- and slash-separated identifiers (polisher-control,
twyman-green, etc.) were single tokens in the reinforcement /
memory-ranking tokenizer, so queries had to match the exact
hyphenation to score. The harness caught this on p06-control-rule:
'polisher control design rule' scored 2 overlap on each of the
three polisher-*/design-rule memories and the tiebreaker picked
the wrong one.

Now hyphenated words contribute both the full form AND each
sub-token. Extracted _add_token helper to avoid duplicating the
stop-word / length gate at both insertion points.

Reinforcement matcher tests still pass (28) — the new sub-tokens
only widen the match set, they never narrow it, so memories that
previously reinforced continue to reinforce.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/atocore/memory/reinforcement.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/atocore/memory/reinforcement.py b/src/atocore/memory/reinforcement.py
index b4fb4c5..b6829e3 100644
--- a/src/atocore/memory/reinforcement.py
+++ b/src/atocore/memory/reinforcement.py
@@ -180,20 +180,36 @@ def _stem(word: str) -> str:
 def _tokenize(text: str) -> set[str]:
     """Split normalized text into a stemmed token set.
 
-    Strips punctuation, drops words shorter than 3 chars and stop words.
+    Strips punctuation, drops words shorter than 3 chars and stop
+    words. Hyphenated and slash-separated identifiers
+    (``polisher-control``, ``twyman-green``, ``2-projects/interferometer``)
+    produce both the full form AND each sub-token, so a query for
+    "polisher control" can match a memory that wrote
+    "polisher-control" without forcing callers to guess the exact
+    hyphenation.
     """
     tokens: set[str] = set()
     for raw in text.split():
-        # Strip leading/trailing punctuation (commas, periods, quotes, etc.)
         word = raw.strip(".,;:!?\"'()[]{}-/")
-        if len(word) < 3:
+        if not word:
             continue
-        if word in _STOP_WORDS:
-            continue
-        tokens.add(_stem(word))
+        _add_token(tokens, word)
+        # Also add sub-tokens split on internal '-' or '/' so
+        # hyphenated identifiers match queries that don't hyphenate.
+        if "-" in word or "/" in word:
+            for sub in re.split(r"[-/]+", word):
+                _add_token(tokens, sub)
     return tokens
 
 
+def _add_token(tokens: set[str], word: str) -> None:
+    if len(word) < 3:
+        return
+    if word in _STOP_WORDS:
+        return
+    tokens.add(_stem(word))
+
+
 def _memory_matches(memory_content: str, normalized_response: str) -> bool:
     """Return True if enough of the memory's tokens appear in the response.