From 38f6e525afe5f406bd78e5902808af8f8007549b Mon Sep 17 00:00:00 2001 From: Anto01 Date: Sat, 11 Apr 2026 13:04:01 -0400 Subject: [PATCH] fix: tokenizer splits hyphenated identifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyphen- and slash-separated identifiers (polisher-control, twyman-green, etc.) were single tokens in the reinforcement / memory-ranking tokenizer, so queries had to match the exact hyphenation to score. The harness caught this on p06-control-rule: 'polisher control design rule' scored 2 overlap on each of the three polisher-*/design-rule memories and the tiebreaker picked the wrong one. Now hyphenated words contribute both the full form AND each sub-token. Extracted _add_token helper to avoid duplicating the stop-word / length gate at both insertion points. Reinforcement matcher tests still pass (28) — the new sub-tokens only widen the match set, they never narrow it, so memories that previously reinforced continue to reinforce. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/atocore/memory/reinforcement.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/atocore/memory/reinforcement.py b/src/atocore/memory/reinforcement.py index b4fb4c5..b6829e3 100644 --- a/src/atocore/memory/reinforcement.py +++ b/src/atocore/memory/reinforcement.py @@ -180,20 +180,36 @@ def _stem(word: str) -> str: def _tokenize(text: str) -> set[str]: """Split normalized text into a stemmed token set. - Strips punctuation, drops words shorter than 3 chars and stop words. + Strips punctuation, drops words shorter than 3 chars and stop + words. Hyphenated and slash-separated identifiers + (``polisher-control``, ``twyman-green``, ``2-projects/interferometer``) + produce both the full form AND each sub-token, so a query for + "polisher control" can match a memory that wrote + "polisher-control" without forcing callers to guess the exact + hyphenation. """ tokens: set[str] = set() for raw in text.split(): - # Strip leading/trailing punctuation (commas, periods, quotes, etc.) word = raw.strip(".,;:!?\"'()[]{}-/") - if len(word) < 3: + if not word: continue - if word in _STOP_WORDS: - continue - tokens.add(_stem(word)) + _add_token(tokens, word) + # Also add sub-tokens split on internal '-' or '/' so + # hyphenated identifiers match queries that don't hyphenate. + if "-" in word or "/" in word: + for sub in re.split(r"[-/]+", word): + _add_token(tokens, sub) return tokens +def _add_token(tokens: set[str], word: str) -> None: + if len(word) < 3: + return + if word in _STOP_WORDS: + return + tokens.add(_stem(word)) + + def _memory_matches(memory_content: str, normalized_response: str) -> bool: """Return True if enough of the memory's tokens appear in the response.