diff --git a/src/atocore/memory/reinforcement.py b/src/atocore/memory/reinforcement.py index b4fb4c5..b6829e3 100644 --- a/src/atocore/memory/reinforcement.py +++ b/src/atocore/memory/reinforcement.py @@ -180,20 +180,36 @@ def _stem(word: str) -> str: def _tokenize(text: str) -> set[str]: """Split normalized text into a stemmed token set. - Strips punctuation, drops words shorter than 3 chars and stop words. + Strips punctuation, drops words shorter than 3 chars and stop + words. Hyphenated and slash-separated identifiers + (``polisher-control``, ``twyman-green``, ``2-projects/interferometer``) + produce both the full form AND each sub-token, so a query for + "polisher control" can match a memory that wrote + "polisher-control" without forcing callers to guess the exact + hyphenation. """ tokens: set[str] = set() for raw in text.split(): - # Strip leading/trailing punctuation (commas, periods, quotes, etc.) word = raw.strip(".,;:!?\"'()[]{}-/") - if len(word) < 3: + if not word: continue - if word in _STOP_WORDS: - continue - tokens.add(_stem(word)) + _add_token(tokens, word) + # Also add sub-tokens split on internal '-' or '/' so + # hyphenated identifiers match queries that don't hyphenate. + if "-" in word or "/" in word: + for sub in re.split(r"[-/]+", word): + _add_token(tokens, sub) return tokens +def _add_token(tokens: set[str], word: str) -> None: + if len(word) < 3: + return + if word in _STOP_WORDS: + return + tokens.add(_stem(word)) + + def _memory_matches(memory_content: str, normalized_response: str) -> bool: """Return True if enough of the memory's tokens appear in the response.