From a34a7a995f22a4e7901ac4963ae31626c1e648f6 Mon Sep 17 00:00:00 2001 From: Anto01 Date: Sat, 11 Apr 2026 09:40:05 -0400 Subject: [PATCH] fix: token-overlap matcher for reinforcement (Phase 9B) Replace the substring-based _memory_matches() with a token-overlap matcher that tokenizes both memory content and response, applies lightweight stemming (trailing s/ed/ing) and stop-word removal, then checks whether >= 70% of the memory's tokens appear in the response. This fixes the paraphrase blindness that prevented reinforcement from ever firing on natural responses ("prefers" vs "prefer", "because history" vs "because the history"). 7 new tests (26 total reinforcement tests, all passing). Co-Authored-By: Claude Opus 4.6 --- src/atocore/memory/reinforcement.py | 70 ++++++++++++++--- tests/test_reinforcement.py | 117 ++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 10 deletions(-) diff --git a/src/atocore/memory/reinforcement.py b/src/atocore/memory/reinforcement.py index b7acd84..fc6ee7a 100644 --- a/src/atocore/memory/reinforcement.py +++ b/src/atocore/memory/reinforcement.py @@ -8,10 +8,11 @@ given memory, without ever promoting anything new into trusted state. Design notes ------------ -- Matching is intentionally simple and explainable: - * normalize both sides (lowercase, collapse whitespace) - * require the normalized memory content (or its first 80 chars) to - appear as a substring in the normalized response +- Matching uses token-overlap: tokenize both sides (lowercase, stem, + drop stop words), then check whether >= 70 % of the memory's content + tokens appear in the response token set. This handles natural + paraphrases (e.g. "prefers" vs "prefer", "because history" vs + "because the history") that substring matching missed. - Candidates and invalidated memories are NEVER considered — reinforcement must not revive history. - Reinforcement is capped at 1.0 and monotonically non-decreasing. @@ -43,9 +44,12 @@ log = get_logger("reinforcement") # memories like "prefers Python". _MIN_MEMORY_CONTENT_LENGTH = 12 -# When a memory's content is very long, match on its leading window only -# to avoid punishing small paraphrases further into the body. -_MATCH_WINDOW_CHARS = 80 +# Token-overlap matching constants. +_STOP_WORDS: frozenset[str] = frozenset({ + "the", "a", "an", "and", "or", "of", "to", "is", "was", + "that", "this", "with", "for", "from", "into", +}) +_MATCH_THRESHOLD = 0.70 DEFAULT_CONFIDENCE_DELTA = 0.02 @@ -144,12 +148,58 @@ def _normalize(text: str) -> str: return collapsed.strip() +def _stem(word: str) -> str: + """Aggressive suffix-folding so inflected forms collapse. + + Handles trailing ``ing``, ``ed``, and ``s`` — good enough for + reinforcement matching without pulling in nltk/snowball. + """ + # Order matters: try longest suffix first. + if word.endswith("ing") and len(word) >= 6: + return word[:-3] + if word.endswith("ed") and len(word) > 4: + stem = word[:-2] + # "preferred" → "preferr" → "prefer" (doubled consonant before -ed) + if len(stem) >= 3 and stem[-1] == stem[-2]: + stem = stem[:-1] + return stem + if word.endswith("s") and len(word) > 3: + return word[:-1] + return word + + +def _tokenize(text: str) -> set[str]: + """Split normalized text into a stemmed token set. + + Strips punctuation, drops words shorter than 3 chars and stop words. + """ + tokens: set[str] = set() + for raw in text.split(): + # Strip leading/trailing punctuation (commas, periods, quotes, etc.) + word = raw.strip(".,;:!?\"'()[]{}-/") + if len(word) < 3: + continue + if word in _STOP_WORDS: + continue + tokens.add(_stem(word)) + return tokens + + def _memory_matches(memory_content: str, normalized_response: str) -> bool: - """Return True if the memory content appears in the response.""" + """Return True if enough of the memory's tokens appear in the response. + + Uses token-overlap: tokenize both sides (lowercase, stem, drop stop + words), then check whether >= 70 % of the memory's content tokens + appear in the response token set. + """ if not memory_content: return False normalized_memory = _normalize(memory_content) if len(normalized_memory) < _MIN_MEMORY_CONTENT_LENGTH: return False - window = normalized_memory[:_MATCH_WINDOW_CHARS] - return window in normalized_response + memory_tokens = _tokenize(normalized_memory) + if not memory_tokens: + return False + response_tokens = _tokenize(normalized_response) + overlap = memory_tokens & response_tokens + return len(overlap) / len(memory_tokens) >= _MATCH_THRESHOLD diff --git a/tests/test_reinforcement.py b/tests/test_reinforcement.py index 7537fa4..9d3832b 100644 --- a/tests/test_reinforcement.py +++ b/tests/test_reinforcement.py @@ -6,6 +6,8 @@ from atocore.interactions.service import record_interaction from atocore.main import app from atocore.memory.reinforcement import ( DEFAULT_CONFIDENCE_DELTA, + _stem, + _tokenize, reinforce_from_interaction, ) from atocore.memory.service import ( @@ -373,3 +375,118 @@ def test_get_memories_filter_by_alias(project_registry): assert len(via_alias) == 2 assert len(via_canonical) == 2 assert {m.content for m in via_alias} == {"m1", "m2"} + + +# --- token-overlap matcher: unit tests ------------------------------------- + + +def test_stem_folds_s_ed_ing(): + assert _stem("prefers") == "prefer" + assert _stem("preferred") == "prefer" + assert _stem("services") == "service" + assert _stem("processing") == "process" + # Short words must not be over-stripped + assert _stem("red") == "red" # 3 chars, don't strip "ed" + assert _stem("bus") == "bus" # 3 chars, don't strip "s" + assert _stem("sing") == "sing" # 4 chars, don't strip "ing" + assert _stem("being") == "being" # 5 chars, "ing" strip leaves "be" (2) — too short + + +def test_tokenize_removes_stop_words(): + tokens = _tokenize("the quick brown fox jumps over the lazy dog") + assert "the" not in tokens + assert "quick" in tokens + assert "brown" in tokens + assert "fox" in tokens + assert "dog" in tokens + # "over" has len 4, not a stop word → kept (stemmed: "over") + assert "over" in tokens + + +# --- token-overlap matcher: paraphrase matching ---------------------------- + + +def test_reinforce_matches_paraphrase_prefers_vs_prefer(tmp_data_dir): + """The canonical rebase case from phase9-first-real-use.md.""" + init_db() + mem = create_memory( + memory_type="preference", + content="prefers rebase-based workflows because history stays linear", + confidence=0.5, + ) + interaction = _make_interaction( + response=( + "I prefer rebase-based workflows because the history stays " + "linear and reviewers have an easier time." + ), + ) + results = reinforce_from_interaction(interaction) + assert any(r.memory_id == mem.id for r in results) + + +def test_reinforce_matches_paraphrase_with_articles_and_ed(tmp_data_dir): + init_db() + mem = create_memory( + memory_type="preference", + content="preferred structured logging across all backend services", + confidence=0.5, + ) + interaction = _make_interaction( + response=( + "I set up structured logging across all the backend services, " + "which the team prefers for consistency." + ), + ) + results = reinforce_from_interaction(interaction) + assert any(r.memory_id == mem.id for r in results) + + +def test_reinforce_rejects_low_overlap(tmp_data_dir): + init_db() + mem = create_memory( + memory_type="preference", + content="always uses Python for data processing scripts", + confidence=0.5, + ) + interaction = _make_interaction( + response=( + "The CI pipeline runs on Node.js and deploys to Kubernetes " + "using Helm charts." + ), + ) + results = reinforce_from_interaction(interaction) + assert all(r.memory_id != mem.id for r in results) + + +def test_reinforce_matches_at_70_percent_threshold(tmp_data_dir): + """Exactly 7 of 10 content tokens present → should match.""" + init_db() + # After stop-word removal and stemming, this has 10 tokens: + # alpha, bravo, charlie, delta, echo, foxtrot, golf, hotel, india, juliet + mem = create_memory( + memory_type="preference", + content="alpha bravo charlie delta echo foxtrot golf hotel india juliet", + confidence=0.5, + ) + # Echo 7 of 10 tokens (70%) plus some noise + interaction = _make_interaction( + response="alpha bravo charlie delta echo foxtrot golf noise words here", + ) + results = reinforce_from_interaction(interaction) + assert any(r.memory_id == mem.id for r in results) + + +def test_reinforce_rejects_below_70_percent(tmp_data_dir): + """Only 6 of 10 content tokens present (60%) → should NOT match.""" + init_db() + mem = create_memory( + memory_type="preference", + content="alpha bravo charlie delta echo foxtrot golf hotel india juliet", + confidence=0.5, + ) + # Echo 6 of 10 tokens (60%) plus noise + interaction = _make_interaction( + response="alpha bravo charlie delta echo foxtrot noise words here only", + ) + results = reinforce_from_interaction(interaction) + assert all(r.memory_id != mem.id for r in results)