diff --git a/src/atocore/engineering/wiki.py b/src/atocore/engineering/wiki.py index 3ad8d36..057d32d 100644 --- a/src/atocore/engineering/wiki.py +++ b/src/atocore/engineering/wiki.py @@ -240,10 +240,30 @@ def render_homepage() -> str: # Quick stats all_entities = get_entities(limit=500) - all_memories = get_memories(active_only=True, limit=500) + all_memories_raw = get_memories(active_only=True, limit=500) + # Partition real knowledge from ambient provenance so counts are honest. + # Each memory lands in exactly one bucket (low-signal takes priority). + all_memories: list = [] + akc_session_count = 0 + low_signal_count = 0 + for _m in all_memories_raw: + if _is_low_signal_memory(_m): + low_signal_count += 1 + elif _is_akc_session_memory(_m): + akc_session_count += 1 + else: + all_memories.append(_m) pending = get_memories(status="candidate", limit=500) lines.append('

System

') - lines.append(f'

{len(all_entities)} entities · {len(all_memories)} active memories · {len(projects)} projects

') + lines.append( + f'

{len(all_entities)} entities · {len(all_memories)} memories · ' + f'{len(projects)} projects' + + (f' · {akc_session_count} AKC session snapshots' + + (f", {low_signal_count} low-signal hidden" if low_signal_count else "") + + '' + if akc_session_count or low_signal_count else '') + + '

' + ) # Triage queue prompt — surfaced prominently if non-empty if pending: @@ -286,6 +306,44 @@ import re as _re _WIKILINK_PATTERN = _re.compile(r"\[\[([^\[\]|]+?)(?:\|([^\[\]]+?))?\]\]") +# ----------------------------------------------------------------- signal/noise +# Memories with these patterns are low-signal ambient artefacts — they +# inflate lists on the homepage and domain pages without being informative. +# They remain in the DB (for provenance / audit) but are filtered from +# default browsing surfaces. Pass `include_low_signal=True` on a page query +# param to surface them. +_LOW_SIGNAL_CONTENT_PATTERNS = ( + "(no transcript)", # silent-mic AKC sessions + "synthetic AKC integration", # E2E test pollution + "AKC-E2E-", # E2E test prefix in content + "AKC-IMG-TEST-", # image-upload test prefix + "IMG integration test — synthetic", # E2E narrative header +) + +# AKC voice-session ambient memories follow this pattern — they're +# provenance records, not knowledge. Collapse them behind a link on domain +# pages instead of rendering each inline. +_AKC_SESSION_HEADER = "AKC voice session " + + +def _is_low_signal_memory(mem) -> bool: + """True for memories whose content is known ambient/test pollution.""" + content = (getattr(mem, "content", "") or "") + if not content: + return True + return any(p in content for p in _LOW_SIGNAL_CONTENT_PATTERNS) + + +def _is_akc_session_memory(mem) -> bool: + """True for AKC voice-session ambient snapshots (have value as provenance, + but shouldn't clutter topical listings).""" + content = (getattr(mem, "content", "") or "") + tags = getattr(mem, "domain_tags", None) or [] + if any(t in ("session", "akc") for t in tags) and "voice" in tags: + return True + return content.startswith(_AKC_SESSION_HEADER) + + def _resolve_wikilink(target: str, current_project: str | None) -> tuple[str, str, str]: """Resolve a ``[[Name]]`` target to ``(href, css_class, extra_suffix)``. @@ -874,8 +932,22 @@ def render_domain(tag: str) -> str: breadcrumbs=[("Wiki", "/wiki"), ("Domains", "")]) all_mems = get_memories(active_only=True, limit=500) - matching = [m for m in all_mems - if any((t or "").lower() == tag for t in (m.domain_tags or []))] + matching_all = [m for m in all_mems + if any((t or "").lower() == tag for t in (m.domain_tags or []))] + + # Partition: low-signal test pollution is hidden entirely, ambient AKC + # session memories are collapsed (shown as a count + link to + # /wiki/activity). Priority: low-signal > session > real. + matching: list = [] + akc_sessions: list = [] + hidden_low_signal = 0 + for m in matching_all: + if _is_low_signal_memory(m): + hidden_low_signal += 1 + elif _is_akc_session_memory(m): + akc_sessions.append(m) + else: + matching.append(m) # Group by project by_project: dict[str, list] = {} @@ -884,6 +956,18 @@ def render_domain(tag: str) -> str: lines = [f'

Domain: {tag}

'] lines.append(f'

{len(matching)} active memories across {len(by_project)} projects

') + if akc_sessions or hidden_low_signal: + noise_bits = [] + if akc_sessions: + noise_bits.append( + f'{len(akc_sessions)} AKC voice session snapshots' + ) + if hidden_low_signal: + noise_bits.append(f"{hidden_low_signal} low-signal memories hidden") + lines.append( + f'

' + f'Ambient provenance not listed: {" · ".join(noise_bits)}.

' + ) if not matching: lines.append( diff --git a/tests/test_wiki_pages.py b/tests/test_wiki_pages.py index 64b50c6..935d05a 100644 --- a/tests/test_wiki_pages.py +++ b/tests/test_wiki_pages.py @@ -161,3 +161,103 @@ def test_memory_detail_shows_superseded_sources(tmp_data_dir): assert html1 is not None assert "superseded" in html1 assert "auto-dedup-tier1" in html1 # audit trail shows who merged + + +# -------------------------------------------------- low-signal wiki filters +# Ambient AKC session memories and test pollution shouldn't dominate domain +# pages / homepage counts. These tests lock the partitioning behaviour. + +def test_domain_page_hides_empty_transcript_sessions(tmp_data_dir): + """Silent-mic AKC sessions (content has '(no transcript)') are ambient + noise — they go into the hidden count, not the main list.""" + _init_all() + # One real knowledge memory with tag "optics" + create_memory( + "knowledge", + "CGH null corrector supports F/1.2 asphere testing", + project="p05", confidence=0.9, domain_tags=["optics", "cgh"], + ) + # One silent AKC session with the same tag — should NOT appear + create_memory( + "episodic", + "AKC voice session abc (gen-002)\nDuration: 60s, 2 captures\n" + "\n## Transcript\n(no transcript)\n", + project="p05", confidence=0.7, + domain_tags=["optics", "session", "akc", "voice"], + ) + html = render_domain("optics") + assert "CGH null corrector" in html + # The hidden-count banner should be present + assert "low-signal" in html or "Ambient provenance" in html + # And the empty-transcript content itself is not rendered inline + assert "(no transcript)" not in html + + +def test_domain_page_collapses_akc_session_snapshots(tmp_data_dir): + """AKC voice-session memories are provenance records — count them as + a single collapsed link, don't inline every one.""" + _init_all() + for i in range(5): + create_memory( + "episodic", + f"AKC voice session session-{i} (gen-00{i})\nDuration: 120s, 3 captures\n" + f"\n## Transcript\nReal transcript number {i}", + project="p05", confidence=0.7, + domain_tags=["optics", "session", "akc", "voice"], + ) + html = render_domain("optics") + # Inline count should mention AKC session snapshots + assert "AKC voice session snapshots" in html + # None of the session transcripts should be pasted inline on the domain + # page (they're provenance, linked via /wiki/activity) + assert "Real transcript number 0" not in html + + +def test_homepage_stats_exclude_ambient_memory(tmp_data_dir): + """Homepage system-stats line shows real memory count, pushes ambient + counts into a dimmed sub-segment.""" + _init_all() + # 2 real memories + 3 ambient sessions + 1 silent junk + create_memory("knowledge", "Real fact 1", project="p05", confidence=0.8) + create_memory("knowledge", "Real fact 2", project="p05", confidence=0.8) + for i in range(3): + create_memory( + "episodic", + f"AKC voice session s{i} (gen-00{i})\nReal transcript x", + project="p05", confidence=0.7, + domain_tags=["session", "akc", "voice"], + ) + create_memory( + "episodic", + "AKC voice session silent (gen-099)\nDuration: 30s, 0 captures\n" + "\n## Transcript\n(no transcript)\n", + project="p05", confidence=0.7, + domain_tags=["session", "akc", "voice"], + ) + html = render_homepage() + assert "3 AKC session snapshots" in html + assert "low-signal hidden" in html + # Main count reflects only real knowledge + assert "2 memories" in html + + +def test_low_signal_predicate_catches_known_patterns(): + from atocore.engineering.wiki import _is_low_signal_memory, _is_akc_session_memory + from dataclasses import dataclass + + @dataclass + class M: + content: str = "" + domain_tags: list = None + + # Explicit empty-transcript — low signal + assert _is_low_signal_memory(M(content="AKC voice session x\n## Transcript\n(no transcript)\n")) + # E2E test pollution — low signal + assert _is_low_signal_memory(M(content="IMG integration test — synthetic session")) + assert _is_low_signal_memory(M(content="synthetic AKC integration session")) + # Real knowledge — NOT low signal + assert not _is_low_signal_memory(M(content="The CGH is mounted to the fold mirror via…")) + # AKC session tag predicate + assert _is_akc_session_memory(M(content="anything", domain_tags=["session", "akc", "voice"])) + assert _is_akc_session_memory(M(content="AKC voice session abc")) + assert not _is_akc_session_memory(M(content="Real fact", domain_tags=["optics"]))