#!/usr/bin/env python3 """Phase 6 C.1 — Emerging-concepts detector. Scans active + candidate memories to surface: 1. Unregistered projects — project strings appearing on 3+ memories that aren't in the project registry. Surface for one-click registration. 2. Emerging categories — top 20 domain_tags by frequency, for "what themes are emerging in my work?" intelligence. 3. Reinforced transients — active memories with reference_count >= 5 AND valid_until set. These "were temporary but now durable"; candidates for valid_until extension (handled by a sibling script). Writes results to project_state under atocore/proposals/*. Emits a warning alert the FIRST time a project crosses the 5-memory threshold (so the user gets notified without being spammed on every run). Usage: python3 scripts/detect_emerging.py [--base-url URL] [--dry-run] """ from __future__ import annotations import argparse import json import os import sys from collections import Counter, defaultdict # src/ importable so we can reuse service helpers _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) _SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src")) if _SRC_DIR not in sys.path: sys.path.insert(0, _SRC_DIR) PROJECT_MIN_MEMORIES = int(os.environ.get("ATOCORE_EMERGING_PROJECT_MIN", "3")) PROJECT_ALERT_THRESHOLD = int(os.environ.get("ATOCORE_EMERGING_ALERT_THRESHOLD", "5")) TOP_TAGS_LIMIT = int(os.environ.get("ATOCORE_EMERGING_TOP_TAGS", "20")) def main() -> None: parser = argparse.ArgumentParser(description="Detect emerging projects + categories") parser.add_argument("--base-url", default=os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100")) parser.add_argument("--dry-run", action="store_true", help="Report without writing to project state") args = parser.parse_args() from atocore.memory.service import get_memories from atocore.projects.registry import load_project_registry from atocore.context.project_state import set_state, get_state # Registered project ids (including aliases — a memory tagged 'p04' should # NOT be flagged as emerging since 'p04' is a registered alias for p04-gigabit) registered = set() for p in load_project_registry(): registered.add(p.project_id.lower()) for alias in p.aliases: registered.add(alias.lower()) # Pull active + candidate memories (give ourselves a broad view) active = get_memories(active_only=True, limit=500) candidates = get_memories(status="candidate", limit=500) all_mems = list(active) + list(candidates) # --- Unregistered projects --- project_mems: dict[str, list] = defaultdict(list) for m in all_mems: proj = (m.project or "").strip().lower() if not proj or proj in registered: continue project_mems[proj].append(m) unregistered = [] for proj, mems in sorted(project_mems.items()): if len(mems) < PROJECT_MIN_MEMORIES: continue unregistered.append({ "project": proj, "count": len(mems), "sample_memory_ids": [m.id for m in mems[:3]], "sample_contents": [(m.content or "")[:150] for m in mems[:3]], }) # --- Emerging domain_tags (only active memories — candidates might be noise) --- tag_counter = Counter() for m in active: for t in (m.domain_tags or []): if isinstance(t, str) and t.strip(): tag_counter[t.strip().lower()] += 1 emerging_tags = [ {"tag": tag, "count": cnt} for tag, cnt in tag_counter.most_common(TOP_TAGS_LIMIT) ] # --- Reinforced transients --- reinforced = [] for m in active: ref_count = getattr(m, "reference_count", 0) or 0 vu = (getattr(m, "valid_until", "") or "").strip() if ref_count >= 5 and vu: reinforced.append({ "memory_id": m.id, "reference_count": ref_count, "valid_until": vu, "content_preview": (m.content or "")[:150], "project": m.project or "", }) # --- Output --- result = { "unregistered_projects": unregistered, "emerging_categories": emerging_tags, "reinforced_transients": reinforced, "counts": { "active_memories": len(active), "candidate_memories": len(candidates), "unregistered_project_count": len(unregistered), "emerging_tag_count": len(emerging_tags), "reinforced_transient_count": len(reinforced), }, } print(json.dumps(result, indent=2)) if args.dry_run: return # --- Persist to project state --- try: set_state( project_name="atocore", category="proposals", key="unregistered_projects", value=json.dumps(unregistered), source="emerging detector", ) set_state( project_name="atocore", category="proposals", key="emerging_categories", value=json.dumps(emerging_tags), source="emerging detector", ) set_state( project_name="atocore", category="proposals", key="reinforced_transients", value=json.dumps(reinforced), source="emerging detector", ) except Exception as e: print(f"WARN: failed to persist to project state: {e}", file=sys.stderr) # --- Alert on NEW projects crossing alert threshold --- try: # Read previous run's projects to detect "new" ones prev_unregistered: list = [] for e in get_state("atocore"): if e.category == "proposals" and e.key == "unregistered_projects_prev": try: prev_unregistered = json.loads(e.value) except Exception: pass prev_names = {p.get("project") for p in prev_unregistered if isinstance(p, dict)} newly_crossed = [ p for p in unregistered if p["count"] >= PROJECT_ALERT_THRESHOLD and p["project"] not in prev_names ] if newly_crossed: from atocore.observability.alerts import emit_alert names = ", ".join(p["project"] for p in newly_crossed) emit_alert( severity="warning", title=f"Emerging project(s) detected: {names}", message=( f"{len(newly_crossed)} unregistered project(s) have crossed " f"the {PROJECT_ALERT_THRESHOLD}-memory threshold and may " f"warrant registration: {names}. Review at /wiki or " f"/admin/dashboard." ), context={"projects": [p["project"] for p in newly_crossed]}, ) # Persist this run's list for next-run comparison set_state( project_name="atocore", category="proposals", key="unregistered_projects_prev", value=json.dumps(unregistered), source="emerging detector", ) except Exception as e: print(f"WARN: alert/state write failed: {e}", file=sys.stderr) if __name__ == "__main__": main()