201 lines
7.2 KiB
Python
201 lines
7.2 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Phase 6 C.1 — Emerging-concepts detector.
|
||
|
|
|
||
|
|
Scans active + candidate memories to surface:
|
||
|
|
1. Unregistered projects — project strings appearing on 3+ memories
|
||
|
|
that aren't in the project registry. Surface for one-click
|
||
|
|
registration.
|
||
|
|
2. Emerging categories — top 20 domain_tags by frequency, for
|
||
|
|
"what themes are emerging in my work?" intelligence.
|
||
|
|
3. Reinforced transients — active memories with reference_count >= 5
|
||
|
|
AND valid_until set. These "were temporary but now durable";
|
||
|
|
candidates for valid_until extension (handled by a sibling script).
|
||
|
|
|
||
|
|
Writes results to project_state under atocore/proposals/*. Emits a
|
||
|
|
warning alert the FIRST time a project crosses the 5-memory threshold
|
||
|
|
(so the user gets notified without being spammed on every run).
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python3 scripts/detect_emerging.py [--base-url URL] [--dry-run]
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
from collections import Counter, defaultdict
|
||
|
|
|
||
|
|
# src/ importable so we can reuse service helpers
|
||
|
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||
|
|
_SRC_DIR = os.path.abspath(os.path.join(_SCRIPT_DIR, "..", "src"))
|
||
|
|
if _SRC_DIR not in sys.path:
|
||
|
|
sys.path.insert(0, _SRC_DIR)
|
||
|
|
|
||
|
|
|
||
|
|
PROJECT_MIN_MEMORIES = int(os.environ.get("ATOCORE_EMERGING_PROJECT_MIN", "3"))
|
||
|
|
PROJECT_ALERT_THRESHOLD = int(os.environ.get("ATOCORE_EMERGING_ALERT_THRESHOLD", "5"))
|
||
|
|
TOP_TAGS_LIMIT = int(os.environ.get("ATOCORE_EMERGING_TOP_TAGS", "20"))
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
parser = argparse.ArgumentParser(description="Detect emerging projects + categories")
|
||
|
|
parser.add_argument("--base-url", default=os.environ.get("ATOCORE_BASE_URL", "http://127.0.0.1:8100"))
|
||
|
|
parser.add_argument("--dry-run", action="store_true", help="Report without writing to project state")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
from atocore.memory.service import get_memories
|
||
|
|
from atocore.projects.registry import load_project_registry
|
||
|
|
from atocore.context.project_state import set_state, get_state
|
||
|
|
|
||
|
|
# Registered project ids (including aliases — a memory tagged 'p04' should
|
||
|
|
# NOT be flagged as emerging since 'p04' is a registered alias for p04-gigabit)
|
||
|
|
registered = set()
|
||
|
|
for p in load_project_registry():
|
||
|
|
registered.add(p.project_id.lower())
|
||
|
|
for alias in p.aliases:
|
||
|
|
registered.add(alias.lower())
|
||
|
|
|
||
|
|
# Pull active + candidate memories (give ourselves a broad view)
|
||
|
|
active = get_memories(active_only=True, limit=500)
|
||
|
|
candidates = get_memories(status="candidate", limit=500)
|
||
|
|
all_mems = list(active) + list(candidates)
|
||
|
|
|
||
|
|
# --- Unregistered projects ---
|
||
|
|
project_mems: dict[str, list] = defaultdict(list)
|
||
|
|
for m in all_mems:
|
||
|
|
proj = (m.project or "").strip().lower()
|
||
|
|
if not proj or proj in registered:
|
||
|
|
continue
|
||
|
|
project_mems[proj].append(m)
|
||
|
|
|
||
|
|
unregistered = []
|
||
|
|
for proj, mems in sorted(project_mems.items()):
|
||
|
|
if len(mems) < PROJECT_MIN_MEMORIES:
|
||
|
|
continue
|
||
|
|
unregistered.append({
|
||
|
|
"project": proj,
|
||
|
|
"count": len(mems),
|
||
|
|
"sample_memory_ids": [m.id for m in mems[:3]],
|
||
|
|
"sample_contents": [(m.content or "")[:150] for m in mems[:3]],
|
||
|
|
})
|
||
|
|
|
||
|
|
# --- Emerging domain_tags (only active memories — candidates might be noise) ---
|
||
|
|
tag_counter = Counter()
|
||
|
|
for m in active:
|
||
|
|
for t in (m.domain_tags or []):
|
||
|
|
if isinstance(t, str) and t.strip():
|
||
|
|
tag_counter[t.strip().lower()] += 1
|
||
|
|
|
||
|
|
emerging_tags = [
|
||
|
|
{"tag": tag, "count": cnt}
|
||
|
|
for tag, cnt in tag_counter.most_common(TOP_TAGS_LIMIT)
|
||
|
|
]
|
||
|
|
|
||
|
|
# --- Reinforced transients ---
|
||
|
|
reinforced = []
|
||
|
|
for m in active:
|
||
|
|
ref_count = getattr(m, "reference_count", 0) or 0
|
||
|
|
vu = (getattr(m, "valid_until", "") or "").strip()
|
||
|
|
if ref_count >= 5 and vu:
|
||
|
|
reinforced.append({
|
||
|
|
"memory_id": m.id,
|
||
|
|
"reference_count": ref_count,
|
||
|
|
"valid_until": vu,
|
||
|
|
"content_preview": (m.content or "")[:150],
|
||
|
|
"project": m.project or "",
|
||
|
|
})
|
||
|
|
|
||
|
|
# --- Output ---
|
||
|
|
result = {
|
||
|
|
"unregistered_projects": unregistered,
|
||
|
|
"emerging_categories": emerging_tags,
|
||
|
|
"reinforced_transients": reinforced,
|
||
|
|
"counts": {
|
||
|
|
"active_memories": len(active),
|
||
|
|
"candidate_memories": len(candidates),
|
||
|
|
"unregistered_project_count": len(unregistered),
|
||
|
|
"emerging_tag_count": len(emerging_tags),
|
||
|
|
"reinforced_transient_count": len(reinforced),
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
print(json.dumps(result, indent=2))
|
||
|
|
|
||
|
|
if args.dry_run:
|
||
|
|
return
|
||
|
|
|
||
|
|
# --- Persist to project state ---
|
||
|
|
try:
|
||
|
|
set_state(
|
||
|
|
project_name="atocore",
|
||
|
|
category="proposals",
|
||
|
|
key="unregistered_projects",
|
||
|
|
value=json.dumps(unregistered),
|
||
|
|
source="emerging detector",
|
||
|
|
)
|
||
|
|
set_state(
|
||
|
|
project_name="atocore",
|
||
|
|
category="proposals",
|
||
|
|
key="emerging_categories",
|
||
|
|
value=json.dumps(emerging_tags),
|
||
|
|
source="emerging detector",
|
||
|
|
)
|
||
|
|
set_state(
|
||
|
|
project_name="atocore",
|
||
|
|
category="proposals",
|
||
|
|
key="reinforced_transients",
|
||
|
|
value=json.dumps(reinforced),
|
||
|
|
source="emerging detector",
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"WARN: failed to persist to project state: {e}", file=sys.stderr)
|
||
|
|
|
||
|
|
# --- Alert on NEW projects crossing alert threshold ---
|
||
|
|
try:
|
||
|
|
# Read previous run's projects to detect "new" ones
|
||
|
|
prev_unregistered: list = []
|
||
|
|
for e in get_state("atocore"):
|
||
|
|
if e.category == "proposals" and e.key == "unregistered_projects_prev":
|
||
|
|
try:
|
||
|
|
prev_unregistered = json.loads(e.value)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
prev_names = {p.get("project") for p in prev_unregistered if isinstance(p, dict)}
|
||
|
|
|
||
|
|
newly_crossed = [
|
||
|
|
p for p in unregistered
|
||
|
|
if p["count"] >= PROJECT_ALERT_THRESHOLD
|
||
|
|
and p["project"] not in prev_names
|
||
|
|
]
|
||
|
|
if newly_crossed:
|
||
|
|
from atocore.observability.alerts import emit_alert
|
||
|
|
names = ", ".join(p["project"] for p in newly_crossed)
|
||
|
|
emit_alert(
|
||
|
|
severity="warning",
|
||
|
|
title=f"Emerging project(s) detected: {names}",
|
||
|
|
message=(
|
||
|
|
f"{len(newly_crossed)} unregistered project(s) have crossed "
|
||
|
|
f"the {PROJECT_ALERT_THRESHOLD}-memory threshold and may "
|
||
|
|
f"warrant registration: {names}. Review at /wiki or "
|
||
|
|
f"/admin/dashboard."
|
||
|
|
),
|
||
|
|
context={"projects": [p["project"] for p in newly_crossed]},
|
||
|
|
)
|
||
|
|
|
||
|
|
# Persist this run's list for next-run comparison
|
||
|
|
set_state(
|
||
|
|
project_name="atocore",
|
||
|
|
category="proposals",
|
||
|
|
key="unregistered_projects_prev",
|
||
|
|
value=json.dumps(unregistered),
|
||
|
|
source="emerging detector",
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
print(f"WARN: alert/state write failed: {e}", file=sys.stderr)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|