diff --git a/src/atocore/api/routes.py b/src/atocore/api/routes.py index 906d6b4..f71afaa 100644 --- a/src/atocore/api/routes.py +++ b/src/atocore/api/routes.py @@ -474,6 +474,33 @@ def api_register_emerging_project(req: RegisterEmergingRequest) -> dict: return result +@router.get("/admin/projects/proposals") +def api_project_proposals(min_active: int = 10) -> dict: + """Live registration proposals for unregistered projects. + + Reads SQL + the registry directly, so the result is current — unlike + `/admin/dashboard.proposals.unregistered_projects` which is the + nightly cache from `scripts/detect_emerging.py`. Each proposal + includes a guessed ingest root, sibling labels suggested as aliases, + and a few sample memories so the operator can sanity-check before + POSTing to /admin/projects/register-emerging. + + Query params: + min_active: minimum active-memory count for a label to surface + (default 10). + """ + from atocore.memory.service import propose_emerging_projects + + if min_active < 1: + raise HTTPException(status_code=400, detail="min_active must be >= 1") + proposals = propose_emerging_projects(min_active=min_active) + return { + "proposals": proposals, + "count": len(proposals), + "min_active": min_active, + } + + @router.put("/projects/{project_name}") def api_project_update(project_name: str, req: ProjectUpdateRequest) -> dict: """Update an existing project registration.""" diff --git a/src/atocore/memory/service.py b/src/atocore/memory/service.py index 7f08c5a..60afffd 100644 --- a/src/atocore/memory/service.py +++ b/src/atocore/memory/service.py @@ -424,6 +424,126 @@ def get_memory_count_summary() -> dict: return summary +def propose_emerging_projects(min_active: int = 10) -> list[dict]: + """Return live, on-demand registration proposals for unregistered projects. + + Differs from the nightly ``scripts/detect_emerging.py`` cache (which + is fresh once a day and lives in ``project_state.proposals``) by + reading current SQL and the registry directly. Each proposal is + operator-ready: a guessed ingest root, sibling labels suggested as + aliases, and a few sample memories so the operator can sanity-check + the bucket before committing it. + + Args: + min_active: minimum number of active memories required for a + label to surface as a proposal. Defaults to 10 — anything + smaller is too noisy to register without more signal. + + Returns: + list of proposal dicts, sorted by active_count desc: + { + "project_id": str, + "active_count": int, + "candidate_count": int, + "suggested_aliases": list[str], + "guessed_ingest_root": {"source": "vault", "subpath": ...}, + "sample_memories": [{id, content_preview, updated_at}, ...], + } + """ + from atocore.projects.registry import load_project_registry + + # Build the set of names already known to the registry (canonical + aliases), + # lowercased. Anything in this set is "registered" and not a proposal. + registered_names: set[str] = set() + try: + for project in load_project_registry(): + registered_names.add(project.project_id.lower()) + for alias in project.aliases: + registered_names.add(alias.lower()) + except Exception: + # Fail-open: if the registry can't load, assume nothing is + # registered and let the proposal surface everything. + pass + + with get_connection() as conn: + # Active counts per project (excluding empty/null project — that's + # the global bucket, not a proposal candidate). + active_rows = conn.execute( + "SELECT project, count(*) AS c FROM memories " + "WHERE status = 'active' AND project IS NOT NULL AND project != '' " + "GROUP BY project" + ).fetchall() + cand_rows = conn.execute( + "SELECT project, count(*) AS c FROM memories " + "WHERE status = 'candidate' AND project IS NOT NULL AND project != '' " + "GROUP BY project" + ).fetchall() + + cand_counts = {r["project"]: r["c"] for r in cand_rows} + + # Filter to unregistered labels above threshold + unregistered: list[tuple[str, int, int]] = [] # (project, active_n, candidate_n) + for r in active_rows: + proj = r["project"] + if proj.lower() in registered_names: + continue + if r["c"] < min_active: + continue + unregistered.append((proj, r["c"], cand_counts.get(proj, 0))) + + # Sibling alias detection: two unregistered labels are siblings if + # they share a non-trivial token (length >= 4 after splitting on + # '-' and '_'). Cheap, defensible, and the operator gets to veto. + def _tokens(label: str) -> set[str]: + parts = label.replace("_", "-").split("-") + return {p for p in parts if len(p) >= 4} + + label_tokens = {label: _tokens(label) for label, _a, _c in unregistered} + + proposals = [] + for proj, active_n, candidate_n in sorted( + unregistered, key=lambda t: (-t[1], t[0]) + ): + siblings = [ + other + for other in label_tokens + if other != proj and (label_tokens[proj] & label_tokens[other]) + ] + siblings.sort() + + # Sample memories: top 3 active by updated_at desc + with get_connection() as conn: + sample_rows = conn.execute( + "SELECT id, content, updated_at FROM memories " + "WHERE status = 'active' AND project = ? " + "ORDER BY updated_at DESC LIMIT 3", + (proj,), + ).fetchall() + samples = [ + { + "id": r["id"], + "content_preview": (r["content"] or "")[:160], + "updated_at": r["updated_at"], + } + for r in sample_rows + ] + + proposals.append( + { + "project_id": proj, + "active_count": active_n, + "candidate_count": candidate_n, + "suggested_aliases": siblings, + "guessed_ingest_root": { + "source": "vault", + "subpath": f"incoming/projects/{proj}/", + }, + "sample_memories": samples, + } + ) + return proposals + + def update_memory( memory_id: str, content: str | None = None, diff --git a/tests/test_emerging_project_proposals.py b/tests/test_emerging_project_proposals.py new file mode 100644 index 0000000..0acf285 --- /dev/null +++ b/tests/test_emerging_project_proposals.py @@ -0,0 +1,172 @@ +"""Wave 1.5 — live emerging-project registration proposals. + +The nightly `scripts/detect_emerging.py` writes a stale cache to +`project_state.proposals.unregistered_projects`. This endpoint provides +the on-demand alternative that operators can hit before deciding which +unregistered project to register via `/admin/projects/register-emerging`. +""" + +import json + +import pytest +from fastapi.testclient import TestClient + +import atocore.config as config +from atocore.main import app +from atocore.memory.service import create_memory +from atocore.models.database import init_db + + +@pytest.fixture +def env(tmp_data_dir, tmp_path, monkeypatch): + """Fresh DB + a registry holding a single registered project so we + can prove the proposals endpoint excludes registered names.""" + registry_path = tmp_path / "registry.json" + registry_path.write_text( + json.dumps( + { + "projects": [ + { + "id": "p04-gigabit", + "aliases": ["p04", "gigabit"], + "description": "test", + "ingest_roots": [ + {"source": "vault", "subpath": "incoming/projects/p04-gigabit"} + ], + } + ] + } + ), + encoding="utf-8", + ) + monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path)) + config.settings = config.Settings() + init_db() + yield tmp_data_dir + + +def test_proposals_excludes_registered_project_and_its_aliases(env): + """Memories tagged on a registered canonical id or any of its + aliases must not appear as a registration proposal.""" + # Registered: p04-gigabit (aliases p04, gigabit) + for i in range(15): + create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit") + for i in range(15): + create_memory("knowledge", f"alias fact {i}", project="p04") # alias + + # Unregistered, above threshold + for i in range(12): + create_memory("knowledge", f"apm fact {i}", project="apm") + + client = TestClient(app) + body = client.get("/admin/projects/proposals?min_active=10").json() + ids = [p["project_id"] for p in body["proposals"]] + assert "apm" in ids + assert "p04-gigabit" not in ids + assert "p04" not in ids + assert "gigabit" not in ids + + +def test_proposals_threshold_filters_low_count_labels(env): + create_memory("knowledge", "single one-off", project="discrawl") + for i in range(3): + create_memory("knowledge", f"low-volume {i}", project="drill") + for i in range(11): + create_memory("knowledge", f"high-volume {i}", project="apm") + + client = TestClient(app) + proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"] + ids = [p["project_id"] for p in proposals] + assert "apm" in ids + assert "drill" not in ids + assert "discrawl" not in ids + + +def test_proposals_suggest_sibling_aliases_via_shared_tokens(env): + """Lead-space + lead-space-exploration-ltd + space-exploration-ltd + should cluster: each proposes the others as suggested_aliases via + shared non-trivial tokens (length >= 4).""" + for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"): + for i in range(11): + create_memory("knowledge", f"{label} content {i}", project=label) + + client = TestClient(app) + proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]} + + # All three appear and each suggests at least one of the others + for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"): + assert label in proposals + siblings = set(proposals[label]["suggested_aliases"]) + # Every label shares "space" (and others share "exploration"/"lead") + # so at least one sibling must be present. + assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label} + + +def test_proposals_short_token_does_not_match(env): + """Two-or-three-letter tokens are too noisy to suggest aliases on. + 'apm' (3 chars) and 'apm-fpga' (3 + 4) share 'apm' (3 chars) — the + 'apm' token alone is too short, but 'fpga' (4) is long enough so + the match comes from the longer token. We test the negative: 'apm' + and 'drill' must NOT be siblings.""" + for i in range(11): + create_memory("knowledge", f"apm fact {i}", project="apm") + for i in range(11): + create_memory("knowledge", f"drill fact {i}", project="drill") + + client = TestClient(app) + proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]} + assert proposals["apm"]["suggested_aliases"] == [] + assert proposals["drill"]["suggested_aliases"] == [] + + +def test_proposals_include_sample_memories_and_guessed_root(env): + for i in range(11): + create_memory("knowledge", f"sample content {i}", project="apm") + + client = TestClient(app) + body = client.get("/admin/projects/proposals").json() + apm = next(p for p in body["proposals"] if p["project_id"] == "apm") + assert apm["active_count"] == 11 + assert apm["candidate_count"] == 0 + assert apm["guessed_ingest_root"] == { + "source": "vault", + "subpath": "incoming/projects/apm/", + } + assert len(apm["sample_memories"]) == 3 + for s in apm["sample_memories"]: + assert s["id"] + assert "sample content" in s["content_preview"] + + +def test_proposals_count_candidates_separately(env): + for i in range(11): + create_memory("knowledge", f"active {i}", project="apm") + for i in range(4): + create_memory("knowledge", f"candidate {i}", project="apm", status="candidate") + + client = TestClient(app) + apm = next( + p for p in client.get("/admin/projects/proposals").json()["proposals"] + if p["project_id"] == "apm" + ) + assert apm["active_count"] == 11 + assert apm["candidate_count"] == 4 + + +def test_proposals_min_active_param_validation(env): + client = TestClient(app) + r = client.get("/admin/projects/proposals?min_active=0") + assert r.status_code == 400 + + +def test_proposals_sorted_by_active_count_desc(env): + for i in range(20): + create_memory("knowledge", f"big {i}", project="apm") + for i in range(11): + create_memory("knowledge", f"small {i}", project="openclaw") + + client = TestClient(app) + proposals = client.get("/admin/projects/proposals").json()["proposals"] + ids = [p["project_id"] for p in proposals] + assert ids[0] == "apm" + assert ids[1] == "openclaw"