feat(projects): live emerging-project registration proposals

Adds GET /admin/projects/proposals?min_active=N — an on-demand companion to the nightly scripts/detect_emerging.py cache. Reads SQL + the registry directly so the result is always current. Each proposal is operator-ready: - project_id (the literal label as captured) - active_count / candidate_count from current SQL - sample_memories: 3 most recent active memories with content preview - suggested_aliases: sibling labels sharing a >=4-char token (e.g. lead-space + lead-space-exploration-ltd + space-exploration-ltd cluster naturally; apm and drill stay independent) - guessed_ingest_root: vault:incoming/projects/<id>/ Workflow: operator hits /admin/projects/proposals to see the live "what should I register?" view, picks aliases from the suggestions, then POSTs to the existing /admin/projects/register-emerging. Closes Codex's Wave 1.5 ask: "promote-to-registered-project proposal with suggested aliases, sample memories, and guessed ingest root; require one click." For apm at 165 active memories on prod, this is overdue. 8 regression tests covering: registered-name (canonical + alias) exclusion, threshold filtering, sibling clustering, short-token negative, sample/root shape, candidate counting, param validation, sort order. Test count: 586 -> 594. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 22:10:37 -04:00
parent d4ee52729c
commit e8ac8bb536
3 changed files with 319 additions and 0 deletions
--- a/src/atocore/api/routes.py
+++ b/src/atocore/api/routes.py
@@ -474,6 +474,33 @@ def api_register_emerging_project(req: RegisterEmergingRequest) -> dict:
    return result


+@router.get("/admin/projects/proposals")
+def api_project_proposals(min_active: int = 10) -> dict:
+    """Live registration proposals for unregistered projects.
+
+    Reads SQL + the registry directly, so the result is current — unlike
+    `/admin/dashboard.proposals.unregistered_projects` which is the
+    nightly cache from `scripts/detect_emerging.py`. Each proposal
+    includes a guessed ingest root, sibling labels suggested as aliases,
+    and a few sample memories so the operator can sanity-check before
+    POSTing to /admin/projects/register-emerging.
+
+    Query params:
+        min_active: minimum active-memory count for a label to surface
+                    (default 10).
+    """
+    from atocore.memory.service import propose_emerging_projects
+
+    if min_active < 1:
+        raise HTTPException(status_code=400, detail="min_active must be >= 1")
+    proposals = propose_emerging_projects(min_active=min_active)
+    return {
+        "proposals": proposals,
+        "count": len(proposals),
+        "min_active": min_active,
+    }
+
+
@router.put("/projects/{project_name}")
 def api_project_update(project_name: str, req: ProjectUpdateRequest) -> dict:
    """Update an existing project registration."""
--- a/src/atocore/memory/service.py
+++ b/src/atocore/memory/service.py
@@ -424,6 +424,126 @@ def get_memory_count_summary() -> dict:
    return summary


+def propose_emerging_projects(min_active: int = 10) -> list[dict]:
+    """Return live, on-demand registration proposals for unregistered projects.
+
+    Differs from the nightly ``scripts/detect_emerging.py`` cache (which
+    is fresh once a day and lives in ``project_state.proposals``) by
+    reading current SQL and the registry directly. Each proposal is
+    operator-ready: a guessed ingest root, sibling labels suggested as
+    aliases, and a few sample memories so the operator can sanity-check
+    the bucket before committing it.
+
+    Args:
+        min_active: minimum number of active memories required for a
+            label to surface as a proposal. Defaults to 10 — anything
+            smaller is too noisy to register without more signal.
+
+    Returns:
+        list of proposal dicts, sorted by active_count desc:
+            {
+                "project_id": str,
+                "active_count": int,
+                "candidate_count": int,
+                "suggested_aliases": list[str],
+                "guessed_ingest_root": {"source": "vault", "subpath": ...},
+                "sample_memories": [{id, content_preview, updated_at}, ...],
+            }
+    """
+    from atocore.projects.registry import load_project_registry
+
+    # Build the set of names already known to the registry (canonical + aliases),
+    # lowercased. Anything in this set is "registered" and not a proposal.
+    registered_names: set[str] = set()
+    try:
+        for project in load_project_registry():
+            registered_names.add(project.project_id.lower())
+            for alias in project.aliases:
+                registered_names.add(alias.lower())
+    except Exception:
+        # Fail-open: if the registry can't load, assume nothing is
+        # registered and let the proposal surface everything.
+        pass
+
+    with get_connection() as conn:
+        # Active counts per project (excluding empty/null project — that's
+        # the global bucket, not a proposal candidate).
+        active_rows = conn.execute(
+            "SELECT project, count(*) AS c FROM memories "
+            "WHERE status = 'active' AND project IS NOT NULL AND project != '' "
+            "GROUP BY project"
+        ).fetchall()
+        cand_rows = conn.execute(
+            "SELECT project, count(*) AS c FROM memories "
+            "WHERE status = 'candidate' AND project IS NOT NULL AND project != '' "
+            "GROUP BY project"
+        ).fetchall()
+
+    cand_counts = {r["project"]: r["c"] for r in cand_rows}
+
+    # Filter to unregistered labels above threshold
+    unregistered: list[tuple[str, int, int]] = []  # (project, active_n, candidate_n)
+    for r in active_rows:
+        proj = r["project"]
+        if proj.lower() in registered_names:
+            continue
+        if r["c"] < min_active:
+            continue
+        unregistered.append((proj, r["c"], cand_counts.get(proj, 0)))
+
+    # Sibling alias detection: two unregistered labels are siblings if
+    # they share a non-trivial token (length >= 4 after splitting on
+    # '-' and '_'). Cheap, defensible, and the operator gets to veto.
+    def _tokens(label: str) -> set[str]:
+        parts = label.replace("_", "-").split("-")
+        return {p for p in parts if len(p) >= 4}
+
+    label_tokens = {label: _tokens(label) for label, _a, _c in unregistered}
+
+    proposals = []
+    for proj, active_n, candidate_n in sorted(
+        unregistered, key=lambda t: (-t[1], t[0])
+    ):
+        siblings = [
+            other
+            for other in label_tokens
+            if other != proj and (label_tokens[proj] & label_tokens[other])
+        ]
+        siblings.sort()
+
+        # Sample memories: top 3 active by updated_at desc
+        with get_connection() as conn:
+            sample_rows = conn.execute(
+                "SELECT id, content, updated_at FROM memories "
+                "WHERE status = 'active' AND project = ? "
+                "ORDER BY updated_at DESC LIMIT 3",
+                (proj,),
+            ).fetchall()
+        samples = [
+            {
+                "id": r["id"],
+                "content_preview": (r["content"] or "")[:160],
+                "updated_at": r["updated_at"],
+            }
+            for r in sample_rows
+        ]
+
+        proposals.append(
+            {
+                "project_id": proj,
+                "active_count": active_n,
+                "candidate_count": candidate_n,
+                "suggested_aliases": siblings,
+                "guessed_ingest_root": {
+                    "source": "vault",
+                    "subpath": f"incoming/projects/{proj}/",
+                },
+                "sample_memories": samples,
+            }
+        )
+    return proposals
+
+
 def update_memory(
    memory_id: str,
    content: str | None = None,
--- a/tests/test_emerging_project_proposals.py
+++ b/tests/test_emerging_project_proposals.py
@@ -0,0 +1,172 @@
+"""Wave 1.5 — live emerging-project registration proposals.
+
+The nightly `scripts/detect_emerging.py` writes a stale cache to
+`project_state.proposals.unregistered_projects`. This endpoint provides
+the on-demand alternative that operators can hit before deciding which
+unregistered project to register via `/admin/projects/register-emerging`.
+"""
+
+import json
+
+import pytest
+from fastapi.testclient import TestClient
+
+import atocore.config as config
+from atocore.main import app
+from atocore.memory.service import create_memory
+from atocore.models.database import init_db
+
+
+@pytest.fixture
+def env(tmp_data_dir, tmp_path, monkeypatch):
+    """Fresh DB + a registry holding a single registered project so we
+    can prove the proposals endpoint excludes registered names."""
+    registry_path = tmp_path / "registry.json"
+    registry_path.write_text(
+        json.dumps(
+            {
+                "projects": [
+                    {
+                        "id": "p04-gigabit",
+                        "aliases": ["p04", "gigabit"],
+                        "description": "test",
+                        "ingest_roots": [
+                            {"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
+                        ],
+                    }
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
+    config.settings = config.Settings()
+    init_db()
+    yield tmp_data_dir
+
+
+def test_proposals_excludes_registered_project_and_its_aliases(env):
+    """Memories tagged on a registered canonical id or any of its
+    aliases must not appear as a registration proposal."""
+    # Registered: p04-gigabit (aliases p04, gigabit)
+    for i in range(15):
+        create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
+    for i in range(15):
+        create_memory("knowledge", f"alias fact {i}", project="p04")  # alias
+
+    # Unregistered, above threshold
+    for i in range(12):
+        create_memory("knowledge", f"apm fact {i}", project="apm")
+
+    client = TestClient(app)
+    body = client.get("/admin/projects/proposals?min_active=10").json()
+    ids = [p["project_id"] for p in body["proposals"]]
+    assert "apm" in ids
+    assert "p04-gigabit" not in ids
+    assert "p04" not in ids
+    assert "gigabit" not in ids
+
+
+def test_proposals_threshold_filters_low_count_labels(env):
+    create_memory("knowledge", "single one-off", project="discrawl")
+    for i in range(3):
+        create_memory("knowledge", f"low-volume {i}", project="drill")
+    for i in range(11):
+        create_memory("knowledge", f"high-volume {i}", project="apm")
+
+    client = TestClient(app)
+    proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"]
+    ids = [p["project_id"] for p in proposals]
+    assert "apm" in ids
+    assert "drill" not in ids
+    assert "discrawl" not in ids
+
+
+def test_proposals_suggest_sibling_aliases_via_shared_tokens(env):
+    """Lead-space + lead-space-exploration-ltd + space-exploration-ltd
+    should cluster: each proposes the others as suggested_aliases via
+    shared non-trivial tokens (length >= 4)."""
+    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
+        for i in range(11):
+            create_memory("knowledge", f"{label} content {i}", project=label)
+
+    client = TestClient(app)
+    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
+
+    # All three appear and each suggests at least one of the others
+    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
+        assert label in proposals
+        siblings = set(proposals[label]["suggested_aliases"])
+        # Every label shares "space" (and others share "exploration"/"lead")
+        # so at least one sibling must be present.
+        assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label}
+
+
+def test_proposals_short_token_does_not_match(env):
+    """Two-or-three-letter tokens are too noisy to suggest aliases on.
+    'apm' (3 chars) and 'apm-fpga' (3 + 4) share 'apm' (3 chars) — the
+    'apm' token alone is too short, but 'fpga' (4) is long enough so
+    the match comes from the longer token. We test the negative: 'apm'
+    and 'drill' must NOT be siblings."""
+    for i in range(11):
+        create_memory("knowledge", f"apm fact {i}", project="apm")
+    for i in range(11):
+        create_memory("knowledge", f"drill fact {i}", project="drill")
+
+    client = TestClient(app)
+    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
+    assert proposals["apm"]["suggested_aliases"] == []
+    assert proposals["drill"]["suggested_aliases"] == []
+
+
+def test_proposals_include_sample_memories_and_guessed_root(env):
+    for i in range(11):
+        create_memory("knowledge", f"sample content {i}", project="apm")
+
+    client = TestClient(app)
+    body = client.get("/admin/projects/proposals").json()
+    apm = next(p for p in body["proposals"] if p["project_id"] == "apm")
+    assert apm["active_count"] == 11
+    assert apm["candidate_count"] == 0
+    assert apm["guessed_ingest_root"] == {
+        "source": "vault",
+        "subpath": "incoming/projects/apm/",
+    }
+    assert len(apm["sample_memories"]) == 3
+    for s in apm["sample_memories"]:
+        assert s["id"]
+        assert "sample content" in s["content_preview"]
+
+
+def test_proposals_count_candidates_separately(env):
+    for i in range(11):
+        create_memory("knowledge", f"active {i}", project="apm")
+    for i in range(4):
+        create_memory("knowledge", f"candidate {i}", project="apm", status="candidate")
+
+    client = TestClient(app)
+    apm = next(
+        p for p in client.get("/admin/projects/proposals").json()["proposals"]
+        if p["project_id"] == "apm"
+    )
+    assert apm["active_count"] == 11
+    assert apm["candidate_count"] == 4
+
+
+def test_proposals_min_active_param_validation(env):
+    client = TestClient(app)
+    r = client.get("/admin/projects/proposals?min_active=0")
+    assert r.status_code == 400
+
+
+def test_proposals_sorted_by_active_count_desc(env):
+    for i in range(20):
+        create_memory("knowledge", f"big {i}", project="apm")
+    for i in range(11):
+        create_memory("knowledge", f"small {i}", project="openclaw")
+
+    client = TestClient(app)
+    proposals = client.get("/admin/projects/proposals").json()["proposals"]
+    ids = [p["project_id"] for p in proposals]
+    assert ids[0] == "apm"
+    assert ids[1] == "openclaw"