feat(projects): Wave 1.5 — live emerging-project registration proposals

GET /admin/projects/proposals?min_active=N — on-demand companion to the nightly scripts/detect_emerging.py cache. Reads SQL + the registry directly so the result is current. Each proposal: - project_id (literal label as captured) - active_count / candidate_count from current SQL - sample_memories: 3 most recent active rows with content preview - suggested_aliases: sibling labels sharing a >=4-char token, case-insensitive (lead-space + lead-space-exploration-ltd + space-exploration-ltd cluster; apm and apm-fpga do NOT cluster via the 3-char 'apm') - guessed_ingest_root: vault:incoming/projects/<id>/ Workflow: hit /admin/projects/proposals to see "what should I register?", then POST to existing /admin/projects/register-emerging. For prod: apm has 165 active memories, openclaw has 17, hydrotech-mining variants combine to 13. apm is overdue. Closes Codex's prior P2 from the state-of-service review. Reviewed by Codex on tip e8ac8bb (verdict GO); two follow-on improvements (stronger negative-clustering test + case-insensitive tokens) folded into f70fa6b. 10 regression tests covering: registered canonical/alias exclusion, threshold filtering, sibling clustering, short-token negative, case-insensitive clustering, registered-token-leak guard, sample shape, candidate counting, param validation, sort order. Test count: 586 -> 596. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 22:16:18 -04:00
parent d4ee52729c
commit b69d2c7088
3 changed files with 364 additions and 0 deletions
--- a/tests/test_emerging_project_proposals.py
+++ b/tests/test_emerging_project_proposals.py
@@ -0,0 +1,217 @@
+"""Wave 1.5 — live emerging-project registration proposals.
+
+The nightly `scripts/detect_emerging.py` writes a stale cache to
+`project_state.proposals.unregistered_projects`. This endpoint provides
+the on-demand alternative that operators can hit before deciding which
+unregistered project to register via `/admin/projects/register-emerging`.
+"""
+
+import json
+
+import pytest
+from fastapi.testclient import TestClient
+
+import atocore.config as config
+from atocore.main import app
+from atocore.memory.service import create_memory
+from atocore.models.database import init_db
+
+
+@pytest.fixture
+def env(tmp_data_dir, tmp_path, monkeypatch):
+    """Fresh DB + a registry holding a single registered project so we
+    can prove the proposals endpoint excludes registered names."""
+    registry_path = tmp_path / "registry.json"
+    registry_path.write_text(
+        json.dumps(
+            {
+                "projects": [
+                    {
+                        "id": "p04-gigabit",
+                        "aliases": ["p04", "gigabit"],
+                        "description": "test",
+                        "ingest_roots": [
+                            {"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
+                        ],
+                    }
+                ]
+            }
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
+    config.settings = config.Settings()
+    init_db()
+    yield tmp_data_dir
+
+
+def test_proposals_excludes_registered_project_and_its_aliases(env):
+    """Memories tagged on a registered canonical id or any of its
+    aliases must not appear as a registration proposal."""
+    # Registered: p04-gigabit (aliases p04, gigabit)
+    for i in range(15):
+        create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
+    for i in range(15):
+        create_memory("knowledge", f"alias fact {i}", project="p04")  # alias
+
+    # Unregistered, above threshold
+    for i in range(12):
+        create_memory("knowledge", f"apm fact {i}", project="apm")
+
+    client = TestClient(app)
+    body = client.get("/admin/projects/proposals?min_active=10").json()
+    ids = [p["project_id"] for p in body["proposals"]]
+    assert "apm" in ids
+    assert "p04-gigabit" not in ids
+    assert "p04" not in ids
+    assert "gigabit" not in ids
+
+
+def test_proposals_threshold_filters_low_count_labels(env):
+    create_memory("knowledge", "single one-off", project="discrawl")
+    for i in range(3):
+        create_memory("knowledge", f"low-volume {i}", project="drill")
+    for i in range(11):
+        create_memory("knowledge", f"high-volume {i}", project="apm")
+
+    client = TestClient(app)
+    proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"]
+    ids = [p["project_id"] for p in proposals]
+    assert "apm" in ids
+    assert "drill" not in ids
+    assert "discrawl" not in ids
+
+
+def test_proposals_suggest_sibling_aliases_via_shared_tokens(env):
+    """Lead-space + lead-space-exploration-ltd + space-exploration-ltd
+    should cluster: each proposes the others as suggested_aliases via
+    shared non-trivial tokens (length >= 4)."""
+    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
+        for i in range(11):
+            create_memory("knowledge", f"{label} content {i}", project=label)
+
+    client = TestClient(app)
+    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
+
+    # All three appear and each suggests at least one of the others
+    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
+        assert label in proposals
+        siblings = set(proposals[label]["suggested_aliases"])
+        # Every label shares "space" (and others share "exploration"/"lead")
+        # so at least one sibling must be present.
+        assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label}
+
+
+def test_proposals_short_token_does_not_match(env):
+    """Per Codex Wave 1.5 P2: previously this test only asserted apm
+    and drill have empty siblings, which is trivially true because they
+    share no tokens at all. The real risk is an accidental relaxation
+    that lets <4-char tokens trigger clustering. Construct a setup where
+    that would matter:
+      - 'apm' and 'apm-fpga': only the 3-char 'apm' is shared. They must
+        NOT cluster, because 'apm' is too short.
+      - 'foo-fpga' and 'bar-fpga': the 4-char 'fpga' is shared. They
+        MUST cluster.
+    """
+    for label in ("apm", "apm-fpga", "foo-fpga", "bar-fpga"):
+        for i in range(11):
+            create_memory("knowledge", f"{label} fact {i}", project=label)
+
+    client = TestClient(app)
+    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
+
+    # Negative: short-token match must not happen
+    assert "apm-fpga" not in proposals["apm"]["suggested_aliases"], (
+        "'apm' (3 chars) is below the 4-char minimum; 'apm' and 'apm-fpga' "
+        "must not cluster via the 'apm' token."
+    )
+
+    # Positive: long-token match must happen — both directions
+    assert "bar-fpga" in proposals["foo-fpga"]["suggested_aliases"]
+    assert "foo-fpga" in proposals["bar-fpga"]["suggested_aliases"]
+    # And 'apm-fpga' clusters with the others via 'fpga'
+    assert "apm-fpga" in proposals["foo-fpga"]["suggested_aliases"]
+
+
+def test_proposals_clustering_is_case_insensitive(env):
+    """Token comparison must be case-insensitive so labels captured
+    with mixed casing still cluster. Codex Wave 1.5 P3."""
+    for label in ("HydroTech-Mining", "hydrotech-split-tank"):
+        for i in range(11):
+            create_memory("knowledge", f"{label} fact {i}", project=label)
+
+    client = TestClient(app)
+    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
+    assert "hydrotech-split-tank" in proposals["HydroTech-Mining"]["suggested_aliases"]
+    assert "HydroTech-Mining" in proposals["hydrotech-split-tank"]["suggested_aliases"]
+
+
+def test_proposals_registered_token_does_not_leak_into_sibling_set(env, monkeypatch):
+    """Registered project ids must be filtered BEFORE clustering so a
+    registered token doesn't get suggested as an alias for an
+    unregistered sibling. p04-gigabit is registered in env; an
+    unregistered 'gigabit-other' must not list 'p04-gigabit' as alias."""
+    for i in range(15):
+        create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
+    for i in range(11):
+        create_memory("knowledge", f"gigabit-other fact {i}", project="gigabit-other")
+
+    client = TestClient(app)
+    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
+    assert "p04-gigabit" not in proposals
+    assert "gigabit-other" in proposals
+    # And the registered name must not surface as a sibling
+    assert "p04-gigabit" not in proposals["gigabit-other"]["suggested_aliases"]
+
+
+def test_proposals_include_sample_memories_and_guessed_root(env):
+    for i in range(11):
+        create_memory("knowledge", f"sample content {i}", project="apm")
+
+    client = TestClient(app)
+    body = client.get("/admin/projects/proposals").json()
+    apm = next(p for p in body["proposals"] if p["project_id"] == "apm")
+    assert apm["active_count"] == 11
+    assert apm["candidate_count"] == 0
+    assert apm["guessed_ingest_root"] == {
+        "source": "vault",
+        "subpath": "incoming/projects/apm/",
+    }
+    assert len(apm["sample_memories"]) == 3
+    for s in apm["sample_memories"]:
+        assert s["id"]
+        assert "sample content" in s["content_preview"]
+
+
+def test_proposals_count_candidates_separately(env):
+    for i in range(11):
+        create_memory("knowledge", f"active {i}", project="apm")
+    for i in range(4):
+        create_memory("knowledge", f"candidate {i}", project="apm", status="candidate")
+
+    client = TestClient(app)
+    apm = next(
+        p for p in client.get("/admin/projects/proposals").json()["proposals"]
+        if p["project_id"] == "apm"
+    )
+    assert apm["active_count"] == 11
+    assert apm["candidate_count"] == 4
+
+
+def test_proposals_min_active_param_validation(env):
+    client = TestClient(app)
+    r = client.get("/admin/projects/proposals?min_active=0")
+    assert r.status_code == 400
+
+
+def test_proposals_sorted_by_active_count_desc(env):
+    for i in range(20):
+        create_memory("knowledge", f"big {i}", project="apm")
+    for i in range(11):
+        create_memory("knowledge", f"small {i}", project="openclaw")
+
+    client = TestClient(app)
+    proposals = client.get("/admin/projects/proposals").json()["proposals"]
+    ids = [p["project_id"] for p in proposals]
+    assert ids[0] == "apm"
+    assert ids[1] == "openclaw"