tests/test_emerging_project_proposals.py

"""Wave 1.5 — live emerging-project registration proposals.

The nightly `scripts/detect_emerging.py` writes a stale cache to
`project_state.proposals.unregistered_projects`. This endpoint provides
the on-demand alternative that operators can hit before deciding which
unregistered project to register via `/admin/projects/register-emerging`.
"""

import json

import pytest
from fastapi.testclient import TestClient

import atocore.config as config
from atocore.main import app
from atocore.memory.service import create_memory
from atocore.models.database import init_db


@pytest.fixture
def env(tmp_data_dir, tmp_path, monkeypatch):
    """Fresh DB + a registry holding a single registered project so we
    can prove the proposals endpoint excludes registered names."""
    registry_path = tmp_path / "registry.json"
    registry_path.write_text(
        json.dumps(
            {
                "projects": [
                    {
                        "id": "p04-gigabit",
                        "aliases": ["p04", "gigabit"],
                        "description": "test",
                        "ingest_roots": [
                            {"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
                        ],
                    }
                ]
            }
        ),
        encoding="utf-8",
    )
    monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
    config.settings = config.Settings()
    init_db()
    yield tmp_data_dir


def test_proposals_excludes_registered_project_and_its_aliases(env):
    """Memories tagged on a registered canonical id or any of its
    aliases must not appear as a registration proposal."""
    # Registered: p04-gigabit (aliases p04, gigabit)
    for i in range(15):
        create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
    for i in range(15):
        create_memory("knowledge", f"alias fact {i}", project="p04")  # alias

    # Unregistered, above threshold
    for i in range(12):
        create_memory("knowledge", f"apm fact {i}", project="apm")

    client = TestClient(app)
    body = client.get("/admin/projects/proposals?min_active=10").json()
    ids = [p["project_id"] for p in body["proposals"]]
    assert "apm" in ids
    assert "p04-gigabit" not in ids
    assert "p04" not in ids
    assert "gigabit" not in ids


def test_proposals_threshold_filters_low_count_labels(env):
    create_memory("knowledge", "single one-off", project="discrawl")
    for i in range(3):
        create_memory("knowledge", f"low-volume {i}", project="drill")
    for i in range(11):
        create_memory("knowledge", f"high-volume {i}", project="apm")

    client = TestClient(app)
    proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"]
    ids = [p["project_id"] for p in proposals]
    assert "apm" in ids
    assert "drill" not in ids
    assert "discrawl" not in ids


def test_proposals_suggest_sibling_aliases_via_shared_tokens(env):
    """Lead-space + lead-space-exploration-ltd + space-exploration-ltd
    should cluster: each proposes the others as suggested_aliases via
    shared non-trivial tokens (length >= 4)."""
    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
        for i in range(11):
            create_memory("knowledge", f"{label} content {i}", project=label)

    client = TestClient(app)
    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}

    # All three appear and each suggests at least one of the others
    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
        assert label in proposals
        siblings = set(proposals[label]["suggested_aliases"])
        # Every label shares "space" (and others share "exploration"/"lead")
        # so at least one sibling must be present.
        assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label}


def test_proposals_short_token_does_not_match(env):
    """Two-or-three-letter tokens are too noisy to suggest aliases on.
    'apm' (3 chars) and 'apm-fpga' (3 + 4) share 'apm' (3 chars) — the
    'apm' token alone is too short, but 'fpga' (4) is long enough so
    the match comes from the longer token. We test the negative: 'apm'
    and 'drill' must NOT be siblings."""
    for i in range(11):
        create_memory("knowledge", f"apm fact {i}", project="apm")
    for i in range(11):
        create_memory("knowledge", f"drill fact {i}", project="drill")

    client = TestClient(app)
    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
    assert proposals["apm"]["suggested_aliases"] == []
    assert proposals["drill"]["suggested_aliases"] == []


def test_proposals_include_sample_memories_and_guessed_root(env):
    for i in range(11):
        create_memory("knowledge", f"sample content {i}", project="apm")

    client = TestClient(app)
    body = client.get("/admin/projects/proposals").json()
    apm = next(p for p in body["proposals"] if p["project_id"] == "apm")
    assert apm["active_count"] == 11
    assert apm["candidate_count"] == 0
    assert apm["guessed_ingest_root"] == {
        "source": "vault",
        "subpath": "incoming/projects/apm/",
    }
    assert len(apm["sample_memories"]) == 3
    for s in apm["sample_memories"]:
        assert s["id"]
        assert "sample content" in s["content_preview"]


def test_proposals_count_candidates_separately(env):
    for i in range(11):
        create_memory("knowledge", f"active {i}", project="apm")
    for i in range(4):
        create_memory("knowledge", f"candidate {i}", project="apm", status="candidate")

    client = TestClient(app)
    apm = next(
        p for p in client.get("/admin/projects/proposals").json()["proposals"]
        if p["project_id"] == "apm"
    )
    assert apm["active_count"] == 11
    assert apm["candidate_count"] == 4


def test_proposals_min_active_param_validation(env):
    client = TestClient(app)
    r = client.get("/admin/projects/proposals?min_active=0")
    assert r.status_code == 400


def test_proposals_sorted_by_active_count_desc(env):
    for i in range(20):
        create_memory("knowledge", f"big {i}", project="apm")
    for i in range(11):
        create_memory("knowledge", f"small {i}", project="openclaw")

    client = TestClient(app)
    proposals = client.get("/admin/projects/proposals").json()["proposals"]
    ids = [p["project_id"] for p in proposals]
    assert ids[0] == "apm"
    assert ids[1] == "openclaw"
feat(projects): live emerging-project registration proposals Adds GET /admin/projects/proposals?min_active=N — an on-demand companion to the nightly scripts/detect_emerging.py cache. Reads SQL + the registry directly so the result is always current. Each proposal is operator-ready: - project_id (the literal label as captured) - active_count / candidate_count from current SQL - sample_memories: 3 most recent active memories with content preview - suggested_aliases: sibling labels sharing a >=4-char token (e.g. lead-space + lead-space-exploration-ltd + space-exploration-ltd cluster naturally; apm and drill stay independent) - guessed_ingest_root: vault:incoming/projects/<id>/ Workflow: operator hits /admin/projects/proposals to see the live "what should I register?" view, picks aliases from the suggestions, then POSTs to the existing /admin/projects/register-emerging. Closes Codex's Wave 1.5 ask: "promote-to-registered-project proposal with suggested aliases, sample memories, and guessed ingest root; require one click." For apm at 165 active memories on prod, this is overdue. 8 regression tests covering: registered-name (canonical + alias) exclusion, threshold filtering, sibling clustering, short-token negative, sample/root shape, candidate counting, param validation, sort order. Test count: 586 -> 594. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-28 22:10:37 -04:00			`"""Wave 1.5 — live emerging-project registration proposals.`

			The nightly `scripts/detect_emerging.py` writes a stale cache to
			`project_state.proposals.unregistered_projects`. This endpoint provides
			`the on-demand alternative that operators can hit before deciding which`
			unregistered project to register via `/admin/projects/register-emerging`.
			`"""`

			`import json`

			`import pytest`
			`from fastapi.testclient import TestClient`

			`import atocore.config as config`
			`from atocore.main import app`
			`from atocore.memory.service import create_memory`
			`from atocore.models.database import init_db`


			`@pytest.fixture`
			`def env(tmp_data_dir, tmp_path, monkeypatch):`
			`"""Fresh DB + a registry holding a single registered project so we`
			`can prove the proposals endpoint excludes registered names."""`
			`registry_path = tmp_path / "registry.json"`
			`registry_path.write_text(`
			`json.dumps(`
			`{`
			`"projects": [`
			`{`
			`"id": "p04-gigabit",`
			`"aliases": ["p04", "gigabit"],`
			`"description": "test",`
			`"ingest_roots": [`
			`{"source": "vault", "subpath": "incoming/projects/p04-gigabit"}`
			`],`
			`}`
			`]`
			`}`
			`),`
			`encoding="utf-8",`
			`)`
			`monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))`
			`config.settings = config.Settings()`
			`init_db()`
			`yield tmp_data_dir`


			`def test_proposals_excludes_registered_project_and_its_aliases(env):`
			`"""Memories tagged on a registered canonical id or any of its`
			`aliases must not appear as a registration proposal."""`
			`# Registered: p04-gigabit (aliases p04, gigabit)`
			`for i in range(15):`
			`create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")`
			`for i in range(15):`
			`create_memory("knowledge", f"alias fact {i}", project="p04") # alias`

			`# Unregistered, above threshold`
			`for i in range(12):`
			`create_memory("knowledge", f"apm fact {i}", project="apm")`

			`client = TestClient(app)`
			`body = client.get("/admin/projects/proposals?min_active=10").json()`
			`ids = [p["project_id"] for p in body["proposals"]]`
			`assert "apm" in ids`
			`assert "p04-gigabit" not in ids`
			`assert "p04" not in ids`
			`assert "gigabit" not in ids`


			`def test_proposals_threshold_filters_low_count_labels(env):`
			`create_memory("knowledge", "single one-off", project="discrawl")`
			`for i in range(3):`
			`create_memory("knowledge", f"low-volume {i}", project="drill")`
			`for i in range(11):`
			`create_memory("knowledge", f"high-volume {i}", project="apm")`

			`client = TestClient(app)`
			`proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"]`
			`ids = [p["project_id"] for p in proposals]`
			`assert "apm" in ids`
			`assert "drill" not in ids`
			`assert "discrawl" not in ids`


			`def test_proposals_suggest_sibling_aliases_via_shared_tokens(env):`
			`"""Lead-space + lead-space-exploration-ltd + space-exploration-ltd`
			`should cluster: each proposes the others as suggested_aliases via`
			`shared non-trivial tokens (length >= 4)."""`
			`for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):`
			`for i in range(11):`
			`create_memory("knowledge", f"{label} content {i}", project=label)`

			`client = TestClient(app)`
			`proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}`

			`# All three appear and each suggests at least one of the others`
			`for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):`
			`assert label in proposals`
			`siblings = set(proposals[label]["suggested_aliases"])`
			`# Every label shares "space" (and others share "exploration"/"lead")`
			`# so at least one sibling must be present.`
			`assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label}`


			`def test_proposals_short_token_does_not_match(env):`
			`"""Two-or-three-letter tokens are too noisy to suggest aliases on.`
			`'apm' (3 chars) and 'apm-fpga' (3 + 4) share 'apm' (3 chars) — the`
			`'apm' token alone is too short, but 'fpga' (4) is long enough so`
			`the match comes from the longer token. We test the negative: 'apm'`
			`and 'drill' must NOT be siblings."""`
			`for i in range(11):`
			`create_memory("knowledge", f"apm fact {i}", project="apm")`
			`for i in range(11):`
			`create_memory("knowledge", f"drill fact {i}", project="drill")`

			`client = TestClient(app)`
			`proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}`
			`assert proposals["apm"]["suggested_aliases"] == []`
			`assert proposals["drill"]["suggested_aliases"] == []`


			`def test_proposals_include_sample_memories_and_guessed_root(env):`
			`for i in range(11):`
			`create_memory("knowledge", f"sample content {i}", project="apm")`

			`client = TestClient(app)`
			`body = client.get("/admin/projects/proposals").json()`
			`apm = next(p for p in body["proposals"] if p["project_id"] == "apm")`
			`assert apm["active_count"] == 11`
			`assert apm["candidate_count"] == 0`
			`assert apm["guessed_ingest_root"] == {`
			`"source": "vault",`
			`"subpath": "incoming/projects/apm/",`
			`}`
			`assert len(apm["sample_memories"]) == 3`
			`for s in apm["sample_memories"]:`
			`assert s["id"]`
			`assert "sample content" in s["content_preview"]`


			`def test_proposals_count_candidates_separately(env):`
			`for i in range(11):`
			`create_memory("knowledge", f"active {i}", project="apm")`
			`for i in range(4):`
			`create_memory("knowledge", f"candidate {i}", project="apm", status="candidate")`

			`client = TestClient(app)`
			`apm = next(`
			`p for p in client.get("/admin/projects/proposals").json()["proposals"]`
			`if p["project_id"] == "apm"`
			`)`
			`assert apm["active_count"] == 11`
			`assert apm["candidate_count"] == 4`


			`def test_proposals_min_active_param_validation(env):`
			`client = TestClient(app)`
			`r = client.get("/admin/projects/proposals?min_active=0")`
			`assert r.status_code == 400`


			`def test_proposals_sorted_by_active_count_desc(env):`
			`for i in range(20):`
			`create_memory("knowledge", f"big {i}", project="apm")`
			`for i in range(11):`
			`create_memory("knowledge", f"small {i}", project="openclaw")`

			`client = TestClient(app)`
			`proposals = client.get("/admin/projects/proposals").json()["proposals"]`
			`ids = [p["project_id"] for p in proposals]`
			`assert ids[0] == "apm"`
			`assert ids[1] == "openclaw"`