"""Wave 1.5 — live emerging-project registration proposals. The nightly `scripts/detect_emerging.py` writes a stale cache to `project_state.proposals.unregistered_projects`. This endpoint provides the on-demand alternative that operators can hit before deciding which unregistered project to register via `/admin/projects/register-emerging`. """ import json import pytest from fastapi.testclient import TestClient import atocore.config as config from atocore.main import app from atocore.memory.service import create_memory from atocore.models.database import init_db @pytest.fixture def env(tmp_data_dir, tmp_path, monkeypatch): """Fresh DB + a registry holding a single registered project so we can prove the proposals endpoint excludes registered names.""" registry_path = tmp_path / "registry.json" registry_path.write_text( json.dumps( { "projects": [ { "id": "p04-gigabit", "aliases": ["p04", "gigabit"], "description": "test", "ingest_roots": [ {"source": "vault", "subpath": "incoming/projects/p04-gigabit"} ], } ] } ), encoding="utf-8", ) monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path)) config.settings = config.Settings() init_db() yield tmp_data_dir def test_proposals_excludes_registered_project_and_its_aliases(env): """Memories tagged on a registered canonical id or any of its aliases must not appear as a registration proposal.""" # Registered: p04-gigabit (aliases p04, gigabit) for i in range(15): create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit") for i in range(15): create_memory("knowledge", f"alias fact {i}", project="p04") # alias # Unregistered, above threshold for i in range(12): create_memory("knowledge", f"apm fact {i}", project="apm") client = TestClient(app) body = client.get("/admin/projects/proposals?min_active=10").json() ids = [p["project_id"] for p in body["proposals"]] assert "apm" in ids assert "p04-gigabit" not in ids assert "p04" not in ids assert "gigabit" not in ids def test_proposals_threshold_filters_low_count_labels(env): create_memory("knowledge", "single one-off", project="discrawl") for i in range(3): create_memory("knowledge", f"low-volume {i}", project="drill") for i in range(11): create_memory("knowledge", f"high-volume {i}", project="apm") client = TestClient(app) proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"] ids = [p["project_id"] for p in proposals] assert "apm" in ids assert "drill" not in ids assert "discrawl" not in ids def test_proposals_suggest_sibling_aliases_via_shared_tokens(env): """Lead-space + lead-space-exploration-ltd + space-exploration-ltd should cluster: each proposes the others as suggested_aliases via shared non-trivial tokens (length >= 4).""" for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"): for i in range(11): create_memory("knowledge", f"{label} content {i}", project=label) client = TestClient(app) proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]} # All three appear and each suggests at least one of the others for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"): assert label in proposals siblings = set(proposals[label]["suggested_aliases"]) # Every label shares "space" (and others share "exploration"/"lead") # so at least one sibling must be present. assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label} def test_proposals_short_token_does_not_match(env): """Two-or-three-letter tokens are too noisy to suggest aliases on. 'apm' (3 chars) and 'apm-fpga' (3 + 4) share 'apm' (3 chars) — the 'apm' token alone is too short, but 'fpga' (4) is long enough so the match comes from the longer token. We test the negative: 'apm' and 'drill' must NOT be siblings.""" for i in range(11): create_memory("knowledge", f"apm fact {i}", project="apm") for i in range(11): create_memory("knowledge", f"drill fact {i}", project="drill") client = TestClient(app) proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]} assert proposals["apm"]["suggested_aliases"] == [] assert proposals["drill"]["suggested_aliases"] == [] def test_proposals_include_sample_memories_and_guessed_root(env): for i in range(11): create_memory("knowledge", f"sample content {i}", project="apm") client = TestClient(app) body = client.get("/admin/projects/proposals").json() apm = next(p for p in body["proposals"] if p["project_id"] == "apm") assert apm["active_count"] == 11 assert apm["candidate_count"] == 0 assert apm["guessed_ingest_root"] == { "source": "vault", "subpath": "incoming/projects/apm/", } assert len(apm["sample_memories"]) == 3 for s in apm["sample_memories"]: assert s["id"] assert "sample content" in s["content_preview"] def test_proposals_count_candidates_separately(env): for i in range(11): create_memory("knowledge", f"active {i}", project="apm") for i in range(4): create_memory("knowledge", f"candidate {i}", project="apm", status="candidate") client = TestClient(app) apm = next( p for p in client.get("/admin/projects/proposals").json()["proposals"] if p["project_id"] == "apm" ) assert apm["active_count"] == 11 assert apm["candidate_count"] == 4 def test_proposals_min_active_param_validation(env): client = TestClient(app) r = client.get("/admin/projects/proposals?min_active=0") assert r.status_code == 400 def test_proposals_sorted_by_active_count_desc(env): for i in range(20): create_memory("knowledge", f"big {i}", project="apm") for i in range(11): create_memory("knowledge", f"small {i}", project="openclaw") client = TestClient(app) proposals = client.get("/admin/projects/proposals").json()["proposals"] ids = [p["project_id"] for p in proposals] assert ids[0] == "apm" assert ids[1] == "openclaw"