ATOCore/tests/test_emerging_project_proposals.py

"""Wave 1.5 — live emerging-project registration proposals.

The nightly `scripts/detect_emerging.py` writes a stale cache to
`project_state.proposals.unregistered_projects`. This endpoint provides
the on-demand alternative that operators can hit before deciding which
unregistered project to register via `/admin/projects/register-emerging`.
"""

import json

import pytest
from fastapi.testclient import TestClient

import atocore.config as config
from atocore.main import app
from atocore.memory.service import create_memory
from atocore.models.database import init_db


@pytest.fixture
def env(tmp_data_dir, tmp_path, monkeypatch):
    """Fresh DB + a registry holding a single registered project so we
    can prove the proposals endpoint excludes registered names."""
    registry_path = tmp_path / "registry.json"
    registry_path.write_text(
        json.dumps(
            {
                "projects": [
                    {
                        "id": "p04-gigabit",
                        "aliases": ["p04", "gigabit"],
                        "description": "test",
                        "ingest_roots": [
                            {"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
                        ],
                    }
                ]
            }
        ),
        encoding="utf-8",
    )
    monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
    config.settings = config.Settings()
    init_db()
    yield tmp_data_dir


def test_proposals_excludes_registered_project_and_its_aliases(env):
    """Memories tagged on a registered canonical id or any of its
    aliases must not appear as a registration proposal."""
    # Registered: p04-gigabit (aliases p04, gigabit)
    for i in range(15):
        create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
    for i in range(15):
        create_memory("knowledge", f"alias fact {i}", project="p04")  # alias

    # Unregistered, above threshold
    for i in range(12):
        create_memory("knowledge", f"apm fact {i}", project="apm")

    client = TestClient(app)
    body = client.get("/admin/projects/proposals?min_active=10").json()
    ids = [p["project_id"] for p in body["proposals"]]
    assert "apm" in ids
    assert "p04-gigabit" not in ids
    assert "p04" not in ids
    assert "gigabit" not in ids


def test_proposals_threshold_filters_low_count_labels(env):
    create_memory("knowledge", "single one-off", project="discrawl")
    for i in range(3):
        create_memory("knowledge", f"low-volume {i}", project="drill")
    for i in range(11):
        create_memory("knowledge", f"high-volume {i}", project="apm")

    client = TestClient(app)
    proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"]
    ids = [p["project_id"] for p in proposals]
    assert "apm" in ids
    assert "drill" not in ids
    assert "discrawl" not in ids


def test_proposals_suggest_sibling_aliases_via_shared_tokens(env):
    """Lead-space + lead-space-exploration-ltd + space-exploration-ltd
    should cluster: each proposes the others as suggested_aliases via
    shared non-trivial tokens (length >= 4)."""
    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
        for i in range(11):
            create_memory("knowledge", f"{label} content {i}", project=label)

    client = TestClient(app)
    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}

    # All three appear and each suggests at least one of the others
    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
        assert label in proposals
        siblings = set(proposals[label]["suggested_aliases"])
        # Every label shares "space" (and others share "exploration"/"lead")
        # so at least one sibling must be present.
        assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label}


def test_proposals_short_token_does_not_match(env):
    """Two-or-three-letter tokens are too noisy to suggest aliases on.
    'apm' (3 chars) and 'apm-fpga' (3 + 4) share 'apm' (3 chars) — the
    'apm' token alone is too short, but 'fpga' (4) is long enough so
    the match comes from the longer token. We test the negative: 'apm'
    and 'drill' must NOT be siblings."""
    for i in range(11):
        create_memory("knowledge", f"apm fact {i}", project="apm")
    for i in range(11):
        create_memory("knowledge", f"drill fact {i}", project="drill")

    client = TestClient(app)
    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
    assert proposals["apm"]["suggested_aliases"] == []
    assert proposals["drill"]["suggested_aliases"] == []


def test_proposals_include_sample_memories_and_guessed_root(env):
    for i in range(11):
        create_memory("knowledge", f"sample content {i}", project="apm")

    client = TestClient(app)
    body = client.get("/admin/projects/proposals").json()
    apm = next(p for p in body["proposals"] if p["project_id"] == "apm")
    assert apm["active_count"] == 11
    assert apm["candidate_count"] == 0
    assert apm["guessed_ingest_root"] == {
        "source": "vault",
        "subpath": "incoming/projects/apm/",
    }
    assert len(apm["sample_memories"]) == 3
    for s in apm["sample_memories"]:
        assert s["id"]
        assert "sample content" in s["content_preview"]


def test_proposals_count_candidates_separately(env):
    for i in range(11):
        create_memory("knowledge", f"active {i}", project="apm")
    for i in range(4):
        create_memory("knowledge", f"candidate {i}", project="apm", status="candidate")

    client = TestClient(app)
    apm = next(
        p for p in client.get("/admin/projects/proposals").json()["proposals"]
        if p["project_id"] == "apm"
    )
    assert apm["active_count"] == 11
    assert apm["candidate_count"] == 4


def test_proposals_min_active_param_validation(env):
    client = TestClient(app)
    r = client.get("/admin/projects/proposals?min_active=0")
    assert r.status_code == 400


def test_proposals_sorted_by_active_count_desc(env):
    for i in range(20):
        create_memory("knowledge", f"big {i}", project="apm")
    for i in range(11):
        create_memory("knowledge", f"small {i}", project="openclaw")

    client = TestClient(app)
    proposals = client.get("/admin/projects/proposals").json()["proposals"]
    ids = [p["project_id"] for p in proposals]
    assert ids[0] == "apm"
    assert ids[1] == "openclaw"