Files
ATOCore/tests/test_emerging_project_proposals.py
Anto01 b69d2c7088 feat(projects): Wave 1.5 — live emerging-project registration proposals
GET /admin/projects/proposals?min_active=N — on-demand companion to
the nightly scripts/detect_emerging.py cache. Reads SQL + the registry
directly so the result is current.

Each proposal:
  - project_id (literal label as captured)
  - active_count / candidate_count from current SQL
  - sample_memories: 3 most recent active rows with content preview
  - suggested_aliases: sibling labels sharing a >=4-char token,
    case-insensitive (lead-space + lead-space-exploration-ltd +
    space-exploration-ltd cluster; apm and apm-fpga do NOT cluster
    via the 3-char 'apm')
  - guessed_ingest_root: vault:incoming/projects/<id>/

Workflow: hit /admin/projects/proposals to see "what should I register?",
then POST to existing /admin/projects/register-emerging.

For prod: apm has 165 active memories, openclaw has 17,
hydrotech-mining variants combine to 13. apm is overdue.

Closes Codex's prior P2 from the state-of-service review. Reviewed by
Codex on tip e8ac8bb (verdict GO); two follow-on improvements (stronger
negative-clustering test + case-insensitive tokens) folded into f70fa6b.

10 regression tests covering: registered canonical/alias exclusion,
threshold filtering, sibling clustering, short-token negative,
case-insensitive clustering, registered-token-leak guard, sample shape,
candidate counting, param validation, sort order.

Test count: 586 -> 596.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 22:16:18 -04:00

218 lines
8.6 KiB
Python

"""Wave 1.5 — live emerging-project registration proposals.
The nightly `scripts/detect_emerging.py` writes a stale cache to
`project_state.proposals.unregistered_projects`. This endpoint provides
the on-demand alternative that operators can hit before deciding which
unregistered project to register via `/admin/projects/register-emerging`.
"""
import json
import pytest
from fastapi.testclient import TestClient
import atocore.config as config
from atocore.main import app
from atocore.memory.service import create_memory
from atocore.models.database import init_db
@pytest.fixture
def env(tmp_data_dir, tmp_path, monkeypatch):
"""Fresh DB + a registry holding a single registered project so we
can prove the proposals endpoint excludes registered names."""
registry_path = tmp_path / "registry.json"
registry_path.write_text(
json.dumps(
{
"projects": [
{
"id": "p04-gigabit",
"aliases": ["p04", "gigabit"],
"description": "test",
"ingest_roots": [
{"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
],
}
]
}
),
encoding="utf-8",
)
monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
config.settings = config.Settings()
init_db()
yield tmp_data_dir
def test_proposals_excludes_registered_project_and_its_aliases(env):
"""Memories tagged on a registered canonical id or any of its
aliases must not appear as a registration proposal."""
# Registered: p04-gigabit (aliases p04, gigabit)
for i in range(15):
create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
for i in range(15):
create_memory("knowledge", f"alias fact {i}", project="p04") # alias
# Unregistered, above threshold
for i in range(12):
create_memory("knowledge", f"apm fact {i}", project="apm")
client = TestClient(app)
body = client.get("/admin/projects/proposals?min_active=10").json()
ids = [p["project_id"] for p in body["proposals"]]
assert "apm" in ids
assert "p04-gigabit" not in ids
assert "p04" not in ids
assert "gigabit" not in ids
def test_proposals_threshold_filters_low_count_labels(env):
create_memory("knowledge", "single one-off", project="discrawl")
for i in range(3):
create_memory("knowledge", f"low-volume {i}", project="drill")
for i in range(11):
create_memory("knowledge", f"high-volume {i}", project="apm")
client = TestClient(app)
proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"]
ids = [p["project_id"] for p in proposals]
assert "apm" in ids
assert "drill" not in ids
assert "discrawl" not in ids
def test_proposals_suggest_sibling_aliases_via_shared_tokens(env):
"""Lead-space + lead-space-exploration-ltd + space-exploration-ltd
should cluster: each proposes the others as suggested_aliases via
shared non-trivial tokens (length >= 4)."""
for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
for i in range(11):
create_memory("knowledge", f"{label} content {i}", project=label)
client = TestClient(app)
proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
# All three appear and each suggests at least one of the others
for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
assert label in proposals
siblings = set(proposals[label]["suggested_aliases"])
# Every label shares "space" (and others share "exploration"/"lead")
# so at least one sibling must be present.
assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label}
def test_proposals_short_token_does_not_match(env):
"""Per Codex Wave 1.5 P2: previously this test only asserted apm
and drill have empty siblings, which is trivially true because they
share no tokens at all. The real risk is an accidental relaxation
that lets <4-char tokens trigger clustering. Construct a setup where
that would matter:
- 'apm' and 'apm-fpga': only the 3-char 'apm' is shared. They must
NOT cluster, because 'apm' is too short.
- 'foo-fpga' and 'bar-fpga': the 4-char 'fpga' is shared. They
MUST cluster.
"""
for label in ("apm", "apm-fpga", "foo-fpga", "bar-fpga"):
for i in range(11):
create_memory("knowledge", f"{label} fact {i}", project=label)
client = TestClient(app)
proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
# Negative: short-token match must not happen
assert "apm-fpga" not in proposals["apm"]["suggested_aliases"], (
"'apm' (3 chars) is below the 4-char minimum; 'apm' and 'apm-fpga' "
"must not cluster via the 'apm' token."
)
# Positive: long-token match must happen — both directions
assert "bar-fpga" in proposals["foo-fpga"]["suggested_aliases"]
assert "foo-fpga" in proposals["bar-fpga"]["suggested_aliases"]
# And 'apm-fpga' clusters with the others via 'fpga'
assert "apm-fpga" in proposals["foo-fpga"]["suggested_aliases"]
def test_proposals_clustering_is_case_insensitive(env):
"""Token comparison must be case-insensitive so labels captured
with mixed casing still cluster. Codex Wave 1.5 P3."""
for label in ("HydroTech-Mining", "hydrotech-split-tank"):
for i in range(11):
create_memory("knowledge", f"{label} fact {i}", project=label)
client = TestClient(app)
proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
assert "hydrotech-split-tank" in proposals["HydroTech-Mining"]["suggested_aliases"]
assert "HydroTech-Mining" in proposals["hydrotech-split-tank"]["suggested_aliases"]
def test_proposals_registered_token_does_not_leak_into_sibling_set(env, monkeypatch):
"""Registered project ids must be filtered BEFORE clustering so a
registered token doesn't get suggested as an alias for an
unregistered sibling. p04-gigabit is registered in env; an
unregistered 'gigabit-other' must not list 'p04-gigabit' as alias."""
for i in range(15):
create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
for i in range(11):
create_memory("knowledge", f"gigabit-other fact {i}", project="gigabit-other")
client = TestClient(app)
proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
assert "p04-gigabit" not in proposals
assert "gigabit-other" in proposals
# And the registered name must not surface as a sibling
assert "p04-gigabit" not in proposals["gigabit-other"]["suggested_aliases"]
def test_proposals_include_sample_memories_and_guessed_root(env):
for i in range(11):
create_memory("knowledge", f"sample content {i}", project="apm")
client = TestClient(app)
body = client.get("/admin/projects/proposals").json()
apm = next(p for p in body["proposals"] if p["project_id"] == "apm")
assert apm["active_count"] == 11
assert apm["candidate_count"] == 0
assert apm["guessed_ingest_root"] == {
"source": "vault",
"subpath": "incoming/projects/apm/",
}
assert len(apm["sample_memories"]) == 3
for s in apm["sample_memories"]:
assert s["id"]
assert "sample content" in s["content_preview"]
def test_proposals_count_candidates_separately(env):
for i in range(11):
create_memory("knowledge", f"active {i}", project="apm")
for i in range(4):
create_memory("knowledge", f"candidate {i}", project="apm", status="candidate")
client = TestClient(app)
apm = next(
p for p in client.get("/admin/projects/proposals").json()["proposals"]
if p["project_id"] == "apm"
)
assert apm["active_count"] == 11
assert apm["candidate_count"] == 4
def test_proposals_min_active_param_validation(env):
client = TestClient(app)
r = client.get("/admin/projects/proposals?min_active=0")
assert r.status_code == 400
def test_proposals_sorted_by_active_count_desc(env):
for i in range(20):
create_memory("knowledge", f"big {i}", project="apm")
for i in range(11):
create_memory("knowledge", f"small {i}", project="openclaw")
client = TestClient(app)
proposals = client.get("/admin/projects/proposals").json()["proposals"]
ids = [p["project_id"] for p in proposals]
assert ids[0] == "apm"
assert ids[1] == "openclaw"