feat(projects): Wave 1.5 — live emerging-project registration proposals

GET /admin/projects/proposals?min_active=N — on-demand companion to the nightly scripts/detect_emerging.py cache. Reads SQL + the registry directly so the result is current. Each proposal: - project_id (literal label as captured) - active_count / candidate_count from current SQL - sample_memories: 3 most recent active rows with content preview - suggested_aliases: sibling labels sharing a >=4-char token, case-insensitive (lead-space + lead-space-exploration-ltd + space-exploration-ltd cluster; apm and apm-fpga do NOT cluster via the 3-char 'apm') - guessed_ingest_root: vault:incoming/projects/<id>/ Workflow: hit /admin/projects/proposals to see "what should I register?", then POST to existing /admin/projects/register-emerging. For prod: apm has 165 active memories, openclaw has 17, hydrotech-mining variants combine to 13. apm is overdue. Closes Codex's prior P2 from the state-of-service review. Reviewed by Codex on tip e8ac8bb (verdict GO); two follow-on improvements (stronger negative-clustering test + case-insensitive tokens) folded into f70fa6b. 10 regression tests covering: registered canonical/alias exclusion, threshold filtering, sibling clustering, short-token negative, case-insensitive clustering, registered-token-leak guard, sample shape, candidate counting, param validation, sort order. Test count: 586 -> 596. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 22:16:18 -04:00
parent d4ee52729c
commit b69d2c7088
3 changed files with 364 additions and 0 deletions
--- a/src/atocore/api/routes.py
+++ b/src/atocore/api/routes.py
@@ -474,6 +474,33 @@ def api_register_emerging_project(req: RegisterEmergingRequest) -> dict:
    return result
@router.get("/admin/projects/proposals")
 def api_project_proposals(min_active: int = 10) -> dict:
    """Live registration proposals for unregistered projects.
    Reads SQL + the registry directly, so the result is current — unlike
    `/admin/dashboard.proposals.unregistered_projects` which is the
    nightly cache from `scripts/detect_emerging.py`. Each proposal
    includes a guessed ingest root, sibling labels suggested as aliases,
    and a few sample memories so the operator can sanity-check before
    POSTing to /admin/projects/register-emerging.
    Query params:
        min_active: minimum active-memory count for a label to surface
                    (default 10).
    """
    from atocore.memory.service import propose_emerging_projects
    if min_active < 1:
        raise HTTPException(status_code=400, detail="min_active must be >= 1")
    proposals = propose_emerging_projects(min_active=min_active)
    return {
        "proposals": proposals,
        "count": len(proposals),
        "min_active": min_active,
    }
@router.put("/projects/{project_name}")
 def api_project_update(project_name: str, req: ProjectUpdateRequest) -> dict:
    """Update an existing project registration."""
--- a/src/atocore/memory/service.py
+++ b/src/atocore/memory/service.py
@@ -424,6 +424,126 @@ def get_memory_count_summary() -> dict:
    return summary
 def propose_emerging_projects(min_active: int = 10) -> list[dict]:
    """Return live, on-demand registration proposals for unregistered projects.
    Differs from the nightly ``scripts/detect_emerging.py`` cache (which
    is fresh once a day and lives in ``project_state.proposals``) by
    reading current SQL and the registry directly. Each proposal is
    operator-ready: a guessed ingest root, sibling labels suggested as
    aliases, and a few sample memories so the operator can sanity-check
    the bucket before committing it.
    Args:
        min_active: minimum number of active memories required for a
            label to surface as a proposal. Defaults to 10 — anything
            smaller is too noisy to register without more signal.
    Returns:
        list of proposal dicts, sorted by active_count desc:
            {
                "project_id": str,
                "active_count": int,
                "candidate_count": int,
                "suggested_aliases": list[str],
                "guessed_ingest_root": {"source": "vault", "subpath": ...},
                "sample_memories": [{id, content_preview, updated_at}, ...],
            }
    """
    from atocore.projects.registry import load_project_registry
    # Build the set of names already known to the registry (canonical + aliases),
    # lowercased. Anything in this set is "registered" and not a proposal.
    registered_names: set[str] = set()
    try:
        for project in load_project_registry():
            registered_names.add(project.project_id.lower())
            for alias in project.aliases:
                registered_names.add(alias.lower())
    except Exception:
        # Fail-open: if the registry can't load, assume nothing is
        # registered and let the proposal surface everything.
        pass
    with get_connection() as conn:
        # Active counts per project (excluding empty/null project — that's
        # the global bucket, not a proposal candidate).
        active_rows = conn.execute(
            "SELECT project, count(*) AS c FROM memories "
            "WHERE status = 'active' AND project IS NOT NULL AND project != '' "
            "GROUP BY project"
        ).fetchall()
        cand_rows = conn.execute(
            "SELECT project, count(*) AS c FROM memories "
            "WHERE status = 'candidate' AND project IS NOT NULL AND project != '' "
            "GROUP BY project"
        ).fetchall()
    cand_counts = {r["project"]: r["c"] for r in cand_rows}
    # Filter to unregistered labels above threshold
    unregistered: list[tuple[str, int, int]] = []  # (project, active_n, candidate_n)
    for r in active_rows:
        proj = r["project"]
        if proj.lower() in registered_names:
            continue
        if r["c"] < min_active:
            continue
        unregistered.append((proj, r["c"], cand_counts.get(proj, 0)))
    # Sibling alias detection: two unregistered labels are siblings if
    # they share a non-trivial token (length >= 4 after splitting on
    # '-' and '_'). Cheap, defensible, and the operator gets to veto.
    def _tokens(label: str) -> set[str]:
        parts = label.lower().replace("_", "-").split("-")
        return {p for p in parts if len(p) >= 4}
    label_tokens = {label: _tokens(label) for label, _a, _c in unregistered}
    proposals = []
    for proj, active_n, candidate_n in sorted(
        unregistered, key=lambda t: (-t[1], t[0])
    ):
        siblings = [
            other
            for other in label_tokens
            if other != proj and (label_tokens[proj] & label_tokens[other])
        ]
        siblings.sort()
        # Sample memories: top 3 active by updated_at desc
        with get_connection() as conn:
            sample_rows = conn.execute(
                "SELECT id, content, updated_at FROM memories "
                "WHERE status = 'active' AND project = ? "
                "ORDER BY updated_at DESC LIMIT 3",
                (proj,),
            ).fetchall()
        samples = [
            {
                "id": r["id"],
                "content_preview": (r["content"] or "")[:160],
                "updated_at": r["updated_at"],
            }
            for r in sample_rows
        ]
        proposals.append(
            {
                "project_id": proj,
                "active_count": active_n,
                "candidate_count": candidate_n,
                "suggested_aliases": siblings,
                "guessed_ingest_root": {
                    "source": "vault",
                    "subpath": f"incoming/projects/{proj}/",
                },
                "sample_memories": samples,
            }
        )
    return proposals
 def update_memory(
    memory_id: str,
    content: str | None = None,
--- a/tests/test_emerging_project_proposals.py
+++ b/tests/test_emerging_project_proposals.py
@@ -0,0 +1,217 @@
 """Wave 1.5 — live emerging-project registration proposals.
 The nightly `scripts/detect_emerging.py` writes a stale cache to
 `project_state.proposals.unregistered_projects`. This endpoint provides
 the on-demand alternative that operators can hit before deciding which
 unregistered project to register via `/admin/projects/register-emerging`.
 """
 import json
 import pytest
 from fastapi.testclient import TestClient
 import atocore.config as config
 from atocore.main import app
 from atocore.memory.service import create_memory
 from atocore.models.database import init_db
@pytest.fixture
 def env(tmp_data_dir, tmp_path, monkeypatch):
    """Fresh DB + a registry holding a single registered project so we
    can prove the proposals endpoint excludes registered names."""
    registry_path = tmp_path / "registry.json"
    registry_path.write_text(
        json.dumps(
            {
                "projects": [
                    {
                        "id": "p04-gigabit",
                        "aliases": ["p04", "gigabit"],
                        "description": "test",
                        "ingest_roots": [
                            {"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
                        ],
                    }
                ]
            }
        ),
        encoding="utf-8",
    )
    monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
    config.settings = config.Settings()
    init_db()
    yield tmp_data_dir
 def test_proposals_excludes_registered_project_and_its_aliases(env):
    """Memories tagged on a registered canonical id or any of its
    aliases must not appear as a registration proposal."""
    # Registered: p04-gigabit (aliases p04, gigabit)
    for i in range(15):
        create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
    for i in range(15):
        create_memory("knowledge", f"alias fact {i}", project="p04")  # alias
    # Unregistered, above threshold
    for i in range(12):
        create_memory("knowledge", f"apm fact {i}", project="apm")
    client = TestClient(app)
    body = client.get("/admin/projects/proposals?min_active=10").json()
    ids = [p["project_id"] for p in body["proposals"]]
    assert "apm" in ids
    assert "p04-gigabit" not in ids
    assert "p04" not in ids
    assert "gigabit" not in ids
 def test_proposals_threshold_filters_low_count_labels(env):
    create_memory("knowledge", "single one-off", project="discrawl")
    for i in range(3):
        create_memory("knowledge", f"low-volume {i}", project="drill")
    for i in range(11):
        create_memory("knowledge", f"high-volume {i}", project="apm")
    client = TestClient(app)
    proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"]
    ids = [p["project_id"] for p in proposals]
    assert "apm" in ids
    assert "drill" not in ids
    assert "discrawl" not in ids
 def test_proposals_suggest_sibling_aliases_via_shared_tokens(env):
    """Lead-space + lead-space-exploration-ltd + space-exploration-ltd
    should cluster: each proposes the others as suggested_aliases via
    shared non-trivial tokens (length >= 4)."""
    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
        for i in range(11):
            create_memory("knowledge", f"{label} content {i}", project=label)
    client = TestClient(app)
    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
    # All three appear and each suggests at least one of the others
    for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
        assert label in proposals
        siblings = set(proposals[label]["suggested_aliases"])
        # Every label shares "space" (and others share "exploration"/"lead")
        # so at least one sibling must be present.
        assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label}
 def test_proposals_short_token_does_not_match(env):
    """Per Codex Wave 1.5 P2: previously this test only asserted apm
    and drill have empty siblings, which is trivially true because they
    share no tokens at all. The real risk is an accidental relaxation
    that lets <4-char tokens trigger clustering. Construct a setup where
    that would matter:
      - 'apm' and 'apm-fpga': only the 3-char 'apm' is shared. They must
        NOT cluster, because 'apm' is too short.
      - 'foo-fpga' and 'bar-fpga': the 4-char 'fpga' is shared. They
        MUST cluster.
    """
    for label in ("apm", "apm-fpga", "foo-fpga", "bar-fpga"):
        for i in range(11):
            create_memory("knowledge", f"{label} fact {i}", project=label)
    client = TestClient(app)
    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
    # Negative: short-token match must not happen
    assert "apm-fpga" not in proposals["apm"]["suggested_aliases"], (
        "'apm' (3 chars) is below the 4-char minimum; 'apm' and 'apm-fpga' "
        "must not cluster via the 'apm' token."
    )
    # Positive: long-token match must happen — both directions
    assert "bar-fpga" in proposals["foo-fpga"]["suggested_aliases"]
    assert "foo-fpga" in proposals["bar-fpga"]["suggested_aliases"]
    # And 'apm-fpga' clusters with the others via 'fpga'
    assert "apm-fpga" in proposals["foo-fpga"]["suggested_aliases"]
 def test_proposals_clustering_is_case_insensitive(env):
    """Token comparison must be case-insensitive so labels captured
    with mixed casing still cluster. Codex Wave 1.5 P3."""
    for label in ("HydroTech-Mining", "hydrotech-split-tank"):
        for i in range(11):
            create_memory("knowledge", f"{label} fact {i}", project=label)
    client = TestClient(app)
    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
    assert "hydrotech-split-tank" in proposals["HydroTech-Mining"]["suggested_aliases"]
    assert "HydroTech-Mining" in proposals["hydrotech-split-tank"]["suggested_aliases"]
 def test_proposals_registered_token_does_not_leak_into_sibling_set(env, monkeypatch):
    """Registered project ids must be filtered BEFORE clustering so a
    registered token doesn't get suggested as an alias for an
    unregistered sibling. p04-gigabit is registered in env; an
    unregistered 'gigabit-other' must not list 'p04-gigabit' as alias."""
    for i in range(15):
        create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
    for i in range(11):
        create_memory("knowledge", f"gigabit-other fact {i}", project="gigabit-other")
    client = TestClient(app)
    proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
    assert "p04-gigabit" not in proposals
    assert "gigabit-other" in proposals
    # And the registered name must not surface as a sibling
    assert "p04-gigabit" not in proposals["gigabit-other"]["suggested_aliases"]
 def test_proposals_include_sample_memories_and_guessed_root(env):
    for i in range(11):
        create_memory("knowledge", f"sample content {i}", project="apm")
    client = TestClient(app)
    body = client.get("/admin/projects/proposals").json()
    apm = next(p for p in body["proposals"] if p["project_id"] == "apm")
    assert apm["active_count"] == 11
    assert apm["candidate_count"] == 0
    assert apm["guessed_ingest_root"] == {
        "source": "vault",
        "subpath": "incoming/projects/apm/",
    }
    assert len(apm["sample_memories"]) == 3
    for s in apm["sample_memories"]:
        assert s["id"]
        assert "sample content" in s["content_preview"]
 def test_proposals_count_candidates_separately(env):
    for i in range(11):
        create_memory("knowledge", f"active {i}", project="apm")
    for i in range(4):
        create_memory("knowledge", f"candidate {i}", project="apm", status="candidate")
    client = TestClient(app)
    apm = next(
        p for p in client.get("/admin/projects/proposals").json()["proposals"]
        if p["project_id"] == "apm"
    )
    assert apm["active_count"] == 11
    assert apm["candidate_count"] == 4
 def test_proposals_min_active_param_validation(env):
    client = TestClient(app)
    r = client.get("/admin/projects/proposals?min_active=0")
    assert r.status_code == 400
 def test_proposals_sorted_by_active_count_desc(env):
    for i in range(20):
        create_memory("knowledge", f"big {i}", project="apm")
    for i in range(11):
        create_memory("knowledge", f"small {i}", project="openclaw")
    client = TestClient(app)
    proposals = client.get("/admin/projects/proposals").json()["proposals"]
    ids = [p["project_id"] for p in proposals]
    assert ids[0] == "apm"
    assert ids[1] == "openclaw"