feat(projects): Wave 1.5 — live emerging-project registration proposals
GET /admin/projects/proposals?min_active=N — on-demand companion to
the nightly scripts/detect_emerging.py cache. Reads SQL + the registry
directly so the result is current.
Each proposal:
- project_id (literal label as captured)
- active_count / candidate_count from current SQL
- sample_memories: 3 most recent active rows with content preview
- suggested_aliases: sibling labels sharing a >=4-char token,
case-insensitive (lead-space + lead-space-exploration-ltd +
space-exploration-ltd cluster; apm and apm-fpga do NOT cluster
via the 3-char 'apm')
- guessed_ingest_root: vault:incoming/projects/<id>/
Workflow: hit /admin/projects/proposals to see "what should I register?",
then POST to existing /admin/projects/register-emerging.
For prod: apm has 165 active memories, openclaw has 17,
hydrotech-mining variants combine to 13. apm is overdue.
Closes Codex's prior P2 from the state-of-service review. Reviewed by
Codex on tip e8ac8bb (verdict GO); two follow-on improvements (stronger
negative-clustering test + case-insensitive tokens) folded into f70fa6b.
10 regression tests covering: registered canonical/alias exclusion,
threshold filtering, sibling clustering, short-token negative,
case-insensitive clustering, registered-token-leak guard, sample shape,
candidate counting, param validation, sort order.
Test count: 586 -> 596.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -474,6 +474,33 @@ def api_register_emerging_project(req: RegisterEmergingRequest) -> dict:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/admin/projects/proposals")
|
||||||
|
def api_project_proposals(min_active: int = 10) -> dict:
|
||||||
|
"""Live registration proposals for unregistered projects.
|
||||||
|
|
||||||
|
Reads SQL + the registry directly, so the result is current — unlike
|
||||||
|
`/admin/dashboard.proposals.unregistered_projects` which is the
|
||||||
|
nightly cache from `scripts/detect_emerging.py`. Each proposal
|
||||||
|
includes a guessed ingest root, sibling labels suggested as aliases,
|
||||||
|
and a few sample memories so the operator can sanity-check before
|
||||||
|
POSTing to /admin/projects/register-emerging.
|
||||||
|
|
||||||
|
Query params:
|
||||||
|
min_active: minimum active-memory count for a label to surface
|
||||||
|
(default 10).
|
||||||
|
"""
|
||||||
|
from atocore.memory.service import propose_emerging_projects
|
||||||
|
|
||||||
|
if min_active < 1:
|
||||||
|
raise HTTPException(status_code=400, detail="min_active must be >= 1")
|
||||||
|
proposals = propose_emerging_projects(min_active=min_active)
|
||||||
|
return {
|
||||||
|
"proposals": proposals,
|
||||||
|
"count": len(proposals),
|
||||||
|
"min_active": min_active,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@router.put("/projects/{project_name}")
|
@router.put("/projects/{project_name}")
|
||||||
def api_project_update(project_name: str, req: ProjectUpdateRequest) -> dict:
|
def api_project_update(project_name: str, req: ProjectUpdateRequest) -> dict:
|
||||||
"""Update an existing project registration."""
|
"""Update an existing project registration."""
|
||||||
|
|||||||
@@ -424,6 +424,126 @@ def get_memory_count_summary() -> dict:
|
|||||||
return summary
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
def propose_emerging_projects(min_active: int = 10) -> list[dict]:
|
||||||
|
"""Return live, on-demand registration proposals for unregistered projects.
|
||||||
|
|
||||||
|
Differs from the nightly ``scripts/detect_emerging.py`` cache (which
|
||||||
|
is fresh once a day and lives in ``project_state.proposals``) by
|
||||||
|
reading current SQL and the registry directly. Each proposal is
|
||||||
|
operator-ready: a guessed ingest root, sibling labels suggested as
|
||||||
|
aliases, and a few sample memories so the operator can sanity-check
|
||||||
|
the bucket before committing it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
min_active: minimum number of active memories required for a
|
||||||
|
label to surface as a proposal. Defaults to 10 — anything
|
||||||
|
smaller is too noisy to register without more signal.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of proposal dicts, sorted by active_count desc:
|
||||||
|
{
|
||||||
|
"project_id": str,
|
||||||
|
"active_count": int,
|
||||||
|
"candidate_count": int,
|
||||||
|
"suggested_aliases": list[str],
|
||||||
|
"guessed_ingest_root": {"source": "vault", "subpath": ...},
|
||||||
|
"sample_memories": [{id, content_preview, updated_at}, ...],
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
from atocore.projects.registry import load_project_registry
|
||||||
|
|
||||||
|
# Build the set of names already known to the registry (canonical + aliases),
|
||||||
|
# lowercased. Anything in this set is "registered" and not a proposal.
|
||||||
|
registered_names: set[str] = set()
|
||||||
|
try:
|
||||||
|
for project in load_project_registry():
|
||||||
|
registered_names.add(project.project_id.lower())
|
||||||
|
for alias in project.aliases:
|
||||||
|
registered_names.add(alias.lower())
|
||||||
|
except Exception:
|
||||||
|
# Fail-open: if the registry can't load, assume nothing is
|
||||||
|
# registered and let the proposal surface everything.
|
||||||
|
pass
|
||||||
|
|
||||||
|
with get_connection() as conn:
|
||||||
|
# Active counts per project (excluding empty/null project — that's
|
||||||
|
# the global bucket, not a proposal candidate).
|
||||||
|
active_rows = conn.execute(
|
||||||
|
"SELECT project, count(*) AS c FROM memories "
|
||||||
|
"WHERE status = 'active' AND project IS NOT NULL AND project != '' "
|
||||||
|
"GROUP BY project"
|
||||||
|
).fetchall()
|
||||||
|
cand_rows = conn.execute(
|
||||||
|
"SELECT project, count(*) AS c FROM memories "
|
||||||
|
"WHERE status = 'candidate' AND project IS NOT NULL AND project != '' "
|
||||||
|
"GROUP BY project"
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
cand_counts = {r["project"]: r["c"] for r in cand_rows}
|
||||||
|
|
||||||
|
# Filter to unregistered labels above threshold
|
||||||
|
unregistered: list[tuple[str, int, int]] = [] # (project, active_n, candidate_n)
|
||||||
|
for r in active_rows:
|
||||||
|
proj = r["project"]
|
||||||
|
if proj.lower() in registered_names:
|
||||||
|
continue
|
||||||
|
if r["c"] < min_active:
|
||||||
|
continue
|
||||||
|
unregistered.append((proj, r["c"], cand_counts.get(proj, 0)))
|
||||||
|
|
||||||
|
# Sibling alias detection: two unregistered labels are siblings if
|
||||||
|
# they share a non-trivial token (length >= 4 after splitting on
|
||||||
|
# '-' and '_'). Cheap, defensible, and the operator gets to veto.
|
||||||
|
def _tokens(label: str) -> set[str]:
|
||||||
|
parts = label.lower().replace("_", "-").split("-")
|
||||||
|
return {p for p in parts if len(p) >= 4}
|
||||||
|
|
||||||
|
label_tokens = {label: _tokens(label) for label, _a, _c in unregistered}
|
||||||
|
|
||||||
|
proposals = []
|
||||||
|
for proj, active_n, candidate_n in sorted(
|
||||||
|
unregistered, key=lambda t: (-t[1], t[0])
|
||||||
|
):
|
||||||
|
siblings = [
|
||||||
|
other
|
||||||
|
for other in label_tokens
|
||||||
|
if other != proj and (label_tokens[proj] & label_tokens[other])
|
||||||
|
]
|
||||||
|
siblings.sort()
|
||||||
|
|
||||||
|
# Sample memories: top 3 active by updated_at desc
|
||||||
|
with get_connection() as conn:
|
||||||
|
sample_rows = conn.execute(
|
||||||
|
"SELECT id, content, updated_at FROM memories "
|
||||||
|
"WHERE status = 'active' AND project = ? "
|
||||||
|
"ORDER BY updated_at DESC LIMIT 3",
|
||||||
|
(proj,),
|
||||||
|
).fetchall()
|
||||||
|
samples = [
|
||||||
|
{
|
||||||
|
"id": r["id"],
|
||||||
|
"content_preview": (r["content"] or "")[:160],
|
||||||
|
"updated_at": r["updated_at"],
|
||||||
|
}
|
||||||
|
for r in sample_rows
|
||||||
|
]
|
||||||
|
|
||||||
|
proposals.append(
|
||||||
|
{
|
||||||
|
"project_id": proj,
|
||||||
|
"active_count": active_n,
|
||||||
|
"candidate_count": candidate_n,
|
||||||
|
"suggested_aliases": siblings,
|
||||||
|
"guessed_ingest_root": {
|
||||||
|
"source": "vault",
|
||||||
|
"subpath": f"incoming/projects/{proj}/",
|
||||||
|
},
|
||||||
|
"sample_memories": samples,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return proposals
|
||||||
|
|
||||||
|
|
||||||
def update_memory(
|
def update_memory(
|
||||||
memory_id: str,
|
memory_id: str,
|
||||||
content: str | None = None,
|
content: str | None = None,
|
||||||
|
|||||||
217
tests/test_emerging_project_proposals.py
Normal file
217
tests/test_emerging_project_proposals.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
"""Wave 1.5 — live emerging-project registration proposals.
|
||||||
|
|
||||||
|
The nightly `scripts/detect_emerging.py` writes a stale cache to
|
||||||
|
`project_state.proposals.unregistered_projects`. This endpoint provides
|
||||||
|
the on-demand alternative that operators can hit before deciding which
|
||||||
|
unregistered project to register via `/admin/projects/register-emerging`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
import atocore.config as config
|
||||||
|
from atocore.main import app
|
||||||
|
from atocore.memory.service import create_memory
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def env(tmp_data_dir, tmp_path, monkeypatch):
|
||||||
|
"""Fresh DB + a registry holding a single registered project so we
|
||||||
|
can prove the proposals endpoint excludes registered names."""
|
||||||
|
registry_path = tmp_path / "registry.json"
|
||||||
|
registry_path.write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"projects": [
|
||||||
|
{
|
||||||
|
"id": "p04-gigabit",
|
||||||
|
"aliases": ["p04", "gigabit"],
|
||||||
|
"description": "test",
|
||||||
|
"ingest_roots": [
|
||||||
|
{"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
|
||||||
|
config.settings = config.Settings()
|
||||||
|
init_db()
|
||||||
|
yield tmp_data_dir
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_excludes_registered_project_and_its_aliases(env):
|
||||||
|
"""Memories tagged on a registered canonical id or any of its
|
||||||
|
aliases must not appear as a registration proposal."""
|
||||||
|
# Registered: p04-gigabit (aliases p04, gigabit)
|
||||||
|
for i in range(15):
|
||||||
|
create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
|
||||||
|
for i in range(15):
|
||||||
|
create_memory("knowledge", f"alias fact {i}", project="p04") # alias
|
||||||
|
|
||||||
|
# Unregistered, above threshold
|
||||||
|
for i in range(12):
|
||||||
|
create_memory("knowledge", f"apm fact {i}", project="apm")
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
body = client.get("/admin/projects/proposals?min_active=10").json()
|
||||||
|
ids = [p["project_id"] for p in body["proposals"]]
|
||||||
|
assert "apm" in ids
|
||||||
|
assert "p04-gigabit" not in ids
|
||||||
|
assert "p04" not in ids
|
||||||
|
assert "gigabit" not in ids
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_threshold_filters_low_count_labels(env):
|
||||||
|
create_memory("knowledge", "single one-off", project="discrawl")
|
||||||
|
for i in range(3):
|
||||||
|
create_memory("knowledge", f"low-volume {i}", project="drill")
|
||||||
|
for i in range(11):
|
||||||
|
create_memory("knowledge", f"high-volume {i}", project="apm")
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
proposals = client.get("/admin/projects/proposals?min_active=10").json()["proposals"]
|
||||||
|
ids = [p["project_id"] for p in proposals]
|
||||||
|
assert "apm" in ids
|
||||||
|
assert "drill" not in ids
|
||||||
|
assert "discrawl" not in ids
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_suggest_sibling_aliases_via_shared_tokens(env):
|
||||||
|
"""Lead-space + lead-space-exploration-ltd + space-exploration-ltd
|
||||||
|
should cluster: each proposes the others as suggested_aliases via
|
||||||
|
shared non-trivial tokens (length >= 4)."""
|
||||||
|
for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
|
||||||
|
for i in range(11):
|
||||||
|
create_memory("knowledge", f"{label} content {i}", project=label)
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
|
||||||
|
|
||||||
|
# All three appear and each suggests at least one of the others
|
||||||
|
for label in ("lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"):
|
||||||
|
assert label in proposals
|
||||||
|
siblings = set(proposals[label]["suggested_aliases"])
|
||||||
|
# Every label shares "space" (and others share "exploration"/"lead")
|
||||||
|
# so at least one sibling must be present.
|
||||||
|
assert siblings & {"lead-space", "lead-space-exploration-ltd", "space-exploration-ltd"} - {label}
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_short_token_does_not_match(env):
|
||||||
|
"""Per Codex Wave 1.5 P2: previously this test only asserted apm
|
||||||
|
and drill have empty siblings, which is trivially true because they
|
||||||
|
share no tokens at all. The real risk is an accidental relaxation
|
||||||
|
that lets <4-char tokens trigger clustering. Construct a setup where
|
||||||
|
that would matter:
|
||||||
|
- 'apm' and 'apm-fpga': only the 3-char 'apm' is shared. They must
|
||||||
|
NOT cluster, because 'apm' is too short.
|
||||||
|
- 'foo-fpga' and 'bar-fpga': the 4-char 'fpga' is shared. They
|
||||||
|
MUST cluster.
|
||||||
|
"""
|
||||||
|
for label in ("apm", "apm-fpga", "foo-fpga", "bar-fpga"):
|
||||||
|
for i in range(11):
|
||||||
|
create_memory("knowledge", f"{label} fact {i}", project=label)
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
|
||||||
|
|
||||||
|
# Negative: short-token match must not happen
|
||||||
|
assert "apm-fpga" not in proposals["apm"]["suggested_aliases"], (
|
||||||
|
"'apm' (3 chars) is below the 4-char minimum; 'apm' and 'apm-fpga' "
|
||||||
|
"must not cluster via the 'apm' token."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Positive: long-token match must happen — both directions
|
||||||
|
assert "bar-fpga" in proposals["foo-fpga"]["suggested_aliases"]
|
||||||
|
assert "foo-fpga" in proposals["bar-fpga"]["suggested_aliases"]
|
||||||
|
# And 'apm-fpga' clusters with the others via 'fpga'
|
||||||
|
assert "apm-fpga" in proposals["foo-fpga"]["suggested_aliases"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_clustering_is_case_insensitive(env):
|
||||||
|
"""Token comparison must be case-insensitive so labels captured
|
||||||
|
with mixed casing still cluster. Codex Wave 1.5 P3."""
|
||||||
|
for label in ("HydroTech-Mining", "hydrotech-split-tank"):
|
||||||
|
for i in range(11):
|
||||||
|
create_memory("knowledge", f"{label} fact {i}", project=label)
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
|
||||||
|
assert "hydrotech-split-tank" in proposals["HydroTech-Mining"]["suggested_aliases"]
|
||||||
|
assert "HydroTech-Mining" in proposals["hydrotech-split-tank"]["suggested_aliases"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_registered_token_does_not_leak_into_sibling_set(env, monkeypatch):
|
||||||
|
"""Registered project ids must be filtered BEFORE clustering so a
|
||||||
|
registered token doesn't get suggested as an alias for an
|
||||||
|
unregistered sibling. p04-gigabit is registered in env; an
|
||||||
|
unregistered 'gigabit-other' must not list 'p04-gigabit' as alias."""
|
||||||
|
for i in range(15):
|
||||||
|
create_memory("knowledge", f"p04 fact {i}", project="p04-gigabit")
|
||||||
|
for i in range(11):
|
||||||
|
create_memory("knowledge", f"gigabit-other fact {i}", project="gigabit-other")
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
proposals = {p["project_id"]: p for p in client.get("/admin/projects/proposals").json()["proposals"]}
|
||||||
|
assert "p04-gigabit" not in proposals
|
||||||
|
assert "gigabit-other" in proposals
|
||||||
|
# And the registered name must not surface as a sibling
|
||||||
|
assert "p04-gigabit" not in proposals["gigabit-other"]["suggested_aliases"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_include_sample_memories_and_guessed_root(env):
|
||||||
|
for i in range(11):
|
||||||
|
create_memory("knowledge", f"sample content {i}", project="apm")
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
body = client.get("/admin/projects/proposals").json()
|
||||||
|
apm = next(p for p in body["proposals"] if p["project_id"] == "apm")
|
||||||
|
assert apm["active_count"] == 11
|
||||||
|
assert apm["candidate_count"] == 0
|
||||||
|
assert apm["guessed_ingest_root"] == {
|
||||||
|
"source": "vault",
|
||||||
|
"subpath": "incoming/projects/apm/",
|
||||||
|
}
|
||||||
|
assert len(apm["sample_memories"]) == 3
|
||||||
|
for s in apm["sample_memories"]:
|
||||||
|
assert s["id"]
|
||||||
|
assert "sample content" in s["content_preview"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_count_candidates_separately(env):
|
||||||
|
for i in range(11):
|
||||||
|
create_memory("knowledge", f"active {i}", project="apm")
|
||||||
|
for i in range(4):
|
||||||
|
create_memory("knowledge", f"candidate {i}", project="apm", status="candidate")
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
apm = next(
|
||||||
|
p for p in client.get("/admin/projects/proposals").json()["proposals"]
|
||||||
|
if p["project_id"] == "apm"
|
||||||
|
)
|
||||||
|
assert apm["active_count"] == 11
|
||||||
|
assert apm["candidate_count"] == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_min_active_param_validation(env):
|
||||||
|
client = TestClient(app)
|
||||||
|
r = client.get("/admin/projects/proposals?min_active=0")
|
||||||
|
assert r.status_code == 400
|
||||||
|
|
||||||
|
|
||||||
|
def test_proposals_sorted_by_active_count_desc(env):
|
||||||
|
for i in range(20):
|
||||||
|
create_memory("knowledge", f"big {i}", project="apm")
|
||||||
|
for i in range(11):
|
||||||
|
create_memory("knowledge", f"small {i}", project="openclaw")
|
||||||
|
|
||||||
|
client = TestClient(app)
|
||||||
|
proposals = client.get("/admin/projects/proposals").json()["proposals"]
|
||||||
|
ids = [p["project_id"] for p in proposals]
|
||||||
|
assert ids[0] == "apm"
|
||||||
|
assert ids[1] == "openclaw"
|
||||||
Reference in New Issue
Block a user