feat(projects): Wave 1.5 — live emerging-project registration proposals
GET /admin/projects/proposals?min_active=N — on-demand companion to
the nightly scripts/detect_emerging.py cache. Reads SQL + the registry
directly so the result is current.
Each proposal:
- project_id (literal label as captured)
- active_count / candidate_count from current SQL
- sample_memories: 3 most recent active rows with content preview
- suggested_aliases: sibling labels sharing a >=4-char token,
case-insensitive (lead-space + lead-space-exploration-ltd +
space-exploration-ltd cluster; apm and apm-fpga do NOT cluster
via the 3-char 'apm')
- guessed_ingest_root: vault:incoming/projects/<id>/
Workflow: hit /admin/projects/proposals to see "what should I register?",
then POST to existing /admin/projects/register-emerging.
For prod: apm has 165 active memories, openclaw has 17,
hydrotech-mining variants combine to 13. apm is overdue.
Closes Codex's prior P2 from the state-of-service review. Reviewed by
Codex on tip e8ac8bb (verdict GO); two follow-on improvements (stronger
negative-clustering test + case-insensitive tokens) folded into f70fa6b.
10 regression tests covering: registered canonical/alias exclusion,
threshold filtering, sibling clustering, short-token negative,
case-insensitive clustering, registered-token-leak guard, sample shape,
candidate counting, param validation, sort order.
Test count: 586 -> 596.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -474,6 +474,33 @@ def api_register_emerging_project(req: RegisterEmergingRequest) -> dict:
|
||||
return result
|
||||
|
||||
|
||||
@router.get("/admin/projects/proposals")
|
||||
def api_project_proposals(min_active: int = 10) -> dict:
|
||||
"""Live registration proposals for unregistered projects.
|
||||
|
||||
Reads SQL + the registry directly, so the result is current — unlike
|
||||
`/admin/dashboard.proposals.unregistered_projects` which is the
|
||||
nightly cache from `scripts/detect_emerging.py`. Each proposal
|
||||
includes a guessed ingest root, sibling labels suggested as aliases,
|
||||
and a few sample memories so the operator can sanity-check before
|
||||
POSTing to /admin/projects/register-emerging.
|
||||
|
||||
Query params:
|
||||
min_active: minimum active-memory count for a label to surface
|
||||
(default 10).
|
||||
"""
|
||||
from atocore.memory.service import propose_emerging_projects
|
||||
|
||||
if min_active < 1:
|
||||
raise HTTPException(status_code=400, detail="min_active must be >= 1")
|
||||
proposals = propose_emerging_projects(min_active=min_active)
|
||||
return {
|
||||
"proposals": proposals,
|
||||
"count": len(proposals),
|
||||
"min_active": min_active,
|
||||
}
|
||||
|
||||
|
||||
@router.put("/projects/{project_name}")
|
||||
def api_project_update(project_name: str, req: ProjectUpdateRequest) -> dict:
|
||||
"""Update an existing project registration."""
|
||||
|
||||
@@ -424,6 +424,126 @@ def get_memory_count_summary() -> dict:
|
||||
return summary
|
||||
|
||||
|
||||
def propose_emerging_projects(min_active: int = 10) -> list[dict]:
|
||||
"""Return live, on-demand registration proposals for unregistered projects.
|
||||
|
||||
Differs from the nightly ``scripts/detect_emerging.py`` cache (which
|
||||
is fresh once a day and lives in ``project_state.proposals``) by
|
||||
reading current SQL and the registry directly. Each proposal is
|
||||
operator-ready: a guessed ingest root, sibling labels suggested as
|
||||
aliases, and a few sample memories so the operator can sanity-check
|
||||
the bucket before committing it.
|
||||
|
||||
Args:
|
||||
min_active: minimum number of active memories required for a
|
||||
label to surface as a proposal. Defaults to 10 — anything
|
||||
smaller is too noisy to register without more signal.
|
||||
|
||||
Returns:
|
||||
list of proposal dicts, sorted by active_count desc:
|
||||
{
|
||||
"project_id": str,
|
||||
"active_count": int,
|
||||
"candidate_count": int,
|
||||
"suggested_aliases": list[str],
|
||||
"guessed_ingest_root": {"source": "vault", "subpath": ...},
|
||||
"sample_memories": [{id, content_preview, updated_at}, ...],
|
||||
}
|
||||
"""
|
||||
from atocore.projects.registry import load_project_registry
|
||||
|
||||
# Build the set of names already known to the registry (canonical + aliases),
|
||||
# lowercased. Anything in this set is "registered" and not a proposal.
|
||||
registered_names: set[str] = set()
|
||||
try:
|
||||
for project in load_project_registry():
|
||||
registered_names.add(project.project_id.lower())
|
||||
for alias in project.aliases:
|
||||
registered_names.add(alias.lower())
|
||||
except Exception:
|
||||
# Fail-open: if the registry can't load, assume nothing is
|
||||
# registered and let the proposal surface everything.
|
||||
pass
|
||||
|
||||
with get_connection() as conn:
|
||||
# Active counts per project (excluding empty/null project — that's
|
||||
# the global bucket, not a proposal candidate).
|
||||
active_rows = conn.execute(
|
||||
"SELECT project, count(*) AS c FROM memories "
|
||||
"WHERE status = 'active' AND project IS NOT NULL AND project != '' "
|
||||
"GROUP BY project"
|
||||
).fetchall()
|
||||
cand_rows = conn.execute(
|
||||
"SELECT project, count(*) AS c FROM memories "
|
||||
"WHERE status = 'candidate' AND project IS NOT NULL AND project != '' "
|
||||
"GROUP BY project"
|
||||
).fetchall()
|
||||
|
||||
cand_counts = {r["project"]: r["c"] for r in cand_rows}
|
||||
|
||||
# Filter to unregistered labels above threshold
|
||||
unregistered: list[tuple[str, int, int]] = [] # (project, active_n, candidate_n)
|
||||
for r in active_rows:
|
||||
proj = r["project"]
|
||||
if proj.lower() in registered_names:
|
||||
continue
|
||||
if r["c"] < min_active:
|
||||
continue
|
||||
unregistered.append((proj, r["c"], cand_counts.get(proj, 0)))
|
||||
|
||||
# Sibling alias detection: two unregistered labels are siblings if
|
||||
# they share a non-trivial token (length >= 4 after splitting on
|
||||
# '-' and '_'). Cheap, defensible, and the operator gets to veto.
|
||||
def _tokens(label: str) -> set[str]:
|
||||
parts = label.lower().replace("_", "-").split("-")
|
||||
return {p for p in parts if len(p) >= 4}
|
||||
|
||||
label_tokens = {label: _tokens(label) for label, _a, _c in unregistered}
|
||||
|
||||
proposals = []
|
||||
for proj, active_n, candidate_n in sorted(
|
||||
unregistered, key=lambda t: (-t[1], t[0])
|
||||
):
|
||||
siblings = [
|
||||
other
|
||||
for other in label_tokens
|
||||
if other != proj and (label_tokens[proj] & label_tokens[other])
|
||||
]
|
||||
siblings.sort()
|
||||
|
||||
# Sample memories: top 3 active by updated_at desc
|
||||
with get_connection() as conn:
|
||||
sample_rows = conn.execute(
|
||||
"SELECT id, content, updated_at FROM memories "
|
||||
"WHERE status = 'active' AND project = ? "
|
||||
"ORDER BY updated_at DESC LIMIT 3",
|
||||
(proj,),
|
||||
).fetchall()
|
||||
samples = [
|
||||
{
|
||||
"id": r["id"],
|
||||
"content_preview": (r["content"] or "")[:160],
|
||||
"updated_at": r["updated_at"],
|
||||
}
|
||||
for r in sample_rows
|
||||
]
|
||||
|
||||
proposals.append(
|
||||
{
|
||||
"project_id": proj,
|
||||
"active_count": active_n,
|
||||
"candidate_count": candidate_n,
|
||||
"suggested_aliases": siblings,
|
||||
"guessed_ingest_root": {
|
||||
"source": "vault",
|
||||
"subpath": f"incoming/projects/{proj}/",
|
||||
},
|
||||
"sample_memories": samples,
|
||||
}
|
||||
)
|
||||
return proposals
|
||||
|
||||
|
||||
def update_memory(
|
||||
memory_id: str,
|
||||
content: str | None = None,
|
||||
|
||||
Reference in New Issue
Block a user