"""Registered project source metadata and refresh helpers.""" from __future__ import annotations import json import tempfile from dataclasses import asdict, dataclass from pathlib import Path import atocore.config as _config from atocore.ingestion.pipeline import ingest_folder @dataclass(frozen=True) class ProjectSourceRef: source: str subpath: str label: str = "" @dataclass(frozen=True) class RegisteredProject: project_id: str aliases: tuple[str, ...] description: str ingest_roots: tuple[ProjectSourceRef, ...] def get_project_registry_template() -> dict: """Return a minimal template for registering a new project.""" return { "projects": [ { "id": "p07-example", "aliases": ["p07", "example-project"], "description": "Short description of the project and staged corpus.", "ingest_roots": [ { "source": "vault", "subpath": "incoming/projects/p07-example", "label": "Primary staged project docs", } ], } ] } def build_project_registration_proposal( project_id: str, aliases: list[str] | tuple[str, ...] | None = None, description: str = "", ingest_roots: list[dict] | tuple[dict, ...] | None = None, ) -> dict: """Build a normalized project registration proposal without mutating state.""" normalized_id = project_id.strip() if not normalized_id: raise ValueError("Project id must be non-empty") normalized_aliases = _normalize_aliases(aliases or []) normalized_roots = _normalize_ingest_roots(ingest_roots or []) if not normalized_roots: raise ValueError("At least one ingest root is required") collisions = _find_name_collisions(normalized_id, normalized_aliases) resolved_roots = [] for root in normalized_roots: source_ref = ProjectSourceRef( source=root["source"], subpath=root["subpath"], label=root.get("label", ""), ) resolved_path = _resolve_ingest_root(source_ref) resolved_roots.append( { **root, "path": str(resolved_path), "exists": resolved_path.exists(), "is_dir": resolved_path.is_dir(), } ) return { "project": { "id": normalized_id, "aliases": normalized_aliases, "description": description.strip(), "ingest_roots": normalized_roots, }, "resolved_ingest_roots": resolved_roots, "collisions": collisions, "registry_path": str(_config.settings.resolved_project_registry_path), "valid": not collisions, } def register_project( project_id: str, aliases: list[str] | tuple[str, ...] | None = None, description: str = "", ingest_roots: list[dict] | tuple[dict, ...] | None = None, ) -> dict: """Persist a validated project registration to the registry file.""" proposal = build_project_registration_proposal( project_id=project_id, aliases=aliases, description=description, ingest_roots=ingest_roots, ) if not proposal["valid"]: collision_names = ", ".join(collision["name"] for collision in proposal["collisions"]) raise ValueError(f"Project registration has collisions: {collision_names}") registry_path = _config.settings.resolved_project_registry_path payload = _load_registry_payload(registry_path) payload.setdefault("projects", []).append(proposal["project"]) _write_registry_payload(registry_path, payload) return { **proposal, "status": "registered", } def update_project( project_name: str, aliases: list[str] | tuple[str, ...] | None = None, description: str | None = None, ingest_roots: list[dict] | tuple[dict, ...] | None = None, ) -> dict: """Update an existing project registration in the registry file.""" existing = get_registered_project(project_name) if existing is None: raise ValueError(f"Unknown project: {project_name}") final_aliases = _normalize_aliases(aliases) if aliases is not None else list(existing.aliases) final_description = description.strip() if description is not None else existing.description final_roots = ( _normalize_ingest_roots(ingest_roots) if ingest_roots is not None else [asdict(root) for root in existing.ingest_roots] ) if not final_roots: raise ValueError("At least one ingest root is required") collisions = _find_name_collisions( existing.project_id, final_aliases, exclude_project_id=existing.project_id, ) if collisions: collision_names = ", ".join(collision["name"] for collision in collisions) raise ValueError(f"Project update has collisions: {collision_names}") updated_entry = { "id": existing.project_id, "aliases": final_aliases, "description": final_description, "ingest_roots": final_roots, } resolved_roots = [] for root in final_roots: source_ref = ProjectSourceRef( source=root["source"], subpath=root["subpath"], label=root.get("label", ""), ) resolved_path = _resolve_ingest_root(source_ref) resolved_roots.append( { **root, "path": str(resolved_path), "exists": resolved_path.exists(), "is_dir": resolved_path.is_dir(), } ) registry_path = _config.settings.resolved_project_registry_path payload = _load_registry_payload(registry_path) payload["projects"] = [ updated_entry if str(entry.get("id", "")).strip() == existing.project_id else entry for entry in payload.get("projects", []) ] _write_registry_payload(registry_path, payload) return { "project": updated_entry, "resolved_ingest_roots": resolved_roots, "collisions": [], "registry_path": str(registry_path), "valid": True, "status": "updated", } def load_project_registry() -> list[RegisteredProject]: """Load project registry entries from JSON config.""" registry_path = _config.settings.resolved_project_registry_path payload = _load_registry_payload(registry_path) entries = payload.get("projects", []) projects: list[RegisteredProject] = [] for entry in entries: project_id = str(entry["id"]).strip() if not project_id: raise ValueError("Project registry entry is missing a non-empty id") aliases = tuple( alias.strip() for alias in entry.get("aliases", []) if isinstance(alias, str) and alias.strip() ) description = str(entry.get("description", "")).strip() ingest_roots = tuple( ProjectSourceRef( source=str(root["source"]).strip(), subpath=str(root["subpath"]).strip(), label=str(root.get("label", "")).strip(), ) for root in entry.get("ingest_roots", []) if str(root.get("source", "")).strip() and str(root.get("subpath", "")).strip() ) if not ingest_roots: raise ValueError(f"Project registry entry '{project_id}' has no ingest_roots") projects.append( RegisteredProject( project_id=project_id, aliases=aliases, description=description, ingest_roots=ingest_roots, ) ) _validate_unique_project_names(projects) return projects def list_registered_projects() -> list[dict]: """Return registry entries with resolved source readiness.""" return [_project_to_dict(project) for project in load_project_registry()] def get_registered_project(project_name: str) -> RegisteredProject | None: """Resolve a registry entry by id or alias.""" needle = project_name.strip().lower() if not needle: return None for project in load_project_registry(): candidates = {project.project_id.lower(), *(alias.lower() for alias in project.aliases)} if needle in candidates: return project return None def resolve_project_name(name: str | None) -> str: """Canonicalize a project name through the registry. Returns the canonical ``project_id`` if the input matches any registered project's id or alias. Returns the input unchanged when it's empty or not in the registry — the second case keeps backwards compatibility with hand-curated state, memories, and interactions that predate the registry, or for projects that are intentionally not registered. This helper is the single canonicalization boundary for project names across the trust hierarchy. Every read/write that takes a project name should pass it through ``resolve_project_name`` before storing or querying. The contract is documented in ``docs/architecture/representation-authority.md``. """ if not name: return name or "" project = get_registered_project(name) if project is not None: return project.project_id return name def refresh_registered_project(project_name: str, purge_deleted: bool = False) -> dict: """Ingest all configured source roots for a registered project. The returned dict carries an overall ``status`` so callers can tell at a glance whether the refresh was fully successful, partial, or did nothing at all because every configured root was missing or not a directory: - ``ingested``: every root was a real directory and was ingested - ``partial``: at least one root ingested and at least one was unusable - ``nothing_to_ingest``: no roots were usable """ project = get_registered_project(project_name) if project is None: raise ValueError(f"Unknown project: {project_name}") roots = [] ingested_count = 0 skipped_count = 0 for source_ref in project.ingest_roots: resolved = _resolve_ingest_root(source_ref) root_result = { "source": source_ref.source, "subpath": source_ref.subpath, "label": source_ref.label, "path": str(resolved), } if not resolved.exists(): roots.append({**root_result, "status": "missing"}) skipped_count += 1 continue if not resolved.is_dir(): roots.append({**root_result, "status": "not_directory"}) skipped_count += 1 continue roots.append( { **root_result, "status": "ingested", "results": ingest_folder(resolved, purge_deleted=purge_deleted), } ) ingested_count += 1 if ingested_count == 0: overall_status = "nothing_to_ingest" elif skipped_count == 0: overall_status = "ingested" else: overall_status = "partial" return { "project": project.project_id, "aliases": list(project.aliases), "description": project.description, "purge_deleted": purge_deleted, "status": overall_status, "roots_ingested": ingested_count, "roots_skipped": skipped_count, "roots": roots, } def _normalize_aliases(aliases: list[str] | tuple[str, ...]) -> list[str]: deduped: list[str] = [] seen: set[str] = set() for alias in aliases: candidate = alias.strip() if not candidate: continue key = candidate.lower() if key in seen: continue seen.add(key) deduped.append(candidate) return deduped def _normalize_ingest_roots(ingest_roots: list[dict] | tuple[dict, ...]) -> list[dict]: normalized: list[dict] = [] for root in ingest_roots: source = str(root.get("source", "")).strip() subpath = str(root.get("subpath", "")).strip() label = str(root.get("label", "")).strip() if not source or not subpath: continue if source not in {"vault", "drive"}: raise ValueError(f"Unsupported source root: {source}") normalized.append({"source": source, "subpath": subpath, "label": label}) return normalized def _project_to_dict(project: RegisteredProject) -> dict: return { "id": project.project_id, "aliases": list(project.aliases), "description": project.description, "ingest_roots": [ { **asdict(source_ref), "path": str(_resolve_ingest_root(source_ref)), "exists": _resolve_ingest_root(source_ref).exists(), "is_dir": _resolve_ingest_root(source_ref).is_dir(), } for source_ref in project.ingest_roots ], } def _resolve_ingest_root(source_ref: ProjectSourceRef) -> Path: base_map = { "vault": _config.settings.resolved_vault_source_dir, "drive": _config.settings.resolved_drive_source_dir, } try: base_dir = base_map[source_ref.source] except KeyError as exc: raise ValueError(f"Unsupported source root: {source_ref.source}") from exc return (base_dir / source_ref.subpath).resolve(strict=False) def _validate_unique_project_names(projects: list[RegisteredProject]) -> None: seen: dict[str, str] = {} for project in projects: names = [project.project_id, *project.aliases] for name in names: key = name.lower() if key in seen and seen[key] != project.project_id: raise ValueError( f"Project registry name collision: '{name}' is used by both " f"'{seen[key]}' and '{project.project_id}'" ) seen[key] = project.project_id def _find_name_collisions( project_id: str, aliases: list[str], exclude_project_id: str | None = None, ) -> list[dict]: collisions: list[dict] = [] existing = load_project_registry() requested_names = [project_id, *aliases] for requested in requested_names: requested_key = requested.lower() for project in existing: if exclude_project_id is not None and project.project_id == exclude_project_id: continue project_names = [project.project_id, *project.aliases] if requested_key in {name.lower() for name in project_names}: collisions.append( { "name": requested, "existing_project": project.project_id, } ) break return collisions def _load_registry_payload(registry_path: Path) -> dict: if not registry_path.exists(): return {"projects": []} return json.loads(registry_path.read_text(encoding="utf-8")) def _write_registry_payload(registry_path: Path, payload: dict) -> None: registry_path.parent.mkdir(parents=True, exist_ok=True) rendered = json.dumps(payload, indent=2, ensure_ascii=True) + "\n" with tempfile.NamedTemporaryFile( mode="w", encoding="utf-8", dir=registry_path.parent, prefix=f"{registry_path.stem}.", suffix=".tmp", delete=False, ) as tmp_file: tmp_file.write(rendered) temp_path = Path(tmp_file.name) temp_path.replace(registry_path)