fix(retrieval): preserve project ids across unscoped ingest
This commit is contained in:
@@ -37,6 +37,13 @@ def ingest_file(file_path: Path, project_id: str = "") -> dict:
|
||||
start = time.time()
|
||||
file_path = file_path.resolve()
|
||||
project_id = (project_id or "").strip()
|
||||
if not project_id:
|
||||
try:
|
||||
from atocore.projects.registry import derive_project_id_for_path
|
||||
|
||||
project_id = derive_project_id_for_path(file_path)
|
||||
except Exception:
|
||||
project_id = ""
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
@@ -8,7 +8,6 @@ from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import atocore.config as _config
|
||||
from atocore.ingestion.pipeline import ingest_project_folder
|
||||
|
||||
|
||||
# Reserved pseudo-projects. `inbox` holds pre-project / lead / quote
|
||||
@@ -260,6 +259,7 @@ def load_project_registry() -> list[RegisteredProject]:
|
||||
)
|
||||
|
||||
_validate_unique_project_names(projects)
|
||||
_validate_ingest_root_overlaps(projects)
|
||||
return projects
|
||||
|
||||
|
||||
@@ -307,6 +307,28 @@ def resolve_project_name(name: str | None) -> str:
|
||||
return name
|
||||
|
||||
|
||||
def derive_project_id_for_path(file_path: str | Path) -> str:
|
||||
"""Return the registered project that owns a source path, if any."""
|
||||
if not file_path:
|
||||
return ""
|
||||
doc_path = Path(file_path).resolve(strict=False)
|
||||
matches: list[tuple[int, int, str]] = []
|
||||
|
||||
for project in load_project_registry():
|
||||
for source_ref in project.ingest_roots:
|
||||
root_path = _resolve_ingest_root(source_ref)
|
||||
try:
|
||||
doc_path.relative_to(root_path)
|
||||
except ValueError:
|
||||
continue
|
||||
matches.append((len(root_path.parts), len(str(root_path)), project.project_id))
|
||||
|
||||
if not matches:
|
||||
return ""
|
||||
matches.sort(reverse=True)
|
||||
return matches[0][2]
|
||||
|
||||
|
||||
def refresh_registered_project(project_name: str, purge_deleted: bool = False) -> dict:
|
||||
"""Ingest all configured source roots for a registered project.
|
||||
|
||||
@@ -322,6 +344,8 @@ def refresh_registered_project(project_name: str, purge_deleted: bool = False) -
|
||||
if project is None:
|
||||
raise ValueError(f"Unknown project: {project_name}")
|
||||
|
||||
from atocore.ingestion.pipeline import ingest_project_folder
|
||||
|
||||
roots = []
|
||||
ingested_count = 0
|
||||
skipped_count = 0
|
||||
@@ -447,6 +471,33 @@ def _validate_unique_project_names(projects: list[RegisteredProject]) -> None:
|
||||
seen[key] = project.project_id
|
||||
|
||||
|
||||
def _validate_ingest_root_overlaps(projects: list[RegisteredProject]) -> None:
|
||||
roots: list[tuple[str, Path]] = []
|
||||
for project in projects:
|
||||
for source_ref in project.ingest_roots:
|
||||
roots.append((project.project_id, _resolve_ingest_root(source_ref)))
|
||||
|
||||
for i, (left_project, left_root) in enumerate(roots):
|
||||
for right_project, right_root in roots[i + 1:]:
|
||||
if left_project == right_project:
|
||||
continue
|
||||
try:
|
||||
left_root.relative_to(right_root)
|
||||
overlaps = True
|
||||
except ValueError:
|
||||
try:
|
||||
right_root.relative_to(left_root)
|
||||
overlaps = True
|
||||
except ValueError:
|
||||
overlaps = False
|
||||
if overlaps:
|
||||
raise ValueError(
|
||||
"Project registry ingest root overlap: "
|
||||
f"'{left_root}' ({left_project}) and "
|
||||
f"'{right_root}' ({right_project})"
|
||||
)
|
||||
|
||||
|
||||
def _find_name_collisions(
|
||||
project_id: str,
|
||||
aliases: list[str],
|
||||
|
||||
@@ -209,8 +209,9 @@ def _is_allowed_for_project_scope(
|
||||
|
||||
|
||||
def _metadata_matches_project(project: RegisteredProject, metadata: dict) -> bool:
|
||||
if "project_id" in metadata:
|
||||
return str(metadata.get("project_id", "")).strip().lower() == project.project_id.lower()
|
||||
stored_project_id = str(metadata.get("project_id", "")).strip().lower()
|
||||
if stored_project_id:
|
||||
return stored_project_id == project.project_id.lower()
|
||||
|
||||
path = _metadata_source_path(metadata)
|
||||
tags = _metadata_tags(metadata)
|
||||
|
||||
Reference in New Issue
Block a user