fix(retrieval): preserve project ids across unscoped ingest

This commit is contained in:
2026-04-24 11:22:13 -04:00
parent c03022d864
commit ce6ffdbb63
12 changed files with 550 additions and 83 deletions

View File

@@ -37,6 +37,13 @@ def ingest_file(file_path: Path, project_id: str = "") -> dict:
start = time.time()
file_path = file_path.resolve()
project_id = (project_id or "").strip()
if not project_id:
try:
from atocore.projects.registry import derive_project_id_for_path
project_id = derive_project_id_for_path(file_path)
except Exception:
project_id = ""
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")

View File

@@ -8,7 +8,6 @@ from dataclasses import asdict, dataclass
from pathlib import Path
import atocore.config as _config
from atocore.ingestion.pipeline import ingest_project_folder
# Reserved pseudo-projects. `inbox` holds pre-project / lead / quote
@@ -260,6 +259,7 @@ def load_project_registry() -> list[RegisteredProject]:
)
_validate_unique_project_names(projects)
_validate_ingest_root_overlaps(projects)
return projects
@@ -307,6 +307,28 @@ def resolve_project_name(name: str | None) -> str:
return name
def derive_project_id_for_path(file_path: str | Path) -> str:
"""Return the registered project that owns a source path, if any."""
if not file_path:
return ""
doc_path = Path(file_path).resolve(strict=False)
matches: list[tuple[int, int, str]] = []
for project in load_project_registry():
for source_ref in project.ingest_roots:
root_path = _resolve_ingest_root(source_ref)
try:
doc_path.relative_to(root_path)
except ValueError:
continue
matches.append((len(root_path.parts), len(str(root_path)), project.project_id))
if not matches:
return ""
matches.sort(reverse=True)
return matches[0][2]
def refresh_registered_project(project_name: str, purge_deleted: bool = False) -> dict:
"""Ingest all configured source roots for a registered project.
@@ -322,6 +344,8 @@ def refresh_registered_project(project_name: str, purge_deleted: bool = False) -
if project is None:
raise ValueError(f"Unknown project: {project_name}")
from atocore.ingestion.pipeline import ingest_project_folder
roots = []
ingested_count = 0
skipped_count = 0
@@ -447,6 +471,33 @@ def _validate_unique_project_names(projects: list[RegisteredProject]) -> None:
seen[key] = project.project_id
def _validate_ingest_root_overlaps(projects: list[RegisteredProject]) -> None:
roots: list[tuple[str, Path]] = []
for project in projects:
for source_ref in project.ingest_roots:
roots.append((project.project_id, _resolve_ingest_root(source_ref)))
for i, (left_project, left_root) in enumerate(roots):
for right_project, right_root in roots[i + 1:]:
if left_project == right_project:
continue
try:
left_root.relative_to(right_root)
overlaps = True
except ValueError:
try:
right_root.relative_to(left_root)
overlaps = True
except ValueError:
overlaps = False
if overlaps:
raise ValueError(
"Project registry ingest root overlap: "
f"'{left_root}' ({left_project}) and "
f"'{right_root}' ({right_project})"
)
def _find_name_collisions(
project_id: str,
aliases: list[str],

View File

@@ -209,8 +209,9 @@ def _is_allowed_for_project_scope(
def _metadata_matches_project(project: RegisteredProject, metadata: dict) -> bool:
if "project_id" in metadata:
return str(metadata.get("project_id", "")).strip().lower() == project.project_id.lower()
stored_project_id = str(metadata.get("project_id", "")).strip().lower()
if stored_project_id:
return stored_project_id == project.project_id.lower()
path = _metadata_source_path(metadata)
tags = _metadata_tags(metadata)