feat(retrieval): persist explicit chunk project ids
This commit is contained in:
@@ -32,10 +32,11 @@ def exclusive_ingestion():
|
||||
_INGESTION_LOCK.release()
|
||||
|
||||
|
||||
def ingest_file(file_path: Path) -> dict:
|
||||
def ingest_file(file_path: Path, project_id: str = "") -> dict:
|
||||
"""Ingest a single markdown file. Returns stats."""
|
||||
start = time.time()
|
||||
file_path = file_path.resolve()
|
||||
project_id = (project_id or "").strip()
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
@@ -65,6 +66,7 @@ def ingest_file(file_path: Path) -> dict:
|
||||
"source_file": str(file_path),
|
||||
"tags": parsed.tags,
|
||||
"title": parsed.title,
|
||||
"project_id": project_id,
|
||||
}
|
||||
chunks = chunk_markdown(parsed.body, base_metadata=base_meta)
|
||||
|
||||
@@ -116,6 +118,7 @@ def ingest_file(file_path: Path) -> dict:
|
||||
"source_file": str(file_path),
|
||||
"tags": json.dumps(parsed.tags),
|
||||
"title": parsed.title,
|
||||
"project_id": project_id,
|
||||
})
|
||||
|
||||
conn.execute(
|
||||
@@ -173,7 +176,17 @@ def ingest_folder(folder_path: Path, purge_deleted: bool = True) -> list[dict]:
|
||||
purge_deleted: If True, remove DB/vector entries for files
|
||||
that no longer exist on disk.
|
||||
"""
|
||||
return ingest_project_folder(folder_path, purge_deleted=purge_deleted, project_id="")
|
||||
|
||||
|
||||
def ingest_project_folder(
|
||||
folder_path: Path,
|
||||
purge_deleted: bool = True,
|
||||
project_id: str = "",
|
||||
) -> list[dict]:
|
||||
"""Ingest a folder and annotate chunks with an optional project id."""
|
||||
folder_path = folder_path.resolve()
|
||||
project_id = (project_id or "").strip()
|
||||
if not folder_path.is_dir():
|
||||
raise NotADirectoryError(f"Not a directory: {folder_path}")
|
||||
|
||||
@@ -187,7 +200,7 @@ def ingest_folder(folder_path: Path, purge_deleted: bool = True) -> list[dict]:
|
||||
# Ingest new/changed files
|
||||
for md_file in md_files:
|
||||
try:
|
||||
result = ingest_file(md_file)
|
||||
result = ingest_file(md_file, project_id=project_id)
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
log.error("ingestion_error", file_path=str(md_file), error=str(e))
|
||||
|
||||
@@ -8,7 +8,7 @@ from dataclasses import asdict, dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import atocore.config as _config
|
||||
from atocore.ingestion.pipeline import ingest_folder
|
||||
from atocore.ingestion.pipeline import ingest_project_folder
|
||||
|
||||
|
||||
# Reserved pseudo-projects. `inbox` holds pre-project / lead / quote
|
||||
@@ -346,7 +346,11 @@ def refresh_registered_project(project_name: str, purge_deleted: bool = False) -
|
||||
{
|
||||
**root_result,
|
||||
"status": "ingested",
|
||||
"results": ingest_folder(resolved, purge_deleted=purge_deleted),
|
||||
"results": ingest_project_folder(
|
||||
resolved,
|
||||
purge_deleted=purge_deleted,
|
||||
project_id=project.project_id,
|
||||
),
|
||||
}
|
||||
)
|
||||
ingested_count += 1
|
||||
|
||||
@@ -209,6 +209,9 @@ def _is_allowed_for_project_scope(
|
||||
|
||||
|
||||
def _metadata_matches_project(project: RegisteredProject, metadata: dict) -> bool:
|
||||
if "project_id" in metadata:
|
||||
return str(metadata.get("project_id", "")).strip().lower() == project.project_id.lower()
|
||||
|
||||
path = _metadata_source_path(metadata)
|
||||
tags = _metadata_tags(metadata)
|
||||
for term in _project_scope_terms(project):
|
||||
|
||||
@@ -64,6 +64,18 @@ class VectorStore:
|
||||
self._collection.delete(ids=ids)
|
||||
log.debug("vectors_deleted", count=len(ids))
|
||||
|
||||
def get_metadatas(self, ids: list[str]) -> dict:
|
||||
"""Fetch vector metadata by chunk IDs."""
|
||||
if not ids:
|
||||
return {"ids": [], "metadatas": []}
|
||||
return self._collection.get(ids=ids, include=["metadatas"])
|
||||
|
||||
def update_metadatas(self, ids: list[str], metadatas: list[dict]) -> None:
|
||||
"""Update vector metadata without re-embedding documents."""
|
||||
if ids:
|
||||
self._collection.update(ids=ids, metadatas=metadatas)
|
||||
log.debug("vector_metadatas_updated", count=len(ids))
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
return self._collection.count()
|
||||
|
||||
Reference in New Issue
Block a user