fix(retrieval): preserve project ids across unscoped ingest

This commit is contained in:
2026-04-24 11:22:13 -04:00
parent c03022d864
commit ce6ffdbb63
12 changed files with 550 additions and 83 deletions

View File

@@ -103,6 +103,66 @@ def test_ingest_file_records_project_id_metadata(tmp_data_dir, sample_markdown,
)
def test_ingest_file_derives_project_id_from_registry_root(tmp_data_dir, tmp_path, monkeypatch):
"""Unscoped ingest should preserve ownership for files under registered roots."""
import atocore.config as config
vault_dir = tmp_path / "vault"
drive_dir = tmp_path / "drive"
config_dir = tmp_path / "config"
project_dir = vault_dir / "incoming" / "projects" / "p04-gigabit"
project_dir.mkdir(parents=True)
drive_dir.mkdir()
config_dir.mkdir()
note = project_dir / "status.md"
note.write_text(
"# Status\n\nCurrent project status with enough detail to create "
"a retrievable chunk for the ingestion pipeline test.",
encoding="utf-8",
)
registry_path = config_dir / "project-registry.json"
registry_path.write_text(
json.dumps(
{
"projects": [
{
"id": "p04-gigabit",
"aliases": ["p04"],
"ingest_roots": [
{"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
],
}
]
}
),
encoding="utf-8",
)
class FakeVectorStore:
def __init__(self):
self.metadatas = []
def add(self, ids, documents, metadatas):
self.metadatas.extend(metadatas)
def delete(self, ids):
return None
fake_store = FakeVectorStore()
monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir))
monkeypatch.setenv("ATOCORE_DRIVE_SOURCE_DIR", str(drive_dir))
monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
config.settings = config.Settings()
monkeypatch.setattr("atocore.ingestion.pipeline.get_vector_store", lambda: fake_store)
init_db()
result = ingest_file(note)
assert result["status"] == "ingested"
assert fake_store.metadatas
assert all(meta["project_id"] == "p04-gigabit" for meta in fake_store.metadatas)
def test_ingest_project_folder_passes_project_id_to_files(tmp_data_dir, sample_folder, monkeypatch):
seen = []