fix(retrieval): preserve project ids across unscoped ingest
This commit is contained in:
154
tests/test_backfill_chunk_project_ids.py
Normal file
154
tests/test_backfill_chunk_project_ids.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""Tests for explicit chunk project_id metadata backfill."""
|
||||
|
||||
import json
|
||||
|
||||
import atocore.config as config
|
||||
from atocore.models.database import get_connection, init_db
|
||||
from scripts import backfill_chunk_project_ids as backfill
|
||||
|
||||
|
||||
def _write_registry(tmp_path, monkeypatch):
|
||||
vault_dir = tmp_path / "vault"
|
||||
drive_dir = tmp_path / "drive"
|
||||
config_dir = tmp_path / "config"
|
||||
project_dir = vault_dir / "incoming" / "projects" / "p04-gigabit"
|
||||
project_dir.mkdir(parents=True)
|
||||
drive_dir.mkdir()
|
||||
config_dir.mkdir()
|
||||
registry_path = config_dir / "project-registry.json"
|
||||
registry_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"projects": [
|
||||
{
|
||||
"id": "p04-gigabit",
|
||||
"aliases": ["p04"],
|
||||
"ingest_roots": [
|
||||
{"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir))
|
||||
monkeypatch.setenv("ATOCORE_DRIVE_SOURCE_DIR", str(drive_dir))
|
||||
monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
|
||||
config.settings = config.Settings()
|
||||
return project_dir
|
||||
|
||||
|
||||
def _insert_chunk(file_path, metadata=None, chunk_id="chunk-1"):
|
||||
with get_connection() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
("doc-1", str(file_path), "hash", "Title", "markdown", "[]"),
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO source_chunks
|
||||
(id, document_id, chunk_index, content, heading_path, char_count, metadata)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
chunk_id,
|
||||
"doc-1",
|
||||
0,
|
||||
"content",
|
||||
"Overview",
|
||||
7,
|
||||
json.dumps(metadata if metadata is not None else {}),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class FakeVectorStore:
|
||||
def __init__(self, metadatas):
|
||||
self.metadatas = dict(metadatas)
|
||||
self.updated = []
|
||||
|
||||
def get_metadatas(self, ids):
|
||||
returned_ids = [chunk_id for chunk_id in ids if chunk_id in self.metadatas]
|
||||
return {
|
||||
"ids": returned_ids,
|
||||
"metadatas": [self.metadatas[chunk_id] for chunk_id in returned_ids],
|
||||
}
|
||||
|
||||
def update_metadatas(self, ids, metadatas):
|
||||
self.updated.append((list(ids), list(metadatas)))
|
||||
for chunk_id, metadata in zip(ids, metadatas, strict=True):
|
||||
self.metadatas[chunk_id] = metadata
|
||||
|
||||
|
||||
def test_backfill_dry_run_is_non_mutating(tmp_data_dir, tmp_path, monkeypatch):
|
||||
init_db()
|
||||
project_dir = _write_registry(tmp_path, monkeypatch)
|
||||
_insert_chunk(project_dir / "status.md")
|
||||
|
||||
result = backfill.backfill(apply=False)
|
||||
|
||||
assert result["updates"] == 1
|
||||
with get_connection() as conn:
|
||||
row = conn.execute("SELECT metadata FROM source_chunks WHERE id = ?", ("chunk-1",)).fetchone()
|
||||
assert json.loads(row["metadata"]) == {}
|
||||
|
||||
|
||||
def test_backfill_apply_updates_chroma_then_sql(tmp_data_dir, tmp_path, monkeypatch):
|
||||
init_db()
|
||||
project_dir = _write_registry(tmp_path, monkeypatch)
|
||||
_insert_chunk(project_dir / "status.md", metadata={"source_file": "status.md"})
|
||||
fake_store = FakeVectorStore({"chunk-1": {"source_file": "status.md"}})
|
||||
monkeypatch.setattr(backfill, "get_vector_store", lambda: fake_store)
|
||||
|
||||
result = backfill.backfill(apply=True, require_chroma_snapshot=True)
|
||||
|
||||
assert result["applied_updates"] == 1
|
||||
assert fake_store.metadatas["chunk-1"]["project_id"] == "p04-gigabit"
|
||||
with get_connection() as conn:
|
||||
row = conn.execute("SELECT metadata FROM source_chunks WHERE id = ?", ("chunk-1",)).fetchone()
|
||||
assert json.loads(row["metadata"])["project_id"] == "p04-gigabit"
|
||||
|
||||
|
||||
def test_backfill_apply_requires_snapshot_confirmation(tmp_data_dir, tmp_path, monkeypatch):
|
||||
init_db()
|
||||
project_dir = _write_registry(tmp_path, monkeypatch)
|
||||
_insert_chunk(project_dir / "status.md")
|
||||
|
||||
try:
|
||||
backfill.backfill(apply=True)
|
||||
except ValueError as exc:
|
||||
assert "Chroma backup" in str(exc)
|
||||
else:
|
||||
raise AssertionError("Expected snapshot confirmation requirement")
|
||||
|
||||
|
||||
def test_backfill_missing_vector_skips_sql_update(tmp_data_dir, tmp_path, monkeypatch):
|
||||
init_db()
|
||||
project_dir = _write_registry(tmp_path, monkeypatch)
|
||||
_insert_chunk(project_dir / "status.md")
|
||||
fake_store = FakeVectorStore({})
|
||||
monkeypatch.setattr(backfill, "get_vector_store", lambda: fake_store)
|
||||
|
||||
result = backfill.backfill(apply=True, require_chroma_snapshot=True)
|
||||
|
||||
assert result["updates"] == 1
|
||||
assert result["applied_updates"] == 0
|
||||
assert result["missing_vectors"] == 1
|
||||
with get_connection() as conn:
|
||||
row = conn.execute("SELECT metadata FROM source_chunks WHERE id = ?", ("chunk-1",)).fetchone()
|
||||
assert json.loads(row["metadata"]) == {}
|
||||
|
||||
|
||||
def test_backfill_skips_malformed_metadata(tmp_data_dir, tmp_path, monkeypatch):
|
||||
init_db()
|
||||
project_dir = _write_registry(tmp_path, monkeypatch)
|
||||
_insert_chunk(project_dir / "status.md", metadata=[])
|
||||
|
||||
result = backfill.backfill(apply=False)
|
||||
|
||||
assert result["updates"] == 0
|
||||
assert result["malformed_metadata"] == 1
|
||||
@@ -103,6 +103,66 @@ def test_ingest_file_records_project_id_metadata(tmp_data_dir, sample_markdown,
|
||||
)
|
||||
|
||||
|
||||
def test_ingest_file_derives_project_id_from_registry_root(tmp_data_dir, tmp_path, monkeypatch):
|
||||
"""Unscoped ingest should preserve ownership for files under registered roots."""
|
||||
import atocore.config as config
|
||||
|
||||
vault_dir = tmp_path / "vault"
|
||||
drive_dir = tmp_path / "drive"
|
||||
config_dir = tmp_path / "config"
|
||||
project_dir = vault_dir / "incoming" / "projects" / "p04-gigabit"
|
||||
project_dir.mkdir(parents=True)
|
||||
drive_dir.mkdir()
|
||||
config_dir.mkdir()
|
||||
note = project_dir / "status.md"
|
||||
note.write_text(
|
||||
"# Status\n\nCurrent project status with enough detail to create "
|
||||
"a retrievable chunk for the ingestion pipeline test.",
|
||||
encoding="utf-8",
|
||||
)
|
||||
registry_path = config_dir / "project-registry.json"
|
||||
registry_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"projects": [
|
||||
{
|
||||
"id": "p04-gigabit",
|
||||
"aliases": ["p04"],
|
||||
"ingest_roots": [
|
||||
{"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
class FakeVectorStore:
|
||||
def __init__(self):
|
||||
self.metadatas = []
|
||||
|
||||
def add(self, ids, documents, metadatas):
|
||||
self.metadatas.extend(metadatas)
|
||||
|
||||
def delete(self, ids):
|
||||
return None
|
||||
|
||||
fake_store = FakeVectorStore()
|
||||
monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir))
|
||||
monkeypatch.setenv("ATOCORE_DRIVE_SOURCE_DIR", str(drive_dir))
|
||||
monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
|
||||
config.settings = config.Settings()
|
||||
monkeypatch.setattr("atocore.ingestion.pipeline.get_vector_store", lambda: fake_store)
|
||||
|
||||
init_db()
|
||||
result = ingest_file(note)
|
||||
|
||||
assert result["status"] == "ingested"
|
||||
assert fake_store.metadatas
|
||||
assert all(meta["project_id"] == "p04-gigabit" for meta in fake_store.metadatas)
|
||||
|
||||
|
||||
def test_ingest_project_folder_passes_project_id_to_files(tmp_data_dir, sample_folder, monkeypatch):
|
||||
seen = []
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import json
|
||||
import atocore.config as config
|
||||
from atocore.projects.registry import (
|
||||
build_project_registration_proposal,
|
||||
derive_project_id_for_path,
|
||||
get_registered_project,
|
||||
get_project_registry_template,
|
||||
list_registered_projects,
|
||||
@@ -103,6 +104,98 @@ def test_project_registry_resolves_alias(tmp_path, monkeypatch):
|
||||
assert project.project_id == "p05-interferometer"
|
||||
|
||||
|
||||
def test_derive_project_id_for_path_uses_registered_roots(tmp_path, monkeypatch):
|
||||
vault_dir = tmp_path / "vault"
|
||||
drive_dir = tmp_path / "drive"
|
||||
config_dir = tmp_path / "config"
|
||||
project_dir = vault_dir / "incoming" / "projects" / "p04-gigabit"
|
||||
project_dir.mkdir(parents=True)
|
||||
drive_dir.mkdir()
|
||||
config_dir.mkdir()
|
||||
note = project_dir / "status.md"
|
||||
note.write_text("# Status\n\nCurrent work.", encoding="utf-8")
|
||||
|
||||
registry_path = config_dir / "project-registry.json"
|
||||
registry_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"projects": [
|
||||
{
|
||||
"id": "p04-gigabit",
|
||||
"aliases": ["p04"],
|
||||
"ingest_roots": [
|
||||
{"source": "vault", "subpath": "incoming/projects/p04-gigabit"}
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir))
|
||||
monkeypatch.setenv("ATOCORE_DRIVE_SOURCE_DIR", str(drive_dir))
|
||||
monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
|
||||
|
||||
original_settings = config.settings
|
||||
try:
|
||||
config.settings = config.Settings()
|
||||
assert derive_project_id_for_path(note) == "p04-gigabit"
|
||||
assert derive_project_id_for_path(tmp_path / "elsewhere.md") == ""
|
||||
finally:
|
||||
config.settings = original_settings
|
||||
|
||||
|
||||
def test_project_registry_rejects_cross_project_ingest_root_overlap(tmp_path, monkeypatch):
|
||||
vault_dir = tmp_path / "vault"
|
||||
drive_dir = tmp_path / "drive"
|
||||
config_dir = tmp_path / "config"
|
||||
vault_dir.mkdir()
|
||||
drive_dir.mkdir()
|
||||
config_dir.mkdir()
|
||||
|
||||
registry_path = config_dir / "project-registry.json"
|
||||
registry_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"projects": [
|
||||
{
|
||||
"id": "parent",
|
||||
"aliases": [],
|
||||
"ingest_roots": [
|
||||
{"source": "vault", "subpath": "incoming/projects/parent"}
|
||||
],
|
||||
},
|
||||
{
|
||||
"id": "child",
|
||||
"aliases": [],
|
||||
"ingest_roots": [
|
||||
{"source": "vault", "subpath": "incoming/projects/parent/child"}
|
||||
],
|
||||
},
|
||||
]
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir))
|
||||
monkeypatch.setenv("ATOCORE_DRIVE_SOURCE_DIR", str(drive_dir))
|
||||
monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path))
|
||||
|
||||
original_settings = config.settings
|
||||
try:
|
||||
config.settings = config.Settings()
|
||||
try:
|
||||
list_registered_projects()
|
||||
except ValueError as exc:
|
||||
assert "ingest root overlap" in str(exc)
|
||||
else:
|
||||
raise AssertionError("Expected overlapping ingest roots to raise")
|
||||
finally:
|
||||
config.settings = original_settings
|
||||
|
||||
|
||||
def test_refresh_registered_project_ingests_registered_roots(tmp_path, monkeypatch):
|
||||
vault_dir = tmp_path / "vault"
|
||||
drive_dir = tmp_path / "drive"
|
||||
@@ -144,7 +237,7 @@ def test_refresh_registered_project_ingests_registered_roots(tmp_path, monkeypat
|
||||
original_settings = config.settings
|
||||
try:
|
||||
config.settings = config.Settings()
|
||||
monkeypatch.setattr("atocore.projects.registry.ingest_project_folder", fake_ingest_folder)
|
||||
monkeypatch.setattr("atocore.ingestion.pipeline.ingest_project_folder", fake_ingest_folder)
|
||||
result = refresh_registered_project("polisher")
|
||||
finally:
|
||||
config.settings = original_settings
|
||||
@@ -199,7 +292,7 @@ def test_refresh_registered_project_reports_nothing_to_ingest_when_all_missing(
|
||||
original_settings = config.settings
|
||||
try:
|
||||
config.settings = config.Settings()
|
||||
monkeypatch.setattr("atocore.projects.registry.ingest_project_folder", fail_ingest_folder)
|
||||
monkeypatch.setattr("atocore.ingestion.pipeline.ingest_project_folder", fail_ingest_folder)
|
||||
result = refresh_registered_project("ghost")
|
||||
finally:
|
||||
config.settings = original_settings
|
||||
@@ -249,7 +342,7 @@ def test_refresh_registered_project_reports_partial_status(tmp_path, monkeypatch
|
||||
original_settings = config.settings
|
||||
try:
|
||||
config.settings = config.Settings()
|
||||
monkeypatch.setattr("atocore.projects.registry.ingest_project_folder", fake_ingest_folder)
|
||||
monkeypatch.setattr("atocore.ingestion.pipeline.ingest_project_folder", fake_ingest_folder)
|
||||
result = refresh_registered_project("mixed")
|
||||
finally:
|
||||
config.settings = original_settings
|
||||
|
||||
@@ -458,6 +458,72 @@ def test_retrieve_project_scope_prefers_exact_project_id(monkeypatch):
|
||||
assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"]
|
||||
|
||||
|
||||
def test_retrieve_empty_project_id_falls_back_to_path_ownership(monkeypatch):
|
||||
target_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p04-gigabit",
|
||||
"aliases": ("p04", "gigabit"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
other_project = type(
|
||||
"Project",
|
||||
(),
|
||||
{
|
||||
"project_id": "p05-interferometer",
|
||||
"aliases": ("p05", "interferometer"),
|
||||
"ingest_roots": (),
|
||||
},
|
||||
)()
|
||||
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
return {
|
||||
"ids": [["chunk-target", "chunk-other"]],
|
||||
"documents": [["target doc", "other doc"]],
|
||||
"metadatas": [[
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p04-gigabit/status.md",
|
||||
"tags": "[]",
|
||||
"title": "Target",
|
||||
"project_id": "",
|
||||
"document_id": "doc-a",
|
||||
},
|
||||
{
|
||||
"heading_path": "Overview",
|
||||
"source_file": "p05-interferometer/status.md",
|
||||
"tags": "[]",
|
||||
"title": "Other",
|
||||
"project_id": "",
|
||||
"document_id": "doc-b",
|
||||
},
|
||||
]],
|
||||
"distances": [[0.2, 0.19]],
|
||||
}
|
||||
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
||||
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever._existing_chunk_ids",
|
||||
lambda chunk_ids: set(chunk_ids),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.get_registered_project",
|
||||
lambda project_name: target_project,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"atocore.retrieval.retriever.load_project_registry",
|
||||
lambda: [target_project, other_project],
|
||||
)
|
||||
|
||||
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
||||
|
||||
assert [r.chunk_id for r in results] == ["chunk-target"]
|
||||
|
||||
|
||||
def test_retrieve_unknown_project_hint_does_not_widen_or_filter(monkeypatch):
|
||||
class FakeStore:
|
||||
def query(self, query_embedding, top_k=10, where=None):
|
||||
|
||||
Reference in New Issue
Block a user