"""Tests for explicit chunk project_id metadata backfill.""" import json import atocore.config as config from atocore.models.database import get_connection, init_db from scripts import backfill_chunk_project_ids as backfill def _write_registry(tmp_path, monkeypatch): vault_dir = tmp_path / "vault" drive_dir = tmp_path / "drive" config_dir = tmp_path / "config" project_dir = vault_dir / "incoming" / "projects" / "p04-gigabit" project_dir.mkdir(parents=True) drive_dir.mkdir() config_dir.mkdir() registry_path = config_dir / "project-registry.json" registry_path.write_text( json.dumps( { "projects": [ { "id": "p04-gigabit", "aliases": ["p04"], "ingest_roots": [ {"source": "vault", "subpath": "incoming/projects/p04-gigabit"} ], } ] } ), encoding="utf-8", ) monkeypatch.setenv("ATOCORE_VAULT_SOURCE_DIR", str(vault_dir)) monkeypatch.setenv("ATOCORE_DRIVE_SOURCE_DIR", str(drive_dir)) monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path)) config.settings = config.Settings() return project_dir def _insert_chunk(file_path, metadata=None, chunk_id="chunk-1"): with get_connection() as conn: conn.execute( """ INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?) """, ("doc-1", str(file_path), "hash", "Title", "markdown", "[]"), ) conn.execute( """ INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?) """, ( chunk_id, "doc-1", 0, "content", "Overview", 7, json.dumps(metadata if metadata is not None else {}), ), ) class FakeVectorStore: def __init__(self, metadatas): self.metadatas = dict(metadatas) self.updated = [] def get_metadatas(self, ids): returned_ids = [chunk_id for chunk_id in ids if chunk_id in self.metadatas] return { "ids": returned_ids, "metadatas": [self.metadatas[chunk_id] for chunk_id in returned_ids], } def update_metadatas(self, ids, metadatas): self.updated.append((list(ids), list(metadatas))) for chunk_id, metadata in zip(ids, metadatas, strict=True): self.metadatas[chunk_id] = metadata def test_backfill_dry_run_is_non_mutating(tmp_data_dir, tmp_path, monkeypatch): init_db() project_dir = _write_registry(tmp_path, monkeypatch) _insert_chunk(project_dir / "status.md") result = backfill.backfill(apply=False) assert result["updates"] == 1 with get_connection() as conn: row = conn.execute("SELECT metadata FROM source_chunks WHERE id = ?", ("chunk-1",)).fetchone() assert json.loads(row["metadata"]) == {} def test_backfill_apply_updates_chroma_then_sql(tmp_data_dir, tmp_path, monkeypatch): init_db() project_dir = _write_registry(tmp_path, monkeypatch) _insert_chunk(project_dir / "status.md", metadata={"source_file": "status.md"}) fake_store = FakeVectorStore({"chunk-1": {"source_file": "status.md"}}) monkeypatch.setattr(backfill, "get_vector_store", lambda: fake_store) result = backfill.backfill(apply=True, require_chroma_snapshot=True) assert result["applied_updates"] == 1 assert fake_store.metadatas["chunk-1"]["project_id"] == "p04-gigabit" with get_connection() as conn: row = conn.execute("SELECT metadata FROM source_chunks WHERE id = ?", ("chunk-1",)).fetchone() assert json.loads(row["metadata"])["project_id"] == "p04-gigabit" def test_backfill_apply_requires_snapshot_confirmation(tmp_data_dir, tmp_path, monkeypatch): init_db() project_dir = _write_registry(tmp_path, monkeypatch) _insert_chunk(project_dir / "status.md") try: backfill.backfill(apply=True) except ValueError as exc: assert "Chroma backup" in str(exc) else: raise AssertionError("Expected snapshot confirmation requirement") def test_backfill_missing_vector_skips_sql_update(tmp_data_dir, tmp_path, monkeypatch): init_db() project_dir = _write_registry(tmp_path, monkeypatch) _insert_chunk(project_dir / "status.md") fake_store = FakeVectorStore({}) monkeypatch.setattr(backfill, "get_vector_store", lambda: fake_store) result = backfill.backfill(apply=True, require_chroma_snapshot=True) assert result["updates"] == 1 assert result["applied_updates"] == 0 assert result["missing_vectors"] == 1 with get_connection() as conn: row = conn.execute("SELECT metadata FROM source_chunks WHERE id = ?", ("chunk-1",)).fetchone() assert json.loads(row["metadata"]) == {} def test_backfill_skips_malformed_metadata(tmp_data_dir, tmp_path, monkeypatch): init_db() project_dir = _write_registry(tmp_path, monkeypatch) _insert_chunk(project_dir / "status.md", metadata=[]) result = backfill.backfill(apply=False) assert result["updates"] == 0 assert result["malformed_metadata"] == 1