feat(retrieval): persist explicit chunk project ids
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
"""Tests for the ingestion pipeline."""
|
||||
|
||||
import json
|
||||
|
||||
from atocore.ingestion.parser import parse_markdown
|
||||
from atocore.models.database import get_connection, init_db
|
||||
from atocore.ingestion.pipeline import ingest_file, ingest_folder
|
||||
from atocore.ingestion.pipeline import ingest_file, ingest_folder, ingest_project_folder
|
||||
|
||||
|
||||
def test_parse_markdown(sample_markdown):
|
||||
@@ -69,6 +71,54 @@ def test_ingest_updates_changed(tmp_data_dir, sample_markdown):
|
||||
assert result["status"] == "ingested"
|
||||
|
||||
|
||||
def test_ingest_file_records_project_id_metadata(tmp_data_dir, sample_markdown, monkeypatch):
|
||||
"""Project-aware ingestion should tag DB and vector metadata exactly."""
|
||||
init_db()
|
||||
|
||||
class FakeVectorStore:
|
||||
def __init__(self):
|
||||
self.metadatas = []
|
||||
|
||||
def add(self, ids, documents, metadatas):
|
||||
self.metadatas.extend(metadatas)
|
||||
|
||||
def delete(self, ids):
|
||||
return None
|
||||
|
||||
fake_store = FakeVectorStore()
|
||||
monkeypatch.setattr("atocore.ingestion.pipeline.get_vector_store", lambda: fake_store)
|
||||
|
||||
result = ingest_file(sample_markdown, project_id="p04-gigabit")
|
||||
|
||||
assert result["status"] == "ingested"
|
||||
assert fake_store.metadatas
|
||||
assert all(meta["project_id"] == "p04-gigabit" for meta in fake_store.metadatas)
|
||||
|
||||
with get_connection() as conn:
|
||||
rows = conn.execute("SELECT metadata FROM source_chunks").fetchall()
|
||||
assert rows
|
||||
assert all(
|
||||
json.loads(row["metadata"])["project_id"] == "p04-gigabit"
|
||||
for row in rows
|
||||
)
|
||||
|
||||
|
||||
def test_ingest_project_folder_passes_project_id_to_files(tmp_data_dir, sample_folder, monkeypatch):
|
||||
seen = []
|
||||
|
||||
def fake_ingest_file(path, project_id=""):
|
||||
seen.append((path.name, project_id))
|
||||
return {"file": str(path), "status": "ingested"}
|
||||
|
||||
monkeypatch.setattr("atocore.ingestion.pipeline.ingest_file", fake_ingest_file)
|
||||
monkeypatch.setattr("atocore.ingestion.pipeline._purge_deleted_files", lambda *args, **kwargs: 0)
|
||||
|
||||
ingest_project_folder(sample_folder, project_id="p05-interferometer")
|
||||
|
||||
assert seen
|
||||
assert {project_id for _, project_id in seen} == {"p05-interferometer"}
|
||||
|
||||
|
||||
def test_parse_markdown_uses_supplied_text(sample_markdown):
|
||||
"""Parsing should be able to reuse pre-read content from ingestion."""
|
||||
latin_text = """---\ntags: parser\n---\n# Parser Title\n\nBody text."""
|
||||
|
||||
Reference in New Issue
Block a user