Stabilize core correctness and sync project plan state

This commit is contained in:
2026-04-05 17:53:23 -04:00
parent b48f0c95ab
commit b0889b3925
20 changed files with 551 additions and 168 deletions

View File

@@ -1,7 +1,7 @@
"""Tests for the retrieval system."""
from atocore.ingestion.pipeline import ingest_file
from atocore.models.database import init_db
from atocore.models.database import get_connection, init_db
from atocore.retrieval.retriever import retrieve
from atocore.retrieval.vector_store import get_vector_store
@@ -39,3 +39,31 @@ def test_vector_store_count(tmp_data_dir, sample_markdown):
ingest_file(sample_markdown)
store = get_vector_store()
assert store.count > 0
def test_retrieve_skips_stale_vector_entries(tmp_data_dir, sample_markdown, monkeypatch):
"""Retriever should ignore vector hits whose chunk rows no longer exist."""
init_db()
ingest_file(sample_markdown)
with get_connection() as conn:
chunk_ids = [row["id"] for row in conn.execute("SELECT id FROM source_chunks").fetchall()]
class FakeStore:
def query(self, query_embedding, top_k=10, where=None):
return {
"ids": [[chunk_ids[0], "missing-chunk"]],
"documents": [["valid doc", "stale doc"]],
"metadatas": [[
{"heading_path": "Overview", "source_file": "valid.md", "tags": "[]", "title": "Valid", "document_id": "doc-1"},
{"heading_path": "Ghost", "source_file": "ghost.md", "tags": "[]", "title": "Ghost", "document_id": "doc-2"},
]],
"distances": [[0.1, 0.2]],
}
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
results = retrieve("overview", top_k=2)
assert len(results) == 1
assert results[0].chunk_id == chunk_ids[0]