feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
"""Tests for the retrieval system."""
|
|
|
|
|
|
|
|
|
|
from atocore.ingestion.pipeline import ingest_file
|
2026-04-05 17:53:23 -04:00
|
|
|
from atocore.models.database import get_connection, init_db
|
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
from atocore.retrieval.retriever import retrieve
|
|
|
|
|
from atocore.retrieval.vector_store import get_vector_store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_retrieve_returns_results(tmp_data_dir, sample_markdown):
|
|
|
|
|
"""Test that retrieval returns relevant chunks."""
|
|
|
|
|
init_db()
|
|
|
|
|
ingest_file(sample_markdown)
|
|
|
|
|
|
|
|
|
|
results = retrieve("What are the memory types?", top_k=5)
|
|
|
|
|
assert len(results) > 0
|
|
|
|
|
assert all(r.score > 0 for r in results)
|
|
|
|
|
assert all(r.content for r in results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_retrieve_scores_ranked(tmp_data_dir, sample_markdown):
|
|
|
|
|
"""Test that results are ranked by score."""
|
|
|
|
|
init_db()
|
|
|
|
|
ingest_file(sample_markdown)
|
|
|
|
|
|
|
|
|
|
results = retrieve("architecture layers", top_k=5)
|
|
|
|
|
if len(results) >= 2:
|
|
|
|
|
scores = [r.score for r in results]
|
|
|
|
|
assert scores == sorted(scores, reverse=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_vector_store_count(tmp_data_dir, sample_markdown):
|
|
|
|
|
"""Test that vector store tracks chunk count."""
|
|
|
|
|
init_db()
|
|
|
|
|
|
|
|
|
|
# Reset singleton for clean test
|
|
|
|
|
import atocore.retrieval.vector_store as vs
|
|
|
|
|
vs._store = None
|
|
|
|
|
|
|
|
|
|
ingest_file(sample_markdown)
|
|
|
|
|
store = get_vector_store()
|
|
|
|
|
assert store.count > 0
|
2026-04-05 17:53:23 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_retrieve_skips_stale_vector_entries(tmp_data_dir, sample_markdown, monkeypatch):
|
|
|
|
|
"""Retriever should ignore vector hits whose chunk rows no longer exist."""
|
|
|
|
|
init_db()
|
|
|
|
|
ingest_file(sample_markdown)
|
|
|
|
|
|
|
|
|
|
with get_connection() as conn:
|
|
|
|
|
chunk_ids = [row["id"] for row in conn.execute("SELECT id FROM source_chunks").fetchall()]
|
|
|
|
|
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
return {
|
|
|
|
|
"ids": [[chunk_ids[0], "missing-chunk"]],
|
|
|
|
|
"documents": [["valid doc", "stale doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{"heading_path": "Overview", "source_file": "valid.md", "tags": "[]", "title": "Valid", "document_id": "doc-1"},
|
|
|
|
|
{"heading_path": "Ghost", "source_file": "ghost.md", "tags": "[]", "title": "Ghost", "document_id": "doc-2"},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.1, 0.2]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
|
|
|
|
|
results = retrieve("overview", top_k=2)
|
|
|
|
|
assert len(results) == 1
|
|
|
|
|
assert results[0].chunk_id == chunk_ids[0]
|
2026-04-06 13:32:33 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_retrieve_project_hint_boosts_matching_chunks(monkeypatch):
|
2026-04-24 10:46:56 -04:00
|
|
|
target_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p04-gigabit",
|
|
|
|
|
"aliases": ("p04", "gigabit"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
other_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p05-interferometer",
|
|
|
|
|
"aliases": ("p05", "interferometer"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
|
2026-04-06 13:32:33 -04:00
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
2026-04-24 10:46:56 -04:00
|
|
|
assert top_k == 8
|
2026-04-06 13:32:33 -04:00
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-a", "chunk-b"]],
|
|
|
|
|
"documents": [["project doc", "other doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p04-gigabit/pkm/_index.md",
|
|
|
|
|
"tags": '["p04-gigabit"]',
|
|
|
|
|
"title": "P04",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p05-interferometer/pkm/_index.md",
|
|
|
|
|
"tags": '["p05-interferometer"]',
|
|
|
|
|
"title": "P05",
|
|
|
|
|
"document_id": "doc-b",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.3, 0.25]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
2026-04-24 10:46:56 -04:00
|
|
|
lambda project_name: target_project,
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.load_project_registry",
|
|
|
|
|
lambda: [target_project, other_project],
|
2026-04-06 13:32:33 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
|
|
|
|
|
2026-04-24 10:46:56 -04:00
|
|
|
assert len(results) == 1
|
2026-04-06 13:32:33 -04:00
|
|
|
assert results[0].chunk_id == "chunk-a"
|
2026-04-24 10:46:56 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_retrieve_project_scope_allows_unowned_global_chunks(monkeypatch):
|
|
|
|
|
target_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p04-gigabit",
|
|
|
|
|
"aliases": ("p04", "gigabit"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-a", "chunk-global"]],
|
|
|
|
|
"documents": [["project doc", "global doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p04-gigabit/pkm/_index.md",
|
|
|
|
|
"tags": '["p04-gigabit"]',
|
|
|
|
|
"title": "P04",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "shared/engineering-rules.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "Shared engineering rules",
|
|
|
|
|
"document_id": "doc-global",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.21]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: target_project,
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.load_project_registry",
|
|
|
|
|
lambda: [target_project],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
|
|
|
|
|
|
|
|
|
assert [r.chunk_id for r in results] == ["chunk-a", "chunk-global"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_retrieve_project_scope_filter_can_be_disabled(monkeypatch):
|
|
|
|
|
target_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p04-gigabit",
|
|
|
|
|
"aliases": ("p04", "gigabit"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
other_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p05-interferometer",
|
|
|
|
|
"aliases": ("p05", "interferometer"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
assert top_k == 2
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-a", "chunk-b"]],
|
|
|
|
|
"documents": [["project doc", "other project doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p04-gigabit/pkm/_index.md",
|
|
|
|
|
"tags": '["p04-gigabit"]',
|
|
|
|
|
"title": "P04",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p05-interferometer/pkm/_index.md",
|
|
|
|
|
"tags": '["p05-interferometer"]',
|
|
|
|
|
"title": "P05",
|
|
|
|
|
"document_id": "doc-b",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.2]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.config.settings.rank_project_scope_filter", False)
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: target_project,
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.load_project_registry",
|
|
|
|
|
lambda: [target_project, other_project],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
|
|
|
|
|
|
|
|
|
assert {r.chunk_id for r in results} == {"chunk-a", "chunk-b"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_retrieve_project_scope_ignores_title_for_ownership(monkeypatch):
|
|
|
|
|
target_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p04-gigabit",
|
|
|
|
|
"aliases": ("p04", "gigabit"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
other_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p06-polisher",
|
|
|
|
|
"aliases": ("p06", "polisher", "p11"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-target", "chunk-poisoned-title"]],
|
|
|
|
|
"documents": [["p04 doc", "p06 doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p04-gigabit/pkm/_index.md",
|
|
|
|
|
"tags": '["p04-gigabit"]',
|
|
|
|
|
"title": "P04",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p06-polisher/pkm/architecture.md",
|
|
|
|
|
"tags": '["p06-polisher"]',
|
|
|
|
|
"title": "GigaBIT M1 mirror lessons",
|
|
|
|
|
"document_id": "doc-b",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.19]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: target_project,
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.load_project_registry",
|
|
|
|
|
lambda: [target_project, other_project],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
|
|
|
|
|
|
|
|
|
assert [r.chunk_id for r in results] == ["chunk-target"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_retrieve_project_scope_uses_path_segments_not_substrings(monkeypatch):
|
|
|
|
|
target_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p05-interferometer",
|
|
|
|
|
"aliases": ("p05", "interferometer"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
abb_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "abb-space",
|
|
|
|
|
"aliases": ("abb",),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-target", "chunk-global"]],
|
|
|
|
|
"documents": [["p05 doc", "global doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p05-interferometer/pkm/_index.md",
|
|
|
|
|
"tags": '["p05-interferometer"]',
|
|
|
|
|
"title": "P05",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Abbreviation notes",
|
|
|
|
|
"source_file": "shared/cabbage-abbreviations.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "ABB-style abbreviations",
|
|
|
|
|
"document_id": "doc-global",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.21]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: target_project,
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.load_project_registry",
|
|
|
|
|
lambda: [target_project, abb_project],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("abbreviations", top_k=2, project_hint="p05")
|
|
|
|
|
|
|
|
|
|
assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"]
|
|
|
|
|
|
|
|
|
|
|
2026-04-24 11:02:30 -04:00
|
|
|
def test_retrieve_project_scope_prefers_exact_project_id(monkeypatch):
|
|
|
|
|
target_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p04-gigabit",
|
|
|
|
|
"aliases": ("p04", "gigabit"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
other_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p06-polisher",
|
|
|
|
|
"aliases": ("p06", "polisher"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-target", "chunk-other", "chunk-global"]],
|
|
|
|
|
"documents": [["target doc", "other doc", "global doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "legacy/unhelpful-path.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "Target",
|
|
|
|
|
"project_id": "p04-gigabit",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p04-gigabit/title-poisoned.md",
|
|
|
|
|
"tags": '["p04-gigabit"]',
|
|
|
|
|
"title": "Looks target-owned but is explicit p06",
|
|
|
|
|
"project_id": "p06-polisher",
|
|
|
|
|
"document_id": "doc-b",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "shared/global.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "Shared",
|
|
|
|
|
"project_id": "",
|
|
|
|
|
"document_id": "doc-global",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.19, 0.21]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: target_project,
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.load_project_registry",
|
|
|
|
|
lambda: [target_project, other_project],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("mirror architecture", top_k=3, project_hint="p04")
|
|
|
|
|
|
|
|
|
|
assert [r.chunk_id for r in results] == ["chunk-target", "chunk-global"]
|
|
|
|
|
|
|
|
|
|
|
2026-04-24 11:22:13 -04:00
|
|
|
def test_retrieve_empty_project_id_falls_back_to_path_ownership(monkeypatch):
|
|
|
|
|
target_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p04-gigabit",
|
|
|
|
|
"aliases": ("p04", "gigabit"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
other_project = type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p05-interferometer",
|
|
|
|
|
"aliases": ("p05", "interferometer"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)()
|
|
|
|
|
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-target", "chunk-other"]],
|
|
|
|
|
"documents": [["target doc", "other doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p04-gigabit/status.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "Target",
|
|
|
|
|
"project_id": "",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p05-interferometer/status.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "Other",
|
|
|
|
|
"project_id": "",
|
|
|
|
|
"document_id": "doc-b",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.19]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: target_project,
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.load_project_registry",
|
|
|
|
|
lambda: [target_project, other_project],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("mirror architecture", top_k=2, project_hint="p04")
|
|
|
|
|
|
|
|
|
|
assert [r.chunk_id for r in results] == ["chunk-target"]
|
|
|
|
|
|
|
|
|
|
|
2026-04-24 10:46:56 -04:00
|
|
|
def test_retrieve_unknown_project_hint_does_not_widen_or_filter(monkeypatch):
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
assert top_k == 2
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-a", "chunk-b"]],
|
|
|
|
|
"documents": [["doc a", "doc b"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "project-a/file.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "A",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "project-b/file.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "B",
|
|
|
|
|
"document_id": "doc-b",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.21]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: None,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("overview", top_k=2, project_hint="unknown-project")
|
|
|
|
|
|
|
|
|
|
assert [r.chunk_id for r in results] == ["chunk-a", "chunk-b"]
|
fix: pass project_hint into retrieve and add path-signal ranking
Two changes that belong together:
1. builder.build_context() now passes project_hint into retrieve(),
so the project-aware boost actually fires for the retrieval pipeline
driven by /context/build. Before this, only direct /query callers
benefited from the registered-project boost.
2. retriever now applies two more ranking signals on every chunk:
- _query_match_boost: boosts chunks whose source/title/heading
echo high-signal query tokens (stop list filters out generic
words like "the", "project", "system")
- _path_signal_boost: down-weights archival noise (_archive,
_history, pre-cleanup, reviews) by 0.72 and up-weights current
high-signal docs (status, decision, requirements, charter,
system-map, error-budget, ...) by 1.18
Tests:
- test_context_builder_passes_project_hint_to_retrieval verifies
the wiring fix
- test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths
verifies the new ranking helpers prefer current docs over archive
This addresses the cross-project competition and archive bleed
called out in current-state.md after the Wave 1 ingestion.
2026-04-06 18:37:07 -04:00
|
|
|
|
|
|
|
|
|
2026-04-24 11:32:46 -04:00
|
|
|
def test_retrieve_fails_open_when_project_scope_resolution_fails(monkeypatch):
|
|
|
|
|
warnings = []
|
|
|
|
|
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
assert top_k == 2
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-a", "chunk-b"]],
|
|
|
|
|
"documents": [["doc a", "doc b"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p04-gigabit/file.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "A",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p05-interferometer/file.md",
|
|
|
|
|
"tags": "[]",
|
|
|
|
|
"title": "B",
|
|
|
|
|
"document_id": "doc-b",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.21]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: (_ for _ in ()).throw(ValueError("registry overlap")),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.log.warning",
|
|
|
|
|
lambda event, **kwargs: warnings.append((event, kwargs)),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve("overview", top_k=2, project_hint="p04")
|
|
|
|
|
|
|
|
|
|
assert [r.chunk_id for r in results] == ["chunk-a", "chunk-b"]
|
|
|
|
|
assert {warning[0] for warning in warnings} == {
|
|
|
|
|
"project_scope_resolution_failed",
|
|
|
|
|
"project_match_boost_resolution_failed",
|
|
|
|
|
}
|
|
|
|
|
assert all("registry overlap" in warning[1]["error"] for warning in warnings)
|
|
|
|
|
|
|
|
|
|
|
fix: pass project_hint into retrieve and add path-signal ranking
Two changes that belong together:
1. builder.build_context() now passes project_hint into retrieve(),
so the project-aware boost actually fires for the retrieval pipeline
driven by /context/build. Before this, only direct /query callers
benefited from the registered-project boost.
2. retriever now applies two more ranking signals on every chunk:
- _query_match_boost: boosts chunks whose source/title/heading
echo high-signal query tokens (stop list filters out generic
words like "the", "project", "system")
- _path_signal_boost: down-weights archival noise (_archive,
_history, pre-cleanup, reviews) by 0.72 and up-weights current
high-signal docs (status, decision, requirements, charter,
system-map, error-budget, ...) by 1.18
Tests:
- test_context_builder_passes_project_hint_to_retrieval verifies
the wiring fix
- test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths
verifies the new ranking helpers prefer current docs over archive
This addresses the cross-project competition and archive bleed
called out in current-state.md after the Wave 1 ingestion.
2026-04-06 18:37:07 -04:00
|
|
|
def test_retrieve_downranks_archive_noise_and_prefers_high_signal_paths(monkeypatch):
|
|
|
|
|
class FakeStore:
|
|
|
|
|
def query(self, query_embedding, top_k=10, where=None):
|
|
|
|
|
return {
|
|
|
|
|
"ids": [["chunk-archive", "chunk-requirements"]],
|
|
|
|
|
"documents": [["archive doc", "requirements doc"]],
|
|
|
|
|
"metadatas": [[
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "History",
|
|
|
|
|
"source_file": "p05-interferometer/pkm/_archive/old/Error-Budget.md",
|
|
|
|
|
"tags": '["p05-interferometer"]',
|
|
|
|
|
"title": "Old Error Budget",
|
|
|
|
|
"document_id": "doc-a",
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"heading_path": "Overview",
|
|
|
|
|
"source_file": "p05-interferometer/pkm/Requirements/Error-Budget.md",
|
|
|
|
|
"tags": '["p05-interferometer"]',
|
|
|
|
|
"title": "Error Budget",
|
|
|
|
|
"document_id": "doc-b",
|
|
|
|
|
},
|
|
|
|
|
]],
|
|
|
|
|
"distances": [[0.2, 0.24]],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.get_vector_store", lambda: FakeStore())
|
|
|
|
|
monkeypatch.setattr("atocore.retrieval.retriever.embed_query", lambda query: [0.0, 0.1])
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever._existing_chunk_ids",
|
|
|
|
|
lambda chunk_ids: set(chunk_ids),
|
|
|
|
|
)
|
|
|
|
|
monkeypatch.setattr(
|
|
|
|
|
"atocore.retrieval.retriever.get_registered_project",
|
|
|
|
|
lambda project_name: type(
|
|
|
|
|
"Project",
|
|
|
|
|
(),
|
|
|
|
|
{
|
|
|
|
|
"project_id": "p05-interferometer",
|
|
|
|
|
"aliases": ("p05", "interferometer"),
|
|
|
|
|
"ingest_roots": (),
|
|
|
|
|
},
|
|
|
|
|
)(),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
results = retrieve(
|
|
|
|
|
"interferometer error budget vendor constraints",
|
|
|
|
|
top_k=2,
|
|
|
|
|
project_hint="p05-interferometer",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
assert len(results) == 2
|
|
|
|
|
assert results[0].chunk_id == "chunk-requirements"
|
|
|
|
|
assert results[0].score > results[1].score
|