feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
"""Retrieval: query → ranked chunks."""
|
|
|
|
|
|
|
|
|
|
import time
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
|
2026-04-05 17:53:23 -04:00
|
|
|
import atocore.config as _config
|
|
|
|
|
from atocore.models.database import get_connection
|
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
from atocore.observability.logger import get_logger
|
|
|
|
|
from atocore.retrieval.embeddings import embed_query
|
|
|
|
|
from atocore.retrieval.vector_store import get_vector_store
|
|
|
|
|
|
|
|
|
|
log = get_logger("retriever")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class ChunkResult:
|
|
|
|
|
chunk_id: str
|
|
|
|
|
content: str
|
|
|
|
|
score: float
|
|
|
|
|
heading_path: str
|
|
|
|
|
source_file: str
|
|
|
|
|
tags: str
|
|
|
|
|
title: str
|
|
|
|
|
document_id: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def retrieve(
|
|
|
|
|
query: str,
|
|
|
|
|
top_k: int | None = None,
|
|
|
|
|
filter_tags: list[str] | None = None,
|
|
|
|
|
) -> list[ChunkResult]:
|
|
|
|
|
"""Retrieve the most relevant chunks for a query."""
|
2026-04-05 17:53:23 -04:00
|
|
|
top_k = top_k or _config.settings.context_top_k
|
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
start = time.time()
|
|
|
|
|
|
|
|
|
|
query_embedding = embed_query(query)
|
|
|
|
|
store = get_vector_store()
|
|
|
|
|
|
|
|
|
|
# Build filter
|
2026-04-05 09:35:37 -04:00
|
|
|
# Tags are stored as JSON strings like '["tag1", "tag2"]'.
|
|
|
|
|
# We use $contains with quoted tag to avoid substring false positives
|
|
|
|
|
# (e.g. searching "prod" won't match "production" because we search '"prod"').
|
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
where = None
|
|
|
|
|
if filter_tags:
|
2026-04-05 09:35:37 -04:00
|
|
|
if len(filter_tags) == 1:
|
|
|
|
|
where = {"tags": {"$contains": f'"{filter_tags[0]}"'}}
|
|
|
|
|
else:
|
|
|
|
|
where = {
|
|
|
|
|
"$and": [
|
|
|
|
|
{"tags": {"$contains": f'"{tag}"'}}
|
|
|
|
|
for tag in filter_tags
|
|
|
|
|
]
|
|
|
|
|
}
|
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
|
|
|
|
|
results = store.query(
|
|
|
|
|
query_embedding=query_embedding,
|
|
|
|
|
top_k=top_k,
|
|
|
|
|
where=where,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
chunks = []
|
|
|
|
|
if results and results["ids"] and results["ids"][0]:
|
2026-04-05 17:53:23 -04:00
|
|
|
existing_ids = _existing_chunk_ids(results["ids"][0])
|
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
for i, chunk_id in enumerate(results["ids"][0]):
|
2026-04-05 17:53:23 -04:00
|
|
|
if chunk_id not in existing_ids:
|
|
|
|
|
continue
|
feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation:
- FastAPI server with 5 endpoints (ingest, query, context/build, health, debug)
- SQLite database with 5 tables (documents, chunks, memories, projects, interactions)
- Heading-aware markdown chunker (800 char max, recursive splitting)
- Multilingual embeddings via sentence-transformers (EN/FR)
- ChromaDB vector store with cosine similarity retrieval
- Context builder with project boosting, dedup, and budget enforcement
- CLI scripts for batch ingestion and test prompt evaluation
- 19 unit tests passing, 79% coverage
- Validated on 482 real project files (8383 chunks, 0 errors)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-05 09:21:27 -04:00
|
|
|
# ChromaDB returns distances (lower = more similar for cosine)
|
|
|
|
|
# Convert to similarity score (1 - distance)
|
|
|
|
|
distance = results["distances"][0][i] if results["distances"] else 0
|
|
|
|
|
score = 1.0 - distance
|
|
|
|
|
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
|
|
|
|
content = results["documents"][0][i] if results["documents"] else ""
|
|
|
|
|
|
|
|
|
|
chunks.append(
|
|
|
|
|
ChunkResult(
|
|
|
|
|
chunk_id=chunk_id,
|
|
|
|
|
content=content,
|
|
|
|
|
score=round(score, 4),
|
|
|
|
|
heading_path=meta.get("heading_path", ""),
|
|
|
|
|
source_file=meta.get("source_file", ""),
|
|
|
|
|
tags=meta.get("tags", "[]"),
|
|
|
|
|
title=meta.get("title", ""),
|
|
|
|
|
document_id=meta.get("document_id", ""),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
duration_ms = int((time.time() - start) * 1000)
|
|
|
|
|
log.info(
|
|
|
|
|
"retrieval_done",
|
|
|
|
|
query=query[:100],
|
|
|
|
|
top_k=top_k,
|
|
|
|
|
results_count=len(chunks),
|
|
|
|
|
duration_ms=duration_ms,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return chunks
|
2026-04-05 17:53:23 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
|
|
|
|
|
"""Filter out stale vector entries whose chunk rows no longer exist."""
|
|
|
|
|
if not chunk_ids:
|
|
|
|
|
return set()
|
|
|
|
|
|
|
|
|
|
placeholders = ", ".join("?" for _ in chunk_ids)
|
|
|
|
|
with get_connection() as conn:
|
|
|
|
|
rows = conn.execute(
|
|
|
|
|
f"SELECT id FROM source_chunks WHERE id IN ({placeholders})",
|
|
|
|
|
chunk_ids,
|
|
|
|
|
).fetchall()
|
|
|
|
|
return {row["id"] for row in rows}
|