Stabilize core correctness and sync project plan state

This commit is contained in:
2026-04-05 17:53:23 -04:00
parent b48f0c95ab
commit b0889b3925
20 changed files with 551 additions and 168 deletions

View File

@@ -1,8 +1,8 @@
"""Embedding model management."""
import atocore.config as _config
from sentence_transformers import SentenceTransformer
from atocore.config import settings
from atocore.observability.logger import get_logger
log = get_logger("embeddings")
@@ -14,9 +14,9 @@ def get_model() -> SentenceTransformer:
"""Load and cache the embedding model."""
global _model
if _model is None:
log.info("loading_embedding_model", model=settings.embedding_model)
_model = SentenceTransformer(settings.embedding_model)
log.info("embedding_model_loaded", model=settings.embedding_model)
log.info("loading_embedding_model", model=_config.settings.embedding_model)
_model = SentenceTransformer(_config.settings.embedding_model)
log.info("embedding_model_loaded", model=_config.settings.embedding_model)
return _model

View File

@@ -3,7 +3,8 @@
import time
from dataclasses import dataclass
from atocore.config import settings
import atocore.config as _config
from atocore.models.database import get_connection
from atocore.observability.logger import get_logger
from atocore.retrieval.embeddings import embed_query
from atocore.retrieval.vector_store import get_vector_store
@@ -29,7 +30,7 @@ def retrieve(
filter_tags: list[str] | None = None,
) -> list[ChunkResult]:
"""Retrieve the most relevant chunks for a query."""
top_k = top_k or settings.context_top_k
top_k = top_k or _config.settings.context_top_k
start = time.time()
query_embedding = embed_query(query)
@@ -59,7 +60,10 @@ def retrieve(
chunks = []
if results and results["ids"] and results["ids"][0]:
existing_ids = _existing_chunk_ids(results["ids"][0])
for i, chunk_id in enumerate(results["ids"][0]):
if chunk_id not in existing_ids:
continue
# ChromaDB returns distances (lower = more similar for cosine)
# Convert to similarity score (1 - distance)
distance = results["distances"][0][i] if results["distances"] else 0
@@ -90,3 +94,17 @@ def retrieve(
)
return chunks
def _existing_chunk_ids(chunk_ids: list[str]) -> set[str]:
"""Filter out stale vector entries whose chunk rows no longer exist."""
if not chunk_ids:
return set()
placeholders = ", ".join("?" for _ in chunk_ids)
with get_connection() as conn:
rows = conn.execute(
f"SELECT id FROM source_chunks WHERE id IN ({placeholders})",
chunk_ids,
).fetchall()
return {row["id"] for row in rows}

View File

@@ -2,7 +2,7 @@
import chromadb
from atocore.config import settings
import atocore.config as _config
from atocore.observability.logger import get_logger
from atocore.retrieval.embeddings import embed_texts
@@ -17,13 +17,13 @@ class VectorStore:
"""Wrapper around ChromaDB for chunk storage and retrieval."""
def __init__(self) -> None:
settings.chroma_path.mkdir(parents=True, exist_ok=True)
self._client = chromadb.PersistentClient(path=str(settings.chroma_path))
_config.settings.chroma_path.mkdir(parents=True, exist_ok=True)
self._client = chromadb.PersistentClient(path=str(_config.settings.chroma_path))
self._collection = self._client.get_or_create_collection(
name=COLLECTION_NAME,
metadata={"hnsw:space": "cosine"},
)
log.info("vector_store_initialized", path=str(settings.chroma_path))
log.info("vector_store_initialized", path=str(_config.settings.chroma_path))
def add(
self,