From 6081462058629fbeb46cae1c0fe16f6e61e856f8 Mon Sep 17 00:00:00 2001
From: Anto01 <antoine.letarte@gmail.com>
Date: Sun, 5 Apr 2026 09:35:37 -0400
Subject: [PATCH] fix: critical bugs and hardening from validation audit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix infinite loop in chunker _hard_split when overlap >= max_size
- Fix tag filter false positives by quoting tag values in ChromaDB query
- Fix score boost semantics (additive → multiplicative) to stay within 0-1 range
- Add error handling and type hints to all API routes
- Update README with proper project documentation

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 README.md                          | 68 +++++++++++++++++++++++++++++-
 src/atocore/api/routes.py          | 53 ++++++++++++++---------
 src/atocore/context/builder.py     |  2 +-
 src/atocore/ingestion/chunker.py   |  4 ++
 src/atocore/retrieval/retriever.py | 15 +++++--
 5 files changed, 117 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 51197b4..02683a2 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,67 @@
-# ATODrive
+# AtoCore
 
-ATODrive project repository
\ No newline at end of file
+Personal context engine that enriches LLM interactions with durable memory, structured context, and project knowledge.
+
+## Quick Start
+
+```bash
+pip install -e .
+uvicorn src.atocore.main:app --port 8100
+```
+
+## Usage
+
+```bash
+# Ingest markdown files
+curl -X POST http://localhost:8100/ingest \
+  -H "Content-Type: application/json" \
+  -d '{"path": "/path/to/notes"}'
+
+# Build enriched context for a prompt
+curl -X POST http://localhost:8100/context/build \
+  -H "Content-Type: application/json" \
+  -d '{"prompt": "What is the project status?", "project": "myproject"}'
+
+# CLI ingestion
+python scripts/ingest_folder.py --path /path/to/notes
+```
+
+## API Endpoints
+
+| Method | Path | Description |
+|--------|------|-------------|
+| POST | /ingest | Ingest markdown file or folder |
+| POST | /query | Retrieve relevant chunks |
+| POST | /context/build | Build full context pack |
+| GET | /health | Health check |
+| GET | /debug/context | Inspect last context pack |
+
+## Architecture
+
+```
+FastAPI (port 8100)
+  ├── Ingestion: markdown → parse → chunk → embed → store
+  ├── Retrieval: query → embed → vector search → rank
+  ├── Context Builder: retrieve → boost → budget → format
+  ├── SQLite (documents, chunks, memories, projects, interactions)
+  └── ChromaDB (vector embeddings)
+```
+
+## Configuration
+
+Set via environment variables (prefix `ATOCORE_`):
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| ATOCORE_DEBUG | false | Enable debug logging |
+| ATOCORE_PORT | 8100 | Server port |
+| ATOCORE_CHUNK_MAX_SIZE | 800 | Max chunk size (chars) |
+| ATOCORE_CONTEXT_BUDGET | 3000 | Context pack budget (chars) |
+| ATOCORE_EMBEDDING_MODEL | paraphrase-multilingual-MiniLM-L12-v2 | Embedding model |
+
+## Testing
+
+```bash
+pip install -e ".[dev]"
+pytest
+```
diff --git a/src/atocore/api/routes.py b/src/atocore/api/routes.py
index 8f3e59d..3dc3007 100644
--- a/src/atocore/api/routes.py
+++ b/src/atocore/api/routes.py
@@ -6,23 +6,24 @@ from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 
 from atocore.context.builder import (
-    ContextPack,
     build_context,
     get_last_context_pack,
     _pack_to_dict,
 )
 from atocore.ingestion.pipeline import ingest_file, ingest_folder
+from atocore.observability.logger import get_logger
 from atocore.retrieval.retriever import retrieve
 from atocore.retrieval.vector_store import get_vector_store
 
 router = APIRouter()
+log = get_logger("api")
 
 
 # --- Request/Response models ---
 
 
 class IngestRequest(BaseModel):
-    path: str  # file or folder path
+    path: str
 
 
 class IngestResponse(BaseModel):
@@ -60,22 +61,32 @@ class ContextBuildResponse(BaseModel):
 
 
 @router.post("/ingest", response_model=IngestResponse)
-def api_ingest(req: IngestRequest):
+def api_ingest(req: IngestRequest) -> IngestResponse:
     """Ingest a markdown file or folder."""
     target = Path(req.path)
-    if target.is_file():
-        results = [ingest_file(target)]
-    elif target.is_dir():
-        results = ingest_folder(target)
-    else:
-        raise HTTPException(status_code=404, detail=f"Path not found: {req.path}")
+    try:
+        if target.is_file():
+            results = [ingest_file(target)]
+        elif target.is_dir():
+            results = ingest_folder(target)
+        else:
+            raise HTTPException(status_code=404, detail=f"Path not found: {req.path}")
+    except HTTPException:
+        raise
+    except Exception as e:
+        log.error("ingest_failed", path=req.path, error=str(e))
+        raise HTTPException(status_code=500, detail=f"Ingestion failed: {e}")
     return IngestResponse(results=results)
 
 
 @router.post("/query", response_model=QueryResponse)
-def api_query(req: QueryRequest):
+def api_query(req: QueryRequest) -> QueryResponse:
     """Retrieve relevant chunks for a prompt."""
-    chunks = retrieve(req.prompt, top_k=req.top_k, filter_tags=req.filter_tags)
+    try:
+        chunks = retrieve(req.prompt, top_k=req.top_k, filter_tags=req.filter_tags)
+    except Exception as e:
+        log.error("query_failed", prompt=req.prompt[:100], error=str(e))
+        raise HTTPException(status_code=500, detail=f"Query failed: {e}")
     return QueryResponse(
         results=[
             {
@@ -92,13 +103,17 @@ def api_query(req: QueryRequest):
 
 
 @router.post("/context/build", response_model=ContextBuildResponse)
-def api_build_context(req: ContextBuildRequest):
+def api_build_context(req: ContextBuildRequest) -> ContextBuildResponse:
     """Build a full context pack for a prompt."""
-    pack = build_context(
-        user_prompt=req.prompt,
-        project_hint=req.project,
-        budget=req.budget,
-    )
+    try:
+        pack = build_context(
+            user_prompt=req.prompt,
+            project_hint=req.project,
+            budget=req.budget,
+        )
+    except Exception as e:
+        log.error("context_build_failed", prompt=req.prompt[:100], error=str(e))
+        raise HTTPException(status_code=500, detail=f"Context build failed: {e}")
     pack_dict = _pack_to_dict(pack)
     return ContextBuildResponse(
         formatted_context=pack.formatted_context,
@@ -113,7 +128,7 @@ def api_build_context(req: ContextBuildRequest):
 
 
 @router.get("/health")
-def api_health():
+def api_health() -> dict:
     """Health check."""
     store = get_vector_store()
     return {
@@ -124,7 +139,7 @@ def api_health():
 
 
 @router.get("/debug/context")
-def api_debug_context():
+def api_debug_context() -> dict:
     """Inspect the last assembled context pack."""
     pack = get_last_context_pack()
     if pack is None:
diff --git a/src/atocore/context/builder.py b/src/atocore/context/builder.py
index 62c53c8..eb02c1d 100644
--- a/src/atocore/context/builder.py
+++ b/src/atocore/context/builder.py
@@ -128,7 +128,7 @@ def _rank_chunks(
             hint_lower = project_hint.lower()
 
             if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:
-                final_score += 0.3
+                final_score *= 1.3
 
         scored.append((final_score, chunk))
 
diff --git a/src/atocore/ingestion/chunker.py b/src/atocore/ingestion/chunker.py
index 90e7e54..6d7d201 100644
--- a/src/atocore/ingestion/chunker.py
+++ b/src/atocore/ingestion/chunker.py
@@ -137,6 +137,10 @@ def _split_by_paragraphs(
 
 def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
     """Hard split text at max_size with overlap."""
+    # Prevent infinite loop: overlap must be less than max_size
+    if overlap >= max_size:
+        overlap = max_size // 4
+
     chunks = []
     start = 0
     while start < len(text):
diff --git a/src/atocore/retrieval/retriever.py b/src/atocore/retrieval/retriever.py
index ddb3b0e..6920e11 100644
--- a/src/atocore/retrieval/retriever.py
+++ b/src/atocore/retrieval/retriever.py
@@ -36,11 +36,20 @@ def retrieve(
     store = get_vector_store()
 
     # Build filter
+    # Tags are stored as JSON strings like '["tag1", "tag2"]'.
+    # We use $contains with quoted tag to avoid substring false positives
+    # (e.g. searching "prod" won't match "production" because we search '"prod"').
     where = None
     if filter_tags:
-        # ChromaDB where filter for tags (stored as JSON string)
-        # Simple contains check — works for single-tag filtering
-        where = {"tags": {"$contains": filter_tags[0]}}
+        if len(filter_tags) == 1:
+            where = {"tags": {"$contains": f'"{filter_tags[0]}"'}}
+        else:
+            where = {
+                "$and": [
+                    {"tags": {"$contains": f'"{tag}"'}}
+                    for tag in filter_tags
+                ]
+            }
 
     results = store.query(
         query_embedding=query_embedding,