feat: implement AtoCore Phase 0 + Phase 0.5 (foundation + PoC)
Complete implementation of the personal context engine foundation: - FastAPI server with 5 endpoints (ingest, query, context/build, health, debug) - SQLite database with 5 tables (documents, chunks, memories, projects, interactions) - Heading-aware markdown chunker (800 char max, recursive splitting) - Multilingual embeddings via sentence-transformers (EN/FR) - ChromaDB vector store with cosine similarity retrieval - Context builder with project boosting, dedup, and budget enforcement - CLI scripts for batch ingestion and test prompt evaluation - 19 unit tests passing, 79% coverage - Validated on 482 real project files (8383 chunks, 0 errors) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
3
src/atocore/__init__.py
Normal file
3
src/atocore/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""AtoCore — Personal Context Engine."""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
0
src/atocore/api/__init__.py
Normal file
0
src/atocore/api/__init__.py
Normal file
132
src/atocore/api/routes.py
Normal file
132
src/atocore/api/routes.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""FastAPI route definitions."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from atocore.context.builder import (
|
||||
ContextPack,
|
||||
build_context,
|
||||
get_last_context_pack,
|
||||
_pack_to_dict,
|
||||
)
|
||||
from atocore.ingestion.pipeline import ingest_file, ingest_folder
|
||||
from atocore.retrieval.retriever import retrieve
|
||||
from atocore.retrieval.vector_store import get_vector_store
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# --- Request/Response models ---
|
||||
|
||||
|
||||
class IngestRequest(BaseModel):
|
||||
path: str # file or folder path
|
||||
|
||||
|
||||
class IngestResponse(BaseModel):
|
||||
results: list[dict]
|
||||
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
prompt: str
|
||||
top_k: int = 10
|
||||
filter_tags: list[str] | None = None
|
||||
|
||||
|
||||
class QueryResponse(BaseModel):
|
||||
results: list[dict]
|
||||
|
||||
|
||||
class ContextBuildRequest(BaseModel):
|
||||
prompt: str
|
||||
project: str | None = None
|
||||
budget: int | None = None
|
||||
|
||||
|
||||
class ContextBuildResponse(BaseModel):
|
||||
formatted_context: str
|
||||
full_prompt: str
|
||||
chunks_used: int
|
||||
total_chars: int
|
||||
budget: int
|
||||
budget_remaining: int
|
||||
duration_ms: int
|
||||
chunks: list[dict]
|
||||
|
||||
|
||||
# --- Endpoints ---
|
||||
|
||||
|
||||
@router.post("/ingest", response_model=IngestResponse)
|
||||
def api_ingest(req: IngestRequest):
|
||||
"""Ingest a markdown file or folder."""
|
||||
target = Path(req.path)
|
||||
if target.is_file():
|
||||
results = [ingest_file(target)]
|
||||
elif target.is_dir():
|
||||
results = ingest_folder(target)
|
||||
else:
|
||||
raise HTTPException(status_code=404, detail=f"Path not found: {req.path}")
|
||||
return IngestResponse(results=results)
|
||||
|
||||
|
||||
@router.post("/query", response_model=QueryResponse)
|
||||
def api_query(req: QueryRequest):
|
||||
"""Retrieve relevant chunks for a prompt."""
|
||||
chunks = retrieve(req.prompt, top_k=req.top_k, filter_tags=req.filter_tags)
|
||||
return QueryResponse(
|
||||
results=[
|
||||
{
|
||||
"chunk_id": c.chunk_id,
|
||||
"content": c.content,
|
||||
"score": c.score,
|
||||
"heading_path": c.heading_path,
|
||||
"source_file": c.source_file,
|
||||
"title": c.title,
|
||||
}
|
||||
for c in chunks
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@router.post("/context/build", response_model=ContextBuildResponse)
|
||||
def api_build_context(req: ContextBuildRequest):
|
||||
"""Build a full context pack for a prompt."""
|
||||
pack = build_context(
|
||||
user_prompt=req.prompt,
|
||||
project_hint=req.project,
|
||||
budget=req.budget,
|
||||
)
|
||||
pack_dict = _pack_to_dict(pack)
|
||||
return ContextBuildResponse(
|
||||
formatted_context=pack.formatted_context,
|
||||
full_prompt=pack.full_prompt,
|
||||
chunks_used=len(pack.chunks_used),
|
||||
total_chars=pack.total_chars,
|
||||
budget=pack.budget,
|
||||
budget_remaining=pack.budget_remaining,
|
||||
duration_ms=pack.duration_ms,
|
||||
chunks=pack_dict["chunks"],
|
||||
)
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
def api_health():
|
||||
"""Health check."""
|
||||
store = get_vector_store()
|
||||
return {
|
||||
"status": "ok",
|
||||
"version": "0.1.0",
|
||||
"vectors_count": store.count,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/debug/context")
|
||||
def api_debug_context():
|
||||
"""Inspect the last assembled context pack."""
|
||||
pack = get_last_context_pack()
|
||||
if pack is None:
|
||||
return {"message": "No context pack built yet."}
|
||||
return _pack_to_dict(pack)
|
||||
39
src/atocore/config.py
Normal file
39
src/atocore/config.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""AtoCore configuration via environment variables."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
debug: bool = False
|
||||
data_dir: Path = Path("./data")
|
||||
host: str = "127.0.0.1"
|
||||
port: int = 8100
|
||||
|
||||
# Embedding
|
||||
embedding_model: str = (
|
||||
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||||
)
|
||||
|
||||
# Chunking
|
||||
chunk_max_size: int = 800
|
||||
chunk_overlap: int = 100
|
||||
chunk_min_size: int = 50
|
||||
|
||||
# Context
|
||||
context_budget: int = 3000
|
||||
context_top_k: int = 15
|
||||
|
||||
model_config = {"env_prefix": "ATOCORE_"}
|
||||
|
||||
@property
|
||||
def db_path(self) -> Path:
|
||||
return self.data_dir / "atocore.db"
|
||||
|
||||
@property
|
||||
def chroma_path(self) -> Path:
|
||||
return self.data_dir / "chroma"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
0
src/atocore/context/__init__.py
Normal file
0
src/atocore/context/__init__.py
Normal file
212
src/atocore/context/builder.py
Normal file
212
src/atocore/context/builder.py
Normal file
@@ -0,0 +1,212 @@
|
||||
"""Context pack assembly: retrieve, rank, budget, format."""
|
||||
|
||||
import json
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.retriever import ChunkResult, retrieve
|
||||
|
||||
log = get_logger("context_builder")
|
||||
|
||||
SYSTEM_PREFIX = (
|
||||
"You have access to the following personal context from the user's knowledge base.\n"
|
||||
"Use it to inform your answer. If the context is not relevant, ignore it.\n"
|
||||
"Do not mention the context system unless asked."
|
||||
)
|
||||
|
||||
# Last built context pack for debug inspection
|
||||
_last_context_pack: "ContextPack | None" = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContextChunk:
|
||||
content: str
|
||||
source_file: str
|
||||
heading_path: str
|
||||
score: float
|
||||
char_count: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContextPack:
|
||||
chunks_used: list[ContextChunk] = field(default_factory=list)
|
||||
total_chars: int = 0
|
||||
budget: int = 0
|
||||
budget_remaining: int = 0
|
||||
formatted_context: str = ""
|
||||
full_prompt: str = ""
|
||||
query: str = ""
|
||||
project_hint: str = ""
|
||||
duration_ms: int = 0
|
||||
|
||||
|
||||
def build_context(
|
||||
user_prompt: str,
|
||||
project_hint: str | None = None,
|
||||
budget: int | None = None,
|
||||
) -> ContextPack:
|
||||
"""Build a context pack for a user prompt."""
|
||||
global _last_context_pack
|
||||
start = time.time()
|
||||
budget = budget or settings.context_budget
|
||||
|
||||
# 1. Retrieve candidates
|
||||
candidates = retrieve(user_prompt, top_k=settings.context_top_k)
|
||||
|
||||
# 2. Score and rank
|
||||
scored = _rank_chunks(candidates, project_hint)
|
||||
|
||||
# 3. Select within budget
|
||||
selected = _select_within_budget(scored, budget)
|
||||
|
||||
# 4. Format
|
||||
formatted = _format_context_block(selected)
|
||||
|
||||
# 5. Build full prompt
|
||||
full_prompt = f"{SYSTEM_PREFIX}\n\n{formatted}\n\n{user_prompt}"
|
||||
|
||||
total_chars = sum(c.char_count for c in selected)
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
|
||||
pack = ContextPack(
|
||||
chunks_used=selected,
|
||||
total_chars=total_chars,
|
||||
budget=budget,
|
||||
budget_remaining=budget - total_chars,
|
||||
formatted_context=formatted,
|
||||
full_prompt=full_prompt,
|
||||
query=user_prompt,
|
||||
project_hint=project_hint or "",
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
_last_context_pack = pack
|
||||
|
||||
log.info(
|
||||
"context_built",
|
||||
chunks_used=len(selected),
|
||||
total_chars=total_chars,
|
||||
budget_remaining=budget - total_chars,
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
log.debug("context_pack_detail", pack=_pack_to_dict(pack))
|
||||
|
||||
return pack
|
||||
|
||||
|
||||
def get_last_context_pack() -> ContextPack | None:
|
||||
"""Return the last built context pack for debug inspection."""
|
||||
return _last_context_pack
|
||||
|
||||
|
||||
def _rank_chunks(
|
||||
candidates: list[ChunkResult],
|
||||
project_hint: str | None,
|
||||
) -> list[tuple[float, ChunkResult]]:
|
||||
"""Rank candidates with boosting for project match."""
|
||||
scored = []
|
||||
seen_content: set[str] = set()
|
||||
|
||||
for chunk in candidates:
|
||||
# Deduplicate by content prefix (first 200 chars)
|
||||
content_key = chunk.content[:200]
|
||||
if content_key in seen_content:
|
||||
continue
|
||||
seen_content.add(content_key)
|
||||
|
||||
# Base score from similarity
|
||||
final_score = chunk.score
|
||||
|
||||
# Project boost
|
||||
if project_hint:
|
||||
tags_str = chunk.tags.lower() if chunk.tags else ""
|
||||
source_str = chunk.source_file.lower()
|
||||
title_str = chunk.title.lower() if chunk.title else ""
|
||||
hint_lower = project_hint.lower()
|
||||
|
||||
if hint_lower in tags_str or hint_lower in source_str or hint_lower in title_str:
|
||||
final_score += 0.3
|
||||
|
||||
scored.append((final_score, chunk))
|
||||
|
||||
# Sort by score descending
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return scored
|
||||
|
||||
|
||||
def _select_within_budget(
|
||||
scored: list[tuple[float, ChunkResult]],
|
||||
budget: int,
|
||||
) -> list[ContextChunk]:
|
||||
"""Select top chunks that fit within the character budget."""
|
||||
selected = []
|
||||
used = 0
|
||||
|
||||
for score, chunk in scored:
|
||||
chunk_len = len(chunk.content)
|
||||
if used + chunk_len > budget:
|
||||
continue
|
||||
selected.append(
|
||||
ContextChunk(
|
||||
content=chunk.content,
|
||||
source_file=_shorten_path(chunk.source_file),
|
||||
heading_path=chunk.heading_path,
|
||||
score=score,
|
||||
char_count=chunk_len,
|
||||
)
|
||||
)
|
||||
used += chunk_len
|
||||
|
||||
return selected
|
||||
|
||||
|
||||
def _format_context_block(chunks: list[ContextChunk]) -> str:
|
||||
"""Format chunks into the context block string."""
|
||||
if not chunks:
|
||||
return "--- AtoCore Context ---\nNo relevant context found.\n--- End Context ---"
|
||||
|
||||
lines = ["--- AtoCore Context ---"]
|
||||
for chunk in chunks:
|
||||
lines.append(
|
||||
f"[Source: {chunk.source_file} | Section: {chunk.heading_path} | Score: {chunk.score:.2f}]"
|
||||
)
|
||||
lines.append(chunk.content)
|
||||
lines.append("")
|
||||
lines.append("--- End Context ---")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _shorten_path(path: str) -> str:
|
||||
"""Shorten an absolute path to a relative-like display."""
|
||||
p = Path(path)
|
||||
parts = p.parts
|
||||
# Show last 3 parts at most
|
||||
if len(parts) > 3:
|
||||
return str(Path(*parts[-3:]))
|
||||
return str(p)
|
||||
|
||||
|
||||
def _pack_to_dict(pack: ContextPack) -> dict:
|
||||
"""Convert a context pack to a JSON-serializable dict."""
|
||||
return {
|
||||
"query": pack.query,
|
||||
"project_hint": pack.project_hint,
|
||||
"chunks_used": len(pack.chunks_used),
|
||||
"total_chars": pack.total_chars,
|
||||
"budget": pack.budget,
|
||||
"budget_remaining": pack.budget_remaining,
|
||||
"duration_ms": pack.duration_ms,
|
||||
"chunks": [
|
||||
{
|
||||
"source_file": c.source_file,
|
||||
"heading_path": c.heading_path,
|
||||
"score": c.score,
|
||||
"char_count": c.char_count,
|
||||
"content_preview": c.content[:100],
|
||||
}
|
||||
for c in pack.chunks_used
|
||||
],
|
||||
}
|
||||
0
src/atocore/ingestion/__init__.py
Normal file
0
src/atocore/ingestion/__init__.py
Normal file
146
src/atocore/ingestion/chunker.py
Normal file
146
src/atocore/ingestion/chunker.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""Heading-aware recursive markdown chunking."""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from atocore.config import settings
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
content: str
|
||||
chunk_index: int
|
||||
heading_path: str
|
||||
char_count: int
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
def chunk_markdown(
|
||||
body: str,
|
||||
base_metadata: dict | None = None,
|
||||
max_size: int | None = None,
|
||||
overlap: int | None = None,
|
||||
min_size: int | None = None,
|
||||
) -> list[Chunk]:
|
||||
"""Split markdown body into chunks using heading-aware strategy.
|
||||
|
||||
1. Split on H2 boundaries
|
||||
2. If section > max_size, split on H3
|
||||
3. If still > max_size, split on paragraph breaks
|
||||
4. If still > max_size, hard split with overlap
|
||||
"""
|
||||
max_size = max_size or settings.chunk_max_size
|
||||
overlap = overlap or settings.chunk_overlap
|
||||
min_size = min_size or settings.chunk_min_size
|
||||
base_metadata = base_metadata or {}
|
||||
|
||||
sections = _split_by_heading(body, level=2)
|
||||
raw_chunks: list[tuple[str, str]] = [] # (heading_path, content)
|
||||
|
||||
for heading, content in sections:
|
||||
if len(content) <= max_size:
|
||||
raw_chunks.append((heading, content))
|
||||
else:
|
||||
# Try splitting on H3
|
||||
subsections = _split_by_heading(content, level=3)
|
||||
for sub_heading, sub_content in subsections:
|
||||
full_path = (
|
||||
f"{heading} > {sub_heading}" if heading and sub_heading else heading or sub_heading
|
||||
)
|
||||
if len(sub_content) <= max_size:
|
||||
raw_chunks.append((full_path, sub_content))
|
||||
else:
|
||||
# Split on paragraphs
|
||||
para_chunks = _split_by_paragraphs(
|
||||
sub_content, max_size, overlap
|
||||
)
|
||||
for pc in para_chunks:
|
||||
raw_chunks.append((full_path, pc))
|
||||
|
||||
# Build final chunks, filtering out too-small ones
|
||||
chunks = []
|
||||
idx = 0
|
||||
for heading_path, content in raw_chunks:
|
||||
content = content.strip()
|
||||
if len(content) < min_size:
|
||||
continue
|
||||
chunks.append(
|
||||
Chunk(
|
||||
content=content,
|
||||
chunk_index=idx,
|
||||
heading_path=heading_path,
|
||||
char_count=len(content),
|
||||
metadata={**base_metadata},
|
||||
)
|
||||
)
|
||||
idx += 1
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _split_by_heading(text: str, level: int) -> list[tuple[str, str]]:
|
||||
"""Split text by heading level. Returns (heading_text, section_content) pairs."""
|
||||
pattern = rf"^({'#' * level})\s+(.+)$"
|
||||
parts: list[tuple[str, str]] = []
|
||||
current_heading = ""
|
||||
current_lines: list[str] = []
|
||||
|
||||
for line in text.split("\n"):
|
||||
match = re.match(pattern, line)
|
||||
if match:
|
||||
# Save previous section
|
||||
if current_lines:
|
||||
parts.append((current_heading, "\n".join(current_lines)))
|
||||
current_heading = match.group(2).strip()
|
||||
current_lines = []
|
||||
else:
|
||||
current_lines.append(line)
|
||||
|
||||
# Save last section
|
||||
if current_lines:
|
||||
parts.append((current_heading, "\n".join(current_lines)))
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
def _split_by_paragraphs(
|
||||
text: str, max_size: int, overlap: int
|
||||
) -> list[str]:
|
||||
"""Split text by paragraph breaks, then hard-split if needed."""
|
||||
paragraphs = re.split(r"\n\n+", text)
|
||||
chunks: list[str] = []
|
||||
current = ""
|
||||
|
||||
for para in paragraphs:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
if len(current) + len(para) + 2 <= max_size:
|
||||
current = f"{current}\n\n{para}" if current else para
|
||||
else:
|
||||
if current:
|
||||
chunks.append(current)
|
||||
# If single paragraph exceeds max, hard split
|
||||
if len(para) > max_size:
|
||||
chunks.extend(_hard_split(para, max_size, overlap))
|
||||
else:
|
||||
current = para
|
||||
continue
|
||||
current = ""
|
||||
|
||||
if current:
|
||||
chunks.append(current)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def _hard_split(text: str, max_size: int, overlap: int) -> list[str]:
|
||||
"""Hard split text at max_size with overlap."""
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + max_size
|
||||
chunks.append(text[start:end])
|
||||
start = end - overlap
|
||||
return chunks
|
||||
65
src/atocore/ingestion/parser.py
Normal file
65
src/atocore/ingestion/parser.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Markdown file parsing with frontmatter extraction."""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import frontmatter
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedDocument:
|
||||
file_path: str
|
||||
title: str
|
||||
body: str
|
||||
tags: list[str] = field(default_factory=list)
|
||||
frontmatter: dict = field(default_factory=dict)
|
||||
headings: list[tuple[int, str]] = field(default_factory=list)
|
||||
|
||||
|
||||
def parse_markdown(file_path: Path) -> ParsedDocument:
|
||||
"""Parse a markdown file, extracting frontmatter and structure."""
|
||||
text = file_path.read_text(encoding="utf-8")
|
||||
post = frontmatter.loads(text)
|
||||
|
||||
meta = dict(post.metadata) if post.metadata else {}
|
||||
body = post.content.strip()
|
||||
|
||||
# Extract title: first H1, or filename
|
||||
title = _extract_title(body, file_path)
|
||||
|
||||
# Extract tags from frontmatter
|
||||
tags = meta.get("tags", [])
|
||||
if isinstance(tags, str):
|
||||
tags = [t.strip() for t in tags.split(",") if t.strip()]
|
||||
tags = tags or []
|
||||
|
||||
# Extract heading structure
|
||||
headings = _extract_headings(body)
|
||||
|
||||
return ParsedDocument(
|
||||
file_path=str(file_path.resolve()),
|
||||
title=title,
|
||||
body=body,
|
||||
tags=tags,
|
||||
frontmatter=meta,
|
||||
headings=headings,
|
||||
)
|
||||
|
||||
|
||||
def _extract_title(body: str, file_path: Path) -> str:
|
||||
"""Get title from first H1 or fallback to filename."""
|
||||
match = re.search(r"^#\s+(.+)$", body, re.MULTILINE)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
return file_path.stem.replace("_", " ").replace("-", " ").title()
|
||||
|
||||
|
||||
def _extract_headings(body: str) -> list[tuple[int, str]]:
|
||||
"""Extract all headings with their level."""
|
||||
headings = []
|
||||
for match in re.finditer(r"^(#{1,4})\s+(.+)$", body, re.MULTILINE):
|
||||
level = len(match.group(1))
|
||||
text = match.group(2).strip()
|
||||
headings.append((level, text))
|
||||
return headings
|
||||
157
src/atocore/ingestion/pipeline.py
Normal file
157
src/atocore/ingestion/pipeline.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""Ingestion pipeline: parse → chunk → embed → store."""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.ingestion.chunker import chunk_markdown
|
||||
from atocore.ingestion.parser import parse_markdown
|
||||
from atocore.models.database import get_connection
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.vector_store import get_vector_store
|
||||
|
||||
log = get_logger("ingestion")
|
||||
|
||||
|
||||
def ingest_file(file_path: Path) -> dict:
|
||||
"""Ingest a single markdown file. Returns stats."""
|
||||
start = time.time()
|
||||
file_path = file_path.resolve()
|
||||
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
if file_path.suffix.lower() not in (".md", ".markdown"):
|
||||
raise ValueError(f"Not a markdown file: {file_path}")
|
||||
|
||||
# Read and hash
|
||||
raw_content = file_path.read_text(encoding="utf-8")
|
||||
file_hash = hashlib.sha256(raw_content.encode()).hexdigest()
|
||||
|
||||
# Check if already ingested and unchanged
|
||||
with get_connection() as conn:
|
||||
existing = conn.execute(
|
||||
"SELECT id, file_hash FROM source_documents WHERE file_path = ?",
|
||||
(str(file_path),),
|
||||
).fetchone()
|
||||
|
||||
if existing and existing["file_hash"] == file_hash:
|
||||
log.info("file_skipped_unchanged", file_path=str(file_path))
|
||||
return {"file": str(file_path), "status": "skipped", "reason": "unchanged"}
|
||||
|
||||
# Parse
|
||||
parsed = parse_markdown(file_path)
|
||||
|
||||
# Chunk
|
||||
base_meta = {
|
||||
"source_file": str(file_path),
|
||||
"tags": parsed.tags,
|
||||
"title": parsed.title,
|
||||
}
|
||||
chunks = chunk_markdown(parsed.body, base_metadata=base_meta)
|
||||
|
||||
if not chunks:
|
||||
log.warning("no_chunks_created", file_path=str(file_path))
|
||||
return {"file": str(file_path), "status": "empty", "chunks": 0}
|
||||
|
||||
# Store in DB and vector store
|
||||
doc_id = str(uuid.uuid4())
|
||||
vector_store = get_vector_store()
|
||||
|
||||
with get_connection() as conn:
|
||||
# Remove old data if re-ingesting
|
||||
if existing:
|
||||
doc_id = existing["id"]
|
||||
old_chunk_ids = [
|
||||
row["id"]
|
||||
for row in conn.execute(
|
||||
"SELECT id FROM source_chunks WHERE document_id = ?",
|
||||
(doc_id,),
|
||||
).fetchall()
|
||||
]
|
||||
conn.execute(
|
||||
"DELETE FROM source_chunks WHERE document_id = ?", (doc_id,)
|
||||
)
|
||||
conn.execute(
|
||||
"UPDATE source_documents SET file_hash = ?, title = ?, tags = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ?",
|
||||
(file_hash, parsed.title, json.dumps(parsed.tags), doc_id),
|
||||
)
|
||||
# Remove old vectors
|
||||
if old_chunk_ids:
|
||||
vector_store.delete(old_chunk_ids)
|
||||
else:
|
||||
conn.execute(
|
||||
"INSERT INTO source_documents (id, file_path, file_hash, title, doc_type, tags) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(doc_id, str(file_path), file_hash, parsed.title, "markdown", json.dumps(parsed.tags)),
|
||||
)
|
||||
|
||||
# Insert chunks
|
||||
chunk_ids = []
|
||||
chunk_contents = []
|
||||
chunk_metadatas = []
|
||||
|
||||
for chunk in chunks:
|
||||
chunk_id = str(uuid.uuid4())
|
||||
chunk_ids.append(chunk_id)
|
||||
chunk_contents.append(chunk.content)
|
||||
chunk_metadatas.append({
|
||||
"document_id": doc_id,
|
||||
"heading_path": chunk.heading_path,
|
||||
"source_file": str(file_path),
|
||||
"tags": json.dumps(parsed.tags),
|
||||
"title": parsed.title,
|
||||
})
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO source_chunks (id, document_id, chunk_index, content, heading_path, char_count, metadata) VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||
(
|
||||
chunk_id,
|
||||
doc_id,
|
||||
chunk.chunk_index,
|
||||
chunk.content,
|
||||
chunk.heading_path,
|
||||
chunk.char_count,
|
||||
json.dumps(chunk.metadata),
|
||||
),
|
||||
)
|
||||
|
||||
# Store embeddings
|
||||
vector_store.add(chunk_ids, chunk_contents, chunk_metadatas)
|
||||
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
log.info(
|
||||
"file_ingested",
|
||||
file_path=str(file_path),
|
||||
chunks_created=len(chunks),
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
return {
|
||||
"file": str(file_path),
|
||||
"status": "ingested",
|
||||
"chunks": len(chunks),
|
||||
"duration_ms": duration_ms,
|
||||
}
|
||||
|
||||
|
||||
def ingest_folder(folder_path: Path) -> list[dict]:
|
||||
"""Ingest all markdown files in a folder recursively."""
|
||||
folder_path = folder_path.resolve()
|
||||
if not folder_path.is_dir():
|
||||
raise NotADirectoryError(f"Not a directory: {folder_path}")
|
||||
|
||||
results = []
|
||||
md_files = sorted(folder_path.rglob("*.md"))
|
||||
log.info("ingestion_started", folder=str(folder_path), file_count=len(md_files))
|
||||
|
||||
for md_file in md_files:
|
||||
try:
|
||||
result = ingest_file(md_file)
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
log.error("ingestion_error", file_path=str(md_file), error=str(e))
|
||||
results.append({"file": str(md_file), "status": "error", "error": str(e)})
|
||||
|
||||
return results
|
||||
33
src/atocore/main.py
Normal file
33
src/atocore/main.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""AtoCore — FastAPI application entry point."""
|
||||
|
||||
from fastapi import FastAPI
|
||||
|
||||
from atocore.api.routes import router
|
||||
from atocore.config import settings
|
||||
from atocore.models.database import init_db
|
||||
from atocore.observability.logger import setup_logging
|
||||
|
||||
app = FastAPI(
|
||||
title="AtoCore",
|
||||
description="Personal Context Engine for LLM interactions",
|
||||
version="0.1.0",
|
||||
)
|
||||
|
||||
app.include_router(router)
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
def startup():
|
||||
setup_logging()
|
||||
init_db()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(
|
||||
"atocore.main:app",
|
||||
host=settings.host,
|
||||
port=settings.port,
|
||||
reload=True,
|
||||
)
|
||||
0
src/atocore/models/__init__.py
Normal file
0
src/atocore/models/__init__.py
Normal file
98
src/atocore/models/database.py
Normal file
98
src/atocore/models/database.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""SQLite database schema and connection management."""
|
||||
|
||||
import sqlite3
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
|
||||
log = get_logger("database")
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS source_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
file_path TEXT UNIQUE NOT NULL,
|
||||
file_hash TEXT NOT NULL,
|
||||
title TEXT,
|
||||
doc_type TEXT DEFAULT 'markdown',
|
||||
tags TEXT DEFAULT '[]',
|
||||
ingested_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS source_chunks (
|
||||
id TEXT PRIMARY KEY,
|
||||
document_id TEXT NOT NULL REFERENCES source_documents(id) ON DELETE CASCADE,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
heading_path TEXT DEFAULT '',
|
||||
char_count INTEGER NOT NULL,
|
||||
metadata TEXT DEFAULT '{}',
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS memories (
|
||||
id TEXT PRIMARY KEY,
|
||||
memory_type TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
source_chunk_id TEXT REFERENCES source_chunks(id),
|
||||
confidence REAL DEFAULT 1.0,
|
||||
status TEXT DEFAULT 'active',
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS projects (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT UNIQUE NOT NULL,
|
||||
description TEXT DEFAULT '',
|
||||
status TEXT DEFAULT 'active',
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS interactions (
|
||||
id TEXT PRIMARY KEY,
|
||||
prompt TEXT NOT NULL,
|
||||
context_pack TEXT DEFAULT '{}',
|
||||
response_summary TEXT DEFAULT '',
|
||||
project_id TEXT REFERENCES projects(id),
|
||||
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_document ON source_chunks(document_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_interactions_project ON interactions(project_id);
|
||||
"""
|
||||
|
||||
|
||||
def _ensure_data_dir() -> None:
|
||||
settings.data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
"""Initialize the database with schema."""
|
||||
_ensure_data_dir()
|
||||
with get_connection() as conn:
|
||||
conn.executescript(SCHEMA_SQL)
|
||||
log.info("database_initialized", path=str(settings.db_path))
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_connection() -> Generator[sqlite3.Connection, None, None]:
|
||||
"""Get a database connection with row factory."""
|
||||
_ensure_data_dir()
|
||||
conn = sqlite3.connect(str(settings.db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
0
src/atocore/observability/__init__.py
Normal file
0
src/atocore/observability/__init__.py
Normal file
41
src/atocore/observability/logger.py
Normal file
41
src/atocore/observability/logger.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""Structured logging for AtoCore."""
|
||||
|
||||
import logging
|
||||
|
||||
import structlog
|
||||
|
||||
from atocore.config import settings
|
||||
|
||||
_LOG_LEVELS = {
|
||||
"DEBUG": logging.DEBUG,
|
||||
"INFO": logging.INFO,
|
||||
"WARNING": logging.WARNING,
|
||||
"ERROR": logging.ERROR,
|
||||
}
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
"""Configure structlog with JSON output."""
|
||||
log_level = "DEBUG" if settings.debug else "INFO"
|
||||
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.contextvars.merge_contextvars,
|
||||
structlog.processors.add_log_level,
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.dev.ConsoleRenderer()
|
||||
if settings.debug
|
||||
else structlog.processors.JSONRenderer(),
|
||||
],
|
||||
wrapper_class=structlog.make_filtering_bound_logger(
|
||||
_LOG_LEVELS.get(log_level, logging.INFO)
|
||||
),
|
||||
context_class=dict,
|
||||
logger_factory=structlog.PrintLoggerFactory(),
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
|
||||
|
||||
def get_logger(name: str) -> structlog.BoundLogger:
|
||||
"""Get a named logger."""
|
||||
return structlog.get_logger(name)
|
||||
0
src/atocore/retrieval/__init__.py
Normal file
0
src/atocore/retrieval/__init__.py
Normal file
32
src/atocore/retrieval/embeddings.py
Normal file
32
src/atocore/retrieval/embeddings.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Embedding model management."""
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
|
||||
log = get_logger("embeddings")
|
||||
|
||||
_model: SentenceTransformer | None = None
|
||||
|
||||
|
||||
def get_model() -> SentenceTransformer:
|
||||
"""Load and cache the embedding model."""
|
||||
global _model
|
||||
if _model is None:
|
||||
log.info("loading_embedding_model", model=settings.embedding_model)
|
||||
_model = SentenceTransformer(settings.embedding_model)
|
||||
log.info("embedding_model_loaded", model=settings.embedding_model)
|
||||
return _model
|
||||
|
||||
|
||||
def embed_texts(texts: list[str]) -> list[list[float]]:
|
||||
"""Generate embeddings for a list of texts."""
|
||||
model = get_model()
|
||||
embeddings = model.encode(texts, show_progress_bar=False, normalize_embeddings=True)
|
||||
return embeddings.tolist()
|
||||
|
||||
|
||||
def embed_query(query: str) -> list[float]:
|
||||
"""Generate embedding for a single query."""
|
||||
return embed_texts([query])[0]
|
||||
83
src/atocore/retrieval/retriever.py
Normal file
83
src/atocore/retrieval/retriever.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""Retrieval: query → ranked chunks."""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.embeddings import embed_query
|
||||
from atocore.retrieval.vector_store import get_vector_store
|
||||
|
||||
log = get_logger("retriever")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkResult:
|
||||
chunk_id: str
|
||||
content: str
|
||||
score: float
|
||||
heading_path: str
|
||||
source_file: str
|
||||
tags: str
|
||||
title: str
|
||||
document_id: str
|
||||
|
||||
|
||||
def retrieve(
|
||||
query: str,
|
||||
top_k: int | None = None,
|
||||
filter_tags: list[str] | None = None,
|
||||
) -> list[ChunkResult]:
|
||||
"""Retrieve the most relevant chunks for a query."""
|
||||
top_k = top_k or settings.context_top_k
|
||||
start = time.time()
|
||||
|
||||
query_embedding = embed_query(query)
|
||||
store = get_vector_store()
|
||||
|
||||
# Build filter
|
||||
where = None
|
||||
if filter_tags:
|
||||
# ChromaDB where filter for tags (stored as JSON string)
|
||||
# Simple contains check — works for single-tag filtering
|
||||
where = {"tags": {"$contains": filter_tags[0]}}
|
||||
|
||||
results = store.query(
|
||||
query_embedding=query_embedding,
|
||||
top_k=top_k,
|
||||
where=where,
|
||||
)
|
||||
|
||||
chunks = []
|
||||
if results and results["ids"] and results["ids"][0]:
|
||||
for i, chunk_id in enumerate(results["ids"][0]):
|
||||
# ChromaDB returns distances (lower = more similar for cosine)
|
||||
# Convert to similarity score (1 - distance)
|
||||
distance = results["distances"][0][i] if results["distances"] else 0
|
||||
score = 1.0 - distance
|
||||
meta = results["metadatas"][0][i] if results["metadatas"] else {}
|
||||
content = results["documents"][0][i] if results["documents"] else ""
|
||||
|
||||
chunks.append(
|
||||
ChunkResult(
|
||||
chunk_id=chunk_id,
|
||||
content=content,
|
||||
score=round(score, 4),
|
||||
heading_path=meta.get("heading_path", ""),
|
||||
source_file=meta.get("source_file", ""),
|
||||
tags=meta.get("tags", "[]"),
|
||||
title=meta.get("title", ""),
|
||||
document_id=meta.get("document_id", ""),
|
||||
)
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
log.info(
|
||||
"retrieval_done",
|
||||
query=query[:100],
|
||||
top_k=top_k,
|
||||
results_count=len(chunks),
|
||||
duration_ms=duration_ms,
|
||||
)
|
||||
|
||||
return chunks
|
||||
77
src/atocore/retrieval/vector_store.py
Normal file
77
src/atocore/retrieval/vector_store.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""ChromaDB vector store wrapper."""
|
||||
|
||||
import chromadb
|
||||
|
||||
from atocore.config import settings
|
||||
from atocore.observability.logger import get_logger
|
||||
from atocore.retrieval.embeddings import embed_texts
|
||||
|
||||
log = get_logger("vector_store")
|
||||
|
||||
COLLECTION_NAME = "atocore_chunks"
|
||||
|
||||
_store: "VectorStore | None" = None
|
||||
|
||||
|
||||
class VectorStore:
|
||||
"""Wrapper around ChromaDB for chunk storage and retrieval."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
settings.chroma_path.mkdir(parents=True, exist_ok=True)
|
||||
self._client = chromadb.PersistentClient(path=str(settings.chroma_path))
|
||||
self._collection = self._client.get_or_create_collection(
|
||||
name=COLLECTION_NAME,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
log.info("vector_store_initialized", path=str(settings.chroma_path))
|
||||
|
||||
def add(
|
||||
self,
|
||||
ids: list[str],
|
||||
documents: list[str],
|
||||
metadatas: list[dict],
|
||||
) -> None:
|
||||
"""Add chunks with embeddings to the store."""
|
||||
embeddings = embed_texts(documents)
|
||||
self._collection.add(
|
||||
ids=ids,
|
||||
embeddings=embeddings,
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
)
|
||||
log.debug("vectors_added", count=len(ids))
|
||||
|
||||
def query(
|
||||
self,
|
||||
query_embedding: list[float],
|
||||
top_k: int = 10,
|
||||
where: dict | None = None,
|
||||
) -> dict:
|
||||
"""Query the store for similar chunks."""
|
||||
kwargs: dict = {
|
||||
"query_embeddings": [query_embedding],
|
||||
"n_results": top_k,
|
||||
"include": ["documents", "metadatas", "distances"],
|
||||
}
|
||||
if where:
|
||||
kwargs["where"] = where
|
||||
|
||||
return self._collection.query(**kwargs)
|
||||
|
||||
def delete(self, ids: list[str]) -> None:
|
||||
"""Delete chunks by IDs."""
|
||||
if ids:
|
||||
self._collection.delete(ids=ids)
|
||||
log.debug("vectors_deleted", count=len(ids))
|
||||
|
||||
@property
|
||||
def count(self) -> int:
|
||||
return self._collection.count()
|
||||
|
||||
|
||||
def get_vector_store() -> VectorStore:
|
||||
"""Get or create the singleton vector store."""
|
||||
global _store
|
||||
if _store is None:
|
||||
_store = VectorStore()
|
||||
return _store
|
||||
Reference in New Issue
Block a user