Files
ATOCore/src/atocore/models/database.py

172 lines
6.7 KiB
Python
Raw Normal View History

"""SQLite database schema and connection management."""
import sqlite3
from contextlib import contextmanager
from pathlib import Path
from typing import Generator
import atocore.config as _config
from atocore.observability.logger import get_logger
log = get_logger("database")
SCHEMA_SQL = """
CREATE TABLE IF NOT EXISTS source_documents (
id TEXT PRIMARY KEY,
file_path TEXT UNIQUE NOT NULL,
file_hash TEXT NOT NULL,
title TEXT,
doc_type TEXT DEFAULT 'markdown',
tags TEXT DEFAULT '[]',
ingested_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS source_chunks (
id TEXT PRIMARY KEY,
document_id TEXT NOT NULL REFERENCES source_documents(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
content TEXT NOT NULL,
heading_path TEXT DEFAULT '',
char_count INTEGER NOT NULL,
metadata TEXT DEFAULT '{}',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS memories (
id TEXT PRIMARY KEY,
memory_type TEXT NOT NULL,
content TEXT NOT NULL,
project TEXT DEFAULT '',
source_chunk_id TEXT REFERENCES source_chunks(id),
confidence REAL DEFAULT 1.0,
status TEXT DEFAULT 'active',
feat(phase9-B): reinforce active memories from captured interactions Phase 9 Commit B from the agreed plan. With Commit A capturing what AtoCore fed to the LLM and what came back, this commit closes the weakest part of the loop: when a memory is actually referenced in a response, its confidence should drift up, and stale memories that nobody ever mentions should stay where they are. This is reinforcement only — nothing is promoted into trusted state and no candidates are created. Extraction is Commit C. Schema (additive migration): - memories.last_referenced_at DATETIME (null by default) - memories.reference_count INTEGER DEFAULT 0 - idx_memories_last_referenced on last_referenced_at - memories.status now accepts the new "candidate" value so Commit C has the status slot to land on. Existing active/superseded/invalid rows are untouched. New module: src/atocore/memory/reinforcement.py - reinforce_from_interaction(interaction): scans the interaction's response + response_summary for echoes of active memories and bumps confidence / reference_count for each match - matching is intentionally simple and explainable: * normalize both sides (lowercase, collapse whitespace) * require >= 12 chars of memory content to match * compare the leading 80-char window of each memory - the candidate pool is project-scoped memories for the interaction's project + global identity + preference memories, deduplicated - candidates and invalidated memories are NEVER reinforced; only active memories move Memory service changes: - MEMORY_STATUSES = ["candidate", "active", "superseded", "invalid"] - create_memory(status="candidate"|"active"|...) with per-status duplicate scoping so a candidate and an active with identical text can legitimately coexist during review - get_memories(status=...) explicit override of the legacy active_only flag; callers can now list the review queue cleanly - update_memory accepts any valid status including "candidate" - reinforce_memory(id, delta): low-level primitive that bumps confidence (capped at 1.0), increments reference_count, and sets last_referenced_at. Only active memories; returns (applied, old, new) - promote_memory / reject_candidate_memory helpers prepping Commit C Interactions service: - record_interaction(reinforce=True) runs reinforce_from_interaction automatically when the interaction has response content. reinforcement errors are logged but never raised back to the caller so capture itself is never blocked by a flaky downstream. - circular import between interactions service and memory.reinforcement avoided by lazy import inside the function API: - POST /interactions now accepts a reinforce bool field (default true) - POST /interactions/{id}/reinforce runs reinforcement on an existing captured interaction — useful for backfilling or for retrying after a transient error in the automatic pass - response lists which memory ids were reinforced with old / new confidence for audit Tests (17 new, all green): - reinforce_memory bumps, caps at 1.0, accumulates reference_count - reinforce_memory rejects candidates and missing ids - reinforce_memory rejects negative delta - reinforce_from_interaction matches active memory - reinforce_from_interaction ignores candidates and inactive - reinforce_from_interaction requires minimum content length - reinforce_from_interaction handles empty response cleanly - reinforce_from_interaction normalizes casing and whitespace - reinforce_from_interaction deduplicates across memory buckets - record_interaction auto-reinforces by default - record_interaction reinforce=False skips the pass - record_interaction handles empty response - POST /interactions/{id}/reinforce runs against stored interaction - POST /interactions/{id}/reinforce returns 404 for missing id - POST /interactions accepts reinforce=false Full suite: 135 passing (was 118). Trust model unchanged: - reinforcement only moves confidence within the existing active set - the candidate lifecycle is declared but only Commit C will actually create candidate memories - trusted project state is never touched by reinforcement Next: Commit C adds the rule-based extractor that produces candidate memories from captured interactions plus the promote/reject review queue endpoints.
2026-04-06 21:18:38 -04:00
last_referenced_at DATETIME,
reference_count INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS projects (
id TEXT PRIMARY KEY,
name TEXT UNIQUE NOT NULL,
description TEXT DEFAULT '',
status TEXT DEFAULT 'active',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS interactions (
id TEXT PRIMARY KEY,
prompt TEXT NOT NULL,
context_pack TEXT DEFAULT '{}',
response_summary TEXT DEFAULT '',
feat(phase9-A): interaction capture loop foundation Phase 9 Commit A from the agreed plan: turn AtoCore from a stateless context enhancer into a system that records what it actually fed to an LLM and what came back. This is the audit trail Reflection (Commit B) and Extraction (Commit C) will be layered on top of. The interactions table existed in the schema since the original PoC but nothing wrote to it. This change makes it real: Schema migration (additive only): - response full LLM response (caller decides how much) - memories_used JSON list of memory ids in the context pack - chunks_used JSON list of chunk ids in the context pack - client identifier of the calling system (openclaw, claude-code, manual, ...) - session_id groups multi-turn conversations - project project name (mirrors the memory module pattern, no FK so capture stays cheap) - indexes on session_id, project, created_at The created_at column is now written explicitly with a SQLite-compatible 'YYYY-MM-DD HH:MM:SS' format so the same string lives in the DB and the returned dataclass. Without this the `since` filter on list_interactions would silently fail because CURRENT_TIMESTAMP and isoformat use different shapes that do not compare cleanly as strings. New module src/atocore/interactions/: - Interaction dataclass - record_interaction() persists one round-trip (prompt required; everything else optional). Refuses empty prompts. - list_interactions() filters by project / session_id / client / since, newest-first, hard-capped at 500 - get_interaction() fetch by id, full response + context pack API endpoints: - POST /interactions capture one interaction - GET /interactions list with summaries (no full response) - GET /interactions/{id} full record incl. response + pack Trust model: - Capture is read-only with respect to memories, project state, and source chunks. Nothing here promotes anything into trusted state. - The audit trail becomes the dataset Commit B (reinforcement) and Commit C (extraction + review queue) will operate on. Tests (13 new, all green): - service: persist + roundtrip every field - service: minimum-fields path (prompt only) - service: empty / whitespace prompt rejected - service: get by id returns None for missing - service: filter by project, session, client - service: ordering newest-first with limit - service: since filter inclusive on cutoff (the bug the timestamp fix above caught) - service: limit=0 returns empty - API: POST records and round-trips through GET /interactions/{id} - API: empty prompt returns 400 - API: missing id returns 404 - API: list filter returns summaries (not full response bodies) Full suite: 118 passing (was 105). master-plan-status.md updated to move Phase 9 from "not started" to "started" with the explicit note that Commit A is in and Commits B/C remain.
2026-04-06 19:31:43 -04:00
response TEXT DEFAULT '',
memories_used TEXT DEFAULT '[]',
chunks_used TEXT DEFAULT '[]',
client TEXT DEFAULT '',
session_id TEXT DEFAULT '',
project TEXT DEFAULT '',
project_id TEXT REFERENCES projects(id),
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_chunks_document ON source_chunks(document_id);
CREATE INDEX IF NOT EXISTS idx_memories_type ON memories(memory_type);
CREATE INDEX IF NOT EXISTS idx_memories_project ON memories(project);
CREATE INDEX IF NOT EXISTS idx_memories_status ON memories(status);
CREATE INDEX IF NOT EXISTS idx_interactions_project ON interactions(project_id);
feat(phase9-A): interaction capture loop foundation Phase 9 Commit A from the agreed plan: turn AtoCore from a stateless context enhancer into a system that records what it actually fed to an LLM and what came back. This is the audit trail Reflection (Commit B) and Extraction (Commit C) will be layered on top of. The interactions table existed in the schema since the original PoC but nothing wrote to it. This change makes it real: Schema migration (additive only): - response full LLM response (caller decides how much) - memories_used JSON list of memory ids in the context pack - chunks_used JSON list of chunk ids in the context pack - client identifier of the calling system (openclaw, claude-code, manual, ...) - session_id groups multi-turn conversations - project project name (mirrors the memory module pattern, no FK so capture stays cheap) - indexes on session_id, project, created_at The created_at column is now written explicitly with a SQLite-compatible 'YYYY-MM-DD HH:MM:SS' format so the same string lives in the DB and the returned dataclass. Without this the `since` filter on list_interactions would silently fail because CURRENT_TIMESTAMP and isoformat use different shapes that do not compare cleanly as strings. New module src/atocore/interactions/: - Interaction dataclass - record_interaction() persists one round-trip (prompt required; everything else optional). Refuses empty prompts. - list_interactions() filters by project / session_id / client / since, newest-first, hard-capped at 500 - get_interaction() fetch by id, full response + context pack API endpoints: - POST /interactions capture one interaction - GET /interactions list with summaries (no full response) - GET /interactions/{id} full record incl. response + pack Trust model: - Capture is read-only with respect to memories, project state, and source chunks. Nothing here promotes anything into trusted state. - The audit trail becomes the dataset Commit B (reinforcement) and Commit C (extraction + review queue) will operate on. Tests (13 new, all green): - service: persist + roundtrip every field - service: minimum-fields path (prompt only) - service: empty / whitespace prompt rejected - service: get by id returns None for missing - service: filter by project, session, client - service: ordering newest-first with limit - service: since filter inclusive on cutoff (the bug the timestamp fix above caught) - service: limit=0 returns empty - API: POST records and round-trips through GET /interactions/{id} - API: empty prompt returns 400 - API: missing id returns 404 - API: list filter returns summaries (not full response bodies) Full suite: 118 passing (was 105). master-plan-status.md updated to move Phase 9 from "not started" to "started" with the explicit note that Commit A is in and Commits B/C remain.
2026-04-06 19:31:43 -04:00
CREATE INDEX IF NOT EXISTS idx_interactions_project_name ON interactions(project);
CREATE INDEX IF NOT EXISTS idx_interactions_session ON interactions(session_id);
CREATE INDEX IF NOT EXISTS idx_interactions_created_at ON interactions(created_at);
"""
def _ensure_data_dir() -> None:
_config.ensure_runtime_dirs()
def init_db() -> None:
"""Initialize the database with schema."""
_ensure_data_dir()
with get_connection() as conn:
conn.executescript(SCHEMA_SQL)
_apply_migrations(conn)
log.info("database_initialized", path=str(_config.settings.db_path))
def _apply_migrations(conn: sqlite3.Connection) -> None:
"""Apply lightweight schema migrations for existing local databases."""
if not _column_exists(conn, "memories", "project"):
conn.execute("ALTER TABLE memories ADD COLUMN project TEXT DEFAULT ''")
conn.execute("CREATE INDEX IF NOT EXISTS idx_memories_project ON memories(project)")
feat(phase9-B): reinforce active memories from captured interactions Phase 9 Commit B from the agreed plan. With Commit A capturing what AtoCore fed to the LLM and what came back, this commit closes the weakest part of the loop: when a memory is actually referenced in a response, its confidence should drift up, and stale memories that nobody ever mentions should stay where they are. This is reinforcement only — nothing is promoted into trusted state and no candidates are created. Extraction is Commit C. Schema (additive migration): - memories.last_referenced_at DATETIME (null by default) - memories.reference_count INTEGER DEFAULT 0 - idx_memories_last_referenced on last_referenced_at - memories.status now accepts the new "candidate" value so Commit C has the status slot to land on. Existing active/superseded/invalid rows are untouched. New module: src/atocore/memory/reinforcement.py - reinforce_from_interaction(interaction): scans the interaction's response + response_summary for echoes of active memories and bumps confidence / reference_count for each match - matching is intentionally simple and explainable: * normalize both sides (lowercase, collapse whitespace) * require >= 12 chars of memory content to match * compare the leading 80-char window of each memory - the candidate pool is project-scoped memories for the interaction's project + global identity + preference memories, deduplicated - candidates and invalidated memories are NEVER reinforced; only active memories move Memory service changes: - MEMORY_STATUSES = ["candidate", "active", "superseded", "invalid"] - create_memory(status="candidate"|"active"|...) with per-status duplicate scoping so a candidate and an active with identical text can legitimately coexist during review - get_memories(status=...) explicit override of the legacy active_only flag; callers can now list the review queue cleanly - update_memory accepts any valid status including "candidate" - reinforce_memory(id, delta): low-level primitive that bumps confidence (capped at 1.0), increments reference_count, and sets last_referenced_at. Only active memories; returns (applied, old, new) - promote_memory / reject_candidate_memory helpers prepping Commit C Interactions service: - record_interaction(reinforce=True) runs reinforce_from_interaction automatically when the interaction has response content. reinforcement errors are logged but never raised back to the caller so capture itself is never blocked by a flaky downstream. - circular import between interactions service and memory.reinforcement avoided by lazy import inside the function API: - POST /interactions now accepts a reinforce bool field (default true) - POST /interactions/{id}/reinforce runs reinforcement on an existing captured interaction — useful for backfilling or for retrying after a transient error in the automatic pass - response lists which memory ids were reinforced with old / new confidence for audit Tests (17 new, all green): - reinforce_memory bumps, caps at 1.0, accumulates reference_count - reinforce_memory rejects candidates and missing ids - reinforce_memory rejects negative delta - reinforce_from_interaction matches active memory - reinforce_from_interaction ignores candidates and inactive - reinforce_from_interaction requires minimum content length - reinforce_from_interaction handles empty response cleanly - reinforce_from_interaction normalizes casing and whitespace - reinforce_from_interaction deduplicates across memory buckets - record_interaction auto-reinforces by default - record_interaction reinforce=False skips the pass - record_interaction handles empty response - POST /interactions/{id}/reinforce runs against stored interaction - POST /interactions/{id}/reinforce returns 404 for missing id - POST /interactions accepts reinforce=false Full suite: 135 passing (was 118). Trust model unchanged: - reinforcement only moves confidence within the existing active set - the candidate lifecycle is declared but only Commit C will actually create candidate memories - trusted project state is never touched by reinforcement Next: Commit C adds the rule-based extractor that produces candidate memories from captured interactions plus the promote/reject review queue endpoints.
2026-04-06 21:18:38 -04:00
# Phase 9 Commit B: reinforcement columns.
# last_referenced_at records when a memory was most recently referenced
# in a captured interaction; reference_count is a monotonically
# increasing counter bumped on every reference. Together they let
# Reflection (Commit C) and decay (deferred) reason about which
# memories are actually being used versus which have gone cold.
if not _column_exists(conn, "memories", "last_referenced_at"):
conn.execute("ALTER TABLE memories ADD COLUMN last_referenced_at DATETIME")
if not _column_exists(conn, "memories", "reference_count"):
conn.execute("ALTER TABLE memories ADD COLUMN reference_count INTEGER DEFAULT 0")
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_memories_last_referenced ON memories(last_referenced_at)"
)
feat(phase9-A): interaction capture loop foundation Phase 9 Commit A from the agreed plan: turn AtoCore from a stateless context enhancer into a system that records what it actually fed to an LLM and what came back. This is the audit trail Reflection (Commit B) and Extraction (Commit C) will be layered on top of. The interactions table existed in the schema since the original PoC but nothing wrote to it. This change makes it real: Schema migration (additive only): - response full LLM response (caller decides how much) - memories_used JSON list of memory ids in the context pack - chunks_used JSON list of chunk ids in the context pack - client identifier of the calling system (openclaw, claude-code, manual, ...) - session_id groups multi-turn conversations - project project name (mirrors the memory module pattern, no FK so capture stays cheap) - indexes on session_id, project, created_at The created_at column is now written explicitly with a SQLite-compatible 'YYYY-MM-DD HH:MM:SS' format so the same string lives in the DB and the returned dataclass. Without this the `since` filter on list_interactions would silently fail because CURRENT_TIMESTAMP and isoformat use different shapes that do not compare cleanly as strings. New module src/atocore/interactions/: - Interaction dataclass - record_interaction() persists one round-trip (prompt required; everything else optional). Refuses empty prompts. - list_interactions() filters by project / session_id / client / since, newest-first, hard-capped at 500 - get_interaction() fetch by id, full response + context pack API endpoints: - POST /interactions capture one interaction - GET /interactions list with summaries (no full response) - GET /interactions/{id} full record incl. response + pack Trust model: - Capture is read-only with respect to memories, project state, and source chunks. Nothing here promotes anything into trusted state. - The audit trail becomes the dataset Commit B (reinforcement) and Commit C (extraction + review queue) will operate on. Tests (13 new, all green): - service: persist + roundtrip every field - service: minimum-fields path (prompt only) - service: empty / whitespace prompt rejected - service: get by id returns None for missing - service: filter by project, session, client - service: ordering newest-first with limit - service: since filter inclusive on cutoff (the bug the timestamp fix above caught) - service: limit=0 returns empty - API: POST records and round-trips through GET /interactions/{id} - API: empty prompt returns 400 - API: missing id returns 404 - API: list filter returns summaries (not full response bodies) Full suite: 118 passing (was 105). master-plan-status.md updated to move Phase 9 from "not started" to "started" with the explicit note that Commit A is in and Commits B/C remain.
2026-04-06 19:31:43 -04:00
# Phase 9 Commit A: capture loop columns on the interactions table.
# The original schema only carried prompt + project_id + a context_pack
# JSON blob. To make interactions a real audit trail of what AtoCore fed
# the LLM and what came back, we record the full response, the chunk
# and memory ids that were actually used, plus client + session metadata.
if not _column_exists(conn, "interactions", "response"):
conn.execute("ALTER TABLE interactions ADD COLUMN response TEXT DEFAULT ''")
if not _column_exists(conn, "interactions", "memories_used"):
conn.execute("ALTER TABLE interactions ADD COLUMN memories_used TEXT DEFAULT '[]'")
if not _column_exists(conn, "interactions", "chunks_used"):
conn.execute("ALTER TABLE interactions ADD COLUMN chunks_used TEXT DEFAULT '[]'")
if not _column_exists(conn, "interactions", "client"):
conn.execute("ALTER TABLE interactions ADD COLUMN client TEXT DEFAULT ''")
if not _column_exists(conn, "interactions", "session_id"):
conn.execute("ALTER TABLE interactions ADD COLUMN session_id TEXT DEFAULT ''")
if not _column_exists(conn, "interactions", "project"):
conn.execute("ALTER TABLE interactions ADD COLUMN project TEXT DEFAULT ''")
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_interactions_session ON interactions(session_id)"
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_interactions_project_name ON interactions(project)"
)
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_interactions_created_at ON interactions(created_at)"
)
def _column_exists(conn: sqlite3.Connection, table: str, column: str) -> bool:
rows = conn.execute(f"PRAGMA table_info({table})").fetchall()
return any(row["name"] == column for row in rows)
@contextmanager
def get_connection() -> Generator[sqlite3.Connection, None, None]:
"""Get a database connection with row factory."""
_ensure_data_dir()
conn = sqlite3.connect(
str(_config.settings.db_path),
timeout=_config.settings.db_busy_timeout_ms / 1000,
)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
conn.execute(f"PRAGMA busy_timeout = {_config.settings.db_busy_timeout_ms}")
conn.execute("PRAGMA journal_mode = WAL")
conn.execute("PRAGMA synchronous = NORMAL")
try:
yield conn
conn.commit()
except Exception:
conn.rollback()
raise
finally:
conn.close()