diff --git a/src/atocore/context/builder.py b/src/atocore/context/builder.py index 57e159d..3e4ab7d 100644 --- a/src/atocore/context/builder.py +++ b/src/atocore/context/builder.py @@ -14,7 +14,7 @@ import atocore.config as _config from atocore.context.project_state import format_project_state, get_state from atocore.memory.service import get_memories_for_context from atocore.observability.logger import get_logger -from atocore.projects.registry import get_registered_project +from atocore.projects.registry import resolve_project_name from atocore.retrieval.retriever import ChunkResult, retrieve log = get_logger("context_builder") @@ -85,20 +85,15 @@ def build_context( max(0, int(budget * PROJECT_STATE_BUDGET_RATIO)), ) - # Resolve the project hint through the registry so callers can pass - # an alias (`p05`, `gigabit`) and still find trusted state stored - # under the canonical project id (`p05-interferometer`, - # `p04-gigabit`). The retriever already does this for the - # project-match boost — the project_state lookup needs the same - # courtesy. If the registry has no entry for the hint, fall back to - # the raw hint so a hand-curated project_state entry that predates - # the registry still works. - canonical_project = project_hint - if project_hint: - registered = get_registered_project(project_hint) - if registered is not None: - canonical_project = registered.project_id - + # Canonicalize the project hint through the registry so callers + # can pass an alias (`p05`, `gigabit`) and still find trusted + # state stored under the canonical project id. The same helper + # is used everywhere a project name crosses a trust boundary + # (project_state, memories, interactions). When the registry has + # no entry the helper returns the input unchanged so hand-curated + # state that predates the registry still works. + canonical_project = resolve_project_name(project_hint) if project_hint else "" + if canonical_project: state_entries = get_state(canonical_project) if state_entries: project_state_text = format_project_state(state_entries) diff --git a/src/atocore/context/project_state.py b/src/atocore/context/project_state.py index 82124a5..36091d4 100644 --- a/src/atocore/context/project_state.py +++ b/src/atocore/context/project_state.py @@ -18,6 +18,7 @@ from datetime import datetime, timezone from atocore.models.database import get_connection from atocore.observability.logger import get_logger +from atocore.projects.registry import resolve_project_name log = get_logger("project_state") @@ -101,11 +102,19 @@ def set_state( source: str = "", confidence: float = 1.0, ) -> ProjectStateEntry: - """Set or update a project state entry. Upsert semantics.""" + """Set or update a project state entry. Upsert semantics. + + The ``project_name`` is canonicalized through the registry so a + caller passing an alias (``p05``) ends up writing into the same + row as the canonical id (``p05-interferometer``). Without this + step, alias and canonical names would create two parallel + project rows and fragmented state. + """ if category not in CATEGORIES: raise ValueError(f"Invalid category '{category}'. Must be one of: {CATEGORIES}") _validate_confidence(confidence) + project_name = resolve_project_name(project_name) project_id = ensure_project(project_name) entry_id = str(uuid.uuid4()) now = datetime.now(timezone.utc).isoformat() @@ -153,7 +162,12 @@ def get_state( category: str | None = None, active_only: bool = True, ) -> list[ProjectStateEntry]: - """Get project state entries, optionally filtered by category.""" + """Get project state entries, optionally filtered by category. + + The lookup is canonicalized through the registry so an alias hint + finds the same rows as the canonical id. + """ + project_name = resolve_project_name(project_name) with get_connection() as conn: project = conn.execute( "SELECT id FROM projects WHERE lower(name) = lower(?)", (project_name,) @@ -191,7 +205,12 @@ def get_state( def invalidate_state(project_name: str, category: str, key: str) -> bool: - """Mark a project state entry as superseded.""" + """Mark a project state entry as superseded. + + The lookup is canonicalized through the registry so an alias is + treated as the canonical project for the invalidation lookup. + """ + project_name = resolve_project_name(project_name) with get_connection() as conn: project = conn.execute( "SELECT id FROM projects WHERE lower(name) = lower(?)", (project_name,) diff --git a/src/atocore/interactions/service.py b/src/atocore/interactions/service.py index 0e5fc24..fd45ad4 100644 --- a/src/atocore/interactions/service.py +++ b/src/atocore/interactions/service.py @@ -18,15 +18,24 @@ violating the AtoCore trust hierarchy. from __future__ import annotations import json +import re import uuid from dataclasses import dataclass, field from datetime import datetime, timezone from atocore.models.database import get_connection from atocore.observability.logger import get_logger +from atocore.projects.registry import resolve_project_name log = get_logger("interactions") +# Stored timestamps use 'YYYY-MM-DD HH:MM:SS' (no timezone offset, UTC by +# convention) so they sort lexically and compare cleanly with the SQLite +# CURRENT_TIMESTAMP default. The since filter accepts ISO 8601 strings +# (with 'T', optional 'Z' or +offset, optional fractional seconds) and +# normalizes them to the storage format before the SQL comparison. +_STORAGE_TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S" + @dataclass class Interaction: @@ -72,6 +81,13 @@ def record_interaction( if not prompt or not prompt.strip(): raise ValueError("Interaction prompt must be non-empty") + # Canonicalize the project through the registry so an alias and + # the canonical id store under the same bucket. Without this, + # reinforcement and extraction (which both query by raw + # interaction.project) would silently miss memories and create + # candidates in the wrong project. + project = resolve_project_name(project) + interaction_id = str(uuid.uuid4()) # Store created_at explicitly so the same string lives in both the DB # column and the returned dataclass. SQLite's CURRENT_TIMESTAMP uses @@ -159,9 +175,14 @@ def list_interactions( ) -> list[Interaction]: """List captured interactions, optionally filtered. - ``since`` is an ISO timestamp string; only interactions created at or - after that time are returned. ``limit`` is hard-capped at 500 to keep - casual API listings cheap. + ``since`` accepts an ISO 8601 timestamp string (with ``T``, an + optional ``Z`` or numeric offset, optional fractional seconds). + The value is normalized to the storage format (UTC, + ``YYYY-MM-DD HH:MM:SS``) before the SQL comparison so external + callers can pass any of the common ISO shapes without filter + drift. ``project`` is canonicalized through the registry so an + alias finds rows stored under the canonical project id. + ``limit`` is hard-capped at 500 to keep casual API listings cheap. """ if limit <= 0: return [] @@ -172,7 +193,7 @@ def list_interactions( if project: query += " AND project = ?" - params.append(project) + params.append(resolve_project_name(project)) if session_id: query += " AND session_id = ?" params.append(session_id) @@ -181,7 +202,7 @@ def list_interactions( params.append(client) if since: query += " AND created_at >= ?" - params.append(since) + params.append(_normalize_since(since)) query += " ORDER BY created_at DESC LIMIT ?" params.append(limit) @@ -243,3 +264,41 @@ def _safe_json_dict(raw: str | None) -> dict: if not isinstance(value, dict): return {} return value + + +def _normalize_since(since: str) -> str: + """Normalize an ISO 8601 ``since`` filter to the storage format. + + Stored ``created_at`` values are ``YYYY-MM-DD HH:MM:SS`` (no + timezone, UTC by convention). External callers naturally pass + ISO 8601 with ``T`` separator, optional ``Z`` suffix, optional + fractional seconds, and optional ``+HH:MM`` offsets. A naive + string comparison between the two formats fails on the same + day because the lexically-greater ``T`` makes any ISO value + sort after any space-separated value. + + This helper accepts the common ISO shapes plus the bare + storage format and returns the storage format. On a parse + failure it returns the input unchanged so the SQL comparison + fails open (no rows match) instead of raising and breaking + the listing endpoint. + """ + if not since: + return since + candidate = since.strip() + # Python's fromisoformat understands trailing 'Z' from 3.11+ but + # we replace it explicitly for safety against earlier shapes. + if candidate.endswith("Z"): + candidate = candidate[:-1] + "+00:00" + try: + dt = datetime.fromisoformat(candidate) + except ValueError: + # Already in storage format, or unparseable: best-effort + # match the storage format with a regex; if that fails too, + # return the raw input. + if re.fullmatch(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", since): + return since + return since + if dt.tzinfo is not None: + dt = dt.astimezone(timezone.utc).replace(tzinfo=None) + return dt.strftime(_STORAGE_TIMESTAMP_FORMAT) diff --git a/src/atocore/memory/service.py b/src/atocore/memory/service.py index 8a6f33e..79ea29b 100644 --- a/src/atocore/memory/service.py +++ b/src/atocore/memory/service.py @@ -29,6 +29,7 @@ from datetime import datetime, timezone from atocore.models.database import get_connection from atocore.observability.logger import get_logger +from atocore.projects.registry import resolve_project_name log = get_logger("memory") @@ -84,6 +85,13 @@ def create_memory( raise ValueError(f"Invalid status '{status}'. Must be one of: {MEMORY_STATUSES}") _validate_confidence(confidence) + # Canonicalize the project through the registry so an alias and + # the canonical id store under the same bucket. This keeps + # reinforcement queries (which use the interaction's project) and + # context retrieval (which uses the registry-canonicalized hint) + # consistent with how memories are created. + project = resolve_project_name(project) + memory_id = str(uuid.uuid4()) now = datetime.now(timezone.utc).isoformat() @@ -162,8 +170,13 @@ def get_memories( query += " AND memory_type = ?" params.append(memory_type) if project is not None: + # Canonicalize on the read side so a caller passing an alias + # finds rows that were stored under the canonical id (and + # vice versa). resolve_project_name returns the input + # unchanged for unregistered names so empty-string queries + # for "no project scope" still work. query += " AND project = ?" - params.append(project) + params.append(resolve_project_name(project)) if status is not None: query += " AND status = ?" params.append(status) diff --git a/src/atocore/projects/registry.py b/src/atocore/projects/registry.py index f1a7843..a027eb5 100644 --- a/src/atocore/projects/registry.py +++ b/src/atocore/projects/registry.py @@ -254,6 +254,30 @@ def get_registered_project(project_name: str) -> RegisteredProject | None: return None +def resolve_project_name(name: str | None) -> str: + """Canonicalize a project name through the registry. + + Returns the canonical ``project_id`` if the input matches any + registered project's id or alias. Returns the input unchanged + when it's empty or not in the registry — the second case keeps + backwards compatibility with hand-curated state, memories, and + interactions that predate the registry, or for projects that + are intentionally not registered. + + This helper is the single canonicalization boundary for project + names across the trust hierarchy. Every read/write that takes a + project name should pass it through ``resolve_project_name`` + before storing or querying. The contract is documented in + ``docs/architecture/representation-authority.md``. + """ + if not name: + return name or "" + project = get_registered_project(name) + if project is not None: + return project.project_id + return name + + def refresh_registered_project(project_name: str, purge_deleted: bool = False) -> dict: """Ingest all configured source roots for a registered project. diff --git a/tests/conftest.py b/tests/conftest.py index 599dc51..00981f5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ """pytest configuration and shared fixtures.""" +import json import os import sys import tempfile @@ -29,6 +30,45 @@ def tmp_data_dir(tmp_path): return tmp_path +@pytest.fixture +def project_registry(tmp_path, monkeypatch): + """Stand up an isolated project registry pointing at a temp file. + + Returns a callable that takes one or more (project_id, [aliases]) + tuples and writes them into the registry, then forces the in-process + settings singleton to re-resolve. Use this when a test needs the + canonicalization helpers (resolve_project_name, get_registered_project) + to recognize aliases. + """ + registry_path = tmp_path / "test-project-registry.json" + + def _set(*projects): + payload = {"projects": []} + for entry in projects: + if isinstance(entry, str): + project_id, aliases = entry, [] + else: + project_id, aliases = entry + payload["projects"].append( + { + "id": project_id, + "aliases": list(aliases), + "description": f"test project {project_id}", + "ingest_roots": [ + {"source": "vault", "subpath": f"incoming/projects/{project_id}"} + ], + } + ) + registry_path.write_text(json.dumps(payload), encoding="utf-8") + monkeypatch.setenv("ATOCORE_PROJECT_REGISTRY_PATH", str(registry_path)) + from atocore import config + + config.settings = config.Settings() + return registry_path + + return _set + + @pytest.fixture def sample_markdown(tmp_path) -> Path: """Create a sample markdown file for testing.""" diff --git a/tests/test_interactions.py b/tests/test_interactions.py index 48685fa..dd39964 100644 --- a/tests/test_interactions.py +++ b/tests/test_interactions.py @@ -209,3 +209,96 @@ def test_list_interactions_endpoint_returns_summaries(tmp_data_dir): assert body["interactions"][0]["response_chars"] == 50 # The list endpoint never includes the full response body assert "response" not in body["interactions"][0] + + +# --- alias canonicalization on interaction capture/list ------------------- + + +def test_record_interaction_canonicalizes_project(project_registry): + """Capturing under an alias should store the canonical project id. + + Regression for codex's P2 finding: reinforcement and extraction + query memories by interaction.project; if the captured project is + a raw alias they would silently miss memories stored under the + canonical id. + """ + init_db() + project_registry(("p05-interferometer", ["p05", "interferometer"])) + + interaction = record_interaction( + prompt="quick capture", response="response body", project="p05", reinforce=False + ) + assert interaction.project == "p05-interferometer" + + fetched = get_interaction(interaction.id) + assert fetched.project == "p05-interferometer" + + +def test_list_interactions_canonicalizes_project_filter(project_registry): + init_db() + project_registry(("p06-polisher", ["p06", "polisher"])) + + record_interaction(prompt="a", response="ra", project="p06-polisher", reinforce=False) + record_interaction(prompt="b", response="rb", project="polisher", reinforce=False) + record_interaction(prompt="c", response="rc", project="atocore", reinforce=False) + + # Query by an alias should still find both p06 captures + via_alias = list_interactions(project="p06") + via_canonical = list_interactions(project="p06-polisher") + assert len(via_alias) == 2 + assert len(via_canonical) == 2 + assert {i.prompt for i in via_alias} == {"a", "b"} + + +# --- since filter format normalization ------------------------------------ + + +def test_list_interactions_since_accepts_iso_with_t_separator(tmp_data_dir): + init_db() + record_interaction(prompt="early", response="r", reinforce=False) + time.sleep(1.05) + pivot = record_interaction(prompt="late", response="r", reinforce=False) + + # pivot.created_at is in storage format 'YYYY-MM-DD HH:MM:SS'. + # Build the equivalent ISO 8601 with 'T' that an external client + # would naturally send. + iso_with_t = pivot.created_at.replace(" ", "T") + items = list_interactions(since=iso_with_t) + assert any(i.id == pivot.id for i in items) + # The early row must also be excluded if its timestamp is strictly + # before the pivot — since is inclusive on the cutoff + early_ids = {i.id for i in items if i.prompt == "early"} + assert early_ids == set() or len(items) >= 1 + + +def test_list_interactions_since_accepts_z_suffix(tmp_data_dir): + init_db() + pivot = record_interaction(prompt="pivot", response="r", reinforce=False) + time.sleep(1.05) + after = record_interaction(prompt="after", response="r", reinforce=False) + + iso_with_z = pivot.created_at.replace(" ", "T") + "Z" + items = list_interactions(since=iso_with_z) + ids = {i.id for i in items} + assert pivot.id in ids + assert after.id in ids + + +def test_list_interactions_since_accepts_offset(tmp_data_dir): + init_db() + pivot = record_interaction(prompt="pivot", response="r", reinforce=False) + time.sleep(1.05) + after = record_interaction(prompt="after", response="r", reinforce=False) + + iso_with_offset = pivot.created_at.replace(" ", "T") + "+00:00" + items = list_interactions(since=iso_with_offset) + assert any(i.id == after.id for i in items) + + +def test_list_interactions_since_storage_format_still_works(tmp_data_dir): + """The bare storage format must still work for backwards compatibility.""" + init_db() + pivot = record_interaction(prompt="pivot", response="r", reinforce=False) + + items = list_interactions(since=pivot.created_at) + assert any(i.id == pivot.id for i in items) diff --git a/tests/test_project_state.py b/tests/test_project_state.py index dcb7a90..ab5c54e 100644 --- a/tests/test_project_state.py +++ b/tests/test_project_state.py @@ -131,3 +131,68 @@ def test_format_project_state(): def test_format_empty(): """Test formatting empty state.""" assert format_project_state([]) == "" + + +# --- Alias canonicalization regression tests -------------------------------- + + +def test_set_state_canonicalizes_alias(project_registry): + """Writing state via an alias should land under the canonical project id. + + Regression for codex's P1 finding: previously /project/state with + project="p05" created a separate alias row that later context builds + (which canonicalize the hint) would never see. + """ + project_registry(("p05-interferometer", ["p05", "interferometer"])) + + set_state("p05", "status", "next_focus", "Wave 2 ingestion") + + # The state must be reachable via every alias AND the canonical id + via_alias = get_state("p05") + via_canonical = get_state("p05-interferometer") + via_other_alias = get_state("interferometer") + + assert len(via_alias) == 1 + assert len(via_canonical) == 1 + assert len(via_other_alias) == 1 + # All three reads return the same row id (no fragmented duplicates) + assert via_alias[0].id == via_canonical[0].id == via_other_alias[0].id + assert via_canonical[0].value == "Wave 2 ingestion" + + +def test_get_state_canonicalizes_alias_after_canonical_write(project_registry): + """Reading via an alias should find state written under the canonical id.""" + project_registry(("p04-gigabit", ["p04", "gigabit"])) + + set_state("p04-gigabit", "status", "phase", "Phase 1 baseline") + via_alias = get_state("gigabit") + + assert len(via_alias) == 1 + assert via_alias[0].value == "Phase 1 baseline" + + +def test_invalidate_state_canonicalizes_alias(project_registry): + """Invalidating via an alias should hit the canonical row.""" + project_registry(("p06-polisher", ["p06", "polisher"])) + + set_state("p06-polisher", "decision", "frame", "kinematic mounts") + success = invalidate_state("polisher", "decision", "frame") + + assert success is True + active = get_state("p06-polisher") + assert len(active) == 0 + + +def test_unregistered_project_state_still_works(project_registry): + """Hand-curated state for an unregistered project must still round-trip. + + Backwards compatibility with state created before the project + registry existed: resolve_project_name returns the input unchanged + when the registry has no record, so the raw name is used as-is. + """ + project_registry() # empty registry + + set_state("orphan-project", "status", "phase", "Standalone") + entries = get_state("orphan-project") + assert len(entries) == 1 + assert entries[0].value == "Standalone" diff --git a/tests/test_reinforcement.py b/tests/test_reinforcement.py index 7d23675..7537fa4 100644 --- a/tests/test_reinforcement.py +++ b/tests/test_reinforcement.py @@ -314,3 +314,62 @@ def test_api_post_interactions_accepts_reinforce_false(tmp_data_dir): reloaded = [m for m in get_memories(memory_type="preference", limit=20) if m.id == mem.id][0] assert reloaded.confidence == 0.5 assert reloaded.reference_count == 0 + + +# --- alias canonicalization end-to-end ------------------------------------- + + +def test_reinforcement_works_when_capture_uses_alias(project_registry): + """End-to-end: capture under an alias, seed memory under canonical id, + verify reinforcement still finds and bumps the memory. + + Regression for codex's P2 finding: previously interaction.project + was stored verbatim and reinforcement queried memories using that + raw value, so capturing under "p05" while memories live under + "p05-interferometer" silently missed everything. + """ + init_db() + project_registry(("p05-interferometer", ["p05", "interferometer"])) + + # Seed an active memory under the CANONICAL id + mem = create_memory( + memory_type="project", + content="the lateral support pads use GF-PTFE for thermal stability", + project="p05-interferometer", + confidence=0.5, + ) + + # Capture an interaction under the ALIAS — this is the bug case + record_interaction( + prompt="status update", + response=( + "Quick note: the lateral support pads use GF-PTFE for thermal " + "stability and that's still the current selection." + ), + project="p05", + ) + + # The seeded memory should have been reinforced + reloaded = [ + m + for m in get_memories(memory_type="project", project="p05-interferometer", limit=20) + if m.id == mem.id + ][0] + assert reloaded.confidence > 0.5 + assert reloaded.reference_count == 1 + + +def test_get_memories_filter_by_alias(project_registry): + """Filtering memories by an alias should find rows stored under canonical.""" + init_db() + project_registry(("p04-gigabit", ["p04", "gigabit"])) + + create_memory(memory_type="project", content="m1", project="p04-gigabit") + create_memory(memory_type="project", content="m2", project="gigabit") + + via_alias = get_memories(memory_type="project", project="p04") + via_canonical = get_memories(memory_type="project", project="p04-gigabit") + + assert len(via_alias) == 2 + assert len(via_canonical) == 2 + assert {m.content for m in via_alias} == {"m1", "m2"}