Session 1 of the four-session plan. Empirically exercises the Phase 9
loop (capture -> reinforce -> extract) for the first time and lands
three small hygiene fixes.
Validation script + report
--------------------------
scripts/phase9_first_real_use.py — reproducible script that:
- sets up an isolated SQLite + Chroma store under
data/validation/phase9-first-use (gitignored)
- seeds 3 active memories
- runs 8 sample interactions through capture + reinforce + extract
- prints what each step produced and reinforcement state at the end
- supports --json output for downstream tooling
docs/phase9-first-real-use.md — narrative report of the run with:
- extraction results table (8/8 expectations met exactly)
- the empirical finding that REINFORCEMENT MATCHED ZERO seeds
despite sample 5 clearly echoing the rebase preference memory
- root cause analysis: the substring matcher is too brittle for
natural paraphrases (e.g. "prefers" vs "I prefer", "history"
vs "the history")
- recommended fix: replace substring matcher with a token-overlap
matcher (>=70% of memory tokens present in response, with
light stemming and a small stop list)
- explicit note that the fix is queued as a follow-up commit, not
bundled into the report — keeps the audit trail clean
Key extraction results from the run:
- all 7 heading/sentence rules fired correctly
- 0 false positives on the prose-only sample (the most important
sanity check)
- long content preserved without truncation
- dedup correctly kept three distinct cues from one interaction
- project scoping flowed cleanly through the pipeline
Hygiene 1: FastAPI lifespan migration (src/atocore/main.py)
- Replaced @app.on_event("startup") with the modern @asynccontextmanager
lifespan handler
- Same setup work (setup_logging, ensure_runtime_dirs, init_db,
init_project_state_schema, startup_ready log)
- Removes the two on_event deprecation warnings from every test run
- Test suite now shows 1 warning instead of 3
Hygiene 2: EXTRACTOR_VERSION constant (src/atocore/memory/extractor.py)
- Added EXTRACTOR_VERSION = "0.1.0" with a versioned change log comment
- MemoryCandidate dataclass carries extractor_version on every candidate
- POST /interactions/{id}/extract response now includes extractor_version
on both the top level (current run) and on each candidate
- Implements the versioning requirement called out in
docs/architecture/promotion-rules.md so old candidates can be
identified and re-evaluated when the rule set evolves
Hygiene 3: ~/.git-credentials cleanup (out-of-tree, not committed)
- Removed the dead OAUTH_USER:<jwt> line for dalidou:3000 that was
being silently rewritten by the system credential manager on every
push attempt
- Configured credential.http://dalidou:3000.helper with the empty-string
sentinel pattern so the URL-specific helper chain is exactly
["", store] instead of inheriting the system-level "manager" helper
that ships with Git for Windows
- Same fix for the 100.80.199.40 (Tailscale) entry
- Verified end to end: a fresh push using only the cleaned credentials
file (no embedded URL) authenticates as Antoine and lands cleanly
Full suite: 160 passing (no change from previous), 1 warning
(was 3) thanks to the lifespan migration.
394 lines
15 KiB
Python
394 lines
15 KiB
Python
"""Phase 9 first-real-use validation script.
|
|
|
|
Captures a small set of representative interactions drawn from a real
|
|
working session, runs the full Phase 9 loop (capture -> reinforce ->
|
|
extract) over them, and prints what each step produced. The intent is
|
|
to generate empirical evidence about the extractor's behaviour against
|
|
prose that wasn't written to make the test pass.
|
|
|
|
Usage:
|
|
python scripts/phase9_first_real_use.py [--data-dir PATH]
|
|
|
|
The script writes a fresh isolated SQLite + Chroma store under the
|
|
given data dir (default: ./data/validation/phase9-first-use). The
|
|
data dir is gitignored so the script can be re-run cleanly.
|
|
|
|
Each interaction is printed with:
|
|
- the captured interaction id
|
|
- the reinforcement results (which seeded memories were echoed)
|
|
- the extraction results (which candidates were proposed and why)
|
|
- notes on what the extractor MISSED (manually annotated below)
|
|
|
|
The output is intentionally human-readable so the run can be saved as
|
|
the body of docs/phase9-first-real-use.md.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import shutil
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
# Make the package importable when run directly from the repo root.
|
|
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(_REPO_ROOT / "src"))
|
|
|
|
|
|
@dataclass
|
|
class SampleInteraction:
|
|
label: str
|
|
prompt: str
|
|
response: str
|
|
project: str
|
|
notes_for_human: str
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Sample interactions, paraphrased from real working sessions on AtoCore
|
|
# itself. Each one is a real-feeling exchange with at least one structural
|
|
# cue the extractor should catch, plus prose the extractor should NOT
|
|
# accidentally catch.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SAMPLES: list[SampleInteraction] = [
|
|
SampleInteraction(
|
|
label="exdev-mount-merge-decision",
|
|
project="atocore",
|
|
prompt="The TMPDIR fix didn't work, the LOCAL_COPY_PATH change didn't work, what now?",
|
|
response=(
|
|
"We've been chasing this through three layers without success. "
|
|
"Time to stop guessing and apply the simple fix Dalidou Claude "
|
|
"proposed in the very first diagnosis.\n\n"
|
|
"## Decision: merge the two bind mounts into one\n\n"
|
|
"Both /srv/storage/repos and /srv/storage/gitea/data live on "
|
|
"/dev/sdb1, so the move is a same-filesystem rename — instant, "
|
|
"atomic, no data copy. After the move, the EXDEV class of bug "
|
|
"becomes structurally impossible because there is only one "
|
|
"mount inside the container."
|
|
),
|
|
notes_for_human=(
|
|
"Should fire decision_heading rule. Should NOT fire requirement "
|
|
"or constraint rules. The 'becomes structurally impossible' line "
|
|
"is prose, not a candidate."
|
|
),
|
|
),
|
|
SampleInteraction(
|
|
label="ownership-was-the-real-fix",
|
|
project="atocore",
|
|
prompt="Still failing after the mount merge. What's actually wrong?",
|
|
response=(
|
|
"Same rejection even after the mounts were merged, which kills "
|
|
"the EXDEV theory. The error 'unable to migrate objects to "
|
|
"permanent storage' is git's catch-all for any failure in "
|
|
"tmp_objdir_migrate, and the real errno is being thrown away.\n\n"
|
|
"## Fact: the actual root cause was ownership, not cross-device link\n\n"
|
|
"After mv on the host, the moved directory had perms git could "
|
|
"navigate for some subdirs but not others. Recursive chown "
|
|
"git:git plus chmod u+rwX,g+rwX fixed it on the first try."
|
|
),
|
|
notes_for_human=(
|
|
"Should fire fact_heading rule, target type knowledge. The "
|
|
"'unable to migrate' string is inside prose and should NOT "
|
|
"trigger any rule."
|
|
),
|
|
),
|
|
SampleInteraction(
|
|
label="memory-vs-entity-canonical-home",
|
|
project="atocore",
|
|
prompt="Does a Decision live in memories or entities?",
|
|
response=(
|
|
"Both layers exist for different reasons. After thinking through "
|
|
"the ontology question, I'm landing on a clean rule: one "
|
|
"canonical home per concept, with explicit graduation when a "
|
|
"memory eventually deserves structure.\n\n"
|
|
"## Decision: project, knowledge, and adaptation memory types graduate to entities\n\n"
|
|
"The identity, preference, and episodic types stay in the memory "
|
|
"layer because they're personal and unstructured. The other "
|
|
"three become entities once the engineering layer V1 ships. The "
|
|
"original memory rows are NEVER deleted — they become frozen "
|
|
"historical pointers to whichever entity they graduated into."
|
|
),
|
|
notes_for_human=(
|
|
"Should fire decision_heading rule. Long content (over 100 "
|
|
"chars) — verifies the max-length cap of 280 doesn't truncate."
|
|
),
|
|
),
|
|
SampleInteraction(
|
|
label="auto-promotion-deferred",
|
|
project="atocore",
|
|
prompt="Should the engineering layer auto-promote candidates?",
|
|
response=(
|
|
"The trust hierarchy is the whole point of AtoCore. If we let "
|
|
"the system auto-promote, we trade away the 'bad memory is "
|
|
"worse than no memory' invariant for a marginal speed gain.\n\n"
|
|
"## Decision: no auto-promotion in V1, ever\n\n"
|
|
"The schema and API are designed so policies can be added "
|
|
"later without migration, but the V1 stance is that every "
|
|
"promotion needs a human signoff. The hard line: nothing ever "
|
|
"moves into trusted project state automatically, even with the "
|
|
"engineering layer running."
|
|
),
|
|
notes_for_human=(
|
|
"Should fire decision_heading rule. Has 'invariant' jargon — "
|
|
"verify the extractor doesn't get distracted."
|
|
),
|
|
),
|
|
SampleInteraction(
|
|
label="preference-rebase-workflow",
|
|
project="atocore",
|
|
prompt="Should we use rebase or merge commits for this branch?",
|
|
response=(
|
|
"I prefer rebase-based workflows because the history stays "
|
|
"linear and reviewers have an easier time. For the dalidou "
|
|
"storage-foundation branch we'll rebase before merging into "
|
|
"main so the timeline reads as one coherent narrative."
|
|
),
|
|
notes_for_human=(
|
|
"Should fire preference_sentence rule. Note: the rule looks "
|
|
"for 'I prefer' which is exactly the wording used."
|
|
),
|
|
),
|
|
SampleInteraction(
|
|
label="constraint-from-doc-cite",
|
|
project="p05-interferometer",
|
|
prompt="What's the budget situation for p05 right now?",
|
|
response=(
|
|
"Looking at the latest design notes, the constraint is fairly "
|
|
"tight on the optical path side.\n\n"
|
|
"### Constraint: total optical path stability must stay within 10 nm RMS at 1 kHz\n\n"
|
|
"The error budget allocates 6 nm to the laser source and 4 nm "
|
|
"to the mechanical mounts and frame combined."
|
|
),
|
|
notes_for_human=(
|
|
"Should fire constraint_heading rule with type=project. The "
|
|
"second sentence ('error budget allocates 6 nm') is descriptive "
|
|
"and should NOT fire any rule because it doesn't have a "
|
|
"structural cue."
|
|
),
|
|
),
|
|
SampleInteraction(
|
|
label="prose-only-no-cues",
|
|
project="atocore",
|
|
prompt="What should I work on next?",
|
|
response=(
|
|
"Looking at the current state of AtoCore, the next natural "
|
|
"step is to validate Phase 9 against real interactions before "
|
|
"starting the engineering layer implementation. Empirical "
|
|
"evidence is missing and the planning docs would benefit from "
|
|
"real signal."
|
|
),
|
|
notes_for_human=(
|
|
"Should produce ZERO candidates. Pure recommendation prose, no "
|
|
"structural cues. If this fires anything the extractor is too "
|
|
"loose."
|
|
),
|
|
),
|
|
SampleInteraction(
|
|
label="multiple-cues-in-one-interaction",
|
|
project="p06-polisher",
|
|
prompt="Summarize today's polisher session",
|
|
response=(
|
|
"We worked through three things in the polisher session today.\n\n"
|
|
"## Decision: defer the laser interlock redesign to after the July milestone\n\n"
|
|
"## Constraint: the calibration routine must complete in under 90 seconds for production use\n\n"
|
|
"## Requirement: the polisher must hold position to within 0.5 micron at 1 g loading\n\n"
|
|
"Action items captured for the next sync."
|
|
),
|
|
notes_for_human=(
|
|
"Three rules should fire on the same interaction: "
|
|
"decision_heading -> adaptation, constraint_heading -> project, "
|
|
"requirement_heading -> project. Verify dedup doesn't merge them."
|
|
),
|
|
),
|
|
]
|
|
|
|
|
|
def setup_environment(data_dir: Path) -> None:
|
|
"""Configure AtoCore to use an isolated data directory for this run."""
|
|
if data_dir.exists():
|
|
shutil.rmtree(data_dir)
|
|
data_dir.mkdir(parents=True, exist_ok=True)
|
|
os.environ["ATOCORE_DATA_DIR"] = str(data_dir)
|
|
os.environ.setdefault("ATOCORE_DEBUG", "true")
|
|
# Reset cached settings so the new env vars take effect
|
|
import atocore.config as config
|
|
|
|
config.settings = config.Settings()
|
|
import atocore.retrieval.vector_store as vs
|
|
|
|
vs._store = None
|
|
|
|
|
|
def seed_memories() -> dict[str, str]:
|
|
"""Insert a small set of seed active memories so reinforcement has
|
|
something to match against."""
|
|
from atocore.memory.service import create_memory
|
|
|
|
seeded: dict[str, str] = {}
|
|
seeded["pref_rebase"] = create_memory(
|
|
memory_type="preference",
|
|
content="prefers rebase-based workflows because history stays linear",
|
|
confidence=0.6,
|
|
).id
|
|
seeded["pref_concise"] = create_memory(
|
|
memory_type="preference",
|
|
content="writes commit messages focused on the why, not the what",
|
|
confidence=0.6,
|
|
).id
|
|
seeded["identity_runs_atocore"] = create_memory(
|
|
memory_type="identity",
|
|
content="mechanical engineer who runs AtoCore for context engineering",
|
|
confidence=0.9,
|
|
).id
|
|
return seeded
|
|
|
|
|
|
def run_sample(sample: SampleInteraction) -> dict:
|
|
"""Capture one sample, run extraction, return a result dict."""
|
|
from atocore.interactions.service import record_interaction
|
|
from atocore.memory.extractor import extract_candidates_from_interaction
|
|
|
|
interaction = record_interaction(
|
|
prompt=sample.prompt,
|
|
response=sample.response,
|
|
project=sample.project,
|
|
client="phase9-first-real-use",
|
|
session_id="first-real-use",
|
|
reinforce=True,
|
|
)
|
|
candidates = extract_candidates_from_interaction(interaction)
|
|
|
|
return {
|
|
"label": sample.label,
|
|
"project": sample.project,
|
|
"interaction_id": interaction.id,
|
|
"expected_notes": sample.notes_for_human,
|
|
"candidate_count": len(candidates),
|
|
"candidates": [
|
|
{
|
|
"memory_type": c.memory_type,
|
|
"rule": c.rule,
|
|
"content": c.content,
|
|
"source_span": c.source_span[:120],
|
|
}
|
|
for c in candidates
|
|
],
|
|
}
|
|
|
|
|
|
def report_seed_memory_state(seeded_ids: dict[str, str]) -> dict:
|
|
from atocore.memory.service import get_memories
|
|
|
|
state = {}
|
|
for label, mid in seeded_ids.items():
|
|
rows = [m for m in get_memories(limit=200) if m.id == mid]
|
|
if not rows:
|
|
state[label] = None
|
|
continue
|
|
m = rows[0]
|
|
state[label] = {
|
|
"id": m.id,
|
|
"memory_type": m.memory_type,
|
|
"content_preview": m.content[:80],
|
|
"confidence": round(m.confidence, 4),
|
|
"reference_count": m.reference_count,
|
|
"last_referenced_at": m.last_referenced_at,
|
|
}
|
|
return state
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--data-dir",
|
|
default=str(_REPO_ROOT / "data" / "validation" / "phase9-first-use"),
|
|
help="Isolated data directory to use for this validation run",
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
action="store_true",
|
|
help="Emit machine-readable JSON instead of human prose",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
data_dir = Path(args.data_dir).resolve()
|
|
setup_environment(data_dir)
|
|
|
|
from atocore.models.database import init_db
|
|
from atocore.context.project_state import init_project_state_schema
|
|
|
|
init_db()
|
|
init_project_state_schema()
|
|
|
|
seeded = seed_memories()
|
|
sample_results = [run_sample(s) for s in SAMPLES]
|
|
final_seed_state = report_seed_memory_state(seeded)
|
|
|
|
if args.json:
|
|
json.dump(
|
|
{
|
|
"data_dir": str(data_dir),
|
|
"seeded_memories_initial": list(seeded.keys()),
|
|
"samples": sample_results,
|
|
"seed_memory_state_after_run": final_seed_state,
|
|
},
|
|
sys.stdout,
|
|
indent=2,
|
|
default=str,
|
|
)
|
|
return 0
|
|
|
|
print("=" * 78)
|
|
print("Phase 9 first-real-use validation run")
|
|
print("=" * 78)
|
|
print(f"Isolated data dir: {data_dir}")
|
|
print()
|
|
print("Seeded the memory store with 3 active memories:")
|
|
for label, mid in seeded.items():
|
|
print(f" - {label} ({mid[:8]})")
|
|
print()
|
|
print("-" * 78)
|
|
print(f"Running {len(SAMPLES)} sample interactions ...")
|
|
print("-" * 78)
|
|
|
|
for result in sample_results:
|
|
print()
|
|
print(f"## {result['label']} [project={result['project']}]")
|
|
print(f" interaction_id={result['interaction_id'][:8]}")
|
|
print(f" expected: {result['expected_notes']}")
|
|
print(f" candidates produced: {result['candidate_count']}")
|
|
for i, cand in enumerate(result["candidates"], 1):
|
|
print(
|
|
f" [{i}] type={cand['memory_type']:11s} "
|
|
f"rule={cand['rule']:21s} "
|
|
f"content={cand['content']!r}"
|
|
)
|
|
|
|
print()
|
|
print("-" * 78)
|
|
print("Reinforcement state on seeded memories AFTER all interactions:")
|
|
print("-" * 78)
|
|
for label, state in final_seed_state.items():
|
|
if state is None:
|
|
print(f" {label}: <missing>")
|
|
continue
|
|
print(
|
|
f" {label}: confidence={state['confidence']:.4f} "
|
|
f"refs={state['reference_count']} "
|
|
f"last={state['last_referenced_at'] or '-'}"
|
|
)
|
|
|
|
print()
|
|
print("=" * 78)
|
|
print("Run complete. Data written to:", data_dir)
|
|
print("=" * 78)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|