Compare commits
4 Commits
codex/audi
...
69c971708a
| Author | SHA1 | Date | |
|---|---|---|---|
| 69c971708a | |||
| 8951c624fe | |||
| 1a2ee5e07f | |||
| 9b149d4bfd |
@@ -31,10 +31,11 @@ log() { printf '[%s] %s\n' "$TIMESTAMP" "$*"; }
|
|||||||
# The Python script needs the atocore source on PYTHONPATH
|
# The Python script needs the atocore source on PYTHONPATH
|
||||||
export PYTHONPATH="$APP_DIR/src:${PYTHONPATH:-}"
|
export PYTHONPATH="$APP_DIR/src:${PYTHONPATH:-}"
|
||||||
|
|
||||||
log "=== AtoCore batch LLM extraction starting ==="
|
log "=== AtoCore batch extraction + triage starting ==="
|
||||||
log "URL=$ATOCORE_URL LIMIT=$LIMIT"
|
log "URL=$ATOCORE_URL LIMIT=$LIMIT"
|
||||||
|
|
||||||
# Run the host-side extraction script
|
# Step A: Extract candidates from recent interactions
|
||||||
|
log "Step A: LLM extraction"
|
||||||
python3 "$APP_DIR/scripts/batch_llm_extract_live.py" \
|
python3 "$APP_DIR/scripts/batch_llm_extract_live.py" \
|
||||||
--base-url "$ATOCORE_URL" \
|
--base-url "$ATOCORE_URL" \
|
||||||
--limit "$LIMIT" \
|
--limit "$LIMIT" \
|
||||||
@@ -42,4 +43,12 @@ python3 "$APP_DIR/scripts/batch_llm_extract_live.py" \
|
|||||||
log "WARN: batch extraction failed (non-blocking)"
|
log "WARN: batch extraction failed (non-blocking)"
|
||||||
}
|
}
|
||||||
|
|
||||||
log "=== AtoCore batch LLM extraction complete ==="
|
# Step B: Auto-triage candidates in the queue
|
||||||
|
log "Step B: auto-triage"
|
||||||
|
python3 "$APP_DIR/scripts/auto_triage.py" \
|
||||||
|
--base-url "$ATOCORE_URL" \
|
||||||
|
2>&1 || {
|
||||||
|
log "WARN: auto-triage failed (non-blocking)"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "=== AtoCore batch extraction + triage complete ==="
|
||||||
|
|||||||
247
scripts/auto_triage.py
Normal file
247
scripts/auto_triage.py
Normal file
@@ -0,0 +1,247 @@
|
|||||||
|
"""Auto-triage: LLM second-pass over candidate memories.
|
||||||
|
|
||||||
|
Fetches all status=candidate memories from the AtoCore API, asks
|
||||||
|
a triage model (via claude -p) to classify each as promote / reject /
|
||||||
|
needs_human, and executes the verdict via the promote/reject endpoints.
|
||||||
|
Only needs_human candidates remain in the queue for manual review.
|
||||||
|
|
||||||
|
Trust model:
|
||||||
|
- Auto-promote: model says promote AND confidence >= 0.8 AND no
|
||||||
|
duplicate content in existing active memories
|
||||||
|
- Auto-reject: model says reject
|
||||||
|
- needs_human: everything else stays in queue
|
||||||
|
|
||||||
|
Runs host-side (same as batch extraction) because it needs the
|
||||||
|
claude CLI. Intended to be called after batch-extract.sh in the
|
||||||
|
nightly cron, or manually.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
python3 scripts/auto_triage.py --base-url http://localhost:8100
|
||||||
|
python3 scripts/auto_triage.py --dry-run # preview without executing
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
DEFAULT_BASE_URL = os.environ.get("ATOCORE_BASE_URL", "http://localhost:8100")
|
||||||
|
DEFAULT_MODEL = os.environ.get("ATOCORE_TRIAGE_MODEL", "sonnet")
|
||||||
|
DEFAULT_TIMEOUT_S = float(os.environ.get("ATOCORE_TRIAGE_TIMEOUT_S", "60"))
|
||||||
|
AUTO_PROMOTE_MIN_CONFIDENCE = 0.8
|
||||||
|
|
||||||
|
TRIAGE_SYSTEM_PROMPT = """You are a memory triage reviewer for a personal context engine called AtoCore. You review candidate memories extracted from LLM conversations and decide whether each should be promoted to active status, rejected, or flagged for human review.
|
||||||
|
|
||||||
|
You will receive:
|
||||||
|
- The candidate memory content and type
|
||||||
|
- A list of existing active memories for the same project (to check for duplicates)
|
||||||
|
|
||||||
|
For each candidate, output exactly one JSON object:
|
||||||
|
|
||||||
|
{"verdict": "promote|reject|needs_human", "confidence": 0.0-1.0, "reason": "one sentence"}
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
1. PROMOTE when the candidate states a durable architectural fact, ratified decision, standing rule, or engineering constraint that is NOT already covered by an existing active memory. Confidence should reflect how certain you are this is worth keeping.
|
||||||
|
|
||||||
|
2. REJECT when the candidate is:
|
||||||
|
- A stale point-in-time snapshot ("live SHA is X", "36 active memories")
|
||||||
|
- An implementation detail too granular to be useful as standalone context
|
||||||
|
- A planned-but-not-implemented feature description
|
||||||
|
- A duplicate or near-duplicate of an existing active memory
|
||||||
|
- A session observation or conversational filler
|
||||||
|
- A process rule that belongs in DEV-LEDGER.md or AGENTS.md, not memory
|
||||||
|
|
||||||
|
3. NEEDS_HUMAN when you're genuinely unsure — the candidate might be valuable but you can't tell without domain knowledge. This should be rare (< 20% of candidates).
|
||||||
|
|
||||||
|
4. Output ONLY the JSON object. No prose, no markdown, no explanation outside the reason field."""
|
||||||
|
|
||||||
|
_sandbox_cwd = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_sandbox_cwd():
|
||||||
|
global _sandbox_cwd
|
||||||
|
if _sandbox_cwd is None:
|
||||||
|
_sandbox_cwd = tempfile.mkdtemp(prefix="ato-triage-")
|
||||||
|
return _sandbox_cwd
|
||||||
|
|
||||||
|
|
||||||
|
def api_get(base_url, path, timeout=10):
|
||||||
|
req = urllib.request.Request(f"{base_url}{path}")
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||||
|
return json.loads(resp.read().decode("utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def api_post(base_url, path, body=None, timeout=10):
|
||||||
|
data = json.dumps(body or {}).encode("utf-8")
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{base_url}{path}", method="POST",
|
||||||
|
headers={"Content-Type": "application/json"}, data=data,
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||||
|
return json.loads(resp.read().decode("utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_active_memories_for_project(base_url, project):
|
||||||
|
"""Fetch active memories for dedup checking."""
|
||||||
|
params = "active_only=true&limit=50"
|
||||||
|
if project:
|
||||||
|
params += f"&project={urllib.parse.quote(project)}"
|
||||||
|
result = api_get(base_url, f"/memory?{params}")
|
||||||
|
return result.get("memories", [])
|
||||||
|
|
||||||
|
|
||||||
|
def triage_one(candidate, active_memories, model, timeout_s):
|
||||||
|
"""Ask the triage model to classify one candidate."""
|
||||||
|
if not shutil.which("claude"):
|
||||||
|
return {"verdict": "needs_human", "confidence": 0.0, "reason": "claude CLI not available"}
|
||||||
|
|
||||||
|
active_summary = "\n".join(
|
||||||
|
f"- [{m['memory_type']}] {m['content'][:150]}"
|
||||||
|
for m in active_memories[:20]
|
||||||
|
) or "(no active memories for this project)"
|
||||||
|
|
||||||
|
user_message = (
|
||||||
|
f"CANDIDATE TO TRIAGE:\n"
|
||||||
|
f" type: {candidate['memory_type']}\n"
|
||||||
|
f" project: {candidate.get('project') or '(none)'}\n"
|
||||||
|
f" content: {candidate['content']}\n\n"
|
||||||
|
f"EXISTING ACTIVE MEMORIES FOR THIS PROJECT:\n{active_summary}\n\n"
|
||||||
|
f"Return the JSON verdict now."
|
||||||
|
)
|
||||||
|
|
||||||
|
args = [
|
||||||
|
"claude", "-p",
|
||||||
|
"--model", model,
|
||||||
|
"--append-system-prompt", TRIAGE_SYSTEM_PROMPT,
|
||||||
|
"--disable-slash-commands",
|
||||||
|
user_message,
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
completed = subprocess.run(
|
||||||
|
args, capture_output=True, text=True,
|
||||||
|
timeout=timeout_s, cwd=get_sandbox_cwd(),
|
||||||
|
encoding="utf-8", errors="replace",
|
||||||
|
)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return {"verdict": "needs_human", "confidence": 0.0, "reason": "triage model timed out"}
|
||||||
|
except Exception as exc:
|
||||||
|
return {"verdict": "needs_human", "confidence": 0.0, "reason": f"subprocess error: {exc}"}
|
||||||
|
|
||||||
|
if completed.returncode != 0:
|
||||||
|
return {"verdict": "needs_human", "confidence": 0.0, "reason": f"claude exit {completed.returncode}"}
|
||||||
|
|
||||||
|
raw = (completed.stdout or "").strip()
|
||||||
|
return parse_verdict(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_verdict(raw):
|
||||||
|
"""Parse the triage model's JSON verdict."""
|
||||||
|
text = raw.strip()
|
||||||
|
if text.startswith("```"):
|
||||||
|
text = text.strip("`")
|
||||||
|
nl = text.find("\n")
|
||||||
|
if nl >= 0:
|
||||||
|
text = text[nl + 1:]
|
||||||
|
if text.endswith("```"):
|
||||||
|
text = text[:-3]
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
if not text.lstrip().startswith("{"):
|
||||||
|
start = text.find("{")
|
||||||
|
end = text.rfind("}")
|
||||||
|
if start >= 0 and end > start:
|
||||||
|
text = text[start:end + 1]
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = json.loads(text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"verdict": "needs_human", "confidence": 0.0, "reason": "failed to parse triage output"}
|
||||||
|
|
||||||
|
verdict = str(parsed.get("verdict", "needs_human")).strip().lower()
|
||||||
|
if verdict not in {"promote", "reject", "needs_human"}:
|
||||||
|
verdict = "needs_human"
|
||||||
|
|
||||||
|
confidence = parsed.get("confidence", 0.5)
|
||||||
|
try:
|
||||||
|
confidence = max(0.0, min(1.0, float(confidence)))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
confidence = 0.5
|
||||||
|
|
||||||
|
reason = str(parsed.get("reason", "")).strip()[:200]
|
||||||
|
return {"verdict": verdict, "confidence": confidence, "reason": reason}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Auto-triage candidate memories")
|
||||||
|
parser.add_argument("--base-url", default=DEFAULT_BASE_URL)
|
||||||
|
parser.add_argument("--model", default=DEFAULT_MODEL)
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="preview without executing")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Fetch candidates
|
||||||
|
result = api_get(args.base_url, "/memory?status=candidate&limit=100")
|
||||||
|
candidates = result.get("memories", [])
|
||||||
|
print(f"candidates: {len(candidates)} model: {args.model} dry_run: {args.dry_run}")
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
print("queue empty, nothing to triage")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Cache active memories per project for dedup
|
||||||
|
active_cache = {}
|
||||||
|
promoted = rejected = needs_human = errors = 0
|
||||||
|
|
||||||
|
for i, cand in enumerate(candidates, 1):
|
||||||
|
project = cand.get("project") or ""
|
||||||
|
if project not in active_cache:
|
||||||
|
active_cache[project] = fetch_active_memories_for_project(args.base_url, project)
|
||||||
|
|
||||||
|
verdict_obj = triage_one(cand, active_cache[project], args.model, DEFAULT_TIMEOUT_S)
|
||||||
|
verdict = verdict_obj["verdict"]
|
||||||
|
conf = verdict_obj["confidence"]
|
||||||
|
reason = verdict_obj["reason"]
|
||||||
|
|
||||||
|
mid = cand["id"]
|
||||||
|
label = f"[{i:2d}/{len(candidates)}] {mid[:8]} [{cand['memory_type']}]"
|
||||||
|
|
||||||
|
if verdict == "promote" and conf >= AUTO_PROMOTE_MIN_CONFIDENCE:
|
||||||
|
if args.dry_run:
|
||||||
|
print(f" WOULD PROMOTE {label} conf={conf:.2f} {reason}")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
api_post(args.base_url, f"/memory/{mid}/promote")
|
||||||
|
print(f" PROMOTED {label} conf={conf:.2f} {reason}")
|
||||||
|
active_cache[project].append(cand)
|
||||||
|
except Exception:
|
||||||
|
errors += 1
|
||||||
|
promoted += 1
|
||||||
|
elif verdict == "reject":
|
||||||
|
if args.dry_run:
|
||||||
|
print(f" WOULD REJECT {label} conf={conf:.2f} {reason}")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
api_post(args.base_url, f"/memory/{mid}/reject")
|
||||||
|
print(f" REJECTED {label} conf={conf:.2f} {reason}")
|
||||||
|
except Exception:
|
||||||
|
errors += 1
|
||||||
|
rejected += 1
|
||||||
|
else:
|
||||||
|
print(f" NEEDS_HUMAN {label} conf={conf:.2f} {reason}")
|
||||||
|
needs_human += 1
|
||||||
|
|
||||||
|
print(f"\npromoted={promoted} rejected={rejected} needs_human={needs_human} errors={errors}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -100,6 +100,22 @@ def set_last_run(base_url, timestamp):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
_known_projects: set[str] = set()
|
||||||
|
|
||||||
|
|
||||||
|
def _load_known_projects(base_url):
|
||||||
|
"""Fetch registered project IDs from the API for R9 validation."""
|
||||||
|
global _known_projects
|
||||||
|
try:
|
||||||
|
data = api_get(base_url, "/projects")
|
||||||
|
_known_projects = {p["id"] for p in data.get("projects", [])}
|
||||||
|
for p in data.get("projects", []):
|
||||||
|
for alias in p.get("aliases", []):
|
||||||
|
_known_projects.add(alias)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def extract_one(prompt, response, project, model, timeout_s):
|
def extract_one(prompt, response, project, model, timeout_s):
|
||||||
"""Run claude -p on one interaction, return parsed candidates."""
|
"""Run claude -p on one interaction, return parsed candidates."""
|
||||||
if not shutil.which("claude"):
|
if not shutil.which("claude"):
|
||||||
@@ -178,6 +194,12 @@ def parse_candidates(raw, interaction_project):
|
|||||||
project = str(item.get("project") or "").strip()
|
project = str(item.get("project") or "").strip()
|
||||||
if not project and interaction_project:
|
if not project and interaction_project:
|
||||||
project = interaction_project
|
project = interaction_project
|
||||||
|
elif project and interaction_project and project != interaction_project:
|
||||||
|
# R9: model hallucinated an unrecognized project — fall back.
|
||||||
|
# The host-side script can't import the registry, so we
|
||||||
|
# check against a known set fetched from the API.
|
||||||
|
if project not in _known_projects:
|
||||||
|
project = interaction_project
|
||||||
conf = item.get("confidence", 0.5)
|
conf = item.get("confidence", 0.5)
|
||||||
if mem_type not in MEMORY_TYPES or not content:
|
if mem_type not in MEMORY_TYPES or not content:
|
||||||
continue
|
continue
|
||||||
@@ -202,8 +224,9 @@ def main():
|
|||||||
parser.add_argument("--model", default=DEFAULT_MODEL)
|
parser.add_argument("--model", default=DEFAULT_MODEL)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
_load_known_projects(args.base_url)
|
||||||
since = args.since or get_last_run(args.base_url)
|
since = args.since or get_last_run(args.base_url)
|
||||||
print(f"since={since or '(first run)'} limit={args.limit} model={args.model}")
|
print(f"since={since or '(first run)'} limit={args.limit} model={args.model} known_projects={len(_known_projects)}")
|
||||||
|
|
||||||
params = [f"limit={args.limit}"]
|
params = [f"limit={args.limit}"]
|
||||||
if since:
|
if since:
|
||||||
|
|||||||
1
scripts/eval_data/candidate_queue_2026-04-12.json
Normal file
1
scripts/eval_data/candidate_queue_2026-04-12.json
Normal file
File diff suppressed because one or more lines are too long
29
scripts/eval_data/candidate_queue_2026-04-12.txt
Normal file
29
scripts/eval_data/candidate_queue_2026-04-12.txt
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
1. [project ] proj=atocore AtoCore extraction must stay off the hot capture path; batch endpoint only
|
||||||
|
2. [project ] proj=atocore Auto-promote gate: confidence ≥0.8 AND no duplicate in active memories
|
||||||
|
3. [project ] proj=atocore AtoCore LLM extraction pipeline deployed on Dalidou host, runs via cron at 03:00 UTC via scripts/batch_llm_extract_live.py
|
||||||
|
4. [project ] proj=atocore LLM extractor runs host-side (not in container) because claude CLI not available in container environment
|
||||||
|
5. [project ] proj=atocore Host-side extraction script scripts/batch_llm_extract_live.py uses pure stdlib, no atocore imports for deployment simplicity
|
||||||
|
6. [project ] proj=atocore POST /admin/extract-batch accepts mode: rule|llm, POST /interactions/{id}/extract now mode-aware
|
||||||
|
7. [knowledge ] proj=atocore claude CLI 2.0.60 removed --no-session-persistence flag, extraction sessions now persist in claude history
|
||||||
|
8. [adaptation ] proj=atocore Durable memory extraction candidates must be <200 chars, stand-alone, typed as project|knowledge|preference|adaptation
|
||||||
|
9. [adaptation ] proj=atocore Memory extraction confidence defaults to 0.5, raise to 0.6 only for unambiguous committed claims
|
||||||
|
10. [project ] proj=atocore Live Dalidou is on commit 39d73e9, not e2895b5
|
||||||
|
11. [project ] proj=atocore Live harness is reproducible at 16/18 PASS
|
||||||
|
12. [project ] proj=atocore Live active memories count is 36
|
||||||
|
13. [project ] proj=atocore Wave 2 project-state entries on live: p04=5, p05=6, p06=6
|
||||||
|
14. [project ] proj=atocore R6 is fixed by commit 39d73e9
|
||||||
|
15. [project ] proj=atocore R9: R6 fix only covers empty project fallback; wrong non-empty model project can still override known interaction scope
|
||||||
|
16. [project ] proj=atocore R10: Phase 8 is baseline-complete but not primary-complete; OpenClaw client covers narrow read-oriented slice of API
|
||||||
|
17. [project ] proj=atocore Phase 8 is decent baseline integration milestone but not primary-ready yet
|
||||||
|
18. [project ] proj=atocore 4-step roadmap complete: extractor → harness → Wave 2 → OpenClaw
|
||||||
|
19. [project ] proj=atocore Codex audit loop proven across two full round-trips in one session
|
||||||
|
20. [project ] proj=atocore Session end state: 36 active memories, 17 project-state entries, 16/18 harness, 280 tests, main at 54d84b5
|
||||||
|
21. [project ] proj=atocore AtoCore extraction stays off the hot capture path; LLM extraction runs as scheduled batch, not inline with POST /interactions.
|
||||||
|
22. [project ] proj=atocore AtoCore auto-triage trust model: auto-promote only when confidence ≥0.8 AND no duplicate active memory; else needs_human.
|
||||||
|
23. [project ] proj=atocore Multi-model triage: use different model for triage reviewer than extractor (sonnet for extract)
|
||||||
|
24. [project ] proj=atocore R9 fix: when interaction has known project, prefer it over model's non-matching project unless model's is registered
|
||||||
|
25. [project ] proj=atocore R7 ranking fix: add overlap-density as secondary signal (overlap_count / memory_token_count)
|
||||||
|
26. [project ] proj=atocore Extraction pipeline skips interactions with response_chars < 50 to avoid low-signal content
|
||||||
|
27. [project ] proj=atocore AtoCore triage uses independent model from extractor (extractor: sonnet, triage: different model or different prompt).
|
||||||
|
28. [project ] proj=atocore AtoCore ranking scorer adds overlap-density (overlap_count / memory_tokens) as secondary signal to fix short-memory ranking.
|
||||||
|
29. [project ] proj=atocore AtoCore project trust: when interaction has known project and model returns different project, prefer interaction's project unless
|
||||||
@@ -257,6 +257,27 @@ def _parse_candidates(raw_output: str, interaction: Interaction) -> list[MemoryC
|
|||||||
project = str(item.get("project") or "").strip()
|
project = str(item.get("project") or "").strip()
|
||||||
if not project and interaction.project:
|
if not project and interaction.project:
|
||||||
project = interaction.project
|
project = interaction.project
|
||||||
|
elif project and interaction.project and project != interaction.project:
|
||||||
|
# R9: model returned a different project than the interaction's
|
||||||
|
# known scope. Trust the model's project only if it resolves
|
||||||
|
# to a known registered project (the registry normalizes
|
||||||
|
# aliases and returns the canonical id). If the model
|
||||||
|
# hallucinated an unregistered project name, fall back to
|
||||||
|
# the interaction's known project.
|
||||||
|
try:
|
||||||
|
from atocore.projects.registry import (
|
||||||
|
load_project_registry,
|
||||||
|
resolve_project_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
registered_ids = {p.project_id for p in load_project_registry()}
|
||||||
|
resolved = resolve_project_name(project)
|
||||||
|
if resolved not in registered_ids:
|
||||||
|
project = interaction.project
|
||||||
|
else:
|
||||||
|
project = resolved
|
||||||
|
except Exception:
|
||||||
|
project = interaction.project
|
||||||
confidence_raw = item.get("confidence", 0.5)
|
confidence_raw = item.get("confidence", 0.5)
|
||||||
if mem_type not in MEMORY_TYPES:
|
if mem_type not in MEMORY_TYPES:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -446,20 +446,27 @@ def _rank_memories_for_query(
|
|||||||
) -> list["Memory"]:
|
) -> list["Memory"]:
|
||||||
"""Rerank a memory list by lexical overlap with a pre-tokenized query.
|
"""Rerank a memory list by lexical overlap with a pre-tokenized query.
|
||||||
|
|
||||||
Ordering key: (overlap_count DESC, confidence DESC). When a query
|
Primary key: overlap_density (overlap_count / memory_token_count),
|
||||||
shares no tokens with a memory, overlap is zero and confidence
|
which rewards short focused memories that match the query precisely
|
||||||
acts as the sole tiebreaker — which matches the pre-query
|
over long overview memories that incidentally share a few tokens.
|
||||||
behaviour and keeps no-query calls stable.
|
Secondary: absolute overlap count. Tertiary: confidence.
|
||||||
|
|
||||||
|
R7 fix: previously overlap_count alone was the primary key, so a
|
||||||
|
40-token overview memory with 3 overlapping tokens tied a 5-token
|
||||||
|
memory with 3 overlapping tokens, and the overview won on
|
||||||
|
confidence. Now the short memory's density (0.6) beats the
|
||||||
|
overview's density (0.075).
|
||||||
"""
|
"""
|
||||||
from atocore.memory.reinforcement import _normalize, _tokenize
|
from atocore.memory.reinforcement import _normalize, _tokenize
|
||||||
|
|
||||||
scored: list[tuple[int, float, Memory]] = []
|
scored: list[tuple[float, int, float, Memory]] = []
|
||||||
for mem in memories:
|
for mem in memories:
|
||||||
mem_tokens = _tokenize(_normalize(mem.content))
|
mem_tokens = _tokenize(_normalize(mem.content))
|
||||||
overlap = len(mem_tokens & query_tokens) if mem_tokens else 0
|
overlap = len(mem_tokens & query_tokens) if mem_tokens else 0
|
||||||
scored.append((overlap, mem.confidence, mem))
|
density = overlap / len(mem_tokens) if mem_tokens else 0.0
|
||||||
scored.sort(key=lambda t: (t[0], t[1]), reverse=True)
|
scored.append((density, overlap, mem.confidence, mem))
|
||||||
return [mem for _, _, mem in scored]
|
scored.sort(key=lambda t: (t[0], t[1], t[2]), reverse=True)
|
||||||
|
return [mem for _, _, _, mem in scored]
|
||||||
|
|
||||||
|
|
||||||
def _row_to_memory(row) -> Memory:
|
def _row_to_memory(row) -> Memory:
|
||||||
|
|||||||
173
tests/test_extraction_pipeline.py
Normal file
173
tests/test_extraction_pipeline.py
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
"""Integration tests for the extraction + triage pipeline (R8).
|
||||||
|
|
||||||
|
Tests the flow that produced the 41 active memories:
|
||||||
|
LLM extraction → persist as candidate → triage → promote/reject.
|
||||||
|
Uses mocked subprocess to avoid real claude -p calls.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from atocore.memory.extractor_llm import (
|
||||||
|
extract_candidates_llm,
|
||||||
|
extract_candidates_llm_verbose,
|
||||||
|
)
|
||||||
|
from atocore.memory.service import create_memory, get_memories
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
import atocore.memory.extractor_llm as extractor_llm
|
||||||
|
|
||||||
|
|
||||||
|
def _make_interaction(**kw):
|
||||||
|
from atocore.interactions.service import Interaction
|
||||||
|
|
||||||
|
return Interaction(
|
||||||
|
id=kw.get("id", "test-pipe-1"),
|
||||||
|
prompt=kw.get("prompt", "test prompt"),
|
||||||
|
response=kw.get("response", ""),
|
||||||
|
response_summary="",
|
||||||
|
project=kw.get("project", ""),
|
||||||
|
client="test",
|
||||||
|
session_id="",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeCompleted:
|
||||||
|
def __init__(self, stdout, returncode=0):
|
||||||
|
self.stdout = stdout
|
||||||
|
self.stderr = ""
|
||||||
|
self.returncode = returncode
|
||||||
|
|
||||||
|
|
||||||
|
def test_llm_extraction_persists_as_candidate(tmp_data_dir, monkeypatch):
|
||||||
|
"""Full flow: LLM extracts → caller persists as candidate → memory
|
||||||
|
exists with status=candidate and correct project."""
|
||||||
|
init_db()
|
||||||
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
extractor_llm.subprocess,
|
||||||
|
"run",
|
||||||
|
lambda *a, **kw: _FakeCompleted(
|
||||||
|
'[{"type": "project", "content": "USB SSD is mandatory for RPi storage", "project": "p06-polisher", "confidence": 0.6}]'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
interaction = _make_interaction(
|
||||||
|
response="We decided USB SSD is mandatory for the polisher RPi.",
|
||||||
|
project="p06-polisher",
|
||||||
|
)
|
||||||
|
candidates = extract_candidates_llm(interaction)
|
||||||
|
assert len(candidates) == 1
|
||||||
|
assert candidates[0].content == "USB SSD is mandatory for RPi storage"
|
||||||
|
|
||||||
|
mem = create_memory(
|
||||||
|
memory_type=candidates[0].memory_type,
|
||||||
|
content=candidates[0].content,
|
||||||
|
project=candidates[0].project,
|
||||||
|
confidence=candidates[0].confidence,
|
||||||
|
status="candidate",
|
||||||
|
)
|
||||||
|
assert mem.status == "candidate"
|
||||||
|
assert mem.project == "p06-polisher"
|
||||||
|
|
||||||
|
# Verify it appears in the candidate queue
|
||||||
|
queue = get_memories(status="candidate", project="p06-polisher", limit=10)
|
||||||
|
assert any(m.id == mem.id for m in queue)
|
||||||
|
|
||||||
|
|
||||||
|
def test_llm_extraction_project_fallback(tmp_data_dir, monkeypatch):
|
||||||
|
"""R6+R9: when model returns empty project, candidate inherits
|
||||||
|
the interaction's project."""
|
||||||
|
init_db()
|
||||||
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
extractor_llm.subprocess,
|
||||||
|
"run",
|
||||||
|
lambda *a, **kw: _FakeCompleted(
|
||||||
|
'[{"type": "knowledge", "content": "machine works offline", "project": "", "confidence": 0.5}]'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
interaction = _make_interaction(
|
||||||
|
response="The machine works fully offline.",
|
||||||
|
project="p06-polisher",
|
||||||
|
)
|
||||||
|
candidates = extract_candidates_llm(interaction)
|
||||||
|
assert len(candidates) == 1
|
||||||
|
assert candidates[0].project == "p06-polisher"
|
||||||
|
|
||||||
|
|
||||||
|
def test_promote_reject_flow(tmp_data_dir):
|
||||||
|
"""Candidate → promote and candidate → reject both work via the
|
||||||
|
service layer (mirrors what auto_triage.py does via HTTP)."""
|
||||||
|
from atocore.memory.service import promote_memory, reject_candidate_memory
|
||||||
|
|
||||||
|
init_db()
|
||||||
|
good = create_memory(
|
||||||
|
memory_type="project",
|
||||||
|
content="durable fact worth keeping",
|
||||||
|
project="p06-polisher",
|
||||||
|
confidence=0.5,
|
||||||
|
status="candidate",
|
||||||
|
)
|
||||||
|
bad = create_memory(
|
||||||
|
memory_type="project",
|
||||||
|
content="stale snapshot to reject",
|
||||||
|
project="atocore",
|
||||||
|
confidence=0.5,
|
||||||
|
status="candidate",
|
||||||
|
)
|
||||||
|
|
||||||
|
promote_memory(good.id)
|
||||||
|
reject_candidate_memory(bad.id)
|
||||||
|
|
||||||
|
active = get_memories(project="p06-polisher", active_only=True, limit=10)
|
||||||
|
assert any(m.id == good.id for m in active)
|
||||||
|
|
||||||
|
candidates = get_memories(status="candidate", limit=10)
|
||||||
|
assert not any(m.id == good.id for m in candidates)
|
||||||
|
assert not any(m.id == bad.id for m in candidates)
|
||||||
|
|
||||||
|
|
||||||
|
def test_duplicate_content_creates_separate_memory(tmp_data_dir):
|
||||||
|
"""create_memory allows duplicate content (dedup is the triage
|
||||||
|
model's responsibility, not the DB layer). Both memories exist."""
|
||||||
|
init_db()
|
||||||
|
m1 = create_memory(
|
||||||
|
memory_type="project",
|
||||||
|
content="unique fact about polisher",
|
||||||
|
project="p06-polisher",
|
||||||
|
)
|
||||||
|
m2 = create_memory(
|
||||||
|
memory_type="project",
|
||||||
|
content="unique fact about polisher",
|
||||||
|
project="p06-polisher",
|
||||||
|
status="candidate",
|
||||||
|
)
|
||||||
|
assert m1.id != m2.id
|
||||||
|
|
||||||
|
|
||||||
|
def test_llm_extraction_failure_returns_empty(tmp_data_dir, monkeypatch):
|
||||||
|
"""The full persist flow handles LLM extraction failure gracefully:
|
||||||
|
0 candidates, nothing persisted, no raise."""
|
||||||
|
init_db()
|
||||||
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: True)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
extractor_llm.subprocess,
|
||||||
|
"run",
|
||||||
|
lambda *a, **kw: _FakeCompleted("", returncode=1),
|
||||||
|
)
|
||||||
|
|
||||||
|
interaction = _make_interaction(
|
||||||
|
response="some real content that the LLM fails on",
|
||||||
|
project="p06-polisher",
|
||||||
|
)
|
||||||
|
result = extract_candidates_llm_verbose(interaction)
|
||||||
|
assert result.candidates == []
|
||||||
|
assert "exit_1" in result.error
|
||||||
|
|
||||||
|
# Nothing in the candidate queue
|
||||||
|
queue = get_memories(status="candidate", limit=10)
|
||||||
|
assert len(queue) == 0
|
||||||
@@ -107,8 +107,11 @@ def test_parser_falls_back_to_interaction_project():
|
|||||||
assert result[0].project == "p06-polisher"
|
assert result[0].project == "p06-polisher"
|
||||||
|
|
||||||
|
|
||||||
def test_parser_keeps_model_project_when_provided():
|
def test_parser_keeps_registered_model_project(tmp_data_dir, project_registry):
|
||||||
"""Model-supplied project takes precedence over interaction."""
|
"""R9: model-supplied project is kept when it's a registered project."""
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
init_db()
|
||||||
|
project_registry(("p04-gigabit", ["p04", "gigabit"]), ("p06-polisher", ["p06"]))
|
||||||
raw = '[{"type": "project", "content": "x", "project": "p04-gigabit"}]'
|
raw = '[{"type": "project", "content": "x", "project": "p04-gigabit"}]'
|
||||||
interaction = _make_interaction()
|
interaction = _make_interaction()
|
||||||
interaction.project = "p06-polisher"
|
interaction.project = "p06-polisher"
|
||||||
@@ -116,6 +119,19 @@ def test_parser_keeps_model_project_when_provided():
|
|||||||
assert result[0].project == "p04-gigabit"
|
assert result[0].project == "p04-gigabit"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parser_rejects_hallucinated_project(tmp_data_dir, project_registry):
|
||||||
|
"""R9: model-supplied project that is NOT registered falls back
|
||||||
|
to the interaction's known project."""
|
||||||
|
from atocore.models.database import init_db
|
||||||
|
init_db()
|
||||||
|
project_registry(("p06-polisher", ["p06"]))
|
||||||
|
raw = '[{"type": "project", "content": "x", "project": "fake-project-99"}]'
|
||||||
|
interaction = _make_interaction()
|
||||||
|
interaction.project = "p06-polisher"
|
||||||
|
result = _parse_candidates(raw, interaction)
|
||||||
|
assert result[0].project == "p06-polisher"
|
||||||
|
|
||||||
|
|
||||||
def test_missing_cli_returns_empty(monkeypatch):
|
def test_missing_cli_returns_empty(monkeypatch):
|
||||||
"""If ``claude`` is not on PATH the extractor returns empty, never raises."""
|
"""If ``claude`` is not on PATH the extractor returns empty, never raises."""
|
||||||
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: False)
|
monkeypatch.setattr(extractor_llm, "_cli_available", lambda: False)
|
||||||
|
|||||||
Reference in New Issue
Block a user