feat: Phase 4 V1 — Robustness Hardening

Adds the observability + safety layer that turns AtoCore from
"works until something silently breaks" into "every mutation is
traceable, drift is detected, failures raise alerts."

1. Audit log (memory_audit table):
   - New table with id, memory_id, action, actor, before/after JSON,
     note, timestamp; 3 indexes for memory_id/timestamp/action
   - _audit_memory() helper called from every mutation:
     create_memory, update_memory, promote_memory,
     reject_candidate_memory, invalidate_memory, supersede_memory,
     reinforce_memory, auto_promote_reinforced, expire_stale_candidates
   - Action verb auto-selected: promoted/rejected/invalidated/
     superseded/updated based on state transition
   - "actor" threaded through: api-http, human-triage, phase10-auto-
     promote, candidate-expiry, reinforcement, etc.
   - Fail-open: audit write failure logs but never breaks the mutation
   - GET /memory/{id}/audit: full history for one memory
   - GET /admin/audit/recent: last 50 mutations across the system

2. Alerts framework (src/atocore/observability/alerts.py):
   - emit_alert(severity, title, message, context) fans out to:
     - structlog logger (always)
     - ~/atocore-logs/alerts.log append (configurable via
       ATOCORE_ALERT_LOG)
     - project_state atocore/alert/last_{severity} (dashboard surface)
     - ATOCORE_ALERT_WEBHOOK POST if set (auto-detects Discord webhook
       format for nice embeds; generic JSON otherwise)
   - Every sink fail-open — one failure doesn't prevent the others
   - Pipeline alert step in nightly cron: harness < 85% → warning;
     candidate queue > 200 → warning

3. Integrity checks (scripts/integrity_check.py):
   - Nightly scan for drift:
     - Memories → missing source_chunk_id references
     - Duplicate active memories (same type+content+project)
     - project_state → missing projects
     - Orphaned source_chunks (no parent document)
   - Results persisted to atocore/status/integrity_check_result
   - Any finding emits a warning alert
   - Added as Step G in deploy/dalidou/batch-extract.sh nightly cron

4. Dashboard surfaces it all:
   - integrity (findings + details)
   - alerts (last info/warning/critical per severity)
   - recent_audit (last 10 mutations with actor + action + preview)

Tests: 308 → 317 (9 new):
  - test_audit_create_logs_entry
  - test_audit_promote_logs_entry
  - test_audit_reject_logs_entry
  - test_audit_update_captures_before_after
  - test_audit_reinforce_logs_entry
  - test_recent_audit_returns_cross_memory_entries
  - test_emit_alert_writes_log_file
  - test_emit_alert_invalid_severity_falls_back_to_info
  - test_emit_alert_fails_open_on_log_write_error

Deferred: formal migration framework with rollback (current additive
pattern is fine for V1); memory detail wiki page with audit view
(quick follow-up).

To enable Discord alerts: set ATOCORE_ALERT_WEBHOOK to a Discord
webhook URL in Dalidou's environment. Default = log-only.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-16 21:54:10 -04:00
parent bfa7dba4de
commit 88f2f7c4e1
8 changed files with 777 additions and 37 deletions

View File

@@ -543,17 +543,33 @@ def api_update_memory(memory_id: str, req: MemoryUpdateRequest) -> dict:
@router.delete("/memory/{memory_id}")
def api_invalidate_memory(memory_id: str) -> dict:
"""Invalidate a memory (error correction)."""
success = invalidate_memory(memory_id)
success = invalidate_memory(memory_id, actor="api-http")
if not success:
raise HTTPException(status_code=404, detail="Memory not found")
return {"status": "invalidated", "id": memory_id}
@router.get("/memory/{memory_id}/audit")
def api_memory_audit(memory_id: str, limit: int = 100) -> dict:
"""Return the audit history for a specific memory (newest first)."""
from atocore.memory.service import get_memory_audit
entries = get_memory_audit(memory_id, limit=limit)
return {"memory_id": memory_id, "entries": entries, "count": len(entries)}
@router.get("/admin/audit/recent")
def api_recent_audit(limit: int = 50) -> dict:
"""Return recent memory_audit entries across all memories (newest first)."""
from atocore.memory.service import get_recent_audit
entries = get_recent_audit(limit=limit)
return {"entries": entries, "count": len(entries)}
@router.post("/memory/{memory_id}/promote")
def api_promote_memory(memory_id: str) -> dict:
"""Promote a candidate memory to active (Phase 9 Commit C)."""
try:
success = promote_memory(memory_id)
success = promote_memory(memory_id, actor="api-http")
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
if not success:
@@ -567,7 +583,7 @@ def api_promote_memory(memory_id: str) -> dict:
@router.post("/memory/{memory_id}/reject")
def api_reject_candidate_memory(memory_id: str) -> dict:
"""Reject a candidate memory (Phase 9 Commit C review queue)."""
success = reject_candidate_memory(memory_id)
success = reject_candidate_memory(memory_id, actor="api-http")
if not success:
raise HTTPException(
status_code=404,
@@ -1046,33 +1062,47 @@ def api_dashboard() -> dict:
# Pipeline health from project state
pipeline: dict = {}
extract_state: dict = {}
integrity: dict = {}
alerts: dict = {}
try:
state_entries = get_state("atocore")
for entry in state_entries:
if entry.category != "status":
continue
if entry.key == "last_extract_batch_run":
extract_state["last_run"] = entry.value
elif entry.key == "pipeline_last_run":
pipeline["last_run"] = entry.value
if entry.category == "status":
if entry.key == "last_extract_batch_run":
extract_state["last_run"] = entry.value
elif entry.key == "pipeline_last_run":
pipeline["last_run"] = entry.value
try:
last = _dt.fromisoformat(entry.value.replace("Z", "+00:00"))
delta = _dt.now(_tz.utc) - last
pipeline["hours_since_last_run"] = round(
delta.total_seconds() / 3600, 1
)
except Exception:
pass
elif entry.key == "pipeline_summary":
try:
pipeline["summary"] = _json.loads(entry.value)
except Exception:
pipeline["summary_raw"] = entry.value
elif entry.key == "retrieval_harness_result":
try:
pipeline["harness"] = _json.loads(entry.value)
except Exception:
pipeline["harness_raw"] = entry.value
elif entry.key == "integrity_check_result":
try:
integrity = _json.loads(entry.value)
except Exception:
pass
elif entry.category == "alert":
# keys like "last_info", "last_warning", "last_critical"
try:
last = _dt.fromisoformat(entry.value.replace("Z", "+00:00"))
delta = _dt.now(_tz.utc) - last
pipeline["hours_since_last_run"] = round(
delta.total_seconds() / 3600, 1
)
payload = _json.loads(entry.value)
except Exception:
pass
elif entry.key == "pipeline_summary":
try:
pipeline["summary"] = _json.loads(entry.value)
except Exception:
pipeline["summary_raw"] = entry.value
elif entry.key == "retrieval_harness_result":
try:
pipeline["harness"] = _json.loads(entry.value)
except Exception:
pipeline["harness_raw"] = entry.value
payload = {"raw": entry.value}
severity = entry.key.replace("last_", "")
alerts[severity] = payload
except Exception:
pass
@@ -1107,6 +1137,14 @@ def api_dashboard() -> dict:
elif len(candidates) > 20:
triage["notice"] = f"{len(candidates)} candidates awaiting triage."
# Recent audit activity (Phase 4 V1) — last 10 mutations for operator
recent_audit: list[dict] = []
try:
from atocore.memory.service import get_recent_audit as _gra
recent_audit = _gra(limit=10)
except Exception:
pass
return {
"memories": {
"active": len(active),
@@ -1123,6 +1161,9 @@ def api_dashboard() -> dict:
"extraction_pipeline": extract_state,
"pipeline": pipeline,
"triage": triage,
"integrity": integrity,
"alerts": alerts,
"recent_audit": recent_audit,
}

View File

@@ -67,6 +67,106 @@ class Memory:
valid_until: str = "" # ISO UTC; empty = permanent
def _audit_memory(
memory_id: str,
action: str,
actor: str = "api",
before: dict | None = None,
after: dict | None = None,
note: str = "",
) -> None:
"""Append an entry to memory_audit.
Phase 4 Robustness V1. Every memory mutation flows through this
helper so we can answer "how did this memory get to its current
state?" and "when did we learn X?".
``action`` is a short verb: created, updated, promoted, rejected,
superseded, invalidated, reinforced, auto_promoted, expired.
``actor`` identifies the caller: api (default), auto-triage,
human-triage, host-cron, reinforcement, phase10-auto-promote,
etc. ``before`` / ``after`` are field snapshots (JSON-serialized).
Fail-open: a logging failure never breaks the mutation itself.
"""
import json as _json
try:
with get_connection() as conn:
conn.execute(
"INSERT INTO memory_audit (id, memory_id, action, actor, "
"before_json, after_json, note) VALUES (?, ?, ?, ?, ?, ?, ?)",
(
str(uuid.uuid4()),
memory_id,
action,
actor or "api",
_json.dumps(before or {}),
_json.dumps(after or {}),
(note or "")[:500],
),
)
except Exception as e:
log.warning("memory_audit_failed", memory_id=memory_id, action=action, error=str(e))
def get_memory_audit(memory_id: str, limit: int = 100) -> list[dict]:
"""Fetch audit entries for a memory, newest first."""
import json as _json
with get_connection() as conn:
rows = conn.execute(
"SELECT id, memory_id, action, actor, before_json, after_json, note, timestamp "
"FROM memory_audit WHERE memory_id = ? ORDER BY timestamp DESC LIMIT ?",
(memory_id, limit),
).fetchall()
out = []
for r in rows:
try:
before = _json.loads(r["before_json"] or "{}")
except Exception:
before = {}
try:
after = _json.loads(r["after_json"] or "{}")
except Exception:
after = {}
out.append({
"id": r["id"],
"memory_id": r["memory_id"],
"action": r["action"],
"actor": r["actor"] or "api",
"before": before,
"after": after,
"note": r["note"] or "",
"timestamp": r["timestamp"],
})
return out
def get_recent_audit(limit: int = 50) -> list[dict]:
"""Fetch recent memory_audit entries across all memories, newest first."""
import json as _json
with get_connection() as conn:
rows = conn.execute(
"SELECT id, memory_id, action, actor, before_json, after_json, note, timestamp "
"FROM memory_audit ORDER BY timestamp DESC LIMIT ?",
(limit,),
).fetchall()
out = []
for r in rows:
try:
after = _json.loads(r["after_json"] or "{}")
except Exception:
after = {}
out.append({
"id": r["id"],
"memory_id": r["memory_id"],
"action": r["action"],
"actor": r["actor"] or "api",
"note": r["note"] or "",
"timestamp": r["timestamp"],
"content_preview": (after.get("content") or "")[:120],
})
return out
def _normalize_tags(tags) -> list[str]:
"""Coerce a tags value (list, JSON string, None) to a clean lowercase list."""
import json as _json
@@ -98,6 +198,7 @@ def create_memory(
status: str = "active",
domain_tags: list[str] | None = None,
valid_until: str = "",
actor: str = "api",
) -> Memory:
"""Create a new memory entry.
@@ -160,6 +261,21 @@ def create_memory(
valid_until=valid_until or "",
)
_audit_memory(
memory_id=memory_id,
action="created",
actor=actor,
after={
"memory_type": memory_type,
"content": content,
"project": project,
"status": status,
"confidence": confidence,
"domain_tags": tags,
"valid_until": valid_until or "",
},
)
return Memory(
id=memory_id,
memory_type=memory_type,
@@ -235,6 +351,8 @@ def update_memory(
memory_type: str | None = None,
domain_tags: list[str] | None = None,
valid_until: str | None = None,
actor: str = "api",
note: str = "",
) -> bool:
"""Update an existing memory."""
import json as _json
@@ -258,31 +376,48 @@ def update_memory(
if duplicate:
raise ValueError("Update would create a duplicate active memory")
# Capture before-state for audit
before_snapshot = {
"content": existing["content"],
"status": existing["status"],
"confidence": existing["confidence"],
"memory_type": existing["memory_type"],
}
after_snapshot = dict(before_snapshot)
updates = []
params: list = []
if content is not None:
updates.append("content = ?")
params.append(content)
after_snapshot["content"] = content
if confidence is not None:
updates.append("confidence = ?")
params.append(confidence)
after_snapshot["confidence"] = confidence
if status is not None:
if status not in MEMORY_STATUSES:
raise ValueError(f"Invalid status '{status}'. Must be one of: {MEMORY_STATUSES}")
updates.append("status = ?")
params.append(status)
after_snapshot["status"] = status
if memory_type is not None:
if memory_type not in MEMORY_TYPES:
raise ValueError(f"Invalid memory type '{memory_type}'. Must be one of: {MEMORY_TYPES}")
updates.append("memory_type = ?")
params.append(memory_type)
after_snapshot["memory_type"] = memory_type
if domain_tags is not None:
norm_tags = _normalize_tags(domain_tags)
updates.append("domain_tags = ?")
params.append(_json.dumps(_normalize_tags(domain_tags)))
params.append(_json.dumps(norm_tags))
after_snapshot["domain_tags"] = norm_tags
if valid_until is not None:
vu = valid_until.strip() or None
updates.append("valid_until = ?")
params.append(valid_until.strip() or None)
params.append(vu)
after_snapshot["valid_until"] = vu or ""
if not updates:
return False
@@ -297,21 +432,40 @@ def update_memory(
if result.rowcount > 0:
log.info("memory_updated", memory_id=memory_id)
# Action verb is driven by status change when applicable; otherwise "updated"
if status == "active" and before_snapshot["status"] == "candidate":
action = "promoted"
elif status == "invalid" and before_snapshot["status"] == "candidate":
action = "rejected"
elif status == "invalid":
action = "invalidated"
elif status == "superseded":
action = "superseded"
else:
action = "updated"
_audit_memory(
memory_id=memory_id,
action=action,
actor=actor,
before=before_snapshot,
after=after_snapshot,
note=note,
)
return True
return False
def invalidate_memory(memory_id: str) -> bool:
def invalidate_memory(memory_id: str, actor: str = "api") -> bool:
"""Mark a memory as invalid (error correction)."""
return update_memory(memory_id, status="invalid")
return update_memory(memory_id, status="invalid", actor=actor)
def supersede_memory(memory_id: str) -> bool:
def supersede_memory(memory_id: str, actor: str = "api") -> bool:
"""Mark a memory as superseded (replaced by newer info)."""
return update_memory(memory_id, status="superseded")
return update_memory(memory_id, status="superseded", actor=actor)
def promote_memory(memory_id: str) -> bool:
def promote_memory(memory_id: str, actor: str = "api", note: str = "") -> bool:
"""Promote a candidate memory to active (Phase 9 Commit C review queue).
Returns False if the memory does not exist or is not currently a
@@ -326,10 +480,10 @@ def promote_memory(memory_id: str) -> bool:
return False
if row["status"] != "candidate":
return False
return update_memory(memory_id, status="active")
return update_memory(memory_id, status="active", actor=actor, note=note)
def reject_candidate_memory(memory_id: str) -> bool:
def reject_candidate_memory(memory_id: str, actor: str = "api", note: str = "") -> bool:
"""Reject a candidate memory (Phase 9 Commit C).
Sets the candidate's status to ``invalid`` so it drops out of the
@@ -344,7 +498,7 @@ def reject_candidate_memory(memory_id: str) -> bool:
return False
if row["status"] != "candidate":
return False
return update_memory(memory_id, status="invalid")
return update_memory(memory_id, status="invalid", actor=actor, note=note)
def reinforce_memory(
@@ -385,6 +539,17 @@ def reinforce_memory(
old_confidence=round(old_confidence, 4),
new_confidence=round(new_confidence, 4),
)
# Reinforcement writes an audit row per bump. Reinforcement fires often
# (every captured interaction); this lets you trace which interactions
# kept which memories alive. Could become chatty but is invaluable for
# decay/cold-memory analysis. If it becomes an issue, throttle here.
_audit_memory(
memory_id=memory_id,
action="reinforced",
actor="reinforcement",
before={"confidence": old_confidence},
after={"confidence": new_confidence},
)
return True, old_confidence, new_confidence
@@ -420,7 +585,11 @@ def auto_promote_reinforced(
for row in rows:
mid = row["id"]
ok = promote_memory(mid)
ok = promote_memory(
mid,
actor="phase10-auto-promote",
note=f"ref_count={row['reference_count']} confidence={row['confidence']:.2f}",
)
if ok:
promoted.append(mid)
log.info(
@@ -459,7 +628,11 @@ def expire_stale_candidates(
for row in rows:
mid = row["id"]
ok = reject_candidate_memory(mid)
ok = reject_candidate_memory(
mid,
actor="candidate-expiry",
note=f"unreinforced for {max_age_days}+ days",
)
if ok:
expired.append(mid)
log.info("memory_expired", memory_id=mid)

View File

@@ -136,6 +136,30 @@ def _apply_migrations(conn: sqlite3.Connection) -> None:
"CREATE INDEX IF NOT EXISTS idx_memories_valid_until ON memories(valid_until)"
)
# Phase 4 (Robustness V1): append-only audit log for memory mutations.
# Every create/update/promote/reject/supersede/invalidate/reinforce/expire/
# auto_promote writes one row here. before/after are JSON snapshots of the
# relevant fields. actor lets us distinguish auto-triage vs human-triage vs
# api vs cron. This is the "how did this memory get to its current state"
# trail — essential once the brain starts auto-organizing itself.
conn.execute(
"""
CREATE TABLE IF NOT EXISTS memory_audit (
id TEXT PRIMARY KEY,
memory_id TEXT NOT NULL,
action TEXT NOT NULL,
actor TEXT DEFAULT 'api',
before_json TEXT DEFAULT '{}',
after_json TEXT DEFAULT '{}',
note TEXT DEFAULT '',
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_memory_audit_memory ON memory_audit(memory_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_memory_audit_timestamp ON memory_audit(timestamp)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_memory_audit_action ON memory_audit(action)")
# Phase 9 Commit A: capture loop columns on the interactions table.
# The original schema only carried prompt + project_id + a context_pack
# JSON blob. To make interactions a real audit trail of what AtoCore fed

View File

@@ -0,0 +1,170 @@
"""Alert emission framework (Phase 4 Robustness V1).
One-stop helper to raise operational alerts from any AtoCore code
path. An alert is a structured message about something the operator
should see — harness regression, queue pileup, integrity drift,
pipeline skipped, etc.
Emission fans out to multiple sinks so a single call touches every
observability channel:
1. structlog logger (always)
2. Append to ``$ATOCORE_ALERT_LOG`` (default ~/atocore-logs/alerts.log)
3. Write the last alert of each severity to AtoCore project state
(atocore/alert/last_{severity}) so the dashboard can surface it
4. POST to ``$ATOCORE_ALERT_WEBHOOK`` if set (Discord/Slack/generic)
All sinks are fail-open — if one fails the others still fire.
Severity levels (inspired by syslog but simpler):
- ``info`` operational event worth noting
- ``warning`` degraded state, service still works
- ``critical`` something is broken and needs attention
Environment variables:
ATOCORE_ALERT_LOG override the alerts log file path
ATOCORE_ALERT_WEBHOOK POST JSON alerts here (Discord webhook, etc.)
ATOCORE_BASE_URL AtoCore API for project-state write (default localhost:8100)
"""
from __future__ import annotations
import json
import os
import threading
import urllib.error
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from atocore.observability.logger import get_logger
log = get_logger("alerts")
SEVERITIES = {"info", "warning", "critical"}
def _default_alert_log() -> Path:
explicit = os.environ.get("ATOCORE_ALERT_LOG")
if explicit:
return Path(explicit)
return Path.home() / "atocore-logs" / "alerts.log"
def _append_log(severity: str, title: str, message: str, context: dict | None) -> None:
path = _default_alert_log()
try:
path.parent.mkdir(parents=True, exist_ok=True)
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
line = f"[{ts}] [{severity.upper()}] {title}: {message}"
if context:
line += f" {json.dumps(context, ensure_ascii=True)[:500]}"
line += "\n"
with open(path, "a", encoding="utf-8") as f:
f.write(line)
except Exception as e:
log.warning("alert_log_write_failed", error=str(e))
def _write_state(severity: str, title: str, message: str, ts: str) -> None:
"""Record the most-recent alert per severity into project_state.
Uses the internal ``set_state`` helper directly so we work even
when the HTTP API isn't available (e.g. called from cron scripts
that import atocore as a library).
"""
try:
from atocore.context.project_state import set_state
set_state(
project_name="atocore",
category="alert",
key=f"last_{severity}",
value=json.dumps({"title": title, "message": message[:400], "timestamp": ts}),
source="alert framework",
)
except Exception as e:
log.warning("alert_state_write_failed", error=str(e))
def _post_webhook(severity: str, title: str, message: str, context: dict | None, ts: str) -> None:
url = os.environ.get("ATOCORE_ALERT_WEBHOOK")
if not url:
return
# Auto-detect Discord webhook shape for nicer formatting
if "discord.com/api/webhooks" in url or "discordapp.com/api/webhooks" in url:
emoji = {"info": ":information_source:", "warning": ":warning:", "critical": ":rotating_light:"}.get(severity, "")
body = {
"content": f"{emoji} **AtoCore {severity}**: {title}",
"embeds": [{
"description": message[:1800],
"timestamp": ts,
"fields": [
{"name": k, "value": str(v)[:200], "inline": True}
for k, v in (context or {}).items()
][:10],
}],
}
else:
body = {
"severity": severity,
"title": title,
"message": message,
"context": context or {},
"timestamp": ts,
}
def _fire():
try:
req = urllib.request.Request(
url,
data=json.dumps(body).encode("utf-8"),
method="POST",
headers={"Content-Type": "application/json"},
)
urllib.request.urlopen(req, timeout=8)
except Exception as e:
log.warning("alert_webhook_failed", error=str(e))
threading.Thread(target=_fire, daemon=True).start()
def emit_alert(
severity: str,
title: str,
message: str,
context: dict | None = None,
) -> None:
"""Emit an alert to all configured sinks.
Fail-open: any single sink failure is logged but does not prevent
other sinks from firing.
"""
severity = (severity or "info").lower()
if severity not in SEVERITIES:
severity = "info"
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Sink 1: structlog — always
logger_fn = {
"info": log.info,
"warning": log.warning,
"critical": log.error,
}[severity]
logger_fn("alert", title=title, message=message[:500], **(context or {}))
# Sinks 2-4: fail-open, each wrapped
try:
_append_log(severity, title, message, context)
except Exception:
pass
try:
_write_state(severity, title, message, ts)
except Exception:
pass
try:
_post_webhook(severity, title, message, context, ts)
except Exception:
pass