feat: Phase 4 V1 — Robustness Hardening
Adds the observability + safety layer that turns AtoCore from
"works until something silently breaks" into "every mutation is
traceable, drift is detected, failures raise alerts."
1. Audit log (memory_audit table):
- New table with id, memory_id, action, actor, before/after JSON,
note, timestamp; 3 indexes for memory_id/timestamp/action
- _audit_memory() helper called from every mutation:
create_memory, update_memory, promote_memory,
reject_candidate_memory, invalidate_memory, supersede_memory,
reinforce_memory, auto_promote_reinforced, expire_stale_candidates
- Action verb auto-selected: promoted/rejected/invalidated/
superseded/updated based on state transition
- "actor" threaded through: api-http, human-triage, phase10-auto-
promote, candidate-expiry, reinforcement, etc.
- Fail-open: audit write failure logs but never breaks the mutation
- GET /memory/{id}/audit: full history for one memory
- GET /admin/audit/recent: last 50 mutations across the system
2. Alerts framework (src/atocore/observability/alerts.py):
- emit_alert(severity, title, message, context) fans out to:
- structlog logger (always)
- ~/atocore-logs/alerts.log append (configurable via
ATOCORE_ALERT_LOG)
- project_state atocore/alert/last_{severity} (dashboard surface)
- ATOCORE_ALERT_WEBHOOK POST if set (auto-detects Discord webhook
format for nice embeds; generic JSON otherwise)
- Every sink fail-open — one failure doesn't prevent the others
- Pipeline alert step in nightly cron: harness < 85% → warning;
candidate queue > 200 → warning
3. Integrity checks (scripts/integrity_check.py):
- Nightly scan for drift:
- Memories → missing source_chunk_id references
- Duplicate active memories (same type+content+project)
- project_state → missing projects
- Orphaned source_chunks (no parent document)
- Results persisted to atocore/status/integrity_check_result
- Any finding emits a warning alert
- Added as Step G in deploy/dalidou/batch-extract.sh nightly cron
4. Dashboard surfaces it all:
- integrity (findings + details)
- alerts (last info/warning/critical per severity)
- recent_audit (last 10 mutations with actor + action + preview)
Tests: 308 → 317 (9 new):
- test_audit_create_logs_entry
- test_audit_promote_logs_entry
- test_audit_reject_logs_entry
- test_audit_update_captures_before_after
- test_audit_reinforce_logs_entry
- test_recent_audit_returns_cross_memory_entries
- test_emit_alert_writes_log_file
- test_emit_alert_invalid_severity_falls_back_to_info
- test_emit_alert_fails_open_on_log_write_error
Deferred: formal migration framework with rollback (current additive
pattern is fine for V1); memory detail wiki page with audit view
(quick follow-up).
To enable Discord alerts: set ATOCORE_ALERT_WEBHOOK to a Discord
webhook URL in Dalidou's environment. Default = log-only.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -543,17 +543,33 @@ def api_update_memory(memory_id: str, req: MemoryUpdateRequest) -> dict:
|
||||
@router.delete("/memory/{memory_id}")
|
||||
def api_invalidate_memory(memory_id: str) -> dict:
|
||||
"""Invalidate a memory (error correction)."""
|
||||
success = invalidate_memory(memory_id)
|
||||
success = invalidate_memory(memory_id, actor="api-http")
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Memory not found")
|
||||
return {"status": "invalidated", "id": memory_id}
|
||||
|
||||
|
||||
@router.get("/memory/{memory_id}/audit")
|
||||
def api_memory_audit(memory_id: str, limit: int = 100) -> dict:
|
||||
"""Return the audit history for a specific memory (newest first)."""
|
||||
from atocore.memory.service import get_memory_audit
|
||||
entries = get_memory_audit(memory_id, limit=limit)
|
||||
return {"memory_id": memory_id, "entries": entries, "count": len(entries)}
|
||||
|
||||
|
||||
@router.get("/admin/audit/recent")
|
||||
def api_recent_audit(limit: int = 50) -> dict:
|
||||
"""Return recent memory_audit entries across all memories (newest first)."""
|
||||
from atocore.memory.service import get_recent_audit
|
||||
entries = get_recent_audit(limit=limit)
|
||||
return {"entries": entries, "count": len(entries)}
|
||||
|
||||
|
||||
@router.post("/memory/{memory_id}/promote")
|
||||
def api_promote_memory(memory_id: str) -> dict:
|
||||
"""Promote a candidate memory to active (Phase 9 Commit C)."""
|
||||
try:
|
||||
success = promote_memory(memory_id)
|
||||
success = promote_memory(memory_id, actor="api-http")
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
if not success:
|
||||
@@ -567,7 +583,7 @@ def api_promote_memory(memory_id: str) -> dict:
|
||||
@router.post("/memory/{memory_id}/reject")
|
||||
def api_reject_candidate_memory(memory_id: str) -> dict:
|
||||
"""Reject a candidate memory (Phase 9 Commit C review queue)."""
|
||||
success = reject_candidate_memory(memory_id)
|
||||
success = reject_candidate_memory(memory_id, actor="api-http")
|
||||
if not success:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
@@ -1046,33 +1062,47 @@ def api_dashboard() -> dict:
|
||||
# Pipeline health from project state
|
||||
pipeline: dict = {}
|
||||
extract_state: dict = {}
|
||||
integrity: dict = {}
|
||||
alerts: dict = {}
|
||||
try:
|
||||
state_entries = get_state("atocore")
|
||||
for entry in state_entries:
|
||||
if entry.category != "status":
|
||||
continue
|
||||
if entry.key == "last_extract_batch_run":
|
||||
extract_state["last_run"] = entry.value
|
||||
elif entry.key == "pipeline_last_run":
|
||||
pipeline["last_run"] = entry.value
|
||||
if entry.category == "status":
|
||||
if entry.key == "last_extract_batch_run":
|
||||
extract_state["last_run"] = entry.value
|
||||
elif entry.key == "pipeline_last_run":
|
||||
pipeline["last_run"] = entry.value
|
||||
try:
|
||||
last = _dt.fromisoformat(entry.value.replace("Z", "+00:00"))
|
||||
delta = _dt.now(_tz.utc) - last
|
||||
pipeline["hours_since_last_run"] = round(
|
||||
delta.total_seconds() / 3600, 1
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
elif entry.key == "pipeline_summary":
|
||||
try:
|
||||
pipeline["summary"] = _json.loads(entry.value)
|
||||
except Exception:
|
||||
pipeline["summary_raw"] = entry.value
|
||||
elif entry.key == "retrieval_harness_result":
|
||||
try:
|
||||
pipeline["harness"] = _json.loads(entry.value)
|
||||
except Exception:
|
||||
pipeline["harness_raw"] = entry.value
|
||||
elif entry.key == "integrity_check_result":
|
||||
try:
|
||||
integrity = _json.loads(entry.value)
|
||||
except Exception:
|
||||
pass
|
||||
elif entry.category == "alert":
|
||||
# keys like "last_info", "last_warning", "last_critical"
|
||||
try:
|
||||
last = _dt.fromisoformat(entry.value.replace("Z", "+00:00"))
|
||||
delta = _dt.now(_tz.utc) - last
|
||||
pipeline["hours_since_last_run"] = round(
|
||||
delta.total_seconds() / 3600, 1
|
||||
)
|
||||
payload = _json.loads(entry.value)
|
||||
except Exception:
|
||||
pass
|
||||
elif entry.key == "pipeline_summary":
|
||||
try:
|
||||
pipeline["summary"] = _json.loads(entry.value)
|
||||
except Exception:
|
||||
pipeline["summary_raw"] = entry.value
|
||||
elif entry.key == "retrieval_harness_result":
|
||||
try:
|
||||
pipeline["harness"] = _json.loads(entry.value)
|
||||
except Exception:
|
||||
pipeline["harness_raw"] = entry.value
|
||||
payload = {"raw": entry.value}
|
||||
severity = entry.key.replace("last_", "")
|
||||
alerts[severity] = payload
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -1107,6 +1137,14 @@ def api_dashboard() -> dict:
|
||||
elif len(candidates) > 20:
|
||||
triage["notice"] = f"{len(candidates)} candidates awaiting triage."
|
||||
|
||||
# Recent audit activity (Phase 4 V1) — last 10 mutations for operator
|
||||
recent_audit: list[dict] = []
|
||||
try:
|
||||
from atocore.memory.service import get_recent_audit as _gra
|
||||
recent_audit = _gra(limit=10)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"memories": {
|
||||
"active": len(active),
|
||||
@@ -1123,6 +1161,9 @@ def api_dashboard() -> dict:
|
||||
"extraction_pipeline": extract_state,
|
||||
"pipeline": pipeline,
|
||||
"triage": triage,
|
||||
"integrity": integrity,
|
||||
"alerts": alerts,
|
||||
"recent_audit": recent_audit,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user