feat: Phase 4 V1 — Robustness Hardening

Adds the observability + safety layer that turns AtoCore from "works until something silently breaks" into "every mutation is traceable, drift is detected, failures raise alerts." 1. Audit log (memory_audit table): - New table with id, memory_id, action, actor, before/after JSON, note, timestamp; 3 indexes for memory_id/timestamp/action - _audit_memory() helper called from every mutation: create_memory, update_memory, promote_memory, reject_candidate_memory, invalidate_memory, supersede_memory, reinforce_memory, auto_promote_reinforced, expire_stale_candidates - Action verb auto-selected: promoted/rejected/invalidated/ superseded/updated based on state transition - "actor" threaded through: api-http, human-triage, phase10-auto- promote, candidate-expiry, reinforcement, etc. - Fail-open: audit write failure logs but never breaks the mutation - GET /memory/{id}/audit: full history for one memory - GET /admin/audit/recent: last 50 mutations across the system 2. Alerts framework (src/atocore/observability/alerts.py): - emit_alert(severity, title, message, context) fans out to: - structlog logger (always) - ~/atocore-logs/alerts.log append (configurable via ATOCORE_ALERT_LOG) - project_state atocore/alert/last_{severity} (dashboard surface) - ATOCORE_ALERT_WEBHOOK POST if set (auto-detects Discord webhook format for nice embeds; generic JSON otherwise) - Every sink fail-open — one failure doesn't prevent the others - Pipeline alert step in nightly cron: harness < 85% → warning; candidate queue > 200 → warning 3. Integrity checks (scripts/integrity_check.py): - Nightly scan for drift: - Memories → missing source_chunk_id references - Duplicate active memories (same type+content+project) - project_state → missing projects - Orphaned source_chunks (no parent document) - Results persisted to atocore/status/integrity_check_result - Any finding emits a warning alert - Added as Step G in deploy/dalidou/batch-extract.sh nightly cron 4. Dashboard surfaces it all: - integrity (findings + details) - alerts (last info/warning/critical per severity) - recent_audit (last 10 mutations with actor + action + preview) Tests: 308 → 317 (9 new): - test_audit_create_logs_entry - test_audit_promote_logs_entry - test_audit_reject_logs_entry - test_audit_update_captures_before_after - test_audit_reinforce_logs_entry - test_recent_audit_returns_cross_memory_entries - test_emit_alert_writes_log_file - test_emit_alert_invalid_severity_falls_back_to_info - test_emit_alert_fails_open_on_log_write_error Deferred: formal migration framework with rollback (current additive pattern is fine for V1); memory detail wiki page with audit view (quick follow-up). To enable Discord alerts: set ATOCORE_ALERT_WEBHOOK to a Discord webhook URL in Dalidou's environment. Default = log-only. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 21:54:10 -04:00
parent bfa7dba4de
commit 88f2f7c4e1
8 changed files with 777 additions and 37 deletions
--- a/src/atocore/api/routes.py
+++ b/src/atocore/api/routes.py
@@ -543,17 +543,33 @@ def api_update_memory(memory_id: str, req: MemoryUpdateRequest) -> dict:
@router.delete("/memory/{memory_id}")
 def api_invalidate_memory(memory_id: str) -> dict:
    """Invalidate a memory (error correction)."""
-    success = invalidate_memory(memory_id)
+    success = invalidate_memory(memory_id, actor="api-http")
    if not success:
        raise HTTPException(status_code=404, detail="Memory not found")
    return {"status": "invalidated", "id": memory_id}


+@router.get("/memory/{memory_id}/audit")
+def api_memory_audit(memory_id: str, limit: int = 100) -> dict:
+    """Return the audit history for a specific memory (newest first)."""
+    from atocore.memory.service import get_memory_audit
+    entries = get_memory_audit(memory_id, limit=limit)
+    return {"memory_id": memory_id, "entries": entries, "count": len(entries)}
+
+
+@router.get("/admin/audit/recent")
+def api_recent_audit(limit: int = 50) -> dict:
+    """Return recent memory_audit entries across all memories (newest first)."""
+    from atocore.memory.service import get_recent_audit
+    entries = get_recent_audit(limit=limit)
+    return {"entries": entries, "count": len(entries)}
+
+
@router.post("/memory/{memory_id}/promote")
 def api_promote_memory(memory_id: str) -> dict:
    """Promote a candidate memory to active (Phase 9 Commit C)."""
    try:
-        success = promote_memory(memory_id)
+        success = promote_memory(memory_id, actor="api-http")
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    if not success:
@@ -567,7 +583,7 @@ def api_promote_memory(memory_id: str) -> dict:
@router.post("/memory/{memory_id}/reject")
 def api_reject_candidate_memory(memory_id: str) -> dict:
    """Reject a candidate memory (Phase 9 Commit C review queue)."""
-    success = reject_candidate_memory(memory_id)
+    success = reject_candidate_memory(memory_id, actor="api-http")
    if not success:
        raise HTTPException(
            status_code=404,
@@ -1046,33 +1062,47 @@ def api_dashboard() -> dict:
    # Pipeline health from project state
    pipeline: dict = {}
    extract_state: dict = {}
+    integrity: dict = {}
+    alerts: dict = {}
    try:
        state_entries = get_state("atocore")
        for entry in state_entries:
-            if entry.category != "status":
-                continue
-            if entry.key == "last_extract_batch_run":
-                extract_state["last_run"] = entry.value
-            elif entry.key == "pipeline_last_run":
-                pipeline["last_run"] = entry.value
+            if entry.category == "status":
+                if entry.key == "last_extract_batch_run":
+                    extract_state["last_run"] = entry.value
+                elif entry.key == "pipeline_last_run":
+                    pipeline["last_run"] = entry.value
+                    try:
+                        last = _dt.fromisoformat(entry.value.replace("Z", "+00:00"))
+                        delta = _dt.now(_tz.utc) - last
+                        pipeline["hours_since_last_run"] = round(
+                            delta.total_seconds() / 3600, 1
+                        )
+                    except Exception:
+                        pass
+                elif entry.key == "pipeline_summary":
+                    try:
+                        pipeline["summary"] = _json.loads(entry.value)
+                    except Exception:
+                        pipeline["summary_raw"] = entry.value
+                elif entry.key == "retrieval_harness_result":
+                    try:
+                        pipeline["harness"] = _json.loads(entry.value)
+                    except Exception:
+                        pipeline["harness_raw"] = entry.value
+                elif entry.key == "integrity_check_result":
+                    try:
+                        integrity = _json.loads(entry.value)
+                    except Exception:
+                        pass
+            elif entry.category == "alert":
+                # keys like "last_info", "last_warning", "last_critical"
                try:
-                    last = _dt.fromisoformat(entry.value.replace("Z", "+00:00"))
-                    delta = _dt.now(_tz.utc) - last
-                    pipeline["hours_since_last_run"] = round(
-                        delta.total_seconds() / 3600, 1
-                    )
+                    payload = _json.loads(entry.value)
                except Exception:
-                    pass
-            elif entry.key == "pipeline_summary":
-                try:
-                    pipeline["summary"] = _json.loads(entry.value)
-                except Exception:
-                    pipeline["summary_raw"] = entry.value
-            elif entry.key == "retrieval_harness_result":
-                try:
-                    pipeline["harness"] = _json.loads(entry.value)
-                except Exception:
-                    pipeline["harness_raw"] = entry.value
+                    payload = {"raw": entry.value}
+                severity = entry.key.replace("last_", "")
+                alerts[severity] = payload
    except Exception:
        pass

@@ -1107,6 +1137,14 @@ def api_dashboard() -> dict:
    elif len(candidates) > 20:
        triage["notice"] = f"{len(candidates)} candidates awaiting triage."

+    # Recent audit activity (Phase 4 V1) — last 10 mutations for operator
+    recent_audit: list[dict] = []
+    try:
+        from atocore.memory.service import get_recent_audit as _gra
+        recent_audit = _gra(limit=10)
+    except Exception:
+        pass
+
    return {
        "memories": {
            "active": len(active),
@@ -1123,6 +1161,9 @@ def api_dashboard() -> dict:
        "extraction_pipeline": extract_state,
        "pipeline": pipeline,
        "triage": triage,
+        "integrity": integrity,
+        "alerts": alerts,
+        "recent_audit": recent_audit,
    }