feat: Phase 4 V1 — Robustness Hardening

Adds the observability + safety layer that turns AtoCore from "works until something silently breaks" into "every mutation is traceable, drift is detected, failures raise alerts." 1. Audit log (memory_audit table): - New table with id, memory_id, action, actor, before/after JSON, note, timestamp; 3 indexes for memory_id/timestamp/action - _audit_memory() helper called from every mutation: create_memory, update_memory, promote_memory, reject_candidate_memory, invalidate_memory, supersede_memory, reinforce_memory, auto_promote_reinforced, expire_stale_candidates - Action verb auto-selected: promoted/rejected/invalidated/ superseded/updated based on state transition - "actor" threaded through: api-http, human-triage, phase10-auto- promote, candidate-expiry, reinforcement, etc. - Fail-open: audit write failure logs but never breaks the mutation - GET /memory/{id}/audit: full history for one memory - GET /admin/audit/recent: last 50 mutations across the system 2. Alerts framework (src/atocore/observability/alerts.py): - emit_alert(severity, title, message, context) fans out to: - structlog logger (always) - ~/atocore-logs/alerts.log append (configurable via ATOCORE_ALERT_LOG) - project_state atocore/alert/last_{severity} (dashboard surface) - ATOCORE_ALERT_WEBHOOK POST if set (auto-detects Discord webhook format for nice embeds; generic JSON otherwise) - Every sink fail-open — one failure doesn't prevent the others - Pipeline alert step in nightly cron: harness < 85% → warning; candidate queue > 200 → warning 3. Integrity checks (scripts/integrity_check.py): - Nightly scan for drift: - Memories → missing source_chunk_id references - Duplicate active memories (same type+content+project) - project_state → missing projects - Orphaned source_chunks (no parent document) - Results persisted to atocore/status/integrity_check_result - Any finding emits a warning alert - Added as Step G in deploy/dalidou/batch-extract.sh nightly cron 4. Dashboard surfaces it all: - integrity (findings + details) - alerts (last info/warning/critical per severity) - recent_audit (last 10 mutations with actor + action + preview) Tests: 308 → 317 (9 new): - test_audit_create_logs_entry - test_audit_promote_logs_entry - test_audit_reject_logs_entry - test_audit_update_captures_before_after - test_audit_reinforce_logs_entry - test_recent_audit_returns_cross_memory_entries - test_emit_alert_writes_log_file - test_emit_alert_invalid_severity_falls_back_to_info - test_emit_alert_fails_open_on_log_write_error Deferred: formal migration framework with rollback (current additive pattern is fine for V1); memory detail wiki page with audit view (quick follow-up). To enable Discord alerts: set ATOCORE_ALERT_WEBHOOK to a Discord webhook URL in Dalidou's environment. Default = log-only. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 21:54:10 -04:00
parent bfa7dba4de
commit 88f2f7c4e1
8 changed files with 777 additions and 37 deletions
--- a/tests/test_alerts.py
+++ b/tests/test_alerts.py
@@ -0,0 +1,58 @@
+"""Tests for the Phase 4 alerts framework."""
+
+from __future__ import annotations
+
+import os
+import tempfile
+from pathlib import Path
+
+import pytest
+
+import atocore.config as _config
+
+
+@pytest.fixture(autouse=True)
+def isolated_env(monkeypatch):
+    """Isolate alerts sinks per test."""
+    tmpdir = tempfile.mkdtemp()
+    log_file = Path(tmpdir) / "alerts.log"
+    monkeypatch.setenv("ATOCORE_ALERT_LOG", str(log_file))
+    monkeypatch.delenv("ATOCORE_ALERT_WEBHOOK", raising=False)
+
+    # Data dir for any state writes
+    monkeypatch.setenv("ATOCORE_DATA_DIR", tmpdir)
+    _config.settings = _config.Settings()
+
+    from atocore.models.database import init_db
+    init_db()
+
+    yield {"tmpdir": tmpdir, "log_file": log_file}
+
+
+def test_emit_alert_writes_log_file(isolated_env):
+    from atocore.observability.alerts import emit_alert
+
+    emit_alert("warning", "test title", "test message body", context={"count": 5})
+
+    content = isolated_env["log_file"].read_text(encoding="utf-8")
+    assert "test title" in content
+    assert "test message body" in content
+    assert "WARNING" in content
+    assert '"count": 5' in content
+
+
+def test_emit_alert_invalid_severity_falls_back_to_info(isolated_env):
+    from atocore.observability.alerts import emit_alert
+
+    emit_alert("made-up-severity", "t", "m")
+    content = isolated_env["log_file"].read_text(encoding="utf-8")
+    assert "INFO" in content
+
+
+def test_emit_alert_fails_open_on_log_write_error(monkeypatch, isolated_env):
+    """An unwritable log path should not crash the emit."""
+    from atocore.observability.alerts import emit_alert
+
+    monkeypatch.setenv("ATOCORE_ALERT_LOG", "/nonexistent/path/that/definitely/is/not/writable/alerts.log")
+    # Must not raise
+    emit_alert("info", "t", "m")
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -264,6 +264,82 @@ def test_expire_stale_candidates(isolated_db):
    assert mem["status"] == "invalid"


+# --- Phase 4: memory_audit log ---
+
+
+def test_audit_create_logs_entry(isolated_db):
+    from atocore.memory.service import create_memory, get_memory_audit
+
+    mem = create_memory("knowledge", "test content for audit", actor="test-harness")
+    audit = get_memory_audit(mem.id)
+    assert len(audit) >= 1
+    latest = audit[0]
+    assert latest["action"] == "created"
+    assert latest["actor"] == "test-harness"
+    assert latest["after"]["content"] == "test content for audit"
+
+
+def test_audit_promote_logs_entry(isolated_db):
+    from atocore.memory.service import create_memory, get_memory_audit, promote_memory
+
+    mem = create_memory("knowledge", "candidate for promote", status="candidate")
+    promote_memory(mem.id, actor="test-triage")
+    audit = get_memory_audit(mem.id)
+    actions = [a["action"] for a in audit]
+    assert "promoted" in actions
+    promote_entry = next(a for a in audit if a["action"] == "promoted")
+    assert promote_entry["actor"] == "test-triage"
+    assert promote_entry["before"]["status"] == "candidate"
+    assert promote_entry["after"]["status"] == "active"
+
+
+def test_audit_reject_logs_entry(isolated_db):
+    from atocore.memory.service import create_memory, get_memory_audit, reject_candidate_memory
+
+    mem = create_memory("knowledge", "candidate for reject", status="candidate")
+    reject_candidate_memory(mem.id, actor="test-triage", note="stale")
+    audit = get_memory_audit(mem.id)
+    actions = [a["action"] for a in audit]
+    assert "rejected" in actions
+    reject_entry = next(a for a in audit if a["action"] == "rejected")
+    assert reject_entry["note"] == "stale"
+
+
+def test_audit_update_captures_before_after(isolated_db):
+    from atocore.memory.service import create_memory, get_memory_audit, update_memory
+
+    mem = create_memory("knowledge", "original content", confidence=0.5)
+    update_memory(mem.id, content="updated content", confidence=0.9, actor="human-edit")
+    audit = get_memory_audit(mem.id)
+    update_entries = [a for a in audit if a["action"] == "updated"]
+    assert len(update_entries) >= 1
+    u = update_entries[0]
+    assert u["before"]["content"] == "original content"
+    assert u["after"]["content"] == "updated content"
+    assert u["before"]["confidence"] == 0.5
+    assert u["after"]["confidence"] == 0.9
+
+
+def test_audit_reinforce_logs_entry(isolated_db):
+    from atocore.memory.service import create_memory, get_memory_audit, reinforce_memory
+
+    mem = create_memory("knowledge", "reinforced mem", confidence=0.5)
+    reinforce_memory(mem.id, confidence_delta=0.02)
+    audit = get_memory_audit(mem.id)
+    actions = [a["action"] for a in audit]
+    assert "reinforced" in actions
+
+
+def test_recent_audit_returns_cross_memory_entries(isolated_db):
+    from atocore.memory.service import create_memory, get_recent_audit
+
+    m1 = create_memory("knowledge", "mem one content", actor="harness")
+    m2 = create_memory("knowledge", "mem two content", actor="harness")
+    recent = get_recent_audit(limit=10)
+    ids = {e["memory_id"] for e in recent}
+    assert m1.id in ids and m2.id in ids
+
+
 # --- Phase 3: domain_tags + valid_until ---