src/atocore/observability/alerts.py

"""Alert emission framework (Phase 4 Robustness V1).

One-stop helper to raise operational alerts from any AtoCore code
path. An alert is a structured message about something the operator
should see — harness regression, queue pileup, integrity drift,
pipeline skipped, etc.

Emission fans out to multiple sinks so a single call touches every
observability channel:

  1. structlog logger (always)
  2. Append to ``$ATOCORE_ALERT_LOG`` (default ~/atocore-logs/alerts.log)
  3. Write the last alert of each severity to AtoCore project state
     (atocore/alert/last_{severity}) so the dashboard can surface it
  4. POST to ``$ATOCORE_ALERT_WEBHOOK`` if set (Discord/Slack/generic)

All sinks are fail-open — if one fails the others still fire.

Severity levels (inspired by syslog but simpler):
  - ``info``      operational event worth noting
  - ``warning``   degraded state, service still works
  - ``critical``  something is broken and needs attention

Environment variables:
  ATOCORE_ALERT_LOG      override the alerts log file path
  ATOCORE_ALERT_WEBHOOK  POST JSON alerts here (Discord webhook, etc.)
  ATOCORE_BASE_URL       AtoCore API for project-state write (default localhost:8100)
"""

from __future__ import annotations

import json
import os
import threading
import urllib.error
import urllib.request
from datetime import datetime, timezone
from pathlib import Path

from atocore.observability.logger import get_logger

log = get_logger("alerts")

SEVERITIES = {"info", "warning", "critical"}


def _default_alert_log() -> Path:
    explicit = os.environ.get("ATOCORE_ALERT_LOG")
    if explicit:
        return Path(explicit)
    return Path.home() / "atocore-logs" / "alerts.log"


def _append_log(severity: str, title: str, message: str, context: dict | None) -> None:
    path = _default_alert_log()
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
        line = f"[{ts}] [{severity.upper()}] {title}: {message}"
        if context:
            line += f"  {json.dumps(context, ensure_ascii=True)[:500]}"
        line += "\n"
        with open(path, "a", encoding="utf-8") as f:
            f.write(line)
    except Exception as e:
        log.warning("alert_log_write_failed", error=str(e))


def _write_state(severity: str, title: str, message: str, ts: str) -> None:
    """Record the most-recent alert per severity into project_state.

    Uses the internal ``set_state`` helper directly so we work even
    when the HTTP API isn't available (e.g. called from cron scripts
    that import atocore as a library).
    """
    try:
        from atocore.context.project_state import set_state

        set_state(
            project_name="atocore",
            category="alert",
            key=f"last_{severity}",
            value=json.dumps({"title": title, "message": message[:400], "timestamp": ts}),
            source="alert framework",
        )
    except Exception as e:
        log.warning("alert_state_write_failed", error=str(e))


def _post_webhook(severity: str, title: str, message: str, context: dict | None, ts: str) -> None:
    url = os.environ.get("ATOCORE_ALERT_WEBHOOK")
    if not url:
        return

    # Auto-detect Discord webhook shape for nicer formatting
    if "discord.com/api/webhooks" in url or "discordapp.com/api/webhooks" in url:
        emoji = {"info": ":information_source:", "warning": ":warning:", "critical": ":rotating_light:"}.get(severity, "")
        body = {
            "content": f"{emoji} **AtoCore {severity}**: {title}",
            "embeds": [{
                "description": message[:1800],
                "timestamp": ts,
                "fields": [
                    {"name": k, "value": str(v)[:200], "inline": True}
                    for k, v in (context or {}).items()
                ][:10],
            }],
        }
    else:
        body = {
            "severity": severity,
            "title": title,
            "message": message,
            "context": context or {},
            "timestamp": ts,
        }

    def _fire():
        try:
            req = urllib.request.Request(
                url,
                data=json.dumps(body).encode("utf-8"),
                method="POST",
                headers={"Content-Type": "application/json"},
            )
            urllib.request.urlopen(req, timeout=8)
        except Exception as e:
            log.warning("alert_webhook_failed", error=str(e))

    threading.Thread(target=_fire, daemon=True).start()


def emit_alert(
    severity: str,
    title: str,
    message: str,
    context: dict | None = None,
) -> None:
    """Emit an alert to all configured sinks.

    Fail-open: any single sink failure is logged but does not prevent
    other sinks from firing.
    """
    severity = (severity or "info").lower()
    if severity not in SEVERITIES:
        severity = "info"

    ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    # Sink 1: structlog — always
    logger_fn = {
        "info": log.info,
        "warning": log.warning,
        "critical": log.error,
    }[severity]
    logger_fn("alert", title=title, message=message[:500], **(context or {}))

    # Sinks 2-4: fail-open, each wrapped
    try:
        _append_log(severity, title, message, context)
    except Exception:
        pass
    try:
        _write_state(severity, title, message, ts)
    except Exception:
        pass
    try:
        _post_webhook(severity, title, message, context, ts)
    except Exception:
        pass
feat: Phase 4 V1 — Robustness Hardening Adds the observability + safety layer that turns AtoCore from "works until something silently breaks" into "every mutation is traceable, drift is detected, failures raise alerts." 1. Audit log (memory_audit table): - New table with id, memory_id, action, actor, before/after JSON, note, timestamp; 3 indexes for memory_id/timestamp/action - _audit_memory() helper called from every mutation: create_memory, update_memory, promote_memory, reject_candidate_memory, invalidate_memory, supersede_memory, reinforce_memory, auto_promote_reinforced, expire_stale_candidates - Action verb auto-selected: promoted/rejected/invalidated/ superseded/updated based on state transition - "actor" threaded through: api-http, human-triage, phase10-auto- promote, candidate-expiry, reinforcement, etc. - Fail-open: audit write failure logs but never breaks the mutation - GET /memory/{id}/audit: full history for one memory - GET /admin/audit/recent: last 50 mutations across the system 2. Alerts framework (src/atocore/observability/alerts.py): - emit_alert(severity, title, message, context) fans out to: - structlog logger (always) - ~/atocore-logs/alerts.log append (configurable via ATOCORE_ALERT_LOG) - project_state atocore/alert/last_{severity} (dashboard surface) - ATOCORE_ALERT_WEBHOOK POST if set (auto-detects Discord webhook format for nice embeds; generic JSON otherwise) - Every sink fail-open — one failure doesn't prevent the others - Pipeline alert step in nightly cron: harness < 85% → warning; candidate queue > 200 → warning 3. Integrity checks (scripts/integrity_check.py): - Nightly scan for drift: - Memories → missing source_chunk_id references - Duplicate active memories (same type+content+project) - project_state → missing projects - Orphaned source_chunks (no parent document) - Results persisted to atocore/status/integrity_check_result - Any finding emits a warning alert - Added as Step G in deploy/dalidou/batch-extract.sh nightly cron 4. Dashboard surfaces it all: - integrity (findings + details) - alerts (last info/warning/critical per severity) - recent_audit (last 10 mutations with actor + action + preview) Tests: 308 → 317 (9 new): - test_audit_create_logs_entry - test_audit_promote_logs_entry - test_audit_reject_logs_entry - test_audit_update_captures_before_after - test_audit_reinforce_logs_entry - test_recent_audit_returns_cross_memory_entries - test_emit_alert_writes_log_file - test_emit_alert_invalid_severity_falls_back_to_info - test_emit_alert_fails_open_on_log_write_error Deferred: formal migration framework with rollback (current additive pattern is fine for V1); memory detail wiki page with audit view (quick follow-up). To enable Discord alerts: set ATOCORE_ALERT_WEBHOOK to a Discord webhook URL in Dalidou's environment. Default = log-only. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-04-16 21:54:10 -04:00			`"""Alert emission framework (Phase 4 Robustness V1).`

			`One-stop helper to raise operational alerts from any AtoCore code`
			`path. An alert is a structured message about something the operator`
			`should see — harness regression, queue pileup, integrity drift,`
			`pipeline skipped, etc.`

			`Emission fans out to multiple sinks so a single call touches every`
			`observability channel:`

			`1. structlog logger (always)`
			2. Append to ``$ATOCORE_ALERT_LOG`` (default ~/atocore-logs/alerts.log)
			`3. Write the last alert of each severity to AtoCore project state`
			`(atocore/alert/last_{severity}) so the dashboard can surface it`
			4. POST to ``$ATOCORE_ALERT_WEBHOOK`` if set (Discord/Slack/generic)

			`All sinks are fail-open — if one fails the others still fire.`

			`Severity levels (inspired by syslog but simpler):`
			- ``info`` operational event worth noting
			- ``warning`` degraded state, service still works
			- ``critical`` something is broken and needs attention

			`Environment variables:`
			`ATOCORE_ALERT_LOG override the alerts log file path`
			`ATOCORE_ALERT_WEBHOOK POST JSON alerts here (Discord webhook, etc.)`
			`ATOCORE_BASE_URL AtoCore API for project-state write (default localhost:8100)`
			`"""`

			`from __future__ import annotations`

			`import json`
			`import os`
			`import threading`
			`import urllib.error`
			`import urllib.request`
			`from datetime import datetime, timezone`
			`from pathlib import Path`

			`from atocore.observability.logger import get_logger`

			`log = get_logger("alerts")`

			`SEVERITIES = {"info", "warning", "critical"}`


			`def _default_alert_log() -> Path:`
			`explicit = os.environ.get("ATOCORE_ALERT_LOG")`
			`if explicit:`
			`return Path(explicit)`
			`return Path.home() / "atocore-logs" / "alerts.log"`


			`def _append_log(severity: str, title: str, message: str, context: dict \| None) -> None:`
			`path = _default_alert_log()`
			`try:`
			`path.parent.mkdir(parents=True, exist_ok=True)`
			`ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")`
			`line = f"[{ts}] [{severity.upper()}] {title}: {message}"`
			`if context:`
			`line += f" {json.dumps(context, ensure_ascii=True)[:500]}"`
			`line += "\n"`
			`with open(path, "a", encoding="utf-8") as f:`
			`f.write(line)`
			`except Exception as e:`
			`log.warning("alert_log_write_failed", error=str(e))`


			`def _write_state(severity: str, title: str, message: str, ts: str) -> None:`
			`"""Record the most-recent alert per severity into project_state.`

			Uses the internal ``set_state`` helper directly so we work even
			`when the HTTP API isn't available (e.g. called from cron scripts`
			`that import atocore as a library).`
			`"""`
			`try:`
			`from atocore.context.project_state import set_state`

			`set_state(`
			`project_name="atocore",`
			`category="alert",`
			`key=f"last_{severity}",`
			`value=json.dumps({"title": title, "message": message[:400], "timestamp": ts}),`
			`source="alert framework",`
			`)`
			`except Exception as e:`
			`log.warning("alert_state_write_failed", error=str(e))`


			`def _post_webhook(severity: str, title: str, message: str, context: dict \| None, ts: str) -> None:`
			`url = os.environ.get("ATOCORE_ALERT_WEBHOOK")`
			`if not url:`
			`return`

			`# Auto-detect Discord webhook shape for nicer formatting`
			`if "discord.com/api/webhooks" in url or "discordapp.com/api/webhooks" in url:`
			`emoji = {"info": ":information_source:", "warning": ":warning:", "critical": ":rotating_light:"}.get(severity, "")`
			`body = {`
			`"content": f"{emoji} AtoCore {severity}: {title}",`
			`"embeds": [{`
			`"description": message[:1800],`
			`"timestamp": ts,`
			`"fields": [`
			`{"name": k, "value": str(v)[:200], "inline": True}`
			`for k, v in (context or {}).items()`
			`][:10],`
			`}],`
			`}`
			`else:`
			`body = {`
			`"severity": severity,`
			`"title": title,`
			`"message": message,`
			`"context": context or {},`
			`"timestamp": ts,`
			`}`

			`def _fire():`
			`try:`
			`req = urllib.request.Request(`
			`url,`
			`data=json.dumps(body).encode("utf-8"),`
			`method="POST",`
			`headers={"Content-Type": "application/json"},`
			`)`
			`urllib.request.urlopen(req, timeout=8)`
			`except Exception as e:`
			`log.warning("alert_webhook_failed", error=str(e))`

			`threading.Thread(target=_fire, daemon=True).start()`


			`def emit_alert(`
			`severity: str,`
			`title: str,`
			`message: str,`
			`context: dict \| None = None,`
			`) -> None:`
			`"""Emit an alert to all configured sinks.`

			`Fail-open: any single sink failure is logged but does not prevent`
			`other sinks from firing.`
			`"""`
			`severity = (severity or "info").lower()`
			`if severity not in SEVERITIES:`
			`severity = "info"`

			`ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")`

			`# Sink 1: structlog — always`
			`logger_fn = {`
			`"info": log.info,`
			`"warning": log.warning,`
			`"critical": log.error,`
			`}[severity]`
			`logger_fn("alert", title=title, message=message[:500], **(context or {}))`

			`# Sinks 2-4: fail-open, each wrapped`
			`try:`
			`_append_log(severity, title, message, context)`
			`except Exception:`
			`pass`
			`try:`
			`_write_state(severity, title, message, ts)`
			`except Exception:`
			`pass`
			`try:`
			`_post_webhook(severity, title, message, context, ts)`
			`except Exception:`
			`pass`