171 lines
5.4 KiB
Python
171 lines
5.4 KiB
Python
|
|
"""Alert emission framework (Phase 4 Robustness V1).
|
||
|
|
|
||
|
|
One-stop helper to raise operational alerts from any AtoCore code
|
||
|
|
path. An alert is a structured message about something the operator
|
||
|
|
should see — harness regression, queue pileup, integrity drift,
|
||
|
|
pipeline skipped, etc.
|
||
|
|
|
||
|
|
Emission fans out to multiple sinks so a single call touches every
|
||
|
|
observability channel:
|
||
|
|
|
||
|
|
1. structlog logger (always)
|
||
|
|
2. Append to ``$ATOCORE_ALERT_LOG`` (default ~/atocore-logs/alerts.log)
|
||
|
|
3. Write the last alert of each severity to AtoCore project state
|
||
|
|
(atocore/alert/last_{severity}) so the dashboard can surface it
|
||
|
|
4. POST to ``$ATOCORE_ALERT_WEBHOOK`` if set (Discord/Slack/generic)
|
||
|
|
|
||
|
|
All sinks are fail-open — if one fails the others still fire.
|
||
|
|
|
||
|
|
Severity levels (inspired by syslog but simpler):
|
||
|
|
- ``info`` operational event worth noting
|
||
|
|
- ``warning`` degraded state, service still works
|
||
|
|
- ``critical`` something is broken and needs attention
|
||
|
|
|
||
|
|
Environment variables:
|
||
|
|
ATOCORE_ALERT_LOG override the alerts log file path
|
||
|
|
ATOCORE_ALERT_WEBHOOK POST JSON alerts here (Discord webhook, etc.)
|
||
|
|
ATOCORE_BASE_URL AtoCore API for project-state write (default localhost:8100)
|
||
|
|
"""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import threading
|
||
|
|
import urllib.error
|
||
|
|
import urllib.request
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from atocore.observability.logger import get_logger
|
||
|
|
|
||
|
|
log = get_logger("alerts")
|
||
|
|
|
||
|
|
SEVERITIES = {"info", "warning", "critical"}
|
||
|
|
|
||
|
|
|
||
|
|
def _default_alert_log() -> Path:
|
||
|
|
explicit = os.environ.get("ATOCORE_ALERT_LOG")
|
||
|
|
if explicit:
|
||
|
|
return Path(explicit)
|
||
|
|
return Path.home() / "atocore-logs" / "alerts.log"
|
||
|
|
|
||
|
|
|
||
|
|
def _append_log(severity: str, title: str, message: str, context: dict | None) -> None:
|
||
|
|
path = _default_alert_log()
|
||
|
|
try:
|
||
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
|
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
|
|
line = f"[{ts}] [{severity.upper()}] {title}: {message}"
|
||
|
|
if context:
|
||
|
|
line += f" {json.dumps(context, ensure_ascii=True)[:500]}"
|
||
|
|
line += "\n"
|
||
|
|
with open(path, "a", encoding="utf-8") as f:
|
||
|
|
f.write(line)
|
||
|
|
except Exception as e:
|
||
|
|
log.warning("alert_log_write_failed", error=str(e))
|
||
|
|
|
||
|
|
|
||
|
|
def _write_state(severity: str, title: str, message: str, ts: str) -> None:
|
||
|
|
"""Record the most-recent alert per severity into project_state.
|
||
|
|
|
||
|
|
Uses the internal ``set_state`` helper directly so we work even
|
||
|
|
when the HTTP API isn't available (e.g. called from cron scripts
|
||
|
|
that import atocore as a library).
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
from atocore.context.project_state import set_state
|
||
|
|
|
||
|
|
set_state(
|
||
|
|
project_name="atocore",
|
||
|
|
category="alert",
|
||
|
|
key=f"last_{severity}",
|
||
|
|
value=json.dumps({"title": title, "message": message[:400], "timestamp": ts}),
|
||
|
|
source="alert framework",
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
log.warning("alert_state_write_failed", error=str(e))
|
||
|
|
|
||
|
|
|
||
|
|
def _post_webhook(severity: str, title: str, message: str, context: dict | None, ts: str) -> None:
|
||
|
|
url = os.environ.get("ATOCORE_ALERT_WEBHOOK")
|
||
|
|
if not url:
|
||
|
|
return
|
||
|
|
|
||
|
|
# Auto-detect Discord webhook shape for nicer formatting
|
||
|
|
if "discord.com/api/webhooks" in url or "discordapp.com/api/webhooks" in url:
|
||
|
|
emoji = {"info": ":information_source:", "warning": ":warning:", "critical": ":rotating_light:"}.get(severity, "")
|
||
|
|
body = {
|
||
|
|
"content": f"{emoji} **AtoCore {severity}**: {title}",
|
||
|
|
"embeds": [{
|
||
|
|
"description": message[:1800],
|
||
|
|
"timestamp": ts,
|
||
|
|
"fields": [
|
||
|
|
{"name": k, "value": str(v)[:200], "inline": True}
|
||
|
|
for k, v in (context or {}).items()
|
||
|
|
][:10],
|
||
|
|
}],
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
body = {
|
||
|
|
"severity": severity,
|
||
|
|
"title": title,
|
||
|
|
"message": message,
|
||
|
|
"context": context or {},
|
||
|
|
"timestamp": ts,
|
||
|
|
}
|
||
|
|
|
||
|
|
def _fire():
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request(
|
||
|
|
url,
|
||
|
|
data=json.dumps(body).encode("utf-8"),
|
||
|
|
method="POST",
|
||
|
|
headers={"Content-Type": "application/json"},
|
||
|
|
)
|
||
|
|
urllib.request.urlopen(req, timeout=8)
|
||
|
|
except Exception as e:
|
||
|
|
log.warning("alert_webhook_failed", error=str(e))
|
||
|
|
|
||
|
|
threading.Thread(target=_fire, daemon=True).start()
|
||
|
|
|
||
|
|
|
||
|
|
def emit_alert(
|
||
|
|
severity: str,
|
||
|
|
title: str,
|
||
|
|
message: str,
|
||
|
|
context: dict | None = None,
|
||
|
|
) -> None:
|
||
|
|
"""Emit an alert to all configured sinks.
|
||
|
|
|
||
|
|
Fail-open: any single sink failure is logged but does not prevent
|
||
|
|
other sinks from firing.
|
||
|
|
"""
|
||
|
|
severity = (severity or "info").lower()
|
||
|
|
if severity not in SEVERITIES:
|
||
|
|
severity = "info"
|
||
|
|
|
||
|
|
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
|
|
|
||
|
|
# Sink 1: structlog — always
|
||
|
|
logger_fn = {
|
||
|
|
"info": log.info,
|
||
|
|
"warning": log.warning,
|
||
|
|
"critical": log.error,
|
||
|
|
}[severity]
|
||
|
|
logger_fn("alert", title=title, message=message[:500], **(context or {}))
|
||
|
|
|
||
|
|
# Sinks 2-4: fail-open, each wrapped
|
||
|
|
try:
|
||
|
|
_append_log(severity, title, message, context)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
try:
|
||
|
|
_write_state(severity, title, message, ts)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
try:
|
||
|
|
_post_webhook(severity, title, message, context, ts)
|
||
|
|
except Exception:
|
||
|
|
pass
|