Files
Atomizer/hq/workspaces/shared/skills/orchestrate/orchestrate.py

611 lines
23 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Atomizer HQ Orchestration Engine — Phase 1b
Synchronous delegation with file-based handoffs, inotify, validation, retries, error handling.
Usage:
python3 orchestrate.py <agent> "<task>" [options]
Options:
--wait Block until agent completes (default: true)
--timeout <sec> Max wait time per attempt (default: 300)
--format json|text Expected response format (default: json)
--context <file> Attach context file to the task
--no-deliver Don't post to Discord
--run-id <id> Custom run ID (default: auto-generated)
--retries <N> Retry on failure (default: 1, max: 3)
--validate Validate required handoff fields strictly
--workflow-id <id> Workflow run ID (for tracing)
--step-id <id> Workflow step ID (for tracing)
--caller <agent> Calling agent (for ACL enforcement)
--channel-context <channel> Include recent Discord channel history as untrusted context
--channel-messages <N> Number of channel messages to fetch (default: 20, max: 30)
"""
import argparse
import json
import os
import subprocess
import sys
import time
import uuid
from pathlib import Path
# ── Constants ────────────────────────────────────────────────────────────────
HANDOFF_DIR = Path("/home/papa/atomizer/handoffs")
LOG_DIR = Path("/home/papa/atomizer/logs/orchestration")
REGISTRY_PATH = Path("/home/papa/atomizer/workspaces/shared/AGENTS_REGISTRY.json")
ORCHESTRATE_DIR = Path("/home/papa/atomizer/workspaces/shared/skills/orchestrate")
GATEWAY_TOKEN = "31422bb39bc9e7a4d34f789d8a7cbc582dece8dd170dadd1"
# Port map (fallback if registry unavailable)
AGENT_PORTS = {
"manager": 18800,
"tech-lead": 18804,
"secretary": 18808,
"auditor": 18812,
"optimizer": 18816,
"study-builder": 18820,
"nx-expert": 18824,
"webster": 18828,
}
# Delegation ACL — who can delegate to whom
DELEGATION_ACL = {
"manager": ["tech-lead", "auditor", "optimizer", "study-builder", "nx-expert", "webster", "secretary"],
"tech-lead": ["webster", "nx-expert", "study-builder", "secretary"],
"optimizer": ["webster", "study-builder", "secretary"],
# All others: no sub-delegation allowed
}
# Required handoff fields for strict validation
REQUIRED_FIELDS = ["status", "result"]
STRICT_FIELDS = ["schemaVersion", "status", "result", "confidence", "timestamp"]
DELIVERABLE_TYPES = ["document", "code", "analysis", "recommendation", "review", "data"]
# ── Helpers ──────────────────────────────────────────────────────────────────
def get_agent_port(agent: str) -> int:
"""Resolve agent name to port, checking registry first."""
if REGISTRY_PATH.exists():
try:
registry = json.loads(REGISTRY_PATH.read_text())
agent_info = registry.get("agents", {}).get(agent)
if agent_info and "port" in agent_info:
return agent_info["port"]
except (json.JSONDecodeError, KeyError):
pass
port = AGENT_PORTS.get(agent)
if port is None:
emit_error(f"Unknown agent '{agent}'")
sys.exit(1)
return port
def check_acl(caller: str | None, target: str) -> bool:
"""Check if caller is allowed to delegate to target."""
if caller is None:
return True # No caller specified = no ACL enforcement
if caller == target:
return False # No self-delegation
allowed = DELEGATION_ACL.get(caller)
if allowed is None:
return False # Agent not in ACL = cannot delegate
return target in allowed
def check_health(agent: str, port: int) -> bool:
"""Quick health check — can we reach the agent's gateway?"""
try:
result = subprocess.run(
["curl", "-sf", "-o", "/dev/null", "-w", "%{http_code}",
f"http://127.0.0.1:{port}/healthz"],
capture_output=True, text=True, timeout=5
)
return result.stdout.strip() in ("200", "204")
except (subprocess.TimeoutExpired, Exception):
return False
def send_task(agent: str, port: int, task: str, run_id: str,
attempt: int = 1, prev_error: str = None,
context: str = None, no_deliver: bool = False) -> bool:
"""Send a task to the agent via /hooks/agent endpoint."""
handoff_path = HANDOFF_DIR / f"{run_id}.json"
# Build retry context if this is a retry
retry_note = ""
if attempt > 1 and prev_error:
retry_note = f"\n⚠️ RETRY (attempt {attempt}): Previous attempt failed: {prev_error}\nPlease try again carefully.\n"
message = f"""[ORCHESTRATED TASK — run_id: {run_id}]
{retry_note}
IMPORTANT: Answer this task DIRECTLY. Do NOT spawn sub-agents, Codex, or background processes.
Use your own knowledge and tools (web_search, web_fetch) directly. Keep your response focused and concise.
{task}
---
IMPORTANT: When you complete this task, write your response as a JSON file to:
{handoff_path}
Use this exact format:
```json
{{
"schemaVersion": "1.1",
"runId": "{run_id}",
"agent": "{agent}",
"status": "complete",
"result": "<your findings/output here>",
"deliverable": {{
"type": "<document|code|analysis|recommendation|review|data>",
"title": "<short title of what you produced>",
"path": "<path to artifact file, or null if result is self-contained>",
"summary": "<one-line summary of the deliverable>"
}},
"artifacts": [],
"confidence": "high|medium|low",
"notes": "<any caveats or open questions>",
"timestamp": "<ISO-8601 timestamp>"
}}
```
Status values: complete | partial | blocked | failed
⚠️ The "deliverable" block is MANDATORY. Every task must produce a concrete deliverable.
If your result is self-contained in "result", set deliverable.path to null and deliverable.type to "analysis" or "recommendation".
Write the file BEFORE posting to Discord. The orchestrator is waiting for it."""
if context:
message = f"CONTEXT:\n{context}\n\n{message}"
# Map agent to their Discord delivery channel (dl-<agent>)
discord_channel = f"channel:dl-{agent}"
payload = {
"message": message,
"name": f"orchestrate:{run_id}",
"sessionKey": f"hook:orchestrate:{run_id}:{attempt}",
"deliver": not no_deliver,
"channel": "discord",
"to": discord_channel,
"wakeMode": "now",
"timeoutSeconds": 600,
}
try:
result = subprocess.run(
["curl", "-sf", "-X", "POST",
f"http://127.0.0.1:{port}/hooks/agent",
"-H", f"Authorization: Bearer {GATEWAY_TOKEN}",
"-H", "Content-Type: application/json",
"-d", json.dumps(payload)],
capture_output=True, text=True, timeout=15
)
return result.returncode == 0
except (subprocess.TimeoutExpired, Exception) as e:
log_event(run_id, agent, "send_error", str(e), attempt=attempt)
return False
def wait_for_handoff(run_id: str, timeout: int) -> dict | None:
"""Wait for the handoff file using inotify. Falls back to polling."""
handoff_path = HANDOFF_DIR / f"{run_id}.json"
# Check if already exists (agent was fast, or late arrival from prev attempt)
if handoff_path.exists():
return read_handoff(handoff_path)
try:
from inotify_simple import INotify, flags
inotify = INotify()
watch_flags = flags.CREATE | flags.MOVED_TO | flags.CLOSE_WRITE
wd = inotify.add_watch(str(HANDOFF_DIR), watch_flags)
deadline = time.time() + timeout
target_name = f"{run_id}.json"
while time.time() < deadline:
remaining = max(0.1, deadline - time.time())
events = inotify.read(timeout=int(remaining * 1000))
for event in events:
if event.name == target_name:
time.sleep(0.3) # Ensure file is fully written
inotify.close()
return read_handoff(handoff_path)
# Direct check in case we missed the inotify event
if handoff_path.exists():
inotify.close()
return read_handoff(handoff_path)
inotify.close()
return None
except ImportError:
return poll_for_handoff(handoff_path, timeout)
def poll_for_handoff(handoff_path: Path, timeout: int) -> dict | None:
"""Fallback polling if inotify unavailable."""
deadline = time.time() + timeout
while time.time() < deadline:
if handoff_path.exists():
time.sleep(0.3)
return read_handoff(handoff_path)
time.sleep(2)
return None
def read_handoff(path: Path) -> dict | None:
"""Read and parse a handoff file."""
try:
raw = path.read_text().strip()
data = json.loads(raw)
return data
except json.JSONDecodeError:
return {
"status": "malformed",
"result": path.read_text()[:2000],
"notes": "Invalid JSON in handoff file",
"_raw": True,
}
except Exception as e:
return {
"status": "error",
"result": str(e),
"notes": f"Failed to read handoff file: {e}",
}
def validate_handoff(data: dict, strict: bool = False) -> tuple[bool, str]:
"""Validate handoff data. Returns (valid, error_message)."""
if data is None:
return False, "No handoff data"
fields = STRICT_FIELDS if strict else REQUIRED_FIELDS
missing = [f for f in fields if f not in data]
if missing:
return False, f"Missing fields: {', '.join(missing)}"
status = data.get("status", "")
if status not in ("complete", "partial", "blocked", "failed"):
return False, f"Invalid status: '{status}'"
if status == "failed":
return False, f"Agent reported failure: {data.get('notes', 'no details')}"
if status == "blocked":
return False, f"Agent blocked: {data.get('notes', 'no details')}"
# Deliverable enforcement (schema v1.1+)
if strict and status == "complete":
deliverable = data.get("deliverable")
if not deliverable or not isinstance(deliverable, dict):
return False, "Missing deliverable block — every completed task must include a deliverable"
if not deliverable.get("type"):
return False, "Deliverable missing 'type' field"
if deliverable["type"] not in DELIVERABLE_TYPES:
return False, f"Invalid deliverable type: '{deliverable['type']}' (valid: {', '.join(DELIVERABLE_TYPES)})"
if not deliverable.get("summary"):
return False, "Deliverable missing 'summary' field"
return True, ""
def should_retry(result: dict | None, attempt: int, max_retries: int) -> tuple[bool, str]:
"""Decide whether to retry based on result and attempt count."""
if attempt >= max_retries:
return False, "Max retries reached"
if result is None:
return True, "timeout"
status = result.get("status", "")
if status == "malformed":
return True, "malformed response"
if status == "failed":
return True, f"agent failed: {result.get('notes', '')}"
if status == "partial" and result.get("confidence") == "low":
return True, "partial with low confidence"
if status == "error":
return True, f"error: {result.get('notes', '')}"
return False, ""
def clear_handoff(run_id: str):
"""Remove handoff file before retry."""
handoff_path = HANDOFF_DIR / f"{run_id}.json"
if handoff_path.exists():
# Rename to .prev instead of deleting (for debugging)
handoff_path.rename(handoff_path.with_suffix(".prev.json"))
def log_event(run_id: str, agent: str, event_type: str, detail: str = "",
attempt: int = 1, elapsed_ms: int = 0, **extra):
"""Unified logging."""
LOG_DIR.mkdir(parents=True, exist_ok=True)
log_file = LOG_DIR / f"{time.strftime('%Y-%m-%d')}.jsonl"
entry = {
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"runId": run_id,
"agent": agent,
"event": event_type,
"detail": detail[:500],
"attempt": attempt,
"elapsedMs": elapsed_ms,
**extra,
}
with open(log_file, "a") as f:
f.write(json.dumps(entry) + "\n")
def emit_error(msg: str):
"""Print error to stderr."""
print(f"ERROR: {msg}", file=sys.stderr)
def get_discord_token_for_caller(caller: str) -> str | None:
"""Load caller bot token from instance config."""
cfg = Path(f"/home/papa/atomizer/instances/{caller}/openclaw.json")
if not cfg.exists():
return None
try:
data = json.loads(cfg.read_text())
return data.get("channels", {}).get("discord", {}).get("token")
except Exception:
return None
def fetch_channel_context(channel: str, messages: int, token: str) -> str | None:
"""Fetch formatted channel context via helper script."""
script = ORCHESTRATE_DIR / "fetch-channel-context.sh"
if not script.exists():
return None
try:
result = subprocess.run(
[str(script), channel, "--messages", str(messages), "--token", token],
capture_output=True,
text=True,
timeout=30,
check=False,
)
if result.returncode != 0:
emit_error(f"Channel context fetch failed: {result.stderr.strip()}")
return None
return result.stdout.strip()
except Exception as e:
emit_error(f"Channel context fetch error: {e}")
return None
# ── Main ─────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Atomizer Orchestration Engine")
parser.add_argument("agent", help="Target agent name")
parser.add_argument("task", help="Task to delegate")
parser.add_argument("--wait", action="store_true", default=True)
parser.add_argument("--timeout", type=int, default=300,
help="Timeout per attempt in seconds (default: 300)")
parser.add_argument("--format", choices=["json", "text"], default="json")
parser.add_argument("--context", type=str, default=None,
help="Path to context file")
parser.add_argument("--no-deliver", action="store_true")
parser.add_argument("--run-id", type=str, default=None)
parser.add_argument("--retries", type=int, default=1,
help="Max attempts (default: 1, max: 3)")
parser.add_argument("--validate", action="store_true", default=True,
help="Strict validation of handoff fields (default: True since v1.1)")
parser.add_argument("--no-validate", action="store_false", dest="validate",
help="Disable strict validation")
parser.add_argument("--workflow-id", type=str, default=None,
help="Workflow run ID for tracing")
parser.add_argument("--step-id", type=str, default=None,
help="Workflow step ID for tracing")
parser.add_argument("--caller", type=str, default=None,
help="Calling agent for ACL enforcement")
parser.add_argument("--channel-context", type=str, default=None,
help="Discord channel name or ID to include as context")
parser.add_argument("--channel-messages", type=int, default=20,
help="Number of channel messages to fetch (default: 20, max: 30)")
args = parser.parse_args()
# Clamp retries
max_retries = min(max(args.retries, 1), 3)
# Generate run ID
run_id = args.run_id or f"orch-{int(time.time())}-{uuid.uuid4().hex[:8]}"
# Task text can be augmented (e.g., channel context prepend)
delegated_task = args.task
# ACL check
if not check_acl(args.caller, args.agent):
result = {
"status": "error",
"result": None,
"notes": f"ACL denied: '{args.caller}' cannot delegate to '{args.agent}'",
"agent": args.agent,
"runId": run_id,
}
print(json.dumps(result, indent=2))
log_event(run_id, args.agent, "acl_denied", f"caller={args.caller}")
sys.exit(1)
# Resolve agent port
port = get_agent_port(args.agent)
# Health check
if not check_health(args.agent, port):
result = {
"status": "error",
"result": None,
"notes": f"Agent '{args.agent}' unreachable at port {port}",
"agent": args.agent,
"runId": run_id,
}
print(json.dumps(result, indent=2))
log_event(run_id, args.agent, "health_failed", f"port={port}")
sys.exit(1)
# Load context
context = None
if args.context:
ctx_path = Path(args.context)
if ctx_path.exists():
context = ctx_path.read_text()
else:
emit_error(f"Context file not found: {args.context}")
# Optional channel context
if args.channel_context:
if not args.caller:
emit_error("--channel-context requires --caller so bot token can be resolved")
sys.exit(1)
token = get_discord_token_for_caller(args.caller)
if not token:
emit_error(f"Could not resolve Discord bot token for caller '{args.caller}'")
sys.exit(1)
channel_messages = min(max(args.channel_messages, 1), 30)
ch_ctx = fetch_channel_context(args.channel_context, channel_messages, token)
if not ch_ctx:
emit_error(f"Failed to fetch channel context for '{args.channel_context}'")
sys.exit(1)
delegated_task = f"{ch_ctx}\n\n{delegated_task}"
# ── Retry loop ───────────────────────────────────────────────────────
result = None
prev_error = None
for attempt in range(1, max_retries + 1):
attempt_start = time.time()
log_event(run_id, args.agent, "attempt_start", delegated_task[:200],
attempt=attempt)
# Idempotency check: if handoff file exists from a previous attempt, use it
handoff_path = HANDOFF_DIR / f"{run_id}.json"
if attempt > 1 and handoff_path.exists():
result = read_handoff(handoff_path)
if result and result.get("status") in ("complete", "partial"):
log_event(run_id, args.agent, "late_arrival",
"Handoff file arrived between retries",
attempt=attempt)
break
# Previous result was bad, clear it for retry
clear_handoff(run_id)
# Send task
sent = send_task(args.agent, port, delegated_task, run_id,
attempt=attempt, prev_error=prev_error,
context=context, no_deliver=args.no_deliver)
if not sent:
prev_error = "Failed to send task"
log_event(run_id, args.agent, "send_failed", prev_error,
attempt=attempt)
if attempt < max_retries:
time.sleep(5) # Brief pause before retry
continue
result = {
"status": "error",
"result": None,
"notes": f"Failed to send task after {attempt} attempts",
}
break
# Wait for result
if args.wait:
result = wait_for_handoff(run_id, args.timeout)
elapsed = time.time() - attempt_start
# Validate
if result is not None:
valid, error_msg = validate_handoff(result, strict=args.validate)
if not valid:
log_event(run_id, args.agent, "validation_failed",
error_msg, attempt=attempt,
elapsed_ms=int(elapsed * 1000))
do_retry, reason = should_retry(result, attempt, max_retries)
if do_retry:
prev_error = reason
clear_handoff(run_id)
time.sleep(3)
continue
# No retry — return what we have
break
else:
# Valid result
log_event(run_id, args.agent, "complete",
result.get("status", ""),
attempt=attempt,
elapsed_ms=int(elapsed * 1000),
confidence=result.get("confidence"))
break
else:
# Timeout
log_event(run_id, args.agent, "timeout", "",
attempt=attempt,
elapsed_ms=int(elapsed * 1000))
do_retry, reason = should_retry(result, attempt, max_retries)
if do_retry:
prev_error = "timeout"
continue
result = {
"status": "timeout",
"result": None,
"notes": f"Agent did not respond within {args.timeout}s "
f"(attempt {attempt}/{max_retries})",
}
break
else:
# Fire and forget
print(json.dumps({"status": "sent", "runId": run_id, "agent": args.agent}))
sys.exit(0)
# ── Output ───────────────────────────────────────────────────────────
if result is None:
result = {
"status": "error",
"result": None,
"notes": "No result after all attempts",
}
# Add metadata
total_elapsed = time.time() - (attempt_start if 'attempt_start' in dir() else time.time())
result["runId"] = run_id
result["agent"] = args.agent
result["latencyMs"] = int(total_elapsed * 1000)
if args.workflow_id:
result["workflowRunId"] = args.workflow_id
if args.step_id:
result["stepId"] = args.step_id
if args.format == "json":
print(json.dumps(result, indent=2))
else:
print(result.get("result", ""))
status = result.get("status", "error")
sys.exit(0 if status in ("complete", "partial") else 1)
if __name__ == "__main__":
main()