feat: Add panel management, validation, and error handling to canvas

Phase 1 - Panel Management System: - Create usePanelStore.ts for centralized panel state management - Add PanelContainer.tsx for draggable floating panels - Create FloatingIntrospectionPanel.tsx (persistent, doesn't disappear on node click) - Create ResultsPanel.tsx for trial result details - Refactor NodeConfigPanelV2 to use panel store for introspection - Integrate PanelContainer into CanvasView Phase 2 - Pre-run Validation: - Create specValidator.ts with comprehensive validation rules - Add ValidationPanel (enhanced version with error navigation) - Add Validate button to SpecRenderer with status indicator - Block run if validation fails - Check for: design vars, objectives, extractors, bounds, connections Phase 3 - Error Handling & Recovery: - Create ErrorPanel.tsx for displaying optimization errors - Add error classification (nx_crash, solver_fail, extractor_error, etc.) - Add recovery suggestions based on error type - Update status endpoint to return error info - Add _get_study_error_info helper to check error_status.json and DB - Integrate error detection into status polling Documentation: - Add CANVAS_ROBUSTNESS_PLAN.md with full implementation plan
2026-01-21 21:35:31 -05:00
parent e1c59a51c1
commit c224b16ac3
12 changed files with 2853 additions and 29 deletions
--- a/atomizer-dashboard/backend/api/routes/optimization.py
+++ b/atomizer-dashboard/backend/api/routes/optimization.py
@@ -15,6 +15,7 @@ import shutil
 import subprocess
 import psutil
 import signal
+import time
 from datetime import datetime

 # Add project root to path
@@ -155,6 +156,93 @@ def get_accurate_study_status(
    return "paused"


+def _get_study_error_info(study_dir: Path, results_dir: Path) -> dict:
+    """Get error information from study if any errors occurred.
+
+    Checks for:
+    1. error_status.json file (written by optimization process on error)
+    2. Failed trials in database
+    3. Error logs
+
+    Returns:
+        dict with keys: error, error_details, error_timestamp, current_trial, status_override
+    """
+    error_info = {}
+
+    # Check for error_status.json (written by optimization process)
+    error_file = results_dir / "error_status.json"
+    if error_file.exists():
+        try:
+            with open(error_file) as f:
+                error_data = json.load(f)
+            error_info["error"] = error_data.get("error", "Unknown error")
+            error_info["error_details"] = error_data.get("details")
+            error_info["error_timestamp"] = error_data.get("timestamp")
+            error_info["current_trial"] = error_data.get("trial")
+
+            # If error is recent (within last 5 minutes), set status to failed
+            if error_data.get("timestamp"):
+                error_age = time.time() - error_data["timestamp"]
+                if error_age < 300:  # 5 minutes
+                    error_info["status_override"] = "failed"
+        except Exception:
+            pass
+
+    # Check for failed trials in database
+    study_db = results_dir / "study.db"
+    if study_db.exists() and "error" not in error_info:
+        try:
+            conn = sqlite3.connect(str(study_db), timeout=2.0)
+            cursor = conn.cursor()
+
+            # Check for FAIL state trials (Optuna uses 'FAIL' not 'FAILED')
+            cursor.execute("""
+                SELECT number, datetime_complete 
+                FROM trials 
+                WHERE state = 'FAIL'
+                ORDER BY datetime_complete DESC
+                LIMIT 1
+            """)
+            failed = cursor.fetchone()
+
+            if failed:
+                trial_number, fail_time = failed
+                error_info["error"] = f"Trial {trial_number} failed"
+                error_info["current_trial"] = trial_number
+                # Parse datetime to timestamp if available
+                if fail_time:
+                    try:
+                        from datetime import datetime
+
+                        dt = datetime.fromisoformat(fail_time)
+                        error_info["error_timestamp"] = dt.timestamp()
+                    except Exception:
+                        error_info["error_timestamp"] = int(time.time())
+
+            conn.close()
+        except Exception:
+            pass
+
+    # Check optimization log for errors
+    log_file = results_dir / "optimization.log"
+    if log_file.exists() and "error" not in error_info:
+        try:
+            # Read last 50 lines of log
+            with open(log_file, "r") as f:
+                lines = f.readlines()[-50:]
+
+            for line in reversed(lines):
+                line_lower = line.lower()
+                if "error" in line_lower or "failed" in line_lower or "exception" in line_lower:
+                    error_info["error"] = line.strip()[:200]  # Truncate long messages
+                    error_info["error_timestamp"] = int(log_file.stat().st_mtime)
+                    break
+        except Exception:
+            pass
+
+    return error_info
+
+
 def _load_study_info(study_dir: Path, topic: Optional[str] = None) -> Optional[dict]:
    """Load study info from a study directory. Returns None if not a valid study."""
    # Look for optimization config (check multiple locations)
@@ -394,9 +482,12 @@ async def get_study_status(study_id: str):
            total_trials = config.get("optimization_settings", {}).get("n_trials", 50)
            status = get_accurate_study_status(study_id, trial_count, total_trials, True)

+            # Check for error status
+            error_info = _get_study_error_info(study_dir, results_dir)
+
            return {
                "study_id": study_id,
-                "status": status,
+                "status": error_info.get("status_override") or status,
                "progress": {
                    "current": trial_count,
                    "total": total_trials,
@@ -405,6 +496,10 @@ async def get_study_status(study_id: str):
                "best_trial": best_trial,
                "pruned_trials": pruned_count,
                "config": config,
+                "error": error_info.get("error"),
+                "error_details": error_info.get("error_details"),
+                "error_timestamp": error_info.get("error_timestamp"),
+                "current_trial": error_info.get("current_trial"),
            }

        # Legacy: Read from JSON history
@@ -437,9 +532,12 @@ async def get_study_status(study_id: str):

        status = "completed" if trial_count >= total_trials else "running"

+        # Check for error status
+        error_info = _get_study_error_info(study_dir, results_dir)
+
        return {
            "study_id": study_id,
-            "status": status,
+            "status": error_info.get("status_override") or status,
            "progress": {
                "current": trial_count,
                "total": total_trials,
@@ -448,6 +546,10 @@ async def get_study_status(study_id: str):
            "best_trial": best_trial,
            "pruned_trials": pruned_count,
            "config": config,
+            "error": error_info.get("error"),
+            "error_details": error_info.get("error_details"),
+            "error_timestamp": error_info.get("error_timestamp"),
+            "current_trial": error_info.get("current_trial"),
        }

    except FileNotFoundError: