Atomizer/optimization_engine/study/state.py

"""
Study State Detector for Atomizer

This module provides utilities to detect and summarize the state of an optimization study.
Used by Claude sessions to quickly understand study context on initialization.
"""

import json
import sqlite3
from pathlib import Path
from typing import Dict, Any, Optional, List
from datetime import datetime


def detect_study_state(study_dir: Path) -> Dict[str, Any]:
    """
    Detect the current state of an optimization study.

    Args:
        study_dir: Path to the study directory

    Returns:
        Dictionary with study state information
    """
    study_dir = Path(study_dir)
    state = {
        "is_study": False,
        "study_name": study_dir.name,
        "status": "unknown",
        "config": None,
        "fea_trials": 0,
        "nn_trials": 0,
        "pareto_solutions": 0,
        "best_trial": None,
        "last_activity": None,
        "has_turbo_report": False,
        "has_surrogate": False,
        "warnings": [],
        "next_actions": []
    }

    # Check if this is a valid study directory
    config_path = study_dir / "optimization_config.json"
    if not config_path.exists():
        # Try 1_setup subdirectory
        config_path = study_dir / "1_setup" / "optimization_config.json"

    if not config_path.exists():
        state["warnings"].append("No optimization_config.json found")
        return state

    state["is_study"] = True

    # Load config
    try:
        with open(config_path, 'r') as f:
            config = json.load(f)
        state["config"] = _summarize_config(config)
    except Exception as e:
        state["warnings"].append(f"Failed to parse config: {e}")

    # Check results directory
    results_dir = study_dir / "2_results"
    if not results_dir.exists():
        state["status"] = "not_started"
        state["next_actions"].append("Run: python run_optimization.py --discover")
        return state

    # Check study.db for FEA trials
    db_path = results_dir / "study.db"
    if db_path.exists():
        fea_stats = _query_study_db(db_path)
        state.update(fea_stats)

    # Check nn_study.db for NN trials
    nn_db_path = results_dir / "nn_study.db"
    if nn_db_path.exists():
        nn_stats = _query_study_db(nn_db_path, prefix="nn_")
        state["nn_trials"] = nn_stats.get("nn_fea_trials", 0)

    # Check for turbo report
    turbo_report_path = results_dir / "turbo_report.json"
    if turbo_report_path.exists():
        state["has_turbo_report"] = True
        try:
            with open(turbo_report_path, 'r') as f:
                turbo = json.load(f)
            state["turbo_summary"] = {
                "mode": turbo.get("mode"),
                "nn_trials": turbo.get("total_nn_trials", 0),
                "fea_validations": turbo.get("fea_validations", 0),
                "time_minutes": round(turbo.get("time_minutes", 0), 1)
            }
        except Exception:
            pass

    # Check for trained surrogate
    surrogate_path = results_dir / "surrogate.pt"
    state["has_surrogate"] = surrogate_path.exists()

    # Determine overall status
    state["status"] = _determine_status(state)

    # Suggest next actions
    state["next_actions"] = _suggest_next_actions(state)

    return state


def _summarize_config(config: Dict) -> Dict[str, Any]:
    """Extract key information from config."""
    # Handle different config formats
    variables = config.get("design_variables", config.get("variables", []))
    objectives = config.get("objectives", [])
    constraints = config.get("constraints", [])

    # Get variable names (handle different key names)
    var_names = []
    for v in variables:
        name = v.get("parameter") or v.get("name") or v.get("expression_name", "unknown")
        var_names.append(name)

    # Get objective names
    obj_names = []
    for o in objectives:
        name = o.get("name") or o.get("metric", "unknown")
        direction = o.get("goal") or o.get("direction", "minimize")
        obj_names.append(f"{name} ({direction})")

    return {
        "n_variables": len(variables),
        "n_objectives": len(objectives),
        "n_constraints": len(constraints),
        "variable_names": var_names[:5],  # First 5 only
        "objective_names": obj_names,
        "study_type": "multi_objective" if len(objectives) > 1 else "single_objective"
    }


def _query_study_db(db_path: Path, prefix: str = "") -> Dict[str, Any]:
    """Query Optuna study database for statistics."""
    stats = {
        f"{prefix}fea_trials": 0,
        f"{prefix}completed_trials": 0,
        f"{prefix}failed_trials": 0,
        f"{prefix}pareto_solutions": 0,
        "best_trial": None,
        "last_activity": None
    }

    try:
        conn = sqlite3.connect(str(db_path))
        cursor = conn.cursor()

        # Count trials by state
        cursor.execute("""
            SELECT state, COUNT(*) FROM trials
            GROUP BY state
        """)
        for state, count in cursor.fetchall():
            if state == "COMPLETE":
                stats[f"{prefix}completed_trials"] = count
                stats[f"{prefix}fea_trials"] = count
            elif state == "FAIL":
                stats[f"{prefix}failed_trials"] = count

        # Get last activity time
        cursor.execute("""
            SELECT MAX(datetime_complete) FROM trials
            WHERE datetime_complete IS NOT NULL
        """)
        result = cursor.fetchone()
        if result and result[0]:
            stats["last_activity"] = result[0]

        # Get best trial (for single objective)
        cursor.execute("""
            SELECT trial_id, value FROM trial_values
            WHERE objective_id = 0
            ORDER BY value ASC
            LIMIT 1
        """)
        result = cursor.fetchone()
        if result:
            stats["best_trial"] = {"trial_id": result[0], "value": result[1]}

        # Count Pareto solutions (trials with user_attr pareto=True or non-dominated)
        # Simplified: count distinct trials in trial_values
        cursor.execute("""
            SELECT COUNT(DISTINCT trial_id) FROM trial_values
        """)
        result = cursor.fetchone()
        if result:
            # For multi-objective, this is a rough estimate
            stats[f"{prefix}pareto_solutions"] = min(result[0], 50)  # Cap at 50

        conn.close()
    except Exception as e:
        stats["db_error"] = str(e)

    return stats


def _determine_status(state: Dict) -> str:
    """Determine overall study status."""
    if state["fea_trials"] == 0:
        return "not_started"
    elif state["fea_trials"] < 3:
        return "discovery"
    elif state["fea_trials"] < 10:
        return "validation"
    elif state["has_turbo_report"]:
        return "turbo_complete"
    elif state["has_surrogate"]:
        return "training_complete"
    elif state["fea_trials"] >= 50:
        return "fea_complete"
    else:
        return "in_progress"


def _suggest_next_actions(state: Dict) -> List[str]:
    """Suggest next actions based on study state."""
    actions = []

    if state["status"] == "not_started":
        actions.append("Run: python run_optimization.py --discover")
    elif state["status"] == "discovery":
        actions.append("Run: python run_optimization.py --validate")
    elif state["status"] == "validation":
        actions.append("Run: python run_optimization.py --test")
        actions.append("Or run full: python run_optimization.py --run --trials 50")
    elif state["status"] == "in_progress":
        actions.append("Continue: python run_optimization.py --resume")
    elif state["status"] == "fea_complete":
        actions.append("Analyze: python -m optimization_engine.method_selector optimization_config.json 2_results/study.db")
        actions.append("Or run turbo: python run_nn_optimization.py --turbo")
    elif state["status"] == "turbo_complete":
        actions.append("View results in dashboard: cd atomizer-dashboard && npm run dev")
        actions.append("Generate report: python generate_report.py")

    return actions


def format_study_summary(state: Dict) -> str:
    """Format study state as a human-readable summary."""
    if not state["is_study"]:
        return f"❌ Not a valid study directory: {state['study_name']}"

    lines = [
        f"📊 **Study: {state['study_name']}**",
        f"Status: {state['status'].replace('_', ' ').title()}",
        ""
    ]

    if state["config"]:
        cfg = state["config"]
        lines.append(f"**Configuration:**")
        lines.append(f"- Variables: {cfg['n_variables']} ({', '.join(cfg['variable_names'][:3])}{'...' if cfg['n_variables'] > 3 else ''})")
        lines.append(f"- Objectives: {cfg['n_objectives']} ({', '.join(cfg['objective_names'])})")
        lines.append(f"- Constraints: {cfg['n_constraints']}")
        lines.append(f"- Type: {cfg['study_type']}")
        lines.append("")

    lines.append("**Progress:**")
    lines.append(f"- FEA trials: {state['fea_trials']}")
    if state["nn_trials"] > 0:
        lines.append(f"- NN trials: {state['nn_trials']}")
    if state["has_turbo_report"] and "turbo_summary" in state:
        ts = state["turbo_summary"]
        lines.append(f"- Turbo mode: {ts['nn_trials']} NN + {ts['fea_validations']} FEA validations ({ts['time_minutes']} min)")
    if state["last_activity"]:
        lines.append(f"- Last activity: {state['last_activity']}")
    lines.append("")

    if state["next_actions"]:
        lines.append("**Suggested Next Actions:**")
        for action in state["next_actions"]:
            lines.append(f"  → {action}")

    if state["warnings"]:
        lines.append("")
        lines.append("**Warnings:**")
        for warning in state["warnings"]:
            lines.append(f"  ⚠️ {warning}")

    return "\n".join(lines)


def get_all_studies(atomizer_root: Path) -> List[Dict[str, Any]]:
    """Get state of all studies in the Atomizer studies directory."""
    studies_dir = atomizer_root / "studies"
    if not studies_dir.exists():
        return []

    studies = []
    for study_path in studies_dir.iterdir():
        if study_path.is_dir() and not study_path.name.startswith("."):
            state = detect_study_state(study_path)
            if state["is_study"]:
                studies.append(state)

    # Sort by last activity (most recent first)
    studies.sort(
        key=lambda s: s.get("last_activity") or "1970-01-01",
        reverse=True
    )

    return studies


if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1:
        study_path = Path(sys.argv[1])
    else:
        # Default to current directory
        study_path = Path.cwd()

    state = detect_study_state(study_path)
    print(format_study_summary(state))