Atomizer/generate_training_data.py

"""
Space-Filling Training Data Generator

This script generates FEA training points that cover the ENTIRE design space
uniformly, unlike optimization which focuses only on promising regions.

Sampling Methods:
1. Latin Hypercube Sampling (LHS) - Good coverage, no clustering
2. Sobol Sequence - Quasi-random, very uniform
3. Grid Sampling - Regular grid, exhaustive but slow

Usage:
    python generate_training_data.py --study uav_arm_optimization --method lhs --points 100
"""
import sys
from pathlib import Path
import json
import argparse
import numpy as np
from scipy.stats import qmc  # For Latin Hypercube and Sobol

project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))


def load_config_bounds(study_path: Path) -> dict:
    """Load design variable bounds from optimization_config.json

    Supports two config formats:
    1. {"parameter": "name", "bounds": [min, max]}  - Current format
    2. {"name": "name", "min_value": min, "max_value": max}  - Legacy format
    """
    config_path = study_path / "1_setup" / "optimization_config.json"

    if not config_path.exists():
        raise FileNotFoundError(f"Config not found: {config_path}")

    with open(config_path) as f:
        config = json.load(f)

    bounds = {}
    for var in config.get('design_variables', []):
        # Support both 'parameter' and 'name' keys
        name = var.get('parameter') or var.get('name')

        # Support both "bounds": [min, max] and "min_value"/"max_value" formats
        if 'bounds' in var:
            min_val, max_val = var['bounds']
        else:
            min_val = var.get('min_value', var.get('min', 0))
            max_val = var.get('max_value', var.get('max', 1))

        # Detect integer type based on name or explicit type
        is_int = (var.get('type') == 'integer' or
                  'count' in name.lower() or
                  (isinstance(min_val, int) and isinstance(max_val, int) and max_val - min_val < 20))

        bounds[name] = {
            'min': min_val,
            'max': max_val,
            'type': 'int' if is_int else 'float'
        }

    return bounds, config


def generate_lhs_samples(bounds: dict, n_samples: int, seed: int = 42) -> list:
    """
    Generate Latin Hypercube Samples across the full design space.

    LHS ensures:
    - Each dimension is divided into n equal intervals
    - Exactly one sample in each interval per dimension
    - Much better coverage than random sampling
    """
    var_names = list(bounds.keys())
    n_dims = len(var_names)

    # Create LHS sampler
    sampler = qmc.LatinHypercube(d=n_dims, seed=seed)
    samples_unit = sampler.random(n=n_samples)  # Values in [0, 1]

    # Scale to actual bounds
    samples = []
    for i in range(n_samples):
        point = {}
        for j, name in enumerate(var_names):
            b = bounds[name]
            value = b['min'] + samples_unit[i, j] * (b['max'] - b['min'])

            if b['type'] == 'int':
                value = int(round(value))

            point[name] = value
        samples.append(point)

    return samples


def generate_sobol_samples(bounds: dict, n_samples: int, seed: int = 42) -> list:
    """
    Generate Sobol sequence samples (quasi-random, very uniform).

    Sobol sequences are deterministic and provide excellent uniformity.
    """
    var_names = list(bounds.keys())
    n_dims = len(var_names)

    sampler = qmc.Sobol(d=n_dims, scramble=True, seed=seed)
    samples_unit = sampler.random(n=n_samples)

    samples = []
    for i in range(n_samples):
        point = {}
        for j, name in enumerate(var_names):
            b = bounds[name]
            value = b['min'] + samples_unit[i, j] * (b['max'] - b['min'])

            if b['type'] == 'int':
                value = int(round(value))

            point[name] = value
        samples.append(point)

    return samples


def generate_grid_samples(bounds: dict, points_per_dim: int = 5) -> list:
    """
    Generate regular grid samples.

    Warning: Scales exponentially with dimensions!
    4 dims x 5 points = 625 samples
    4 dims x 10 points = 10,000 samples
    """
    var_names = list(bounds.keys())

    # Create linspace for each dimension
    grids = []
    for name in var_names:
        b = bounds[name]
        if b['type'] == 'int':
            # For integers, use actual integer values
            values = np.linspace(b['min'], b['max'], points_per_dim)
            values = np.unique(np.round(values).astype(int))
        else:
            values = np.linspace(b['min'], b['max'], points_per_dim)
        grids.append(values)

    # Create meshgrid and flatten
    mesh = np.meshgrid(*grids, indexing='ij')
    flat = [m.flatten() for m in mesh]

    samples = []
    for i in range(len(flat[0])):
        point = {}
        for j, name in enumerate(var_names):
            value = flat[j][i]
            if bounds[name]['type'] == 'int':
                value = int(value)
            else:
                value = float(value)
            point[name] = value
        samples.append(point)

    return samples


def generate_corner_samples(bounds: dict) -> list:
    """
    Generate samples at all corners of the design space.

    This ensures the NN sees the extreme combinations.
    For 4 dimensions: 2^4 = 16 corner points
    """
    var_names = list(bounds.keys())
    n_dims = len(var_names)

    samples = []
    for i in range(2**n_dims):
        point = {}
        for j, name in enumerate(var_names):
            b = bounds[name]
            # Use bit j of i to decide min or max
            value = b['max'] if (i >> j) & 1 else b['min']
            if b['type'] == 'int':
                value = int(value)
            point[name] = value
        samples.append(point)

    return samples


def save_training_points(samples: list, output_path: Path):
    """Save training points to JSON file."""
    with open(output_path, 'w') as f:
        json.dump({
            'n_samples': len(samples),
            'samples': samples
        }, f, indent=2)
    print(f"Saved {len(samples)} training points to: {output_path}")


def visualize_coverage(samples: list, bounds: dict, save_path: Path):
    """Visualize how well samples cover the design space."""
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    var_names = list(bounds.keys())
    n_vars = len(var_names)

    # Create pairwise scatter plots
    fig, axes = plt.subplots(n_vars-1, n_vars-1, figsize=(12, 12))

    for i in range(n_vars - 1):
        for j in range(i + 1, n_vars):
            ax = axes[j-1, i] if n_vars > 2 else axes

            x = [s[var_names[i]] for s in samples]
            y = [s[var_names[j]] for s in samples]

            ax.scatter(x, y, alpha=0.5, s=20)
            ax.set_xlabel(var_names[i].replace('_', '\n'), fontsize=8)
            ax.set_ylabel(var_names[j].replace('_', '\n'), fontsize=8)

            # Show bounds
            b_i = bounds[var_names[i]]
            b_j = bounds[var_names[j]]
            ax.set_xlim(b_i['min'], b_i['max'])
            ax.set_ylim(b_j['min'], b_j['max'])
            ax.grid(True, alpha=0.3)

    # Hide unused subplots
    for i in range(n_vars - 1):
        for j in range(i):
            if n_vars > 2:
                axes[i, j].set_visible(False)

    plt.suptitle(f'Design Space Coverage ({len(samples)} samples)', fontsize=14)
    plt.tight_layout()
    plt.savefig(save_path, dpi=150)
    plt.close()
    print(f"Saved coverage plot: {save_path}")


def main():
    parser = argparse.ArgumentParser(description='Generate space-filling training data')
    parser.add_argument('--study', required=True, help='Study name (e.g., uav_arm_optimization)')
    parser.add_argument('--method', default='lhs', choices=['lhs', 'sobol', 'grid', 'corners', 'combined'],
                        help='Sampling method')
    parser.add_argument('--points', type=int, default=100, help='Number of samples (for lhs/sobol)')
    parser.add_argument('--grid-points', type=int, default=5, help='Points per dimension (for grid)')
    parser.add_argument('--seed', type=int, default=42, help='Random seed')
    parser.add_argument('--visualize', action='store_true', help='Generate coverage plot')
    args = parser.parse_args()

    study_path = project_root / "studies" / args.study
    if not study_path.exists():
        print(f"ERROR: Study not found: {study_path}")
        return

    print("="*70)
    print("Space-Filling Training Data Generator")
    print("="*70)

    # Load bounds from config
    print(f"\nLoading config from: {study_path}")
    bounds, config = load_config_bounds(study_path)

    print(f"\nDesign Variable Bounds:")
    for name, b in bounds.items():
        print(f"  {name}: [{b['min']}, {b['max']}] ({b['type']})")

    # Generate samples
    print(f"\nGenerating samples using method: {args.method}")

    if args.method == 'lhs':
        samples = generate_lhs_samples(bounds, args.points, args.seed)
    elif args.method == 'sobol':
        samples = generate_sobol_samples(bounds, args.points, args.seed)
    elif args.method == 'grid':
        samples = generate_grid_samples(bounds, args.grid_points)
    elif args.method == 'corners':
        samples = generate_corner_samples(bounds)
    elif args.method == 'combined':
        # Combine corners + LHS for best coverage
        corner_samples = generate_corner_samples(bounds)
        lhs_samples = generate_lhs_samples(bounds, args.points - len(corner_samples), args.seed)
        samples = corner_samples + lhs_samples
        print(f"  Combined: {len(corner_samples)} corners + {len(lhs_samples)} LHS")

    print(f"  Generated {len(samples)} samples")

    # Show sample range coverage
    print(f"\nSample Coverage:")
    for name in bounds.keys():
        values = [s[name] for s in samples]
        print(f"  {name}: [{min(values):.2f}, {max(values):.2f}]")

    # Save samples
    output_path = study_path / "1_setup" / "training_points.json"
    save_training_points(samples, output_path)

    # Visualize if requested
    if args.visualize:
        plot_path = study_path / "1_setup" / "training_coverage.png"
        visualize_coverage(samples, bounds, plot_path)

    print(f"\n" + "="*70)
    print("NEXT STEPS")
    print("="*70)
    print(f"1. Run FEA on all {len(samples)} training points:")
    print(f"   python run_training_fea.py --study {args.study}")
    print(f"2. This will create comprehensive training data")
    print(f"3. Then retrain NN on this uniform data")


if __name__ == '__main__':
    main()