generate_training_data.py

"""
Space-Filling Training Data Generator

This script generates FEA training points that cover the ENTIRE design space
uniformly, unlike optimization which focuses only on promising regions.

Sampling Methods:
1. Latin Hypercube Sampling (LHS) - Good coverage, no clustering
2. Sobol Sequence - Quasi-random, very uniform
3. Grid Sampling - Regular grid, exhaustive but slow

Usage:
    python generate_training_data.py --study uav_arm_optimization --method lhs --points 100
"""
import sys
from pathlib import Path
import json
import argparse
import numpy as np
from scipy.stats import qmc  # For Latin Hypercube and Sobol

project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))


def load_config_bounds(study_path: Path) -> dict:
    """Load design variable bounds from optimization_config.json

    Supports two config formats:
    1. {"parameter": "name", "bounds": [min, max]}  - Current format
    2. {"name": "name", "min_value": min, "max_value": max}  - Legacy format
    """
    config_path = study_path / "1_setup" / "optimization_config.json"

    if not config_path.exists():
        raise FileNotFoundError(f"Config not found: {config_path}")

    with open(config_path) as f:
        config = json.load(f)

    bounds = {}
    for var in config.get('design_variables', []):
        # Support both 'parameter' and 'name' keys
        name = var.get('parameter') or var.get('name')

        # Support both "bounds": [min, max] and "min_value"/"max_value" formats
        if 'bounds' in var:
            min_val, max_val = var['bounds']
        else:
            min_val = var.get('min_value', var.get('min', 0))
            max_val = var.get('max_value', var.get('max', 1))

        # Detect integer type based on name or explicit type
        is_int = (var.get('type') == 'integer' or
                  'count' in name.lower() or
                  (isinstance(min_val, int) and isinstance(max_val, int) and max_val - min_val < 20))

        bounds[name] = {
            'min': min_val,
            'max': max_val,
            'type': 'int' if is_int else 'float'
        }

    return bounds, config


def generate_lhs_samples(bounds: dict, n_samples: int, seed: int = 42) -> list:
    """
    Generate Latin Hypercube Samples across the full design space.

    LHS ensures:
    - Each dimension is divided into n equal intervals
    - Exactly one sample in each interval per dimension
    - Much better coverage than random sampling
    """
    var_names = list(bounds.keys())
    n_dims = len(var_names)

    # Create LHS sampler
    sampler = qmc.LatinHypercube(d=n_dims, seed=seed)
    samples_unit = sampler.random(n=n_samples)  # Values in [0, 1]

    # Scale to actual bounds
    samples = []
    for i in range(n_samples):
        point = {}
        for j, name in enumerate(var_names):
            b = bounds[name]
            value = b['min'] + samples_unit[i, j] * (b['max'] - b['min'])

            if b['type'] == 'int':
                value = int(round(value))

            point[name] = value
        samples.append(point)

    return samples


def generate_sobol_samples(bounds: dict, n_samples: int, seed: int = 42) -> list:
    """
    Generate Sobol sequence samples (quasi-random, very uniform).

    Sobol sequences are deterministic and provide excellent uniformity.
    """
    var_names = list(bounds.keys())
    n_dims = len(var_names)

    sampler = qmc.Sobol(d=n_dims, scramble=True, seed=seed)
    samples_unit = sampler.random(n=n_samples)

    samples = []
    for i in range(n_samples):
        point = {}
        for j, name in enumerate(var_names):
            b = bounds[name]
            value = b['min'] + samples_unit[i, j] * (b['max'] - b['min'])

            if b['type'] == 'int':
                value = int(round(value))

            point[name] = value
        samples.append(point)

    return samples


def generate_grid_samples(bounds: dict, points_per_dim: int = 5) -> list:
    """
    Generate regular grid samples.

    Warning: Scales exponentially with dimensions!
    4 dims x 5 points = 625 samples
    4 dims x 10 points = 10,000 samples
    """
    var_names = list(bounds.keys())

    # Create linspace for each dimension
    grids = []
    for name in var_names:
        b = bounds[name]
        if b['type'] == 'int':
            # For integers, use actual integer values
            values = np.linspace(b['min'], b['max'], points_per_dim)
            values = np.unique(np.round(values).astype(int))
        else:
            values = np.linspace(b['min'], b['max'], points_per_dim)
        grids.append(values)

    # Create meshgrid and flatten
    mesh = np.meshgrid(*grids, indexing='ij')
    flat = [m.flatten() for m in mesh]

    samples = []
    for i in range(len(flat[0])):
        point = {}
        for j, name in enumerate(var_names):
            value = flat[j][i]
            if bounds[name]['type'] == 'int':
                value = int(value)
            else:
                value = float(value)
            point[name] = value
        samples.append(point)

    return samples


def generate_corner_samples(bounds: dict) -> list:
    """
    Generate samples at all corners of the design space.

    This ensures the NN sees the extreme combinations.
    For 4 dimensions: 2^4 = 16 corner points
    """
    var_names = list(bounds.keys())
    n_dims = len(var_names)

    samples = []
    for i in range(2**n_dims):
        point = {}
        for j, name in enumerate(var_names):
            b = bounds[name]
            # Use bit j of i to decide min or max
            value = b['max'] if (i >> j) & 1 else b['min']
            if b['type'] == 'int':
                value = int(value)
            point[name] = value
        samples.append(point)

    return samples


def save_training_points(samples: list, output_path: Path):
    """Save training points to JSON file."""
    with open(output_path, 'w') as f:
        json.dump({
            'n_samples': len(samples),
            'samples': samples
        }, f, indent=2)
    print(f"Saved {len(samples)} training points to: {output_path}")


def visualize_coverage(samples: list, bounds: dict, save_path: Path):
    """Visualize how well samples cover the design space."""
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt

    var_names = list(bounds.keys())
    n_vars = len(var_names)

    # Create pairwise scatter plots
    fig, axes = plt.subplots(n_vars-1, n_vars-1, figsize=(12, 12))

    for i in range(n_vars - 1):
        for j in range(i + 1, n_vars):
            ax = axes[j-1, i] if n_vars > 2 else axes

            x = [s[var_names[i]] for s in samples]
            y = [s[var_names[j]] for s in samples]

            ax.scatter(x, y, alpha=0.5, s=20)
            ax.set_xlabel(var_names[i].replace('_', '\n'), fontsize=8)
            ax.set_ylabel(var_names[j].replace('_', '\n'), fontsize=8)

            # Show bounds
            b_i = bounds[var_names[i]]
            b_j = bounds[var_names[j]]
            ax.set_xlim(b_i['min'], b_i['max'])
            ax.set_ylim(b_j['min'], b_j['max'])
            ax.grid(True, alpha=0.3)

    # Hide unused subplots
    for i in range(n_vars - 1):
        for j in range(i):
            if n_vars > 2:
                axes[i, j].set_visible(False)

    plt.suptitle(f'Design Space Coverage ({len(samples)} samples)', fontsize=14)
    plt.tight_layout()
    plt.savefig(save_path, dpi=150)
    plt.close()
    print(f"Saved coverage plot: {save_path}")


def main():
    parser = argparse.ArgumentParser(description='Generate space-filling training data')
    parser.add_argument('--study', required=True, help='Study name (e.g., uav_arm_optimization)')
    parser.add_argument('--method', default='lhs', choices=['lhs', 'sobol', 'grid', 'corners', 'combined'],
                        help='Sampling method')
    parser.add_argument('--points', type=int, default=100, help='Number of samples (for lhs/sobol)')
    parser.add_argument('--grid-points', type=int, default=5, help='Points per dimension (for grid)')
    parser.add_argument('--seed', type=int, default=42, help='Random seed')
    parser.add_argument('--visualize', action='store_true', help='Generate coverage plot')
    args = parser.parse_args()

    study_path = project_root / "studies" / args.study
    if not study_path.exists():
        print(f"ERROR: Study not found: {study_path}")
        return

    print("="*70)
    print("Space-Filling Training Data Generator")
    print("="*70)

    # Load bounds from config
    print(f"\nLoading config from: {study_path}")
    bounds, config = load_config_bounds(study_path)

    print(f"\nDesign Variable Bounds:")
    for name, b in bounds.items():
        print(f"  {name}: [{b['min']}, {b['max']}] ({b['type']})")

    # Generate samples
    print(f"\nGenerating samples using method: {args.method}")

    if args.method == 'lhs':
        samples = generate_lhs_samples(bounds, args.points, args.seed)
    elif args.method == 'sobol':
        samples = generate_sobol_samples(bounds, args.points, args.seed)
    elif args.method == 'grid':
        samples = generate_grid_samples(bounds, args.grid_points)
    elif args.method == 'corners':
        samples = generate_corner_samples(bounds)
    elif args.method == 'combined':
        # Combine corners + LHS for best coverage
        corner_samples = generate_corner_samples(bounds)
        lhs_samples = generate_lhs_samples(bounds, args.points - len(corner_samples), args.seed)
        samples = corner_samples + lhs_samples
        print(f"  Combined: {len(corner_samples)} corners + {len(lhs_samples)} LHS")

    print(f"  Generated {len(samples)} samples")

    # Show sample range coverage
    print(f"\nSample Coverage:")
    for name in bounds.keys():
        values = [s[name] for s in samples]
        print(f"  {name}: [{min(values):.2f}, {max(values):.2f}]")

    # Save samples
    output_path = study_path / "1_setup" / "training_points.json"
    save_training_points(samples, output_path)

    # Visualize if requested
    if args.visualize:
        plot_path = study_path / "1_setup" / "training_coverage.png"
        visualize_coverage(samples, bounds, plot_path)

    print(f"\n" + "="*70)
    print("NEXT STEPS")
    print("="*70)
    print(f"1. Run FEA on all {len(samples)} training points:")
    print(f"   python run_training_fea.py --study {args.study}")
    print(f"2. This will create comprehensive training data")
    print(f"3. Then retrain NN on this uniform data")


if __name__ == '__main__':
    main()
feat: Major update with validators, skills, dashboard, and docs reorganization - Add validation framework (config, model, results, study validators) - Add Claude Code skills (create-study, run-optimization, generate-report, troubleshoot, analyze-model) - Add Atomizer Dashboard (React frontend + FastAPI backend) - Reorganize docs into structured directories (00-09) - Add neural surrogate modules and training infrastructure - Add multi-objective optimization support 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-25 19:23:58 -05:00			`"""`
			`Space-Filling Training Data Generator`

			`This script generates FEA training points that cover the ENTIRE design space`
			`uniformly, unlike optimization which focuses only on promising regions.`

			`Sampling Methods:`
			`1. Latin Hypercube Sampling (LHS) - Good coverage, no clustering`
			`2. Sobol Sequence - Quasi-random, very uniform`
			`3. Grid Sampling - Regular grid, exhaustive but slow`

			`Usage:`
			`python generate_training_data.py --study uav_arm_optimization --method lhs --points 100`
			`"""`
			`import sys`
			`from pathlib import Path`
			`import json`
			`import argparse`
			`import numpy as np`
			`from scipy.stats import qmc # For Latin Hypercube and Sobol`

			`project_root = Path(__file__).parent`
			`sys.path.insert(0, str(project_root))`


			`def load_config_bounds(study_path: Path) -> dict:`
			`"""Load design variable bounds from optimization_config.json`

			`Supports two config formats:`
			`1. {"parameter": "name", "bounds": [min, max]} - Current format`
			`2. {"name": "name", "min_value": min, "max_value": max} - Legacy format`
			`"""`
			`config_path = study_path / "1_setup" / "optimization_config.json"`

			`if not config_path.exists():`
			`raise FileNotFoundError(f"Config not found: {config_path}")`

			`with open(config_path) as f:`
			`config = json.load(f)`

			`bounds = {}`
			`for var in config.get('design_variables', []):`
			`# Support both 'parameter' and 'name' keys`
			`name = var.get('parameter') or var.get('name')`

			`# Support both "bounds": [min, max] and "min_value"/"max_value" formats`
			`if 'bounds' in var:`
			`min_val, max_val = var['bounds']`
			`else:`
			`min_val = var.get('min_value', var.get('min', 0))`
			`max_val = var.get('max_value', var.get('max', 1))`

			`# Detect integer type based on name or explicit type`
			`is_int = (var.get('type') == 'integer' or`
			`'count' in name.lower() or`
			`(isinstance(min_val, int) and isinstance(max_val, int) and max_val - min_val < 20))`

			`bounds[name] = {`
			`'min': min_val,`
			`'max': max_val,`
			`'type': 'int' if is_int else 'float'`
			`}`

			`return bounds, config`


			`def generate_lhs_samples(bounds: dict, n_samples: int, seed: int = 42) -> list:`
			`"""`
			`Generate Latin Hypercube Samples across the full design space.`

			`LHS ensures:`
			`- Each dimension is divided into n equal intervals`
			`- Exactly one sample in each interval per dimension`
			`- Much better coverage than random sampling`
			`"""`
			`var_names = list(bounds.keys())`
			`n_dims = len(var_names)`

			`# Create LHS sampler`
			`sampler = qmc.LatinHypercube(d=n_dims, seed=seed)`
			`samples_unit = sampler.random(n=n_samples) # Values in [0, 1]`

			`# Scale to actual bounds`
			`samples = []`
			`for i in range(n_samples):`
			`point = {}`
			`for j, name in enumerate(var_names):`
			`b = bounds[name]`
			`value = b['min'] + samples_unit[i, j] * (b['max'] - b['min'])`

			`if b['type'] == 'int':`
			`value = int(round(value))`

			`point[name] = value`
			`samples.append(point)`

			`return samples`


			`def generate_sobol_samples(bounds: dict, n_samples: int, seed: int = 42) -> list:`
			`"""`
			`Generate Sobol sequence samples (quasi-random, very uniform).`

			`Sobol sequences are deterministic and provide excellent uniformity.`
			`"""`
			`var_names = list(bounds.keys())`
			`n_dims = len(var_names)`

			`sampler = qmc.Sobol(d=n_dims, scramble=True, seed=seed)`
			`samples_unit = sampler.random(n=n_samples)`

			`samples = []`
			`for i in range(n_samples):`
			`point = {}`
			`for j, name in enumerate(var_names):`
			`b = bounds[name]`
			`value = b['min'] + samples_unit[i, j] * (b['max'] - b['min'])`

			`if b['type'] == 'int':`
			`value = int(round(value))`

			`point[name] = value`
			`samples.append(point)`

			`return samples`


			`def generate_grid_samples(bounds: dict, points_per_dim: int = 5) -> list:`
			`"""`
			`Generate regular grid samples.`

			`Warning: Scales exponentially with dimensions!`
			`4 dims x 5 points = 625 samples`
			`4 dims x 10 points = 10,000 samples`
			`"""`
			`var_names = list(bounds.keys())`

			`# Create linspace for each dimension`
			`grids = []`
			`for name in var_names:`
			`b = bounds[name]`
			`if b['type'] == 'int':`
			`# For integers, use actual integer values`
			`values = np.linspace(b['min'], b['max'], points_per_dim)`
			`values = np.unique(np.round(values).astype(int))`
			`else:`
			`values = np.linspace(b['min'], b['max'], points_per_dim)`
			`grids.append(values)`

			`# Create meshgrid and flatten`
			`mesh = np.meshgrid(*grids, indexing='ij')`
			`flat = [m.flatten() for m in mesh]`

			`samples = []`
			`for i in range(len(flat[0])):`
			`point = {}`
			`for j, name in enumerate(var_names):`
			`value = flat[j][i]`
			`if bounds[name]['type'] == 'int':`
			`value = int(value)`
			`else:`
			`value = float(value)`
			`point[name] = value`
			`samples.append(point)`

			`return samples`


			`def generate_corner_samples(bounds: dict) -> list:`
			`"""`
			`Generate samples at all corners of the design space.`

			`This ensures the NN sees the extreme combinations.`
			`For 4 dimensions: 2^4 = 16 corner points`
			`"""`
			`var_names = list(bounds.keys())`
			`n_dims = len(var_names)`

			`samples = []`
			`for i in range(2**n_dims):`
			`point = {}`
			`for j, name in enumerate(var_names):`
			`b = bounds[name]`
			`# Use bit j of i to decide min or max`
			`value = b['max'] if (i >> j) & 1 else b['min']`
			`if b['type'] == 'int':`
			`value = int(value)`
			`point[name] = value`
			`samples.append(point)`

			`return samples`


			`def save_training_points(samples: list, output_path: Path):`
			`"""Save training points to JSON file."""`
			`with open(output_path, 'w') as f:`
			`json.dump({`
			`'n_samples': len(samples),`
			`'samples': samples`
			`}, f, indent=2)`
			`print(f"Saved {len(samples)} training points to: {output_path}")`


			`def visualize_coverage(samples: list, bounds: dict, save_path: Path):`
			`"""Visualize how well samples cover the design space."""`
			`import matplotlib`
			`matplotlib.use('Agg')`
			`import matplotlib.pyplot as plt`

			`var_names = list(bounds.keys())`
			`n_vars = len(var_names)`

			`# Create pairwise scatter plots`
			`fig, axes = plt.subplots(n_vars-1, n_vars-1, figsize=(12, 12))`

			`for i in range(n_vars - 1):`
			`for j in range(i + 1, n_vars):`
			`ax = axes[j-1, i] if n_vars > 2 else axes`

			`x = [s[var_names[i]] for s in samples]`
			`y = [s[var_names[j]] for s in samples]`

			`ax.scatter(x, y, alpha=0.5, s=20)`
			`ax.set_xlabel(var_names[i].replace('_', '\n'), fontsize=8)`
			`ax.set_ylabel(var_names[j].replace('_', '\n'), fontsize=8)`

			`# Show bounds`
			`b_i = bounds[var_names[i]]`
			`b_j = bounds[var_names[j]]`
			`ax.set_xlim(b_i['min'], b_i['max'])`
			`ax.set_ylim(b_j['min'], b_j['max'])`
			`ax.grid(True, alpha=0.3)`

			`# Hide unused subplots`
			`for i in range(n_vars - 1):`
			`for j in range(i):`
			`if n_vars > 2:`
			`axes[i, j].set_visible(False)`

			`plt.suptitle(f'Design Space Coverage ({len(samples)} samples)', fontsize=14)`
			`plt.tight_layout()`
			`plt.savefig(save_path, dpi=150)`
			`plt.close()`
			`print(f"Saved coverage plot: {save_path}")`


			`def main():`
			`parser = argparse.ArgumentParser(description='Generate space-filling training data')`
			`parser.add_argument('--study', required=True, help='Study name (e.g., uav_arm_optimization)')`
			`parser.add_argument('--method', default='lhs', choices=['lhs', 'sobol', 'grid', 'corners', 'combined'],`
			`help='Sampling method')`
			`parser.add_argument('--points', type=int, default=100, help='Number of samples (for lhs/sobol)')`
			`parser.add_argument('--grid-points', type=int, default=5, help='Points per dimension (for grid)')`
			`parser.add_argument('--seed', type=int, default=42, help='Random seed')`
			`parser.add_argument('--visualize', action='store_true', help='Generate coverage plot')`
			`args = parser.parse_args()`

			`study_path = project_root / "studies" / args.study`
			`if not study_path.exists():`
			`print(f"ERROR: Study not found: {study_path}")`
			`return`

			`print("="*70)`
			`print("Space-Filling Training Data Generator")`
			`print("="*70)`

			`# Load bounds from config`
			`print(f"\nLoading config from: {study_path}")`
			`bounds, config = load_config_bounds(study_path)`

			`print(f"\nDesign Variable Bounds:")`
			`for name, b in bounds.items():`
			`print(f" {name}: [{b['min']}, {b['max']}] ({b['type']})")`

			`# Generate samples`
			`print(f"\nGenerating samples using method: {args.method}")`

			`if args.method == 'lhs':`
			`samples = generate_lhs_samples(bounds, args.points, args.seed)`
			`elif args.method == 'sobol':`
			`samples = generate_sobol_samples(bounds, args.points, args.seed)`
			`elif args.method == 'grid':`
			`samples = generate_grid_samples(bounds, args.grid_points)`
			`elif args.method == 'corners':`
			`samples = generate_corner_samples(bounds)`
			`elif args.method == 'combined':`
			`# Combine corners + LHS for best coverage`
			`corner_samples = generate_corner_samples(bounds)`
			`lhs_samples = generate_lhs_samples(bounds, args.points - len(corner_samples), args.seed)`
			`samples = corner_samples + lhs_samples`
			`print(f" Combined: {len(corner_samples)} corners + {len(lhs_samples)} LHS")`

			`print(f" Generated {len(samples)} samples")`

			`# Show sample range coverage`
			`print(f"\nSample Coverage:")`
			`for name in bounds.keys():`
			`values = [s[name] for s in samples]`
			`print(f" {name}: [{min(values):.2f}, {max(values):.2f}]")`

			`# Save samples`
			`output_path = study_path / "1_setup" / "training_points.json"`
			`save_training_points(samples, output_path)`

			`# Visualize if requested`
			`if args.visualize:`
			`plot_path = study_path / "1_setup" / "training_coverage.png"`
			`visualize_coverage(samples, bounds, plot_path)`

			`print(f"\n" + "="*70)`
			`print("NEXT STEPS")`
			`print("="*70)`
			`print(f"1. Run FEA on all {len(samples)} training points:")`
			`print(f" python run_training_fea.py --study {args.study}")`
			`print(f"2. This will create comprehensive training data")`
			`print(f"3. Then retrain NN on this uniform data")`


			`if __name__ == '__main__':`
			`main()`