""" Space-Filling Training Data Generator This script generates FEA training points that cover the ENTIRE design space uniformly, unlike optimization which focuses only on promising regions. Sampling Methods: 1. Latin Hypercube Sampling (LHS) - Good coverage, no clustering 2. Sobol Sequence - Quasi-random, very uniform 3. Grid Sampling - Regular grid, exhaustive but slow Usage: python generate_training_data.py --study uav_arm_optimization --method lhs --points 100 """ import sys from pathlib import Path import json import argparse import numpy as np from scipy.stats import qmc # For Latin Hypercube and Sobol project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) def load_config_bounds(study_path: Path) -> dict: """Load design variable bounds from optimization_config.json Supports two config formats: 1. {"parameter": "name", "bounds": [min, max]} - Current format 2. {"name": "name", "min_value": min, "max_value": max} - Legacy format """ config_path = study_path / "1_setup" / "optimization_config.json" if not config_path.exists(): raise FileNotFoundError(f"Config not found: {config_path}") with open(config_path) as f: config = json.load(f) bounds = {} for var in config.get('design_variables', []): # Support both 'parameter' and 'name' keys name = var.get('parameter') or var.get('name') # Support both "bounds": [min, max] and "min_value"/"max_value" formats if 'bounds' in var: min_val, max_val = var['bounds'] else: min_val = var.get('min_value', var.get('min', 0)) max_val = var.get('max_value', var.get('max', 1)) # Detect integer type based on name or explicit type is_int = (var.get('type') == 'integer' or 'count' in name.lower() or (isinstance(min_val, int) and isinstance(max_val, int) and max_val - min_val < 20)) bounds[name] = { 'min': min_val, 'max': max_val, 'type': 'int' if is_int else 'float' } return bounds, config def generate_lhs_samples(bounds: dict, n_samples: int, seed: int = 42) -> list: """ Generate Latin Hypercube Samples across the full design space. LHS ensures: - Each dimension is divided into n equal intervals - Exactly one sample in each interval per dimension - Much better coverage than random sampling """ var_names = list(bounds.keys()) n_dims = len(var_names) # Create LHS sampler sampler = qmc.LatinHypercube(d=n_dims, seed=seed) samples_unit = sampler.random(n=n_samples) # Values in [0, 1] # Scale to actual bounds samples = [] for i in range(n_samples): point = {} for j, name in enumerate(var_names): b = bounds[name] value = b['min'] + samples_unit[i, j] * (b['max'] - b['min']) if b['type'] == 'int': value = int(round(value)) point[name] = value samples.append(point) return samples def generate_sobol_samples(bounds: dict, n_samples: int, seed: int = 42) -> list: """ Generate Sobol sequence samples (quasi-random, very uniform). Sobol sequences are deterministic and provide excellent uniformity. """ var_names = list(bounds.keys()) n_dims = len(var_names) sampler = qmc.Sobol(d=n_dims, scramble=True, seed=seed) samples_unit = sampler.random(n=n_samples) samples = [] for i in range(n_samples): point = {} for j, name in enumerate(var_names): b = bounds[name] value = b['min'] + samples_unit[i, j] * (b['max'] - b['min']) if b['type'] == 'int': value = int(round(value)) point[name] = value samples.append(point) return samples def generate_grid_samples(bounds: dict, points_per_dim: int = 5) -> list: """ Generate regular grid samples. Warning: Scales exponentially with dimensions! 4 dims x 5 points = 625 samples 4 dims x 10 points = 10,000 samples """ var_names = list(bounds.keys()) # Create linspace for each dimension grids = [] for name in var_names: b = bounds[name] if b['type'] == 'int': # For integers, use actual integer values values = np.linspace(b['min'], b['max'], points_per_dim) values = np.unique(np.round(values).astype(int)) else: values = np.linspace(b['min'], b['max'], points_per_dim) grids.append(values) # Create meshgrid and flatten mesh = np.meshgrid(*grids, indexing='ij') flat = [m.flatten() for m in mesh] samples = [] for i in range(len(flat[0])): point = {} for j, name in enumerate(var_names): value = flat[j][i] if bounds[name]['type'] == 'int': value = int(value) else: value = float(value) point[name] = value samples.append(point) return samples def generate_corner_samples(bounds: dict) -> list: """ Generate samples at all corners of the design space. This ensures the NN sees the extreme combinations. For 4 dimensions: 2^4 = 16 corner points """ var_names = list(bounds.keys()) n_dims = len(var_names) samples = [] for i in range(2**n_dims): point = {} for j, name in enumerate(var_names): b = bounds[name] # Use bit j of i to decide min or max value = b['max'] if (i >> j) & 1 else b['min'] if b['type'] == 'int': value = int(value) point[name] = value samples.append(point) return samples def save_training_points(samples: list, output_path: Path): """Save training points to JSON file.""" with open(output_path, 'w') as f: json.dump({ 'n_samples': len(samples), 'samples': samples }, f, indent=2) print(f"Saved {len(samples)} training points to: {output_path}") def visualize_coverage(samples: list, bounds: dict, save_path: Path): """Visualize how well samples cover the design space.""" import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt var_names = list(bounds.keys()) n_vars = len(var_names) # Create pairwise scatter plots fig, axes = plt.subplots(n_vars-1, n_vars-1, figsize=(12, 12)) for i in range(n_vars - 1): for j in range(i + 1, n_vars): ax = axes[j-1, i] if n_vars > 2 else axes x = [s[var_names[i]] for s in samples] y = [s[var_names[j]] for s in samples] ax.scatter(x, y, alpha=0.5, s=20) ax.set_xlabel(var_names[i].replace('_', '\n'), fontsize=8) ax.set_ylabel(var_names[j].replace('_', '\n'), fontsize=8) # Show bounds b_i = bounds[var_names[i]] b_j = bounds[var_names[j]] ax.set_xlim(b_i['min'], b_i['max']) ax.set_ylim(b_j['min'], b_j['max']) ax.grid(True, alpha=0.3) # Hide unused subplots for i in range(n_vars - 1): for j in range(i): if n_vars > 2: axes[i, j].set_visible(False) plt.suptitle(f'Design Space Coverage ({len(samples)} samples)', fontsize=14) plt.tight_layout() plt.savefig(save_path, dpi=150) plt.close() print(f"Saved coverage plot: {save_path}") def main(): parser = argparse.ArgumentParser(description='Generate space-filling training data') parser.add_argument('--study', required=True, help='Study name (e.g., uav_arm_optimization)') parser.add_argument('--method', default='lhs', choices=['lhs', 'sobol', 'grid', 'corners', 'combined'], help='Sampling method') parser.add_argument('--points', type=int, default=100, help='Number of samples (for lhs/sobol)') parser.add_argument('--grid-points', type=int, default=5, help='Points per dimension (for grid)') parser.add_argument('--seed', type=int, default=42, help='Random seed') parser.add_argument('--visualize', action='store_true', help='Generate coverage plot') args = parser.parse_args() study_path = project_root / "studies" / args.study if not study_path.exists(): print(f"ERROR: Study not found: {study_path}") return print("="*70) print("Space-Filling Training Data Generator") print("="*70) # Load bounds from config print(f"\nLoading config from: {study_path}") bounds, config = load_config_bounds(study_path) print(f"\nDesign Variable Bounds:") for name, b in bounds.items(): print(f" {name}: [{b['min']}, {b['max']}] ({b['type']})") # Generate samples print(f"\nGenerating samples using method: {args.method}") if args.method == 'lhs': samples = generate_lhs_samples(bounds, args.points, args.seed) elif args.method == 'sobol': samples = generate_sobol_samples(bounds, args.points, args.seed) elif args.method == 'grid': samples = generate_grid_samples(bounds, args.grid_points) elif args.method == 'corners': samples = generate_corner_samples(bounds) elif args.method == 'combined': # Combine corners + LHS for best coverage corner_samples = generate_corner_samples(bounds) lhs_samples = generate_lhs_samples(bounds, args.points - len(corner_samples), args.seed) samples = corner_samples + lhs_samples print(f" Combined: {len(corner_samples)} corners + {len(lhs_samples)} LHS") print(f" Generated {len(samples)} samples") # Show sample range coverage print(f"\nSample Coverage:") for name in bounds.keys(): values = [s[name] for s in samples] print(f" {name}: [{min(values):.2f}, {max(values):.2f}]") # Save samples output_path = study_path / "1_setup" / "training_points.json" save_training_points(samples, output_path) # Visualize if requested if args.visualize: plot_path = study_path / "1_setup" / "training_coverage.png" visualize_coverage(samples, bounds, plot_path) print(f"\n" + "="*70) print("NEXT STEPS") print("="*70) print(f"1. Run FEA on all {len(samples)} training points:") print(f" python run_training_fea.py --study {args.study}") print(f"2. This will create comprehensive training data") print(f"3. Then retrain NN on this uniform data") if __name__ == '__main__': main()