Files
Atomizer/generate_training_data.py

321 lines
10 KiB
Python
Raw Normal View History

"""
Space-Filling Training Data Generator
This script generates FEA training points that cover the ENTIRE design space
uniformly, unlike optimization which focuses only on promising regions.
Sampling Methods:
1. Latin Hypercube Sampling (LHS) - Good coverage, no clustering
2. Sobol Sequence - Quasi-random, very uniform
3. Grid Sampling - Regular grid, exhaustive but slow
Usage:
python generate_training_data.py --study uav_arm_optimization --method lhs --points 100
"""
import sys
from pathlib import Path
import json
import argparse
import numpy as np
from scipy.stats import qmc # For Latin Hypercube and Sobol
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
def load_config_bounds(study_path: Path) -> dict:
"""Load design variable bounds from optimization_config.json
Supports two config formats:
1. {"parameter": "name", "bounds": [min, max]} - Current format
2. {"name": "name", "min_value": min, "max_value": max} - Legacy format
"""
config_path = study_path / "1_setup" / "optimization_config.json"
if not config_path.exists():
raise FileNotFoundError(f"Config not found: {config_path}")
with open(config_path) as f:
config = json.load(f)
bounds = {}
for var in config.get('design_variables', []):
# Support both 'parameter' and 'name' keys
name = var.get('parameter') or var.get('name')
# Support both "bounds": [min, max] and "min_value"/"max_value" formats
if 'bounds' in var:
min_val, max_val = var['bounds']
else:
min_val = var.get('min_value', var.get('min', 0))
max_val = var.get('max_value', var.get('max', 1))
# Detect integer type based on name or explicit type
is_int = (var.get('type') == 'integer' or
'count' in name.lower() or
(isinstance(min_val, int) and isinstance(max_val, int) and max_val - min_val < 20))
bounds[name] = {
'min': min_val,
'max': max_val,
'type': 'int' if is_int else 'float'
}
return bounds, config
def generate_lhs_samples(bounds: dict, n_samples: int, seed: int = 42) -> list:
"""
Generate Latin Hypercube Samples across the full design space.
LHS ensures:
- Each dimension is divided into n equal intervals
- Exactly one sample in each interval per dimension
- Much better coverage than random sampling
"""
var_names = list(bounds.keys())
n_dims = len(var_names)
# Create LHS sampler
sampler = qmc.LatinHypercube(d=n_dims, seed=seed)
samples_unit = sampler.random(n=n_samples) # Values in [0, 1]
# Scale to actual bounds
samples = []
for i in range(n_samples):
point = {}
for j, name in enumerate(var_names):
b = bounds[name]
value = b['min'] + samples_unit[i, j] * (b['max'] - b['min'])
if b['type'] == 'int':
value = int(round(value))
point[name] = value
samples.append(point)
return samples
def generate_sobol_samples(bounds: dict, n_samples: int, seed: int = 42) -> list:
"""
Generate Sobol sequence samples (quasi-random, very uniform).
Sobol sequences are deterministic and provide excellent uniformity.
"""
var_names = list(bounds.keys())
n_dims = len(var_names)
sampler = qmc.Sobol(d=n_dims, scramble=True, seed=seed)
samples_unit = sampler.random(n=n_samples)
samples = []
for i in range(n_samples):
point = {}
for j, name in enumerate(var_names):
b = bounds[name]
value = b['min'] + samples_unit[i, j] * (b['max'] - b['min'])
if b['type'] == 'int':
value = int(round(value))
point[name] = value
samples.append(point)
return samples
def generate_grid_samples(bounds: dict, points_per_dim: int = 5) -> list:
"""
Generate regular grid samples.
Warning: Scales exponentially with dimensions!
4 dims x 5 points = 625 samples
4 dims x 10 points = 10,000 samples
"""
var_names = list(bounds.keys())
# Create linspace for each dimension
grids = []
for name in var_names:
b = bounds[name]
if b['type'] == 'int':
# For integers, use actual integer values
values = np.linspace(b['min'], b['max'], points_per_dim)
values = np.unique(np.round(values).astype(int))
else:
values = np.linspace(b['min'], b['max'], points_per_dim)
grids.append(values)
# Create meshgrid and flatten
mesh = np.meshgrid(*grids, indexing='ij')
flat = [m.flatten() for m in mesh]
samples = []
for i in range(len(flat[0])):
point = {}
for j, name in enumerate(var_names):
value = flat[j][i]
if bounds[name]['type'] == 'int':
value = int(value)
else:
value = float(value)
point[name] = value
samples.append(point)
return samples
def generate_corner_samples(bounds: dict) -> list:
"""
Generate samples at all corners of the design space.
This ensures the NN sees the extreme combinations.
For 4 dimensions: 2^4 = 16 corner points
"""
var_names = list(bounds.keys())
n_dims = len(var_names)
samples = []
for i in range(2**n_dims):
point = {}
for j, name in enumerate(var_names):
b = bounds[name]
# Use bit j of i to decide min or max
value = b['max'] if (i >> j) & 1 else b['min']
if b['type'] == 'int':
value = int(value)
point[name] = value
samples.append(point)
return samples
def save_training_points(samples: list, output_path: Path):
"""Save training points to JSON file."""
with open(output_path, 'w') as f:
json.dump({
'n_samples': len(samples),
'samples': samples
}, f, indent=2)
print(f"Saved {len(samples)} training points to: {output_path}")
def visualize_coverage(samples: list, bounds: dict, save_path: Path):
"""Visualize how well samples cover the design space."""
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
var_names = list(bounds.keys())
n_vars = len(var_names)
# Create pairwise scatter plots
fig, axes = plt.subplots(n_vars-1, n_vars-1, figsize=(12, 12))
for i in range(n_vars - 1):
for j in range(i + 1, n_vars):
ax = axes[j-1, i] if n_vars > 2 else axes
x = [s[var_names[i]] for s in samples]
y = [s[var_names[j]] for s in samples]
ax.scatter(x, y, alpha=0.5, s=20)
ax.set_xlabel(var_names[i].replace('_', '\n'), fontsize=8)
ax.set_ylabel(var_names[j].replace('_', '\n'), fontsize=8)
# Show bounds
b_i = bounds[var_names[i]]
b_j = bounds[var_names[j]]
ax.set_xlim(b_i['min'], b_i['max'])
ax.set_ylim(b_j['min'], b_j['max'])
ax.grid(True, alpha=0.3)
# Hide unused subplots
for i in range(n_vars - 1):
for j in range(i):
if n_vars > 2:
axes[i, j].set_visible(False)
plt.suptitle(f'Design Space Coverage ({len(samples)} samples)', fontsize=14)
plt.tight_layout()
plt.savefig(save_path, dpi=150)
plt.close()
print(f"Saved coverage plot: {save_path}")
def main():
parser = argparse.ArgumentParser(description='Generate space-filling training data')
parser.add_argument('--study', required=True, help='Study name (e.g., uav_arm_optimization)')
parser.add_argument('--method', default='lhs', choices=['lhs', 'sobol', 'grid', 'corners', 'combined'],
help='Sampling method')
parser.add_argument('--points', type=int, default=100, help='Number of samples (for lhs/sobol)')
parser.add_argument('--grid-points', type=int, default=5, help='Points per dimension (for grid)')
parser.add_argument('--seed', type=int, default=42, help='Random seed')
parser.add_argument('--visualize', action='store_true', help='Generate coverage plot')
args = parser.parse_args()
study_path = project_root / "studies" / args.study
if not study_path.exists():
print(f"ERROR: Study not found: {study_path}")
return
print("="*70)
print("Space-Filling Training Data Generator")
print("="*70)
# Load bounds from config
print(f"\nLoading config from: {study_path}")
bounds, config = load_config_bounds(study_path)
print(f"\nDesign Variable Bounds:")
for name, b in bounds.items():
print(f" {name}: [{b['min']}, {b['max']}] ({b['type']})")
# Generate samples
print(f"\nGenerating samples using method: {args.method}")
if args.method == 'lhs':
samples = generate_lhs_samples(bounds, args.points, args.seed)
elif args.method == 'sobol':
samples = generate_sobol_samples(bounds, args.points, args.seed)
elif args.method == 'grid':
samples = generate_grid_samples(bounds, args.grid_points)
elif args.method == 'corners':
samples = generate_corner_samples(bounds)
elif args.method == 'combined':
# Combine corners + LHS for best coverage
corner_samples = generate_corner_samples(bounds)
lhs_samples = generate_lhs_samples(bounds, args.points - len(corner_samples), args.seed)
samples = corner_samples + lhs_samples
print(f" Combined: {len(corner_samples)} corners + {len(lhs_samples)} LHS")
print(f" Generated {len(samples)} samples")
# Show sample range coverage
print(f"\nSample Coverage:")
for name in bounds.keys():
values = [s[name] for s in samples]
print(f" {name}: [{min(values):.2f}, {max(values):.2f}]")
# Save samples
output_path = study_path / "1_setup" / "training_points.json"
save_training_points(samples, output_path)
# Visualize if requested
if args.visualize:
plot_path = study_path / "1_setup" / "training_coverage.png"
visualize_coverage(samples, bounds, plot_path)
print(f"\n" + "="*70)
print("NEXT STEPS")
print("="*70)
print(f"1. Run FEA on all {len(samples)} training points:")
print(f" python run_training_fea.py --study {args.study}")
print(f"2. This will create comprehensive training data")
print(f"3. Then retrain NN on this uniform data")
if __name__ == '__main__':
main()