Files
Atomizer/run_training_fea.py
Anto01 e3bdb08a22 feat: Major update with validators, skills, dashboard, and docs reorganization
- Add validation framework (config, model, results, study validators)
- Add Claude Code skills (create-study, run-optimization, generate-report,
  troubleshoot, analyze-model)
- Add Atomizer Dashboard (React frontend + FastAPI backend)
- Reorganize docs into structured directories (00-09)
- Add neural surrogate modules and training infrastructure
- Add multi-objective optimization support

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 19:23:58 -05:00

354 lines
12 KiB
Python

"""
Parallel FEA Training Data Generator
=====================================
Runs FEA simulations on space-filling training points in parallel
using multiple NX sessions. Results are stored in a shared SQLite
database for thread-safe access.
Hardware Recommendation (based on your i7-14700HX, 64GB RAM):
- 2-3 parallel sessions recommended (each uses ~4-6 cores for Nastran)
- ~30-50 min for 100 points with 3 sessions
Usage:
python run_training_fea.py --study uav_arm_optimization --workers 3
python run_training_fea.py --study uav_arm_optimization --workers 2 --start 50
"""
import sys
import json
import argparse
import shutil
import sqlite3
import time
from pathlib import Path
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Manager
import threading
# Add project root to path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
def setup_worker_directory(study_dir: Path, worker_id: int) -> Path:
"""Create isolated working directory for each worker."""
worker_dir = study_dir / "1_setup" / f"worker_{worker_id}"
model_src = study_dir / "1_setup" / "model"
# Clean and recreate worker directory
if worker_dir.exists():
shutil.rmtree(worker_dir)
# Copy model files to worker directory
shutil.copytree(model_src, worker_dir)
return worker_dir
def run_single_fea(args_tuple):
"""
Run a single FEA simulation. This function runs in a separate process.
Args:
args_tuple: (sample_idx, sample, worker_id, study_dir, db_path)
Returns:
dict with results or error
"""
sample_idx, sample, worker_id, study_dir_str, db_path_str = args_tuple
study_dir = Path(study_dir_str)
db_path = Path(db_path_str)
# Import inside worker to avoid multiprocessing issues
import sys
sys.path.insert(0, str(Path(__file__).parent))
try:
import config as atomizer_config
except ImportError:
atomizer_config = None
from optimization_engine.nx_solver import NXSolver
from optimization_engine.extractors.extract_displacement import extract_displacement
from optimization_engine.extractors.extract_von_mises_stress import extract_solid_stress
from optimization_engine.extractors.extract_frequency import extract_frequency
from optimization_engine.extractors.extract_mass_from_expression import extract_mass_from_expression
result = {
'sample_idx': sample_idx,
'worker_id': worker_id,
'params': sample,
'success': False,
'error': None,
'mass': None,
'frequency': None,
'max_displacement': None,
'max_stress': None
}
try:
# Setup worker directory
worker_dir = setup_worker_directory(study_dir, worker_id)
# Initialize NX solver for this worker
nx_solver = NXSolver(
nastran_version=atomizer_config.NX_VERSION if atomizer_config else "2412",
timeout=atomizer_config.NASTRAN_TIMEOUT if atomizer_config else 600,
use_journal=True,
enable_session_management=True,
study_name=f"training_worker_{worker_id}"
)
# Setup paths
model_file = worker_dir / "Beam.prt"
sim_file = worker_dir / "Beam_sim1.sim"
print(f"[Worker {worker_id}] Sample {sample_idx}: {sample}")
# Run simulation
sim_result = nx_solver.run_simulation(
sim_file=sim_file,
working_dir=worker_dir,
expression_updates=sample,
solution_name=None # Solve all solutions
)
if not sim_result['success']:
result['error'] = sim_result.get('error', 'Unknown error')
print(f"[Worker {worker_id}] Sample {sample_idx} FAILED: {result['error']}")
return result
op2_file = sim_result['op2_file']
# Extract results
# Mass from CAD expression
mass_kg = extract_mass_from_expression(model_file, expression_name="p173")
result['mass'] = mass_kg * 1000.0 # Convert to grams
# Frequency from modal analysis
op2_modal = str(op2_file).replace("solution_1", "solution_2")
freq_result = extract_frequency(op2_modal, subcase=1, mode_number=1)
result['frequency'] = freq_result['frequency']
# Displacement from static analysis
disp_result = extract_displacement(op2_file, subcase=1)
result['max_displacement'] = disp_result['max_displacement']
# Stress from static analysis
stress_result = extract_solid_stress(op2_file, subcase=1, element_type='cquad4')
result['max_stress'] = stress_result['max_von_mises']
result['success'] = True
print(f"[Worker {worker_id}] Sample {sample_idx} SUCCESS: mass={result['mass']:.1f}g, freq={result['frequency']:.1f}Hz")
# Save to database immediately (thread-safe with retries)
save_result_to_db(db_path, result)
except Exception as e:
result['error'] = str(e)
print(f"[Worker {worker_id}] Sample {sample_idx} ERROR: {e}")
return result
def save_result_to_db(db_path: Path, result: dict, max_retries: int = 5):
"""Save result to SQLite database with retry logic for concurrency."""
for attempt in range(max_retries):
try:
conn = sqlite3.connect(str(db_path), timeout=30)
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO training_results
(sample_idx, params_json, success, error, mass, frequency, max_displacement, max_stress, timestamp)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
result['sample_idx'],
json.dumps(result['params']),
result['success'],
result['error'],
result['mass'],
result['frequency'],
result['max_displacement'],
result['max_stress'],
datetime.now().isoformat()
))
conn.commit()
conn.close()
return True
except sqlite3.OperationalError as e:
if "locked" in str(e) and attempt < max_retries - 1:
time.sleep(0.5 * (attempt + 1)) # Exponential backoff
else:
raise
return False
def init_database(db_path: Path):
"""Initialize SQLite database for training results."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS training_results (
sample_idx INTEGER PRIMARY KEY,
params_json TEXT,
success INTEGER,
error TEXT,
mass REAL,
frequency REAL,
max_displacement REAL,
max_stress REAL,
timestamp TEXT
)
""")
conn.commit()
conn.close()
def get_completed_samples(db_path: Path) -> set:
"""Get set of already completed sample indices."""
if not db_path.exists():
return set()
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
try:
cursor.execute("SELECT sample_idx FROM training_results WHERE success = 1")
completed = {row[0] for row in cursor.fetchall()}
except sqlite3.OperationalError:
completed = set()
conn.close()
return completed
def main():
parser = argparse.ArgumentParser(description='Run parallel FEA training data generation')
parser.add_argument('--study', required=True, help='Study name (e.g., uav_arm_optimization)')
parser.add_argument('--workers', type=int, default=2, help='Number of parallel workers (default: 2)')
parser.add_argument('--start', type=int, default=0, help='Starting sample index (for resuming)')
parser.add_argument('--end', type=int, default=None, help='Ending sample index (exclusive)')
parser.add_argument('--resume', action='store_true', help='Skip already completed samples')
args = parser.parse_args()
# Setup paths
study_dir = project_root / "studies" / args.study
training_points_path = study_dir / "1_setup" / "training_points.json"
db_path = study_dir / "2_results" / "training_data.db"
if not study_dir.exists():
print(f"ERROR: Study not found: {study_dir}")
return
if not training_points_path.exists():
print(f"ERROR: Training points not found: {training_points_path}")
print(f"Generate them first: python generate_training_data.py --study {args.study}")
return
# Load training points
with open(training_points_path) as f:
data = json.load(f)
samples = data['samples']
total_samples = len(samples)
# Apply start/end filtering
end_idx = args.end if args.end else total_samples
samples_to_run = [(i, samples[i]) for i in range(args.start, min(end_idx, total_samples))]
print("=" * 70)
print("PARALLEL FEA TRAINING DATA GENERATOR")
print("=" * 70)
print(f"Study: {args.study}")
print(f"Total training points: {total_samples}")
print(f"Processing range: {args.start} to {end_idx}")
print(f"Parallel workers: {args.workers}")
print(f"Database: {db_path}")
print()
# Initialize database
db_path.parent.mkdir(exist_ok=True)
init_database(db_path)
# Check for already completed samples
if args.resume:
completed = get_completed_samples(db_path)
samples_to_run = [(i, s) for i, s in samples_to_run if i not in completed]
print(f"Already completed: {len(completed)} samples")
print(f"Remaining to process: {len(samples_to_run)} samples")
if not samples_to_run:
print("All samples already completed!")
return
print()
print(f"Starting {args.workers} parallel workers...")
print("=" * 70)
# Prepare worker arguments
worker_args = []
for idx, (sample_idx, sample) in enumerate(samples_to_run):
worker_id = idx % args.workers
worker_args.append((sample_idx, sample, worker_id, str(study_dir), str(db_path)))
# Track progress
start_time = time.time()
completed_count = 0
failed_count = 0
# Run with ProcessPoolExecutor
with ProcessPoolExecutor(max_workers=args.workers) as executor:
futures = {executor.submit(run_single_fea, arg): arg[0] for arg in worker_args}
for future in as_completed(futures):
sample_idx = futures[future]
try:
result = future.result()
if result['success']:
completed_count += 1
else:
failed_count += 1
except Exception as e:
print(f"Sample {sample_idx} raised exception: {e}")
failed_count += 1
# Progress update
total_done = completed_count + failed_count
elapsed = time.time() - start_time
rate = total_done / elapsed if elapsed > 0 else 0
remaining = len(samples_to_run) - total_done
eta = remaining / rate / 60 if rate > 0 else 0
print(f"\nProgress: {total_done}/{len(samples_to_run)} ({completed_count} OK, {failed_count} failed)")
print(f"Rate: {rate:.2f} samples/min | ETA: {eta:.1f} min")
# Summary
elapsed = time.time() - start_time
print()
print("=" * 70)
print("TRAINING DATA GENERATION COMPLETE")
print("=" * 70)
print(f"Total time: {elapsed/60:.1f} minutes")
print(f"Completed: {completed_count}/{len(samples_to_run)}")
print(f"Failed: {failed_count}")
print(f"Results saved to: {db_path}")
print()
print("Next steps:")
print(" 1. Merge with existing optimization data:")
print(f" python merge_training_data.py --study {args.study}")
print(" 2. Retrain neural network:")
print(f" python train_nn_surrogate.py --study {args.study}")
if __name__ == "__main__":
main()