feat: Major update with validators, skills, dashboard, and docs reorganization
- Add validation framework (config, model, results, study validators) - Add Claude Code skills (create-study, run-optimization, generate-report, troubleshoot, analyze-model) - Add Atomizer Dashboard (React frontend + FastAPI backend) - Reorganize docs into structured directories (00-09) - Add neural surrogate modules and training infrastructure - Add multi-objective optimization support 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
353
run_training_fea.py
Normal file
353
run_training_fea.py
Normal file
@@ -0,0 +1,353 @@
|
||||
"""
|
||||
Parallel FEA Training Data Generator
|
||||
=====================================
|
||||
|
||||
Runs FEA simulations on space-filling training points in parallel
|
||||
using multiple NX sessions. Results are stored in a shared SQLite
|
||||
database for thread-safe access.
|
||||
|
||||
Hardware Recommendation (based on your i7-14700HX, 64GB RAM):
|
||||
- 2-3 parallel sessions recommended (each uses ~4-6 cores for Nastran)
|
||||
- ~30-50 min for 100 points with 3 sessions
|
||||
|
||||
Usage:
|
||||
python run_training_fea.py --study uav_arm_optimization --workers 3
|
||||
python run_training_fea.py --study uav_arm_optimization --workers 2 --start 50
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import shutil
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from multiprocessing import Manager
|
||||
import threading
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
|
||||
def setup_worker_directory(study_dir: Path, worker_id: int) -> Path:
|
||||
"""Create isolated working directory for each worker."""
|
||||
worker_dir = study_dir / "1_setup" / f"worker_{worker_id}"
|
||||
model_src = study_dir / "1_setup" / "model"
|
||||
|
||||
# Clean and recreate worker directory
|
||||
if worker_dir.exists():
|
||||
shutil.rmtree(worker_dir)
|
||||
|
||||
# Copy model files to worker directory
|
||||
shutil.copytree(model_src, worker_dir)
|
||||
|
||||
return worker_dir
|
||||
|
||||
|
||||
def run_single_fea(args_tuple):
|
||||
"""
|
||||
Run a single FEA simulation. This function runs in a separate process.
|
||||
|
||||
Args:
|
||||
args_tuple: (sample_idx, sample, worker_id, study_dir, db_path)
|
||||
|
||||
Returns:
|
||||
dict with results or error
|
||||
"""
|
||||
sample_idx, sample, worker_id, study_dir_str, db_path_str = args_tuple
|
||||
study_dir = Path(study_dir_str)
|
||||
db_path = Path(db_path_str)
|
||||
|
||||
# Import inside worker to avoid multiprocessing issues
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
try:
|
||||
import config as atomizer_config
|
||||
except ImportError:
|
||||
atomizer_config = None
|
||||
|
||||
from optimization_engine.nx_solver import NXSolver
|
||||
from optimization_engine.extractors.extract_displacement import extract_displacement
|
||||
from optimization_engine.extractors.extract_von_mises_stress import extract_solid_stress
|
||||
from optimization_engine.extractors.extract_frequency import extract_frequency
|
||||
from optimization_engine.extractors.extract_mass_from_expression import extract_mass_from_expression
|
||||
|
||||
result = {
|
||||
'sample_idx': sample_idx,
|
||||
'worker_id': worker_id,
|
||||
'params': sample,
|
||||
'success': False,
|
||||
'error': None,
|
||||
'mass': None,
|
||||
'frequency': None,
|
||||
'max_displacement': None,
|
||||
'max_stress': None
|
||||
}
|
||||
|
||||
try:
|
||||
# Setup worker directory
|
||||
worker_dir = setup_worker_directory(study_dir, worker_id)
|
||||
|
||||
# Initialize NX solver for this worker
|
||||
nx_solver = NXSolver(
|
||||
nastran_version=atomizer_config.NX_VERSION if atomizer_config else "2412",
|
||||
timeout=atomizer_config.NASTRAN_TIMEOUT if atomizer_config else 600,
|
||||
use_journal=True,
|
||||
enable_session_management=True,
|
||||
study_name=f"training_worker_{worker_id}"
|
||||
)
|
||||
|
||||
# Setup paths
|
||||
model_file = worker_dir / "Beam.prt"
|
||||
sim_file = worker_dir / "Beam_sim1.sim"
|
||||
|
||||
print(f"[Worker {worker_id}] Sample {sample_idx}: {sample}")
|
||||
|
||||
# Run simulation
|
||||
sim_result = nx_solver.run_simulation(
|
||||
sim_file=sim_file,
|
||||
working_dir=worker_dir,
|
||||
expression_updates=sample,
|
||||
solution_name=None # Solve all solutions
|
||||
)
|
||||
|
||||
if not sim_result['success']:
|
||||
result['error'] = sim_result.get('error', 'Unknown error')
|
||||
print(f"[Worker {worker_id}] Sample {sample_idx} FAILED: {result['error']}")
|
||||
return result
|
||||
|
||||
op2_file = sim_result['op2_file']
|
||||
|
||||
# Extract results
|
||||
# Mass from CAD expression
|
||||
mass_kg = extract_mass_from_expression(model_file, expression_name="p173")
|
||||
result['mass'] = mass_kg * 1000.0 # Convert to grams
|
||||
|
||||
# Frequency from modal analysis
|
||||
op2_modal = str(op2_file).replace("solution_1", "solution_2")
|
||||
freq_result = extract_frequency(op2_modal, subcase=1, mode_number=1)
|
||||
result['frequency'] = freq_result['frequency']
|
||||
|
||||
# Displacement from static analysis
|
||||
disp_result = extract_displacement(op2_file, subcase=1)
|
||||
result['max_displacement'] = disp_result['max_displacement']
|
||||
|
||||
# Stress from static analysis
|
||||
stress_result = extract_solid_stress(op2_file, subcase=1, element_type='cquad4')
|
||||
result['max_stress'] = stress_result['max_von_mises']
|
||||
|
||||
result['success'] = True
|
||||
|
||||
print(f"[Worker {worker_id}] Sample {sample_idx} SUCCESS: mass={result['mass']:.1f}g, freq={result['frequency']:.1f}Hz")
|
||||
|
||||
# Save to database immediately (thread-safe with retries)
|
||||
save_result_to_db(db_path, result)
|
||||
|
||||
except Exception as e:
|
||||
result['error'] = str(e)
|
||||
print(f"[Worker {worker_id}] Sample {sample_idx} ERROR: {e}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def save_result_to_db(db_path: Path, result: dict, max_retries: int = 5):
|
||||
"""Save result to SQLite database with retry logic for concurrency."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
conn = sqlite3.connect(str(db_path), timeout=30)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
INSERT OR REPLACE INTO training_results
|
||||
(sample_idx, params_json, success, error, mass, frequency, max_displacement, max_stress, timestamp)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
result['sample_idx'],
|
||||
json.dumps(result['params']),
|
||||
result['success'],
|
||||
result['error'],
|
||||
result['mass'],
|
||||
result['frequency'],
|
||||
result['max_displacement'],
|
||||
result['max_stress'],
|
||||
datetime.now().isoformat()
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return True
|
||||
|
||||
except sqlite3.OperationalError as e:
|
||||
if "locked" in str(e) and attempt < max_retries - 1:
|
||||
time.sleep(0.5 * (attempt + 1)) # Exponential backoff
|
||||
else:
|
||||
raise
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def init_database(db_path: Path):
|
||||
"""Initialize SQLite database for training results."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS training_results (
|
||||
sample_idx INTEGER PRIMARY KEY,
|
||||
params_json TEXT,
|
||||
success INTEGER,
|
||||
error TEXT,
|
||||
mass REAL,
|
||||
frequency REAL,
|
||||
max_displacement REAL,
|
||||
max_stress REAL,
|
||||
timestamp TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_completed_samples(db_path: Path) -> set:
|
||||
"""Get set of already completed sample indices."""
|
||||
if not db_path.exists():
|
||||
return set()
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute("SELECT sample_idx FROM training_results WHERE success = 1")
|
||||
completed = {row[0] for row in cursor.fetchall()}
|
||||
except sqlite3.OperationalError:
|
||||
completed = set()
|
||||
|
||||
conn.close()
|
||||
return completed
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Run parallel FEA training data generation')
|
||||
parser.add_argument('--study', required=True, help='Study name (e.g., uav_arm_optimization)')
|
||||
parser.add_argument('--workers', type=int, default=2, help='Number of parallel workers (default: 2)')
|
||||
parser.add_argument('--start', type=int, default=0, help='Starting sample index (for resuming)')
|
||||
parser.add_argument('--end', type=int, default=None, help='Ending sample index (exclusive)')
|
||||
parser.add_argument('--resume', action='store_true', help='Skip already completed samples')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Setup paths
|
||||
study_dir = project_root / "studies" / args.study
|
||||
training_points_path = study_dir / "1_setup" / "training_points.json"
|
||||
db_path = study_dir / "2_results" / "training_data.db"
|
||||
|
||||
if not study_dir.exists():
|
||||
print(f"ERROR: Study not found: {study_dir}")
|
||||
return
|
||||
|
||||
if not training_points_path.exists():
|
||||
print(f"ERROR: Training points not found: {training_points_path}")
|
||||
print(f"Generate them first: python generate_training_data.py --study {args.study}")
|
||||
return
|
||||
|
||||
# Load training points
|
||||
with open(training_points_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
samples = data['samples']
|
||||
total_samples = len(samples)
|
||||
|
||||
# Apply start/end filtering
|
||||
end_idx = args.end if args.end else total_samples
|
||||
samples_to_run = [(i, samples[i]) for i in range(args.start, min(end_idx, total_samples))]
|
||||
|
||||
print("=" * 70)
|
||||
print("PARALLEL FEA TRAINING DATA GENERATOR")
|
||||
print("=" * 70)
|
||||
print(f"Study: {args.study}")
|
||||
print(f"Total training points: {total_samples}")
|
||||
print(f"Processing range: {args.start} to {end_idx}")
|
||||
print(f"Parallel workers: {args.workers}")
|
||||
print(f"Database: {db_path}")
|
||||
print()
|
||||
|
||||
# Initialize database
|
||||
db_path.parent.mkdir(exist_ok=True)
|
||||
init_database(db_path)
|
||||
|
||||
# Check for already completed samples
|
||||
if args.resume:
|
||||
completed = get_completed_samples(db_path)
|
||||
samples_to_run = [(i, s) for i, s in samples_to_run if i not in completed]
|
||||
print(f"Already completed: {len(completed)} samples")
|
||||
print(f"Remaining to process: {len(samples_to_run)} samples")
|
||||
|
||||
if not samples_to_run:
|
||||
print("All samples already completed!")
|
||||
return
|
||||
|
||||
print()
|
||||
print(f"Starting {args.workers} parallel workers...")
|
||||
print("=" * 70)
|
||||
|
||||
# Prepare worker arguments
|
||||
worker_args = []
|
||||
for idx, (sample_idx, sample) in enumerate(samples_to_run):
|
||||
worker_id = idx % args.workers
|
||||
worker_args.append((sample_idx, sample, worker_id, str(study_dir), str(db_path)))
|
||||
|
||||
# Track progress
|
||||
start_time = time.time()
|
||||
completed_count = 0
|
||||
failed_count = 0
|
||||
|
||||
# Run with ProcessPoolExecutor
|
||||
with ProcessPoolExecutor(max_workers=args.workers) as executor:
|
||||
futures = {executor.submit(run_single_fea, arg): arg[0] for arg in worker_args}
|
||||
|
||||
for future in as_completed(futures):
|
||||
sample_idx = futures[future]
|
||||
try:
|
||||
result = future.result()
|
||||
if result['success']:
|
||||
completed_count += 1
|
||||
else:
|
||||
failed_count += 1
|
||||
except Exception as e:
|
||||
print(f"Sample {sample_idx} raised exception: {e}")
|
||||
failed_count += 1
|
||||
|
||||
# Progress update
|
||||
total_done = completed_count + failed_count
|
||||
elapsed = time.time() - start_time
|
||||
rate = total_done / elapsed if elapsed > 0 else 0
|
||||
remaining = len(samples_to_run) - total_done
|
||||
eta = remaining / rate / 60 if rate > 0 else 0
|
||||
|
||||
print(f"\nProgress: {total_done}/{len(samples_to_run)} ({completed_count} OK, {failed_count} failed)")
|
||||
print(f"Rate: {rate:.2f} samples/min | ETA: {eta:.1f} min")
|
||||
|
||||
# Summary
|
||||
elapsed = time.time() - start_time
|
||||
print()
|
||||
print("=" * 70)
|
||||
print("TRAINING DATA GENERATION COMPLETE")
|
||||
print("=" * 70)
|
||||
print(f"Total time: {elapsed/60:.1f} minutes")
|
||||
print(f"Completed: {completed_count}/{len(samples_to_run)}")
|
||||
print(f"Failed: {failed_count}")
|
||||
print(f"Results saved to: {db_path}")
|
||||
print()
|
||||
print("Next steps:")
|
||||
print(" 1. Merge with existing optimization data:")
|
||||
print(f" python merge_training_data.py --study {args.study}")
|
||||
print(" 2. Retrain neural network:")
|
||||
print(f" python train_nn_surrogate.py --study {args.study}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user