feat: Major update with validators, skills, dashboard, and docs reorganization

- Add validation framework (config, model, results, study validators) - Add Claude Code skills (create-study, run-optimization, generate-report, troubleshoot, analyze-model) - Add Atomizer Dashboard (React frontend + FastAPI backend) - Reorganize docs into structured directories (00-09) - Add neural surrogate modules and training infrastructure - Add multi-objective optimization support 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 19:23:58 -05:00
parent 74a92803b7
commit e3bdb08a22
155 changed files with 52729 additions and 37 deletions
--- a/run_training_fea.py
+++ b/run_training_fea.py
@@ -0,0 +1,353 @@
+"""
+Parallel FEA Training Data Generator
+=====================================
+
+Runs FEA simulations on space-filling training points in parallel
+using multiple NX sessions. Results are stored in a shared SQLite
+database for thread-safe access.
+
+Hardware Recommendation (based on your i7-14700HX, 64GB RAM):
+- 2-3 parallel sessions recommended (each uses ~4-6 cores for Nastran)
+- ~30-50 min for 100 points with 3 sessions
+
+Usage:
+    python run_training_fea.py --study uav_arm_optimization --workers 3
+    python run_training_fea.py --study uav_arm_optimization --workers 2 --start 50
+"""
+
+import sys
+import json
+import argparse
+import shutil
+import sqlite3
+import time
+from pathlib import Path
+from datetime import datetime
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from multiprocessing import Manager
+import threading
+
+# Add project root to path
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+
+
+def setup_worker_directory(study_dir: Path, worker_id: int) -> Path:
+    """Create isolated working directory for each worker."""
+    worker_dir = study_dir / "1_setup" / f"worker_{worker_id}"
+    model_src = study_dir / "1_setup" / "model"
+
+    # Clean and recreate worker directory
+    if worker_dir.exists():
+        shutil.rmtree(worker_dir)
+
+    # Copy model files to worker directory
+    shutil.copytree(model_src, worker_dir)
+
+    return worker_dir
+
+
+def run_single_fea(args_tuple):
+    """
+    Run a single FEA simulation. This function runs in a separate process.
+
+    Args:
+        args_tuple: (sample_idx, sample, worker_id, study_dir, db_path)
+
+    Returns:
+        dict with results or error
+    """
+    sample_idx, sample, worker_id, study_dir_str, db_path_str = args_tuple
+    study_dir = Path(study_dir_str)
+    db_path = Path(db_path_str)
+
+    # Import inside worker to avoid multiprocessing issues
+    import sys
+    sys.path.insert(0, str(Path(__file__).parent))
+
+    try:
+        import config as atomizer_config
+    except ImportError:
+        atomizer_config = None
+
+    from optimization_engine.nx_solver import NXSolver
+    from optimization_engine.extractors.extract_displacement import extract_displacement
+    from optimization_engine.extractors.extract_von_mises_stress import extract_solid_stress
+    from optimization_engine.extractors.extract_frequency import extract_frequency
+    from optimization_engine.extractors.extract_mass_from_expression import extract_mass_from_expression
+
+    result = {
+        'sample_idx': sample_idx,
+        'worker_id': worker_id,
+        'params': sample,
+        'success': False,
+        'error': None,
+        'mass': None,
+        'frequency': None,
+        'max_displacement': None,
+        'max_stress': None
+    }
+
+    try:
+        # Setup worker directory
+        worker_dir = setup_worker_directory(study_dir, worker_id)
+
+        # Initialize NX solver for this worker
+        nx_solver = NXSolver(
+            nastran_version=atomizer_config.NX_VERSION if atomizer_config else "2412",
+            timeout=atomizer_config.NASTRAN_TIMEOUT if atomizer_config else 600,
+            use_journal=True,
+            enable_session_management=True,
+            study_name=f"training_worker_{worker_id}"
+        )
+
+        # Setup paths
+        model_file = worker_dir / "Beam.prt"
+        sim_file = worker_dir / "Beam_sim1.sim"
+
+        print(f"[Worker {worker_id}] Sample {sample_idx}: {sample}")
+
+        # Run simulation
+        sim_result = nx_solver.run_simulation(
+            sim_file=sim_file,
+            working_dir=worker_dir,
+            expression_updates=sample,
+            solution_name=None  # Solve all solutions
+        )
+
+        if not sim_result['success']:
+            result['error'] = sim_result.get('error', 'Unknown error')
+            print(f"[Worker {worker_id}] Sample {sample_idx} FAILED: {result['error']}")
+            return result
+
+        op2_file = sim_result['op2_file']
+
+        # Extract results
+        # Mass from CAD expression
+        mass_kg = extract_mass_from_expression(model_file, expression_name="p173")
+        result['mass'] = mass_kg * 1000.0  # Convert to grams
+
+        # Frequency from modal analysis
+        op2_modal = str(op2_file).replace("solution_1", "solution_2")
+        freq_result = extract_frequency(op2_modal, subcase=1, mode_number=1)
+        result['frequency'] = freq_result['frequency']
+
+        # Displacement from static analysis
+        disp_result = extract_displacement(op2_file, subcase=1)
+        result['max_displacement'] = disp_result['max_displacement']
+
+        # Stress from static analysis
+        stress_result = extract_solid_stress(op2_file, subcase=1, element_type='cquad4')
+        result['max_stress'] = stress_result['max_von_mises']
+
+        result['success'] = True
+
+        print(f"[Worker {worker_id}] Sample {sample_idx} SUCCESS: mass={result['mass']:.1f}g, freq={result['frequency']:.1f}Hz")
+
+        # Save to database immediately (thread-safe with retries)
+        save_result_to_db(db_path, result)
+
+    except Exception as e:
+        result['error'] = str(e)
+        print(f"[Worker {worker_id}] Sample {sample_idx} ERROR: {e}")
+
+    return result
+
+
+def save_result_to_db(db_path: Path, result: dict, max_retries: int = 5):
+    """Save result to SQLite database with retry logic for concurrency."""
+    for attempt in range(max_retries):
+        try:
+            conn = sqlite3.connect(str(db_path), timeout=30)
+            cursor = conn.cursor()
+
+            cursor.execute("""
+                INSERT OR REPLACE INTO training_results
+                (sample_idx, params_json, success, error, mass, frequency, max_displacement, max_stress, timestamp)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """, (
+                result['sample_idx'],
+                json.dumps(result['params']),
+                result['success'],
+                result['error'],
+                result['mass'],
+                result['frequency'],
+                result['max_displacement'],
+                result['max_stress'],
+                datetime.now().isoformat()
+            ))
+
+            conn.commit()
+            conn.close()
+            return True
+
+        except sqlite3.OperationalError as e:
+            if "locked" in str(e) and attempt < max_retries - 1:
+                time.sleep(0.5 * (attempt + 1))  # Exponential backoff
+            else:
+                raise
+
+    return False
+
+
+def init_database(db_path: Path):
+    """Initialize SQLite database for training results."""
+    conn = sqlite3.connect(str(db_path))
+    cursor = conn.cursor()
+
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS training_results (
+            sample_idx INTEGER PRIMARY KEY,
+            params_json TEXT,
+            success INTEGER,
+            error TEXT,
+            mass REAL,
+            frequency REAL,
+            max_displacement REAL,
+            max_stress REAL,
+            timestamp TEXT
+        )
+    """)
+
+    conn.commit()
+    conn.close()
+
+
+def get_completed_samples(db_path: Path) -> set:
+    """Get set of already completed sample indices."""
+    if not db_path.exists():
+        return set()
+
+    conn = sqlite3.connect(str(db_path))
+    cursor = conn.cursor()
+
+    try:
+        cursor.execute("SELECT sample_idx FROM training_results WHERE success = 1")
+        completed = {row[0] for row in cursor.fetchall()}
+    except sqlite3.OperationalError:
+        completed = set()
+
+    conn.close()
+    return completed
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Run parallel FEA training data generation')
+    parser.add_argument('--study', required=True, help='Study name (e.g., uav_arm_optimization)')
+    parser.add_argument('--workers', type=int, default=2, help='Number of parallel workers (default: 2)')
+    parser.add_argument('--start', type=int, default=0, help='Starting sample index (for resuming)')
+    parser.add_argument('--end', type=int, default=None, help='Ending sample index (exclusive)')
+    parser.add_argument('--resume', action='store_true', help='Skip already completed samples')
+    args = parser.parse_args()
+
+    # Setup paths
+    study_dir = project_root / "studies" / args.study
+    training_points_path = study_dir / "1_setup" / "training_points.json"
+    db_path = study_dir / "2_results" / "training_data.db"
+
+    if not study_dir.exists():
+        print(f"ERROR: Study not found: {study_dir}")
+        return
+
+    if not training_points_path.exists():
+        print(f"ERROR: Training points not found: {training_points_path}")
+        print(f"Generate them first: python generate_training_data.py --study {args.study}")
+        return
+
+    # Load training points
+    with open(training_points_path) as f:
+        data = json.load(f)
+
+    samples = data['samples']
+    total_samples = len(samples)
+
+    # Apply start/end filtering
+    end_idx = args.end if args.end else total_samples
+    samples_to_run = [(i, samples[i]) for i in range(args.start, min(end_idx, total_samples))]
+
+    print("=" * 70)
+    print("PARALLEL FEA TRAINING DATA GENERATOR")
+    print("=" * 70)
+    print(f"Study: {args.study}")
+    print(f"Total training points: {total_samples}")
+    print(f"Processing range: {args.start} to {end_idx}")
+    print(f"Parallel workers: {args.workers}")
+    print(f"Database: {db_path}")
+    print()
+
+    # Initialize database
+    db_path.parent.mkdir(exist_ok=True)
+    init_database(db_path)
+
+    # Check for already completed samples
+    if args.resume:
+        completed = get_completed_samples(db_path)
+        samples_to_run = [(i, s) for i, s in samples_to_run if i not in completed]
+        print(f"Already completed: {len(completed)} samples")
+        print(f"Remaining to process: {len(samples_to_run)} samples")
+
+    if not samples_to_run:
+        print("All samples already completed!")
+        return
+
+    print()
+    print(f"Starting {args.workers} parallel workers...")
+    print("=" * 70)
+
+    # Prepare worker arguments
+    worker_args = []
+    for idx, (sample_idx, sample) in enumerate(samples_to_run):
+        worker_id = idx % args.workers
+        worker_args.append((sample_idx, sample, worker_id, str(study_dir), str(db_path)))
+
+    # Track progress
+    start_time = time.time()
+    completed_count = 0
+    failed_count = 0
+
+    # Run with ProcessPoolExecutor
+    with ProcessPoolExecutor(max_workers=args.workers) as executor:
+        futures = {executor.submit(run_single_fea, arg): arg[0] for arg in worker_args}
+
+        for future in as_completed(futures):
+            sample_idx = futures[future]
+            try:
+                result = future.result()
+                if result['success']:
+                    completed_count += 1
+                else:
+                    failed_count += 1
+            except Exception as e:
+                print(f"Sample {sample_idx} raised exception: {e}")
+                failed_count += 1
+
+            # Progress update
+            total_done = completed_count + failed_count
+            elapsed = time.time() - start_time
+            rate = total_done / elapsed if elapsed > 0 else 0
+            remaining = len(samples_to_run) - total_done
+            eta = remaining / rate / 60 if rate > 0 else 0
+
+            print(f"\nProgress: {total_done}/{len(samples_to_run)} ({completed_count} OK, {failed_count} failed)")
+            print(f"Rate: {rate:.2f} samples/min | ETA: {eta:.1f} min")
+
+    # Summary
+    elapsed = time.time() - start_time
+    print()
+    print("=" * 70)
+    print("TRAINING DATA GENERATION COMPLETE")
+    print("=" * 70)
+    print(f"Total time: {elapsed/60:.1f} minutes")
+    print(f"Completed: {completed_count}/{len(samples_to_run)}")
+    print(f"Failed: {failed_count}")
+    print(f"Results saved to: {db_path}")
+    print()
+    print("Next steps:")
+    print("  1. Merge with existing optimization data:")
+    print(f"     python merge_training_data.py --study {args.study}")
+    print("  2. Retrain neural network:")
+    print(f"     python train_nn_surrogate.py --study {args.study}")
+
+
+if __name__ == "__main__":
+    main()