feat: Add Zernike GNN surrogate module and M1 mirror V12/V13 studies

This commit introduces the GNN-based surrogate for Zernike mirror optimization and the M1 mirror study progression from V12 (GNN validation) to V13 (pure NSGA-II). ## GNN Surrogate Module (optimization_engine/gnn/) New module for Graph Neural Network surrogate prediction of mirror deformations: - `polar_graph.py`: PolarMirrorGraph - fixed 3000-node polar grid structure - `zernike_gnn.py`: ZernikeGNN with design-conditioned message passing - `differentiable_zernike.py`: GPU-accelerated Zernike fitting and objectives - `train_zernike_gnn.py`: ZernikeGNNTrainer with multi-task loss - `gnn_optimizer.py`: ZernikeGNNOptimizer for turbo mode (~900k trials/hour) - `extract_displacement_field.py`: OP2 to HDF5 field extraction - `backfill_field_data.py`: Extract fields from existing FEA trials Key innovation: Design-conditioned convolutions that modulate message passing based on structural design parameters, enabling accurate field prediction. ## M1 Mirror Studies ### V12: GNN Field Prediction + FEA Validation - Zernike GNN trained on V10/V11 FEA data (238 samples) - Turbo mode: 5000 GNN predictions → top candidates → FEA validation - Calibration workflow for GNN-to-FEA error correction - Scripts: run_gnn_turbo.py, validate_gnn_best.py, compute_full_calibration.py ### V13: Pure NSGA-II FEA (Ground Truth) - Seeds 217 FEA trials from V11+V12 - Pure multi-objective NSGA-II without any surrogate - Establishes ground-truth Pareto front for GNN accuracy evaluation - Narrowed blank_backface_angle range to [4.0, 5.0] ## Documentation Updates - SYS_14: Added Zernike GNN section with architecture diagrams - CLAUDE.md: Added GNN module reference and quick start - V13 README: Study documentation with seeding strategy 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-10 08:44:04 -05:00
parent c6f39bfd6c
commit 96b196de58
22 changed files with 8329 additions and 2 deletions
--- a/optimization_engine/gnn/backfill_field_data.py
+++ b/optimization_engine/gnn/backfill_field_data.py
@@ -0,0 +1,475 @@
+"""
+Backfill Displacement Field Data from Existing Trials
+======================================================
+
+This script scans existing mirror optimization studies (V11, V12, etc.) and extracts
+displacement field data from OP2 files for GNN training.
+
+Structure it expects:
+    studies/m1_mirror_adaptive_V11/
+    ├── 2_iterations/
+    │   ├── iter91/
+    │   │   ├── assy_m1_assyfem1_sim1-solution_1.op2
+    │   │   ├── assy_m1_assyfem1_sim1-solution_1.dat
+    │   │   └── params.exp
+    │   ├── iter92/
+    │   │   └── ...
+    └── 3_results/
+        └── study.db  (Optuna database)
+
+Output structure:
+    studies/m1_mirror_adaptive_V11/
+    └── gnn_data/
+        ├── trial_0000/
+        │   ├── displacement_field.h5
+        │   └── metadata.json
+        ├── trial_0001/
+        │   └── ...
+        └── dataset_index.json  (maps iter -> trial)
+
+Usage:
+    python -m optimization_engine.gnn.backfill_field_data V11
+    python -m optimization_engine.gnn.backfill_field_data V11 V12 --merge
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Any
+from datetime import datetime
+import numpy as np
+
+# Add parent to path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from optimization_engine.gnn.extract_displacement_field import (
+    extract_displacement_field,
+    save_field,
+    load_field,
+    HAS_H5PY,
+)
+
+
+def find_studies(base_dir: Path, pattern: str = "m1_mirror_adaptive_V*") -> List[Path]:
+    """Find all matching study directories."""
+    studies_dir = base_dir / "studies"
+    matches = list(studies_dir.glob(pattern))
+    return sorted(matches)
+
+
+def find_op2_files(study_dir: Path) -> List[Tuple[int, Path, Path]]:
+    """
+    Find all OP2 files in iteration folders.
+
+    Returns:
+        List of (iter_number, op2_path, dat_path) tuples
+    """
+    iterations_dir = study_dir / "2_iterations"
+    if not iterations_dir.exists():
+        print(f"[WARN] No 2_iterations folder in {study_dir.name}")
+        return []
+
+    results = []
+    for iter_dir in sorted(iterations_dir.iterdir()):
+        if not iter_dir.is_dir():
+            continue
+
+        # Extract iteration number
+        match = re.match(r'iter(\d+)', iter_dir.name)
+        if not match:
+            continue
+        iter_num = int(match.group(1))
+
+        # Find OP2 file
+        op2_files = list(iter_dir.glob('*-solution_1.op2'))
+        if not op2_files:
+            op2_files = list(iter_dir.glob('*.op2'))
+        if not op2_files:
+            continue
+
+        op2_path = op2_files[0]
+
+        # Find DAT file
+        dat_path = op2_path.with_suffix('.dat')
+        if not dat_path.exists():
+            dat_path = op2_path.with_suffix('.bdf')
+        if not dat_path.exists():
+            print(f"[WARN] No DAT/BDF for {op2_path.name}, skipping")
+            continue
+
+        results.append((iter_num, op2_path, dat_path))
+
+    return results
+
+
+def read_params_exp(iter_dir: Path) -> Optional[Dict[str, float]]:
+    """Read design parameters from params.exp file."""
+    params_file = iter_dir / "params.exp"
+    if not params_file.exists():
+        return None
+
+    params = {}
+    with open(params_file, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if '=' in line:
+                # Format: name = value
+                parts = line.split('=')
+                if len(parts) == 2:
+                    name = parts[0].strip()
+                    try:
+                        value = float(parts[1].strip())
+                        params[name] = value
+                    except ValueError:
+                        pass
+    return params
+
+
+def backfill_study(
+    study_dir: Path,
+    output_dir: Optional[Path] = None,
+    r_inner: float = 100.0,
+    r_outer: float = 650.0,
+    overwrite: bool = False,
+    verbose: bool = True
+) -> Dict[str, Any]:
+    """
+    Backfill displacement field data for a single study.
+
+    Args:
+        study_dir: Path to study directory
+        output_dir: Output directory (default: study_dir/gnn_data)
+        r_inner: Inner radius for surface identification
+        r_outer: Outer radius for surface identification
+        overwrite: Overwrite existing field data
+        verbose: Print progress
+
+    Returns:
+        Summary dictionary with statistics
+    """
+    if output_dir is None:
+        output_dir = study_dir / "gnn_data"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if verbose:
+        print(f"\n{'='*60}")
+        print(f"BACKFILLING: {study_dir.name}")
+        print(f"{'='*60}")
+
+    # Find all OP2 files
+    op2_list = find_op2_files(study_dir)
+
+    if verbose:
+        print(f"Found {len(op2_list)} iterations with OP2 files")
+
+    # Track results
+    success_count = 0
+    skip_count = 0
+    error_count = 0
+    index = {}
+
+    for iter_num, op2_path, dat_path in op2_list:
+        # Create trial directory
+        trial_dir = output_dir / f"trial_{iter_num:04d}"
+
+        # Check if already exists
+        field_ext = '.h5' if HAS_H5PY else '.npz'
+        field_path = trial_dir / f"displacement_field{field_ext}"
+
+        if field_path.exists() and not overwrite:
+            if verbose:
+                print(f"[SKIP] iter{iter_num}: already processed")
+            skip_count += 1
+            index[iter_num] = {
+                'trial_dir': str(trial_dir.relative_to(study_dir)),
+                'status': 'skipped',
+            }
+            continue
+
+        try:
+            # Extract displacement field
+            if verbose:
+                print(f"[{iter_num:3d}] Extracting from {op2_path.name}...", end=' ')
+
+            field_data = extract_displacement_field(
+                op2_path,
+                bdf_path=dat_path,
+                r_inner=r_inner,
+                r_outer=r_outer,
+                verbose=False
+            )
+
+            # Save field data
+            trial_dir.mkdir(parents=True, exist_ok=True)
+            save_field(field_data, field_path)
+
+            # Read params if available
+            params = read_params_exp(op2_path.parent)
+
+            # Save metadata
+            meta = {
+                'iter_number': iter_num,
+                'op2_file': str(op2_path.name),
+                'n_nodes': len(field_data['node_ids']),
+                'subcases': list(field_data['z_displacement'].keys()),
+                'params': params,
+                'extraction_timestamp': datetime.now().isoformat(),
+            }
+            meta_path = trial_dir / "metadata.json"
+            with open(meta_path, 'w') as f:
+                json.dump(meta, f, indent=2)
+
+            if verbose:
+                print(f"OK ({len(field_data['node_ids'])} nodes)")
+
+            success_count += 1
+            index[iter_num] = {
+                'trial_dir': str(trial_dir.relative_to(study_dir)),
+                'n_nodes': len(field_data['node_ids']),
+                'params': params,
+                'status': 'success',
+            }
+
+        except Exception as e:
+            if verbose:
+                print(f"ERROR: {e}")
+            error_count += 1
+            index[iter_num] = {
+                'trial_dir': str(trial_dir.relative_to(study_dir)) if trial_dir.exists() else None,
+                'error': str(e),
+                'status': 'error',
+            }
+
+    # Save index file
+    index_path = output_dir / "dataset_index.json"
+    index_data = {
+        'study_name': study_dir.name,
+        'generated': datetime.now().isoformat(),
+        'summary': {
+            'total': len(op2_list),
+            'success': success_count,
+            'skipped': skip_count,
+            'errors': error_count,
+        },
+        'trials': index,
+    }
+    with open(index_path, 'w') as f:
+        json.dump(index_data, f, indent=2)
+
+    if verbose:
+        print(f"\n{'='*60}")
+        print(f"SUMMARY: {study_dir.name}")
+        print(f"  Success: {success_count}")
+        print(f"  Skipped: {skip_count}")
+        print(f"  Errors:  {error_count}")
+        print(f"  Index:   {index_path}")
+        print(f"{'='*60}")
+
+    return index_data
+
+
+def merge_datasets(
+    study_dirs: List[Path],
+    output_dir: Path,
+    train_ratio: float = 0.8,
+    verbose: bool = True
+) -> Dict[str, Any]:
+    """
+    Merge displacement field data from multiple studies into a single dataset.
+
+    Args:
+        study_dirs: List of study directories
+        output_dir: Output directory for merged dataset
+        train_ratio: Fraction of data for training (rest for validation)
+        verbose: Print progress
+
+    Returns:
+        Dataset metadata dictionary
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if verbose:
+        print(f"\n{'='*60}")
+        print("MERGING DATASETS")
+        print(f"{'='*60}")
+
+    all_trials = []
+
+    for study_dir in study_dirs:
+        gnn_data_dir = study_dir / "gnn_data"
+        index_path = gnn_data_dir / "dataset_index.json"
+
+        if not index_path.exists():
+            print(f"[WARN] No index for {study_dir.name}, run backfill first")
+            continue
+
+        with open(index_path, 'r') as f:
+            index = json.load(f)
+
+        study_name = study_dir.name
+
+        for iter_num, trial_info in index['trials'].items():
+            if trial_info.get('status') != 'success':
+                continue
+
+            trial_dir = study_dir / trial_info['trial_dir']
+            all_trials.append({
+                'study': study_name,
+                'iter': int(iter_num),
+                'trial_dir': trial_dir,
+                'params': trial_info.get('params', {}),
+                'n_nodes': trial_info.get('n_nodes'),
+            })
+
+    if verbose:
+        print(f"Total successful trials: {len(all_trials)}")
+
+    # Shuffle and split
+    np.random.seed(42)
+    indices = np.random.permutation(len(all_trials))
+    n_train = int(len(all_trials) * train_ratio)
+
+    train_indices = indices[:n_train]
+    val_indices = indices[n_train:]
+
+    # Create split files
+    splits = {
+        'train': [all_trials[i] for i in train_indices],
+        'val': [all_trials[i] for i in val_indices],
+    }
+
+    for split_name, trials in splits.items():
+        split_dir = output_dir / split_name
+        split_dir.mkdir(exist_ok=True)
+
+        split_meta = []
+        for i, trial in enumerate(trials):
+            # Copy/link field data
+            src_ext = '.h5' if HAS_H5PY else '.npz'
+            src_path = trial['trial_dir'] / f"displacement_field{src_ext}"
+            dst_path = split_dir / f"sample_{i:04d}{src_ext}"
+
+            if src_path.exists():
+                # Copy file (or could use symlink on Linux)
+                import shutil
+                shutil.copy(src_path, dst_path)
+
+                split_meta.append({
+                    'index': i,
+                    'source_study': trial['study'],
+                    'source_iter': trial['iter'],
+                    'params': trial['params'],
+                    'n_nodes': trial['n_nodes'],
+                })
+
+        # Save split metadata
+        meta_path = split_dir / "metadata.json"
+        with open(meta_path, 'w') as f:
+            json.dump({
+                'split': split_name,
+                'n_samples': len(split_meta),
+                'samples': split_meta,
+            }, f, indent=2)
+
+        if verbose:
+            print(f"  {split_name}: {len(split_meta)} samples")
+
+    # Save overall metadata
+    dataset_meta = {
+        'created': datetime.now().isoformat(),
+        'source_studies': [str(s.name) for s in study_dirs],
+        'total_samples': len(all_trials),
+        'train_samples': len(splits['train']),
+        'val_samples': len(splits['val']),
+        'train_ratio': train_ratio,
+    }
+    with open(output_dir / "dataset_meta.json", 'w') as f:
+        json.dump(dataset_meta, f, indent=2)
+
+    if verbose:
+        print(f"\nDataset saved to: {output_dir}")
+        print(f"  Train: {len(splits['train'])} samples")
+        print(f"  Val:   {len(splits['val'])} samples")
+
+    return dataset_meta
+
+
+# =============================================================================
+# CLI
+# =============================================================================
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description='Backfill displacement field data for GNN training'
+    )
+    parser.add_argument('studies', nargs='+', type=str,
+                       help='Study versions (e.g., V11 V12) or "all"')
+    parser.add_argument('--merge', action='store_true',
+                       help='Merge data from multiple studies')
+    parser.add_argument('--output', '-o', type=Path,
+                       help='Output directory for merged dataset')
+    parser.add_argument('--r-inner', type=float, default=100.0,
+                       help='Inner radius (mm)')
+    parser.add_argument('--r-outer', type=float, default=650.0,
+                       help='Outer radius (mm)')
+    parser.add_argument('--overwrite', action='store_true',
+                       help='Overwrite existing field data')
+    parser.add_argument('--train-ratio', type=float, default=0.8,
+                       help='Train/val split ratio')
+
+    args = parser.parse_args()
+
+    # Find base directory
+    base_dir = Path(__file__).parent.parent.parent
+
+    # Find studies
+    if args.studies == ['all']:
+        study_dirs = find_studies(base_dir, "m1_mirror_adaptive_V*")
+    else:
+        study_dirs = []
+        for s in args.studies:
+            if s.startswith('V'):
+                pattern = f"m1_mirror_adaptive_{s}"
+            else:
+                pattern = s
+            matches = find_studies(base_dir, pattern)
+            study_dirs.extend(matches)
+
+    if not study_dirs:
+        print("No studies found!")
+        return 1
+
+    print(f"Found {len(study_dirs)} studies:")
+    for s in study_dirs:
+        print(f"  - {s.name}")
+
+    # Backfill each study
+    for study_dir in study_dirs:
+        backfill_study(
+            study_dir,
+            r_inner=args.r_inner,
+            r_outer=args.r_outer,
+            overwrite=args.overwrite,
+        )
+
+    # Merge if requested
+    if args.merge and len(study_dirs) > 1:
+        output_dir = args.output
+        if output_dir is None:
+            output_dir = base_dir / "studies" / "gnn_merged_dataset"
+
+        merge_datasets(
+            study_dirs,
+            output_dir,
+            train_ratio=args.train_ratio,
+        )
+
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())