""" Backfill Displacement Field Data from Existing Trials ====================================================== This script scans existing mirror optimization studies (V11, V12, etc.) and extracts displacement field data from OP2 files for GNN training. Structure it expects: studies/m1_mirror_adaptive_V11/ ├── 2_iterations/ │ ├── iter91/ │ │ ├── assy_m1_assyfem1_sim1-solution_1.op2 │ │ ├── assy_m1_assyfem1_sim1-solution_1.dat │ │ └── params.exp │ ├── iter92/ │ │ └── ... └── 3_results/ └── study.db (Optuna database) Output structure: studies/m1_mirror_adaptive_V11/ └── gnn_data/ ├── trial_0000/ │ ├── displacement_field.h5 │ └── metadata.json ├── trial_0001/ │ └── ... └── dataset_index.json (maps iter -> trial) Usage: python -m optimization_engine.gnn.backfill_field_data V11 python -m optimization_engine.gnn.backfill_field_data V11 V12 --merge """ import json import re import sys from pathlib import Path from typing import Dict, List, Optional, Tuple, Any from datetime import datetime import numpy as np # Add parent to path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from optimization_engine.gnn.extract_displacement_field import ( extract_displacement_field, save_field, load_field, HAS_H5PY, ) def find_studies(base_dir: Path, pattern: str = "m1_mirror_adaptive_V*") -> List[Path]: """Find all matching study directories.""" studies_dir = base_dir / "studies" matches = list(studies_dir.glob(pattern)) return sorted(matches) def find_op2_files(study_dir: Path) -> List[Tuple[int, Path, Path]]: """ Find all OP2 files in iteration folders. Returns: List of (iter_number, op2_path, dat_path) tuples """ iterations_dir = study_dir / "2_iterations" if not iterations_dir.exists(): print(f"[WARN] No 2_iterations folder in {study_dir.name}") return [] results = [] for iter_dir in sorted(iterations_dir.iterdir()): if not iter_dir.is_dir(): continue # Extract iteration number match = re.match(r'iter(\d+)', iter_dir.name) if not match: continue iter_num = int(match.group(1)) # Find OP2 file op2_files = list(iter_dir.glob('*-solution_1.op2')) if not op2_files: op2_files = list(iter_dir.glob('*.op2')) if not op2_files: continue op2_path = op2_files[0] # Find DAT file dat_path = op2_path.with_suffix('.dat') if not dat_path.exists(): dat_path = op2_path.with_suffix('.bdf') if not dat_path.exists(): print(f"[WARN] No DAT/BDF for {op2_path.name}, skipping") continue results.append((iter_num, op2_path, dat_path)) return results def read_params_exp(iter_dir: Path) -> Optional[Dict[str, float]]: """Read design parameters from params.exp file.""" params_file = iter_dir / "params.exp" if not params_file.exists(): return None params = {} with open(params_file, 'r') as f: for line in f: line = line.strip() if '=' in line: # Format: name = value parts = line.split('=') if len(parts) == 2: name = parts[0].strip() try: value = float(parts[1].strip()) params[name] = value except ValueError: pass return params def backfill_study( study_dir: Path, output_dir: Optional[Path] = None, r_inner: float = 100.0, r_outer: float = 650.0, overwrite: bool = False, verbose: bool = True ) -> Dict[str, Any]: """ Backfill displacement field data for a single study. Args: study_dir: Path to study directory output_dir: Output directory (default: study_dir/gnn_data) r_inner: Inner radius for surface identification r_outer: Outer radius for surface identification overwrite: Overwrite existing field data verbose: Print progress Returns: Summary dictionary with statistics """ if output_dir is None: output_dir = study_dir / "gnn_data" output_dir.mkdir(parents=True, exist_ok=True) if verbose: print(f"\n{'='*60}") print(f"BACKFILLING: {study_dir.name}") print(f"{'='*60}") # Find all OP2 files op2_list = find_op2_files(study_dir) if verbose: print(f"Found {len(op2_list)} iterations with OP2 files") # Track results success_count = 0 skip_count = 0 error_count = 0 index = {} for iter_num, op2_path, dat_path in op2_list: # Create trial directory trial_dir = output_dir / f"trial_{iter_num:04d}" # Check if already exists field_ext = '.h5' if HAS_H5PY else '.npz' field_path = trial_dir / f"displacement_field{field_ext}" if field_path.exists() and not overwrite: if verbose: print(f"[SKIP] iter{iter_num}: already processed") skip_count += 1 index[iter_num] = { 'trial_dir': str(trial_dir.relative_to(study_dir)), 'status': 'skipped', } continue try: # Extract displacement field if verbose: print(f"[{iter_num:3d}] Extracting from {op2_path.name}...", end=' ') field_data = extract_displacement_field( op2_path, bdf_path=dat_path, r_inner=r_inner, r_outer=r_outer, verbose=False ) # Save field data trial_dir.mkdir(parents=True, exist_ok=True) save_field(field_data, field_path) # Read params if available params = read_params_exp(op2_path.parent) # Save metadata meta = { 'iter_number': iter_num, 'op2_file': str(op2_path.name), 'n_nodes': len(field_data['node_ids']), 'subcases': list(field_data['z_displacement'].keys()), 'params': params, 'extraction_timestamp': datetime.now().isoformat(), } meta_path = trial_dir / "metadata.json" with open(meta_path, 'w') as f: json.dump(meta, f, indent=2) if verbose: print(f"OK ({len(field_data['node_ids'])} nodes)") success_count += 1 index[iter_num] = { 'trial_dir': str(trial_dir.relative_to(study_dir)), 'n_nodes': len(field_data['node_ids']), 'params': params, 'status': 'success', } except Exception as e: if verbose: print(f"ERROR: {e}") error_count += 1 index[iter_num] = { 'trial_dir': str(trial_dir.relative_to(study_dir)) if trial_dir.exists() else None, 'error': str(e), 'status': 'error', } # Save index file index_path = output_dir / "dataset_index.json" index_data = { 'study_name': study_dir.name, 'generated': datetime.now().isoformat(), 'summary': { 'total': len(op2_list), 'success': success_count, 'skipped': skip_count, 'errors': error_count, }, 'trials': index, } with open(index_path, 'w') as f: json.dump(index_data, f, indent=2) if verbose: print(f"\n{'='*60}") print(f"SUMMARY: {study_dir.name}") print(f" Success: {success_count}") print(f" Skipped: {skip_count}") print(f" Errors: {error_count}") print(f" Index: {index_path}") print(f"{'='*60}") return index_data def merge_datasets( study_dirs: List[Path], output_dir: Path, train_ratio: float = 0.8, verbose: bool = True ) -> Dict[str, Any]: """ Merge displacement field data from multiple studies into a single dataset. Args: study_dirs: List of study directories output_dir: Output directory for merged dataset train_ratio: Fraction of data for training (rest for validation) verbose: Print progress Returns: Dataset metadata dictionary """ output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) if verbose: print(f"\n{'='*60}") print("MERGING DATASETS") print(f"{'='*60}") all_trials = [] for study_dir in study_dirs: gnn_data_dir = study_dir / "gnn_data" index_path = gnn_data_dir / "dataset_index.json" if not index_path.exists(): print(f"[WARN] No index for {study_dir.name}, run backfill first") continue with open(index_path, 'r') as f: index = json.load(f) study_name = study_dir.name for iter_num, trial_info in index['trials'].items(): if trial_info.get('status') != 'success': continue trial_dir = study_dir / trial_info['trial_dir'] all_trials.append({ 'study': study_name, 'iter': int(iter_num), 'trial_dir': trial_dir, 'params': trial_info.get('params', {}), 'n_nodes': trial_info.get('n_nodes'), }) if verbose: print(f"Total successful trials: {len(all_trials)}") # Shuffle and split np.random.seed(42) indices = np.random.permutation(len(all_trials)) n_train = int(len(all_trials) * train_ratio) train_indices = indices[:n_train] val_indices = indices[n_train:] # Create split files splits = { 'train': [all_trials[i] for i in train_indices], 'val': [all_trials[i] for i in val_indices], } for split_name, trials in splits.items(): split_dir = output_dir / split_name split_dir.mkdir(exist_ok=True) split_meta = [] for i, trial in enumerate(trials): # Copy/link field data src_ext = '.h5' if HAS_H5PY else '.npz' src_path = trial['trial_dir'] / f"displacement_field{src_ext}" dst_path = split_dir / f"sample_{i:04d}{src_ext}" if src_path.exists(): # Copy file (or could use symlink on Linux) import shutil shutil.copy(src_path, dst_path) split_meta.append({ 'index': i, 'source_study': trial['study'], 'source_iter': trial['iter'], 'params': trial['params'], 'n_nodes': trial['n_nodes'], }) # Save split metadata meta_path = split_dir / "metadata.json" with open(meta_path, 'w') as f: json.dump({ 'split': split_name, 'n_samples': len(split_meta), 'samples': split_meta, }, f, indent=2) if verbose: print(f" {split_name}: {len(split_meta)} samples") # Save overall metadata dataset_meta = { 'created': datetime.now().isoformat(), 'source_studies': [str(s.name) for s in study_dirs], 'total_samples': len(all_trials), 'train_samples': len(splits['train']), 'val_samples': len(splits['val']), 'train_ratio': train_ratio, } with open(output_dir / "dataset_meta.json", 'w') as f: json.dump(dataset_meta, f, indent=2) if verbose: print(f"\nDataset saved to: {output_dir}") print(f" Train: {len(splits['train'])} samples") print(f" Val: {len(splits['val'])} samples") return dataset_meta # ============================================================================= # CLI # ============================================================================= def main(): import argparse parser = argparse.ArgumentParser( description='Backfill displacement field data for GNN training' ) parser.add_argument('studies', nargs='+', type=str, help='Study versions (e.g., V11 V12) or "all"') parser.add_argument('--merge', action='store_true', help='Merge data from multiple studies') parser.add_argument('--output', '-o', type=Path, help='Output directory for merged dataset') parser.add_argument('--r-inner', type=float, default=100.0, help='Inner radius (mm)') parser.add_argument('--r-outer', type=float, default=650.0, help='Outer radius (mm)') parser.add_argument('--overwrite', action='store_true', help='Overwrite existing field data') parser.add_argument('--train-ratio', type=float, default=0.8, help='Train/val split ratio') args = parser.parse_args() # Find base directory base_dir = Path(__file__).parent.parent.parent # Find studies if args.studies == ['all']: study_dirs = find_studies(base_dir, "m1_mirror_adaptive_V*") else: study_dirs = [] for s in args.studies: if s.startswith('V'): pattern = f"m1_mirror_adaptive_{s}" else: pattern = s matches = find_studies(base_dir, pattern) study_dirs.extend(matches) if not study_dirs: print("No studies found!") return 1 print(f"Found {len(study_dirs)} studies:") for s in study_dirs: print(f" - {s.name}") # Backfill each study for study_dir in study_dirs: backfill_study( study_dir, r_inner=args.r_inner, r_outer=args.r_outer, overwrite=args.overwrite, ) # Merge if requested if args.merge and len(study_dirs) > 1: output_dir = args.output if output_dir is None: output_dir = base_dir / "studies" / "gnn_merged_dataset" merge_datasets( study_dirs, output_dir, train_ratio=args.train_ratio, ) return 0 if __name__ == '__main__': sys.exit(main())