feat: Merge Atomizer-Field neural network module into main repository

Permanently integrates the Atomizer-Field GNN surrogate system: - neural_models/: Graph Neural Network for FEA field prediction - batch_parser.py: Parse training data from FEA exports - train.py: Neural network training pipeline - predict.py: Inference engine for fast predictions This enables 600x-2200x speedup over traditional FEA by replacing expensive simulations with millisecond neural network predictions. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 15:31:33 -05:00
parent a4805947d1
commit d5ffba099e
47 changed files with 18446 additions and 0 deletions
--- a/atomizer-field/train.py
+++ b/atomizer-field/train.py
@@ -0,0 +1,451 @@
+"""
+train.py
+Training script for AtomizerField neural field predictor
+
+AtomizerField Training Pipeline v2.0
+Trains Graph Neural Networks to predict complete FEA field results.
+
+Usage:
+    python train.py --train_dir ./training_data --val_dir ./validation_data
+
+Key Features:
+- Multi-GPU support
+- Checkpoint saving/loading
+- TensorBoard logging
+- Early stopping
+- Learning rate scheduling
+"""
+
+import argparse
+import json
+from pathlib import Path
+import time
+from datetime import datetime
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.tensorboard import SummaryWriter
+
+from neural_models.field_predictor import create_model, AtomizerFieldModel
+from neural_models.physics_losses import create_loss_function
+from neural_models.data_loader import create_dataloaders
+
+
+class Trainer:
+    """
+    Training manager for AtomizerField models
+    """
+
+    def __init__(self, config):
+        """
+        Initialize trainer
+
+        Args:
+            config (dict): Training configuration
+        """
+        self.config = config
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+        print(f"\n{'='*60}")
+        print("AtomizerField Training Pipeline v2.0")
+        print(f"{'='*60}")
+        print(f"Device: {self.device}")
+
+        # Create model
+        print("\nCreating model...")
+        self.model = create_model(config.get('model', {}))
+        self.model = self.model.to(self.device)
+
+        num_params = sum(p.numel() for p in self.model.parameters())
+        print(f"Model created: {num_params:,} parameters")
+
+        # Create loss function
+        loss_config = config.get('loss', {})
+        loss_type = loss_config.pop('type', 'mse')
+        self.criterion = create_loss_function(loss_type, loss_config)
+        print(f"Loss function: {loss_type}")
+
+        # Create optimizer
+        self.optimizer = optim.AdamW(
+            self.model.parameters(),
+            lr=config.get('learning_rate', 1e-3),
+            weight_decay=config.get('weight_decay', 1e-5)
+        )
+
+        # Learning rate scheduler
+        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+            self.optimizer,
+            mode='min',
+            factor=0.5,
+            patience=10,
+            verbose=True
+        )
+
+        # Training state
+        self.start_epoch = 0
+        self.best_val_loss = float('inf')
+        self.epochs_without_improvement = 0
+
+        # Create output directories
+        self.output_dir = Path(config.get('output_dir', './runs'))
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        # TensorBoard logging
+        self.writer = SummaryWriter(
+            log_dir=self.output_dir / 'tensorboard'
+        )
+
+        # Save config
+        with open(self.output_dir / 'config.json', 'w') as f:
+            json.dump(config, f, indent=2)
+
+    def train_epoch(self, train_loader, epoch):
+        """
+        Train for one epoch
+
+        Args:
+            train_loader: Training data loader
+            epoch (int): Current epoch number
+
+        Returns:
+            dict: Training metrics
+        """
+        self.model.train()
+
+        total_loss = 0.0
+        total_disp_loss = 0.0
+        total_stress_loss = 0.0
+        num_batches = 0
+
+        for batch_idx, batch in enumerate(train_loader):
+            # Move batch to device
+            batch = batch.to(self.device)
+
+            # Zero gradients
+            self.optimizer.zero_grad()
+
+            # Forward pass
+            predictions = self.model(batch, return_stress=True)
+
+            # Prepare targets
+            targets = {
+                'displacement': batch.y_displacement,
+            }
+            if hasattr(batch, 'y_stress'):
+                targets['stress'] = batch.y_stress
+
+            # Compute loss
+            losses = self.criterion(predictions, targets, batch)
+
+            # Backward pass
+            losses['total_loss'].backward()
+
+            # Gradient clipping (prevents exploding gradients)
+            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
+
+            # Update weights
+            self.optimizer.step()
+
+            # Accumulate metrics
+            total_loss += losses['total_loss'].item()
+            if 'displacement_loss' in losses:
+                total_disp_loss += losses['displacement_loss'].item()
+            if 'stress_loss' in losses:
+                total_stress_loss += losses['stress_loss'].item()
+            num_batches += 1
+
+            # Print progress
+            if batch_idx % 10 == 0:
+                print(f"  Batch {batch_idx}/{len(train_loader)}: "
+                      f"Loss={losses['total_loss'].item():.6f}")
+
+        # Average metrics
+        metrics = {
+            'total_loss': total_loss / num_batches,
+            'displacement_loss': total_disp_loss / num_batches,
+            'stress_loss': total_stress_loss / num_batches
+        }
+
+        return metrics
+
+    def validate(self, val_loader):
+        """
+        Validate model
+
+        Args:
+            val_loader: Validation data loader
+
+        Returns:
+            dict: Validation metrics
+        """
+        self.model.eval()
+
+        total_loss = 0.0
+        total_disp_loss = 0.0
+        total_stress_loss = 0.0
+        num_batches = 0
+
+        with torch.no_grad():
+            for batch in val_loader:
+                # Move batch to device
+                batch = batch.to(self.device)
+
+                # Forward pass
+                predictions = self.model(batch, return_stress=True)
+
+                # Prepare targets
+                targets = {
+                    'displacement': batch.y_displacement,
+                }
+                if hasattr(batch, 'y_stress'):
+                    targets['stress'] = batch.y_stress
+
+                # Compute loss
+                losses = self.criterion(predictions, targets, batch)
+
+                # Accumulate metrics
+                total_loss += losses['total_loss'].item()
+                if 'displacement_loss' in losses:
+                    total_disp_loss += losses['displacement_loss'].item()
+                if 'stress_loss' in losses:
+                    total_stress_loss += losses['stress_loss'].item()
+                num_batches += 1
+
+        # Average metrics
+        metrics = {
+            'total_loss': total_loss / num_batches,
+            'displacement_loss': total_disp_loss / num_batches,
+            'stress_loss': total_stress_loss / num_batches
+        }
+
+        return metrics
+
+    def train(self, train_loader, val_loader, num_epochs):
+        """
+        Main training loop
+
+        Args:
+            train_loader: Training data loader
+            val_loader: Validation data loader
+            num_epochs (int): Number of epochs to train
+        """
+        print(f"\n{'='*60}")
+        print(f"Starting training for {num_epochs} epochs")
+        print(f"{'='*60}\n")
+
+        for epoch in range(self.start_epoch, num_epochs):
+            epoch_start_time = time.time()
+
+            print(f"Epoch {epoch + 1}/{num_epochs}")
+            print("-" * 60)
+
+            # Train
+            train_metrics = self.train_epoch(train_loader, epoch)
+
+            # Validate
+            val_metrics = self.validate(val_loader)
+
+            epoch_time = time.time() - epoch_start_time
+
+            # Print metrics
+            print(f"\nEpoch {epoch + 1} Results:")
+            print(f"  Training Loss: {train_metrics['total_loss']:.6f}")
+            print(f"    Displacement: {train_metrics['displacement_loss']:.6f}")
+            print(f"    Stress: {train_metrics['stress_loss']:.6f}")
+            print(f"  Validation Loss: {val_metrics['total_loss']:.6f}")
+            print(f"    Displacement: {val_metrics['displacement_loss']:.6f}")
+            print(f"    Stress: {val_metrics['stress_loss']:.6f}")
+            print(f"  Time: {epoch_time:.1f}s")
+
+            # Log to TensorBoard
+            self.writer.add_scalar('Loss/train', train_metrics['total_loss'], epoch)
+            self.writer.add_scalar('Loss/val', val_metrics['total_loss'], epoch)
+            self.writer.add_scalar('DisplacementLoss/train', train_metrics['displacement_loss'], epoch)
+            self.writer.add_scalar('DisplacementLoss/val', val_metrics['displacement_loss'], epoch)
+            self.writer.add_scalar('StressLoss/train', train_metrics['stress_loss'], epoch)
+            self.writer.add_scalar('StressLoss/val', val_metrics['stress_loss'], epoch)
+            self.writer.add_scalar('LearningRate', self.optimizer.param_groups[0]['lr'], epoch)
+
+            # Learning rate scheduling
+            self.scheduler.step(val_metrics['total_loss'])
+
+            # Save checkpoint
+            is_best = val_metrics['total_loss'] < self.best_val_loss
+            if is_best:
+                self.best_val_loss = val_metrics['total_loss']
+                self.epochs_without_improvement = 0
+                print(f"  New best validation loss: {self.best_val_loss:.6f}")
+            else:
+                self.epochs_without_improvement += 1
+
+            self.save_checkpoint(epoch, val_metrics, is_best)
+
+            # Early stopping
+            patience = self.config.get('early_stopping_patience', 50)
+            if self.epochs_without_improvement >= patience:
+                print(f"\nEarly stopping after {patience} epochs without improvement")
+                break
+
+            print()
+
+        print(f"\n{'='*60}")
+        print("Training complete!")
+        print(f"Best validation loss: {self.best_val_loss:.6f}")
+        print(f"{'='*60}\n")
+
+        self.writer.close()
+
+    def save_checkpoint(self, epoch, metrics, is_best=False):
+        """
+        Save model checkpoint
+
+        Args:
+            epoch (int): Current epoch
+            metrics (dict): Validation metrics
+            is_best (bool): Whether this is the best model so far
+        """
+        checkpoint = {
+            'epoch': epoch,
+            'model_state_dict': self.model.state_dict(),
+            'optimizer_state_dict': self.optimizer.state_dict(),
+            'scheduler_state_dict': self.scheduler.state_dict(),
+            'best_val_loss': self.best_val_loss,
+            'config': self.config,
+            'metrics': metrics
+        }
+
+        # Save latest checkpoint
+        checkpoint_path = self.output_dir / 'checkpoint_latest.pt'
+        torch.save(checkpoint, checkpoint_path)
+
+        # Save best checkpoint
+        if is_best:
+            best_path = self.output_dir / 'checkpoint_best.pt'
+            torch.save(checkpoint, best_path)
+            print(f"  Saved best model to {best_path}")
+
+        # Save periodic checkpoint
+        if (epoch + 1) % 10 == 0:
+            periodic_path = self.output_dir / f'checkpoint_epoch_{epoch + 1}.pt'
+            torch.save(checkpoint, periodic_path)
+
+    def load_checkpoint(self, checkpoint_path):
+        """
+        Load model checkpoint
+
+        Args:
+            checkpoint_path (str): Path to checkpoint file
+        """
+        checkpoint = torch.load(checkpoint_path, map_location=self.device)
+
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        self.start_epoch = checkpoint['epoch'] + 1
+        self.best_val_loss = checkpoint['best_val_loss']
+
+        print(f"Loaded checkpoint from epoch {checkpoint['epoch']}")
+        print(f"Best validation loss: {self.best_val_loss:.6f}")
+
+
+def main():
+    """
+    Main training entry point
+    """
+    parser = argparse.ArgumentParser(description='Train AtomizerField neural field predictor')
+
+    # Data arguments
+    parser.add_argument('--train_dir', type=str, required=True,
+                       help='Directory containing training cases')
+    parser.add_argument('--val_dir', type=str, required=True,
+                       help='Directory containing validation cases')
+
+    # Training arguments
+    parser.add_argument('--epochs', type=int, default=100,
+                       help='Number of training epochs')
+    parser.add_argument('--batch_size', type=int, default=4,
+                       help='Batch size')
+    parser.add_argument('--lr', type=float, default=1e-3,
+                       help='Learning rate')
+    parser.add_argument('--weight_decay', type=float, default=1e-5,
+                       help='Weight decay')
+
+    # Model arguments
+    parser.add_argument('--hidden_dim', type=int, default=128,
+                       help='Hidden dimension')
+    parser.add_argument('--num_layers', type=int, default=6,
+                       help='Number of GNN layers')
+    parser.add_argument('--dropout', type=float, default=0.1,
+                       help='Dropout rate')
+
+    # Loss arguments
+    parser.add_argument('--loss_type', type=str, default='mse',
+                       choices=['mse', 'relative', 'physics', 'max'],
+                       help='Loss function type')
+
+    # Other arguments
+    parser.add_argument('--output_dir', type=str, default='./runs',
+                       help='Output directory for checkpoints and logs')
+    parser.add_argument('--resume', type=str, default=None,
+                       help='Path to checkpoint to resume from')
+    parser.add_argument('--num_workers', type=int, default=0,
+                       help='Number of data loading workers')
+
+    args = parser.parse_args()
+
+    # Build configuration
+    config = {
+        'model': {
+            'node_feature_dim': 12,  # 3 coords + 6 BCs + 3 loads
+            'edge_feature_dim': 5,   # E, nu, rho, G, alpha
+            'hidden_dim': args.hidden_dim,
+            'num_layers': args.num_layers,
+            'dropout': args.dropout
+        },
+        'loss': {
+            'type': args.loss_type
+        },
+        'learning_rate': args.lr,
+        'weight_decay': args.weight_decay,
+        'batch_size': args.batch_size,
+        'num_epochs': args.epochs,
+        'output_dir': args.output_dir,
+        'early_stopping_patience': 50
+    }
+
+    # Find all case directories
+    train_cases = list(Path(args.train_dir).glob('*/'))
+    val_cases = list(Path(args.val_dir).glob('*/'))
+
+    print(f"Found {len(train_cases)} training cases")
+    print(f"Found {len(val_cases)} validation cases")
+
+    if not train_cases or not val_cases:
+        print("ERROR: No training or validation cases found!")
+        print("Please ensure your directories contain parsed FEA data.")
+        return
+
+    # Create data loaders
+    train_loader, val_loader = create_dataloaders(
+        train_cases,
+        val_cases,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        normalize=True,
+        include_stress=True
+    )
+
+    # Create trainer
+    trainer = Trainer(config)
+
+    # Resume from checkpoint if specified
+    if args.resume:
+        trainer.load_checkpoint(args.resume)
+
+    # Train
+    trainer.train(train_loader, val_loader, args.epochs)
+
+
+if __name__ == "__main__":
+    main()