refactor: Reorganize code structure and create tests directory

- Consolidate surrogates module to processors/surrogates/ - Move ensemble_surrogate.py to proper location - Add deprecation shim for old import path - Create tests/ directory with pytest structure - Move test files from archive/test_scripts/ - Add conftest.py with shared fixtures 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 09:01:37 -05:00
parent 155e2a1b8e
commit 7bdb74f93b
9 changed files with 61 additions and 10 deletions
--- a/optimization_engine/processors/surrogates/init.py
+++ b/optimization_engine/processors/surrogates/init.py
@@ -59,6 +59,15 @@ def __getattr__(name):
    elif name == 'create_exporter_from_config':
        from .training_data_exporter import create_exporter_from_config
        return create_exporter_from_config
+    elif name == 'EnsembleSurrogate':
+        from .ensemble_surrogate import EnsembleSurrogate
+        return EnsembleSurrogate
+    elif name == 'OODDetector':
+        from .ensemble_surrogate import OODDetector
+        return OODDetector
+    elif name == 'create_and_train_ensemble':
+        from .ensemble_surrogate import create_and_train_ensemble
+        return create_and_train_ensemble

    raise AttributeError(f"module 'optimization_engine.processors.surrogates' has no attribute '{name}'")

@@ -76,4 +85,7 @@ __all__ = [
    'AutoTrainer',
    'TrainingDataExporter',
    'create_exporter_from_config',
+    'EnsembleSurrogate',
+    'OODDetector',
+    'create_and_train_ensemble',
 ]
--- a/optimization_engine/processors/surrogates/ensemble_surrogate.py
+++ b/optimization_engine/processors/surrogates/ensemble_surrogate.py
@@ -0,0 +1,540 @@
+#!/usr/bin/env python3
+"""
+Ensemble Surrogate with Uncertainty Quantification
+
+Addresses the V5 failure mode where single MLPs gave overconfident predictions
+in out-of-distribution regions, leading L-BFGS to fake optima.
+
+Key features:
+1. Ensemble of N MLPs - disagreement = uncertainty
+2. OOD detection - reject predictions far from training data
+3. Confidence bounds - never trust point predictions alone
+4. Active learning - prioritize FEA in uncertain regions
+
+Author: Atomizer
+Created: 2025-12-28
+"""
+
+import numpy as np
+from typing import Tuple, List, Dict, Optional
+from pathlib import Path
+import json
+import logging
+
+logger = logging.getLogger(__name__)
+
+try:
+    import torch
+    import torch.nn as nn
+    HAS_TORCH = True
+except ImportError:
+    HAS_TORCH = False
+    logger.warning("PyTorch not available - ensemble features limited")
+
+from sklearn.neighbors import NearestNeighbors
+
+
+class MLP(nn.Module):
+    """Single MLP for ensemble."""
+
+    def __init__(self, input_dim: int, output_dim: int, hidden_dims: List[int] = None):
+        super().__init__()
+        hidden_dims = hidden_dims or [64, 32]
+
+        layers = []
+        in_dim = input_dim
+        for h_dim in hidden_dims:
+            layers.append(nn.Linear(in_dim, h_dim))
+            layers.append(nn.ReLU())
+            layers.append(nn.Dropout(0.1))
+            in_dim = h_dim
+        layers.append(nn.Linear(in_dim, output_dim))
+
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class OODDetector:
+    """
+    Out-of-Distribution detector using multiple methods:
+    1. Z-score check (is input within N std of training mean)
+    2. KNN distance (is input close to training points)
+    """
+
+    def __init__(self, X_train: np.ndarray, z_threshold: float = 3.0, knn_k: int = 5):
+        self.X_train = X_train
+        self.z_threshold = z_threshold
+        self.knn_k = knn_k
+
+        # Compute training statistics
+        self.mean = X_train.mean(axis=0)
+        self.std = X_train.std(axis=0) + 1e-8
+
+        # Fit KNN for local density estimation
+        self.knn = NearestNeighbors(n_neighbors=min(knn_k, len(X_train)))
+        self.knn.fit(X_train)
+
+        # Compute typical KNN distances in training set
+        train_distances, _ = self.knn.kneighbors(X_train)
+        self.typical_knn_dist = np.median(train_distances.mean(axis=1))
+
+        logger.info(f"[OOD] Initialized with {len(X_train)} training points")
+        logger.info(f"[OOD] Typical KNN distance: {self.typical_knn_dist:.4f}")
+
+    def z_score_check(self, x: np.ndarray) -> Tuple[bool, float]:
+        """Check if point is within z_threshold std of training mean."""
+        x = np.atleast_2d(x)
+        z_scores = np.abs((x - self.mean) / self.std)
+        max_z = z_scores.max(axis=1)
+        is_ok = max_z < self.z_threshold
+        return is_ok[0] if len(is_ok) == 1 else is_ok, max_z[0] if len(max_z) == 1 else max_z
+
+    def knn_distance_check(self, x: np.ndarray) -> Tuple[bool, float]:
+        """Check if point is close enough to training data."""
+        x = np.atleast_2d(x)
+        distances, _ = self.knn.kneighbors(x)
+        avg_dist = distances.mean(axis=1)
+        # Allow up to 3x typical distance
+        is_ok = avg_dist < 3 * self.typical_knn_dist
+        return is_ok[0] if len(is_ok) == 1 else is_ok, avg_dist[0] if len(avg_dist) == 1 else avg_dist
+
+    def is_in_distribution(self, x: np.ndarray) -> Tuple[bool, Dict]:
+        """Combined OOD check."""
+        z_ok, z_score = self.z_score_check(x)
+        knn_ok, knn_dist = self.knn_distance_check(x)
+
+        is_ok = z_ok and knn_ok
+        details = {
+            'z_score': float(z_score),
+            'z_ok': bool(z_ok),
+            'knn_dist': float(knn_dist),
+            'knn_ok': bool(knn_ok),
+            'in_distribution': bool(is_ok)
+        }
+
+        return is_ok, details
+
+
+class EnsembleSurrogate:
+    """
+    Ensemble of MLPs with uncertainty quantification.
+
+    Key insight: Models trained with different random seeds will agree
+    in well-sampled regions but disagree in extrapolation regions.
+    Disagreement = epistemic uncertainty.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        n_models: int = 5,
+        hidden_dims: List[int] = None,
+        device: str = 'auto'
+    ):
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.n_models = n_models
+        self.hidden_dims = hidden_dims or [64, 32]
+
+        if device == 'auto':
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        else:
+            self.device = torch.device(device)
+
+        # Create ensemble
+        self.models = [
+            MLP(input_dim, output_dim, hidden_dims).to(self.device)
+            for _ in range(n_models)
+        ]
+
+        # Normalization stats
+        self.x_mean = None
+        self.x_std = None
+        self.y_mean = None
+        self.y_std = None
+
+        # OOD detector
+        self.ood_detector = None
+
+        # Training state
+        self.is_trained = False
+
+        logger.info(f"[ENSEMBLE] Created {n_models} MLPs on {self.device}")
+
+    def train(
+        self,
+        X: np.ndarray,
+        Y: np.ndarray,
+        epochs: int = 500,
+        lr: float = 0.001,
+        val_split: float = 0.1,
+        patience: int = 50
+    ) -> Dict:
+        """Train all models in ensemble with different random seeds."""
+
+        # Compute normalization
+        self.x_mean = X.mean(axis=0)
+        self.x_std = X.std(axis=0) + 1e-8
+        self.y_mean = Y.mean(axis=0)
+        self.y_std = Y.std(axis=0) + 1e-8
+
+        X_norm = (X - self.x_mean) / self.x_std
+        Y_norm = (Y - self.y_mean) / self.y_std
+
+        # Split data
+        n_val = max(int(len(X) * val_split), 5)
+        indices = np.random.permutation(len(X))
+        val_idx, train_idx = indices[:n_val], indices[n_val:]
+
+        X_train, Y_train = X_norm[train_idx], Y_norm[train_idx]
+        X_val, Y_val = X_norm[val_idx], Y_norm[val_idx]
+
+        # Convert to tensors
+        X_t = torch.FloatTensor(X_train).to(self.device)
+        Y_t = torch.FloatTensor(Y_train).to(self.device)
+        X_v = torch.FloatTensor(X_val).to(self.device)
+        Y_v = torch.FloatTensor(Y_val).to(self.device)
+
+        # Train each model with different seed
+        all_val_losses = []
+        for i, model in enumerate(self.models):
+            torch.manual_seed(42 + i * 1000)  # Different init per model
+            np.random.seed(42 + i * 1000)
+
+            optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
+            criterion = nn.MSELoss()
+
+            best_val_loss = float('inf')
+            patience_counter = 0
+            best_state = None
+
+            for epoch in range(epochs):
+                # Train
+                model.train()
+                optimizer.zero_grad()
+                pred = model(X_t)
+                loss = criterion(pred, Y_t)
+                loss.backward()
+                optimizer.step()
+
+                # Validate
+                model.eval()
+                with torch.no_grad():
+                    val_pred = model(X_v)
+                    val_loss = criterion(val_pred, Y_v).item()
+
+                # Early stopping
+                if val_loss < best_val_loss:
+                    best_val_loss = val_loss
+                    patience_counter = 0
+                    best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
+                else:
+                    patience_counter += 1
+                    if patience_counter >= patience:
+                        break
+
+            # Restore best
+            if best_state:
+                model.load_state_dict(best_state)
+                model.to(self.device)
+
+            all_val_losses.append(best_val_loss)
+            logger.info(f"[ENSEMBLE] Model {i+1}/{self.n_models} trained, val_loss={best_val_loss:.4f}")
+
+        # Initialize OOD detector
+        self.ood_detector = OODDetector(X_norm)
+
+        self.is_trained = True
+
+        # Compute ensemble metrics
+        metrics = self._compute_metrics(X_val, Y_val)
+        metrics['val_losses'] = all_val_losses
+
+        return metrics
+
+    def _compute_metrics(self, X_val: np.ndarray, Y_val: np.ndarray) -> Dict:
+        """Compute R², MAE, and ensemble disagreement on validation set."""
+        mean, std = self.predict_normalized(X_val)
+
+        # R² for each output
+        ss_res = np.sum((Y_val - mean) ** 2, axis=0)
+        ss_tot = np.sum((Y_val - Y_val.mean(axis=0)) ** 2, axis=0)
+        r2 = 1 - ss_res / (ss_tot + 1e-8)
+
+        # MAE
+        mae = np.abs(Y_val - mean).mean(axis=0)
+
+        # Average ensemble disagreement
+        avg_std = std.mean()
+
+        return {
+            'r2': r2.tolist(),
+            'mae': mae.tolist(),
+            'avg_ensemble_std': float(avg_std),
+            'n_val': len(X_val)
+        }
+
+    def predict_normalized(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """Predict on normalized inputs, return normalized outputs."""
+        X = np.atleast_2d(X)
+        X_t = torch.FloatTensor(X).to(self.device)
+
+        preds = []
+        for model in self.models:
+            model.eval()
+            with torch.no_grad():
+                pred = model(X_t).cpu().numpy()
+            preds.append(pred)
+
+        preds = np.array(preds)  # (n_models, n_samples, n_outputs)
+        mean = preds.mean(axis=0)
+        std = preds.std(axis=0)
+
+        return mean, std
+
+    def predict(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Predict with uncertainty.
+
+        Returns:
+            mean: (n_samples, n_outputs) predicted values
+            std: (n_samples, n_outputs) uncertainty (ensemble disagreement)
+        """
+        X = np.atleast_2d(X)
+
+        # Normalize input
+        X_norm = (X - self.x_mean) / self.x_std
+
+        # Get predictions
+        mean_norm, std_norm = self.predict_normalized(X_norm)
+
+        # Denormalize
+        mean = mean_norm * self.y_std + self.y_mean
+        std = std_norm * self.y_std  # Std scales with y_std
+
+        return mean, std
+
+    def predict_with_confidence(self, X: np.ndarray) -> Dict:
+        """
+        Full prediction with confidence assessment.
+
+        Returns dict with:
+        - mean: predicted values
+        - std: uncertainty
+        - confidence: 0-1 score (higher = more reliable)
+        - in_distribution: OOD check result
+        - recommendation: 'trust', 'verify', or 'reject'
+        """
+        X = np.atleast_2d(X)
+
+        mean, std = self.predict(X)
+
+        # OOD check
+        X_norm = (X - self.x_mean) / self.x_std
+        ood_results = [self.ood_detector.is_in_distribution(x) for x in X_norm]
+        in_distribution = [r[0] for r in ood_results]
+
+        # Compute confidence score (0 = no confidence, 1 = high confidence)
+        # Based on: relative std (lower = better) and OOD (in = better)
+        relative_std = std / (np.abs(mean) + 1e-6)
+        avg_rel_std = relative_std.mean(axis=1)
+
+        confidence = np.zeros(len(X))
+        for i in range(len(X)):
+            if not in_distribution[i]:
+                confidence[i] = 0.0  # OOD = no confidence
+            elif avg_rel_std[i] > 0.3:
+                confidence[i] = 0.2  # High uncertainty
+            elif avg_rel_std[i] > 0.1:
+                confidence[i] = 0.5  # Medium uncertainty
+            else:
+                confidence[i] = 0.9  # Low uncertainty
+
+        # Recommendations
+        recommendations = []
+        for i in range(len(X)):
+            if confidence[i] >= 0.7:
+                recommendations.append('trust')
+            elif confidence[i] >= 0.3:
+                recommendations.append('verify')  # Run FEA to check
+            else:
+                recommendations.append('reject')  # Don't use, run FEA instead
+
+        return {
+            'mean': mean,
+            'std': std,
+            'confidence': confidence,
+            'in_distribution': in_distribution,
+            'recommendation': recommendations
+        }
+
+    def acquisition_score(self, X: np.ndarray, best_so_far: float, xi: float = 0.01) -> np.ndarray:
+        """
+        Expected Improvement acquisition function.
+
+        High score = worth running FEA (either promising or uncertain).
+
+        Args:
+            X: candidate points
+            best_so_far: current best objective value
+            xi: exploration-exploitation tradeoff (higher = more exploration)
+
+        Returns:
+            scores: acquisition score per point
+        """
+        X = np.atleast_2d(X)
+        mean, std = self.predict(X)
+
+        # For minimization: improvement = best - predicted
+        # Take first objective (weighted sum) for acquisition
+        if mean.ndim > 1:
+            mean = mean[:, 0]
+            std = std[:, 0]
+
+        improvement = best_so_far - mean
+
+        # Expected improvement with exploration bonus
+        # Higher std = more exploration value
+        z = improvement / (std + 1e-8)
+
+        # Simple acquisition: exploitation + exploration
+        scores = improvement + xi * std
+
+        # Penalize OOD points
+        X_norm = (X - self.x_mean) / self.x_std
+        for i, x in enumerate(X_norm):
+            is_ok, _ = self.ood_detector.is_in_distribution(x)
+            if not is_ok:
+                scores[i] *= 0.1  # Heavy penalty for OOD
+
+        return scores
+
+    def select_candidates_for_fea(
+        self,
+        candidates: np.ndarray,
+        best_so_far: float,
+        n_select: int = 5,
+        diversity_weight: float = 0.3
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Select diverse, high-acquisition candidates for FEA validation.
+
+        Balances:
+        1. High acquisition score (exploitation + exploration)
+        2. Diversity (don't cluster all candidates together)
+        3. In-distribution (avoid OOD predictions)
+
+        Returns:
+            selected: indices of selected candidates
+            scores: acquisition scores
+        """
+        scores = self.acquisition_score(candidates, best_so_far)
+
+        # Greedy selection with diversity
+        selected = []
+        remaining = list(range(len(candidates)))
+
+        while len(selected) < n_select and remaining:
+            if not selected:
+                # First: pick highest score
+                best_idx = max(remaining, key=lambda i: scores[i])
+            else:
+                # Later: balance score with distance to selected
+                def combined_score(i):
+                    # Min distance to already selected
+                    min_dist = min(
+                        np.linalg.norm(candidates[i] - candidates[j])
+                        for j in selected
+                    )
+                    # Combine acquisition + diversity
+                    return scores[i] + diversity_weight * min_dist
+
+                best_idx = max(remaining, key=combined_score)
+
+            selected.append(best_idx)
+            remaining.remove(best_idx)
+
+        return np.array(selected), scores[selected]
+
+    def save(self, path: Path):
+        """Save ensemble to disk."""
+        path = Path(path)
+        path.mkdir(parents=True, exist_ok=True)
+
+        # Save each model
+        for i, model in enumerate(self.models):
+            torch.save(model.state_dict(), path / f"model_{i}.pt")
+
+        # Save normalization stats and config
+        config = {
+            'input_dim': self.input_dim,
+            'output_dim': self.output_dim,
+            'n_models': self.n_models,
+            'hidden_dims': self.hidden_dims,
+            'x_mean': self.x_mean.tolist() if self.x_mean is not None else None,
+            'x_std': self.x_std.tolist() if self.x_std is not None else None,
+            'y_mean': self.y_mean.tolist() if self.y_mean is not None else None,
+            'y_std': self.y_std.tolist() if self.y_std is not None else None,
+        }
+        with open(path / "config.json", 'w') as f:
+            json.dump(config, f, indent=2)
+
+        logger.info(f"[ENSEMBLE] Saved to {path}")
+
+    @classmethod
+    def load(cls, path: Path, device: str = 'auto') -> 'EnsembleSurrogate':
+        """Load ensemble from disk."""
+        path = Path(path)
+
+        with open(path / "config.json") as f:
+            config = json.load(f)
+
+        surrogate = cls(
+            input_dim=config['input_dim'],
+            output_dim=config['output_dim'],
+            n_models=config['n_models'],
+            hidden_dims=config['hidden_dims'],
+            device=device
+        )
+
+        # Load normalization
+        surrogate.x_mean = np.array(config['x_mean']) if config['x_mean'] else None
+        surrogate.x_std = np.array(config['x_std']) if config['x_std'] else None
+        surrogate.y_mean = np.array(config['y_mean']) if config['y_mean'] else None
+        surrogate.y_std = np.array(config['y_std']) if config['y_std'] else None
+
+        # Load models
+        for i, model in enumerate(surrogate.models):
+            model.load_state_dict(torch.load(path / f"model_{i}.pt", map_location=surrogate.device))
+            model.to(surrogate.device)
+
+        surrogate.is_trained = True
+        logger.info(f"[ENSEMBLE] Loaded from {path}")
+
+        return surrogate
+
+
+# Convenience function for quick usage
+def create_and_train_ensemble(
+    X: np.ndarray,
+    Y: np.ndarray,
+    n_models: int = 5,
+    epochs: int = 500
+) -> EnsembleSurrogate:
+    """Create and train an ensemble surrogate."""
+    surrogate = EnsembleSurrogate(
+        input_dim=X.shape[1],
+        output_dim=Y.shape[1] if Y.ndim > 1 else 1,
+        n_models=n_models
+    )
+
+    if Y.ndim == 1:
+        Y = Y.reshape(-1, 1)
+
+    metrics = surrogate.train(X, Y, epochs=epochs)
+    logger.info(f"[ENSEMBLE] Training complete: R²={metrics['r2']}, avg_std={metrics['avg_ensemble_std']:.4f}")
+
+    return surrogate