feat: Add NN Quality Assessor with relative accuracy thresholds

The Method Selector now uses relative accuracy thresholds to assess NN suitability by comparing NN error to problem variability (CV ratio). NNQualityAssessor features: - Physics-based objective classification (linear, smooth, nonlinear, chaotic) - CV ratio computation: nn_error / coefficient_of_variation - Turbo suitability score based on relative thresholds - Data collection from validation_report.json, turbo_report.json, and study.db Quality thresholds by objective type: - Linear (mass, volume): max 2% error, CV ratio < 0.5 - Smooth (frequency): max 5% error, CV ratio < 1.0 - Nonlinear (stress, stiffness): max 10% error, CV ratio < 2.0 - Chaotic (contact, buckling): max 20% error, CV ratio < 3.0 CLI output now includes: - Per-objective NN quality table with error, CV, ratio, and quality indicator - Turbo suitability and hybrid suitability percentages - Warnings when NN error exceeds physics-based thresholds Updated SYS_15_METHOD_SELECTOR.md to v2.0 with full NN Quality Assessment documentation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-07 06:38:25 -05:00
parent 3e9488d9f0
commit 6cf12d9344
2 changed files with 583 additions and 53 deletions
--- a/optimization_engine/method_selector.py
+++ b/optimization_engine/method_selector.py
@@ -165,6 +165,306 @@ class MethodRecommendation:
        return asdict(self)


+@dataclass
+class NNQualityMetrics:
+    """NN surrogate quality metrics with relative thresholds.
+
+    Key insight: NN error should be compared to the coefficient of variation (CV)
+    of each objective to determine if the NN is learning the physics properly.
+
+    - If nn_error >> CV → NN is unreliable (not learning, just noise)
+    - If nn_error ≈ CV → NN captures the trend (hybrid recommended)
+    - If nn_error << CV → NN is excellent (turbo viable)
+    """
+
+    has_nn_data: bool = False
+    n_validations: int = 0
+
+    # Per-objective metrics
+    nn_errors: Dict[str, float] = field(default_factory=dict)  # Absolute % error
+    cv_ratios: Dict[str, float] = field(default_factory=dict)  # nn_error / (CV * 100)
+    expected_errors: Dict[str, float] = field(default_factory=dict)  # Based on physics type
+
+    # Overall quality scores (0-1, higher = better)
+    overall_quality: float = 0.5
+    turbo_suitability: float = 0.0
+    hybrid_suitability: float = 0.5
+
+    # Physics type classification used
+    objective_types: Dict[str, str] = field(default_factory=dict)  # 'linear', 'smooth', 'nonlinear', 'chaotic'
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
+class NNQualityAssessor:
+    """Assesses NN surrogate quality relative to problem complexity.
+
+    Uses physics-based expected error thresholds rather than absolute values.
+    The key metric is the CV ratio: nn_error / coefficient_of_variation.
+
+    CV Ratio Interpretation:
+    - < 0.5  → NN is excellent (captures physics well beyond noise)
+    - 0.5-1  → NN is good (adds value for exploration)
+    - 1-2    → NN is marginal (use with validation)
+    - > 2    → NN is poor (not learning physics, use FEA)
+    """
+
+    # Physics-based expected error thresholds
+    PHYSICS_THRESHOLDS = {
+        'linear': {'max_error': 2.0, 'cv_ratio_max': 0.5},      # mass, volume - deterministic
+        'smooth': {'max_error': 5.0, 'cv_ratio_max': 1.0},      # frequency, avg stress
+        'nonlinear': {'max_error': 10.0, 'cv_ratio_max': 2.0},  # max stress, stiffness
+        'chaotic': {'max_error': 20.0, 'cv_ratio_max': 3.0},    # contact, buckling, fracture
+    }
+
+    # Objective name to physics type classification
+    OBJECTIVE_CLASSIFICATION = {
+        # Linear (deterministic, easy to learn)
+        'mass': 'linear',
+        'volume': 'linear',
+        'weight': 'linear',
+        'area': 'linear',
+
+        # Smooth (well-behaved, moderate difficulty)
+        'frequency': 'smooth',
+        'fundamental_frequency': 'smooth',
+        'first_frequency': 'smooth',
+        'avg_stress': 'smooth',
+        'mean_stress': 'smooth',
+        'displacement': 'smooth',
+        'avg_displacement': 'smooth',
+        'compliance': 'smooth',
+
+        # Nonlinear (sensitive to details, harder to learn)
+        'stress': 'nonlinear',
+        'max_stress': 'nonlinear',
+        'von_mises': 'nonlinear',
+        'stiffness': 'nonlinear',
+        'max_displacement': 'nonlinear',
+        'strain_energy': 'nonlinear',
+
+        # Chaotic (highly nonlinear, very hard to learn)
+        'buckling': 'chaotic',
+        'contact_force': 'chaotic',
+        'fracture': 'chaotic',
+        'fatigue': 'chaotic',
+    }
+
+    def __init__(self):
+        pass
+
+    def collect(self, results_dir: Path, objective_names: List[str],
+                early_metrics: EarlyMetrics) -> NNQualityMetrics:
+        """Collect NN quality metrics from validation reports and database.
+
+        Args:
+            results_dir: Path to 2_results directory
+            objective_names: List of objective names from config
+            early_metrics: EarlyMetrics with coefficient_of_variation data
+
+        Returns:
+            NNQualityMetrics with quality scores and recommendations
+        """
+        metrics = NNQualityMetrics()
+
+        # 1. Try validation_report.json first (most reliable - has explicit FEA comparison)
+        validation_report = results_dir / "validation_report.json"
+        if validation_report.exists():
+            self._load_from_validation_report(validation_report, metrics, objective_names)
+
+        # 2. Try turbo_report.json (has per-iteration errors)
+        turbo_report = results_dir / "turbo_report.json"
+        if turbo_report.exists() and not metrics.has_nn_data:
+            self._load_from_turbo_report(turbo_report, metrics, objective_names)
+
+        # 3. Query Optuna database for nn_error_percent user attributes
+        db_path = results_dir / "study.db"
+        if db_path.exists() and not metrics.has_nn_data:
+            self._load_from_database(db_path, metrics, objective_names)
+
+        # 4. Compute relative metrics using CV from early_metrics
+        if metrics.has_nn_data and early_metrics.coefficient_of_variation:
+            self._compute_relative_metrics(metrics, early_metrics, objective_names)
+
+        return metrics
+
+    def _load_from_validation_report(self, report_path: Path, metrics: NNQualityMetrics,
+                                      objective_names: List[str]):
+        """Load NN error data from validation_report.json."""
+        try:
+            with open(report_path) as f:
+                report = json.load(f)
+
+            metrics.n_validations = report.get('n_validated', 0)
+
+            # Get average errors per objective
+            avg_errors = report.get('average_errors_percent', {})
+            if avg_errors:
+                metrics.has_nn_data = True
+                for obj_name in objective_names:
+                    # Try exact match or partial match
+                    error = avg_errors.get(obj_name)
+                    if error is None:
+                        # Try partial match (e.g., 'mass' in 'total_mass')
+                        for key, val in avg_errors.items():
+                            if obj_name.lower() in key.lower() or key.lower() in obj_name.lower():
+                                error = val
+                                break
+                    if error is not None:
+                        metrics.nn_errors[obj_name] = float(error)
+
+        except Exception as e:
+            pass  # Silently fail, try other sources
+
+    def _load_from_turbo_report(self, report_path: Path, metrics: NNQualityMetrics,
+                                 objective_names: List[str]):
+        """Load NN error data from turbo_report.json."""
+        try:
+            with open(report_path) as f:
+                report = json.load(f)
+
+            metrics.n_validations = report.get('fea_validations', 0)
+            best_solutions = report.get('best_solutions', [])
+
+            if best_solutions:
+                metrics.has_nn_data = True
+
+                # Collect errors from all iterations
+                all_errors = []
+                for sol in best_solutions:
+                    nn_error = sol.get('nn_error', [])
+                    if nn_error:
+                        all_errors.append(nn_error)
+
+                if all_errors:
+                    # Average across all validations
+                    avg_errors = np.mean(all_errors, axis=0)
+                    # Map to objective names (turbo only tracks mass, stress typically)
+                    for i, obj_name in enumerate(objective_names[:len(avg_errors)]):
+                        metrics.nn_errors[obj_name] = float(avg_errors[i])
+
+        except Exception as e:
+            pass
+
+    def _load_from_database(self, db_path: Path, metrics: NNQualityMetrics,
+                            objective_names: List[str]):
+        """Load NN error data from Optuna database user attributes."""
+        try:
+            conn = sqlite3.connect(str(db_path))
+            cursor = conn.cursor()
+
+            # Query nn_error_percent from trial_user_attributes
+            cursor.execute("""
+                SELECT value_json FROM trial_user_attributes
+                WHERE key = 'nn_error_percent'
+            """)
+
+            all_errors = []
+            for (value_json,) in cursor.fetchall():
+                try:
+                    errors = json.loads(value_json)
+                    if isinstance(errors, list):
+                        all_errors.append(errors)
+                except:
+                    pass
+
+            conn.close()
+
+            if all_errors:
+                metrics.has_nn_data = True
+                metrics.n_validations = len(all_errors)
+
+                # Average across all validated trials
+                avg_errors = np.mean(all_errors, axis=0)
+                for i, obj_name in enumerate(objective_names[:len(avg_errors)]):
+                    metrics.nn_errors[obj_name] = float(avg_errors[i])
+
+        except Exception as e:
+            pass
+
+    def _classify_objective(self, obj_name: str) -> str:
+        """Classify objective by physics type."""
+        # Check exact match first
+        if obj_name in self.OBJECTIVE_CLASSIFICATION:
+            return self.OBJECTIVE_CLASSIFICATION[obj_name]
+
+        # Check partial match
+        obj_lower = obj_name.lower()
+        for key, obj_type in self.OBJECTIVE_CLASSIFICATION.items():
+            if key in obj_lower or obj_lower in key:
+                return obj_type
+
+        # Default to 'smooth' if unknown
+        return 'smooth'
+
+    def _compute_relative_metrics(self, metrics: NNQualityMetrics,
+                                   early_metrics: EarlyMetrics,
+                                   objective_names: List[str]):
+        """Compute NN error relative to objective variability (CV)."""
+
+        for obj_name in objective_names:
+            nn_error = metrics.nn_errors.get(obj_name)
+            if nn_error is None:
+                continue
+
+            cv = early_metrics.coefficient_of_variation.get(obj_name, 0.1)
+
+            # Compute CV ratio (nn_error is %, cv is fraction)
+            # CV ratio = how many times larger is NN error than natural variability
+            if cv > 0.001:
+                cv_ratio = nn_error / (cv * 100)
+            else:
+                # Very low CV means linear/deterministic - use absolute error
+                cv_ratio = nn_error / 2.0  # Normalize to 2% baseline
+
+            metrics.cv_ratios[obj_name] = cv_ratio
+
+            # Classify and store
+            obj_type = self._classify_objective(obj_name)
+            metrics.objective_types[obj_name] = obj_type
+            metrics.expected_errors[obj_name] = self.PHYSICS_THRESHOLDS[obj_type]['max_error']
+
+        # Compute overall quality scores
+        self._compute_quality_scores(metrics)
+
+    def _compute_quality_scores(self, metrics: NNQualityMetrics):
+        """Compute overall quality scores based on relative metrics."""
+
+        if not metrics.cv_ratios:
+            return
+
+        quality_scores = []
+        turbo_scores = []
+        hybrid_scores = []
+
+        for obj_name, cv_ratio in metrics.cv_ratios.items():
+            obj_type = metrics.objective_types.get(obj_name, 'smooth')
+            threshold = self.PHYSICS_THRESHOLDS[obj_type]
+
+            # Quality: how well does NN error compare to expected max?
+            nn_error = metrics.nn_errors.get(obj_name, 0)
+            expected = threshold['max_error']
+            # Use sqrt to be less harsh on errors close to threshold
+            quality = max(0, min(1, 1 - (nn_error / expected) ** 0.5)) if expected > 0 else 0.5
+            quality_scores.append(quality)
+
+            # Turbo suitability: cv_ratio should be < cv_ratio_max
+            # Lower ratio = better (NN captures more than noise)
+            cv_max = threshold['cv_ratio_max']
+            turbo = max(0, min(1, 1 - cv_ratio / cv_max)) if cv_max > 0 else 0.5
+            turbo_scores.append(turbo)
+
+            # Hybrid suitability: more lenient threshold (2x)
+            # NN just needs to add some value
+            hybrid = max(0, min(1, 1 - cv_ratio / (cv_max * 2))) if cv_max > 0 else 0.5
+            hybrid_scores.append(hybrid)
+
+        metrics.overall_quality = float(np.mean(quality_scores)) if quality_scores else 0.5
+        metrics.turbo_suitability = float(np.mean(turbo_scores)) if turbo_scores else 0.0
+        metrics.hybrid_suitability = float(np.mean(hybrid_scores)) if hybrid_scores else 0.5
+
+
 class ProblemProfiler:
    """Analyzes optimization config to extract problem characteristics."""

@@ -450,11 +750,13 @@ class AdaptiveMethodSelector:
    - Each method starts with a base score
    - Scores are adjusted based on problem characteristics
    - Early metrics further refine the recommendation
+    - NN quality metrics adjust confidence based on actual surrogate performance
    """

    def __init__(self):
        self.profiler = ProblemProfiler()
        self.metrics_collector = EarlyMetricsCollector()
+        self.nn_quality_assessor = NNQualityAssessor()

        # Method base scores (can be tuned based on historical performance)
        self.base_scores = {
@@ -464,8 +766,13 @@ class AdaptiveMethodSelector:
            OptimizationMethod.GNN_FIELD: 0.4,
        }

+        # Store last metrics for reporting
+        self.last_nn_quality: Optional[NNQualityMetrics] = None
+        self.last_early_metrics: Optional[EarlyMetrics] = None
+
    def recommend(self, config: dict, db_path: Path = None,
-                  early_metrics: EarlyMetrics = None) -> MethodRecommendation:
+                  early_metrics: EarlyMetrics = None,
+                  results_dir: Path = None) -> MethodRecommendation:
        """
        Generate method recommendation.

@@ -473,6 +780,7 @@ class AdaptiveMethodSelector:
            config: Optimization config dict
            db_path: Optional path to existing study.db for early metrics
            early_metrics: Pre-computed early metrics (optional)
+            results_dir: Optional path to 2_results directory for NN quality data

        Returns:
            MethodRecommendation with method, confidence, and parameters
@@ -489,8 +797,24 @@ class AdaptiveMethodSelector:
                config.get('constraints', [])
            )

-        # Score each method
-        scores = self._score_methods(profile, early_metrics)
+        # Collect NN quality metrics if results directory exists
+        nn_quality = None
+        if results_dir is None and db_path:
+            results_dir = db_path.parent  # study.db is typically in 2_results
+
+        if results_dir and results_dir.exists() and early_metrics:
+            nn_quality = self.nn_quality_assessor.collect(
+                results_dir,
+                profile.objective_names,
+                early_metrics
+            )
+            self.last_nn_quality = nn_quality
+
+        # Store early_metrics for reporting
+        self.last_early_metrics = early_metrics
+
+        # Score each method (now includes NN quality)
+        scores = self._score_methods(profile, early_metrics, nn_quality)

        # Sort by score
        ranked = sorted(scores.items(), key=lambda x: x[1]['score'], reverse=True)
@@ -511,14 +835,15 @@ class AdaptiveMethodSelector:
                }
                for m, info in ranked[1:3]
            ],
-            warnings=self._get_warnings(profile, early_metrics)
+            warnings=self._get_warnings(profile, early_metrics, nn_quality)
        )

        return recommendation

    def _score_methods(self, profile: ProblemProfile,
-                       metrics: EarlyMetrics = None) -> Dict[OptimizationMethod, Dict]:
-        """Score each method based on problem characteristics."""
+                       metrics: EarlyMetrics = None,
+                       nn_quality: NNQualityMetrics = None) -> Dict[OptimizationMethod, Dict]:
+        """Score each method based on problem characteristics and NN quality."""

        scores = {}

@@ -528,7 +853,7 @@ class AdaptiveMethodSelector:

            # === TURBO MODE ===
            if method == OptimizationMethod.TURBO:
-                # Good for: low-dimensional, smooth, sufficient budget
+                # Good for: low-dimensional, smooth, sufficient budget, good NN quality

                if profile.n_variables <= 5:
                    score += 0.15
@@ -551,7 +876,24 @@ class AdaptiveMethodSelector:
                    score -= 0.2
                    reasons.append(f"rough landscape ({metrics.response_smoothness:.0%})")

-                if metrics and metrics.nn_accuracy and metrics.nn_accuracy > 0.9:
+                # NEW: NN Quality-based adjustments using relative thresholds
+                if nn_quality and nn_quality.has_nn_data:
+                    if nn_quality.turbo_suitability > 0.8:
+                        score += 0.25
+                        reasons.append(f"excellent NN quality ({nn_quality.turbo_suitability:.0%})")
+                    elif nn_quality.turbo_suitability > 0.5:
+                        score += 0.1
+                        reasons.append(f"good NN quality ({nn_quality.turbo_suitability:.0%})")
+                    elif nn_quality.turbo_suitability < 0.3:
+                        score -= 0.25
+                        reasons.append(f"poor NN quality ({nn_quality.turbo_suitability:.0%}) - use hybrid")
+
+                    # Per-objective warnings for high CV ratios
+                    for obj, cv_ratio in nn_quality.cv_ratios.items():
+                        if cv_ratio > 2.0:
+                            score -= 0.1
+                            reasons.append(f"{obj}: NN error >> variability")
+                elif metrics and metrics.nn_accuracy and metrics.nn_accuracy > 0.9:
                    score += 0.1
                    reasons.append(f"excellent NN fit ({metrics.nn_accuracy:.0%})")

@@ -575,6 +917,15 @@ class AdaptiveMethodSelector:
                    score += 0.05
                    reasons.append("adequate budget for iterations")

+                # NEW: NN Quality adjustments for hybrid
+                if nn_quality and nn_quality.has_nn_data:
+                    if nn_quality.hybrid_suitability > 0.5:
+                        score += 0.15
+                        reasons.append("NN adds value with periodic retraining")
+                    if nn_quality.turbo_suitability < 0.5:
+                        score += 0.1
+                        reasons.append("NN quality suggests hybrid over turbo")
+
            # === PURE FEA ===
            elif method == OptimizationMethod.PURE_FEA:
                # Good for: small budget, highly nonlinear, rough landscape
@@ -595,6 +946,12 @@ class AdaptiveMethodSelector:
                    score += 0.1
                    reasons.append("many infeasible designs - need accurate FEA")

+                # NEW: NN Quality - if NN is truly poor, favor pure FEA
+                if nn_quality and nn_quality.has_nn_data:
+                    if nn_quality.hybrid_suitability < 0.3:
+                        score += 0.2
+                        reasons.append("NN quality too low - prefer FEA")
+
            # === GNN FIELD ===
            elif method == OptimizationMethod.GNN_FIELD:
                # Good for: high-dimensional, need field visualization
@@ -670,7 +1027,8 @@ class AdaptiveMethodSelector:
        return params

    def _get_warnings(self, profile: ProblemProfile,
-                      metrics: EarlyMetrics = None) -> List[str]:
+                      metrics: EarlyMetrics = None,
+                      nn_quality: NNQualityMetrics = None) -> List[str]:
        """Generate warnings about potential issues."""

        warnings = []
@@ -699,6 +1057,25 @@ class AdaptiveMethodSelector:
                "neural surrogate may have high prediction errors"
            )

+        # NEW: NN Quality warnings
+        if nn_quality and nn_quality.has_nn_data:
+            # Per-objective quality warnings
+            for obj_name, cv_ratio in nn_quality.cv_ratios.items():
+                obj_type = nn_quality.objective_types.get(obj_name, 'smooth')
+                nn_error = nn_quality.nn_errors.get(obj_name, 0)
+                expected = nn_quality.expected_errors.get(obj_name, 5.0)
+
+                if cv_ratio > 2.0:
+                    warnings.append(
+                        f"{obj_name}: NN error ({nn_error:.1f}%) >> variability - "
+                        f"NN not learning physics well for this {obj_type} objective"
+                    )
+                elif nn_error > expected * 1.5:
+                    warnings.append(
+                        f"{obj_name}: NN error ({nn_error:.1f}%) above expected ({expected:.0f}%) - "
+                        f"consider retraining or using hybrid mode"
+                    )
+
        return warnings


@@ -806,8 +1183,9 @@ class RuntimeAdvisor:
        }


-def print_recommendation(rec: MethodRecommendation, profile: ProblemProfile = None):
-    """Pretty-print a method recommendation."""
+def print_recommendation(rec: MethodRecommendation, profile: ProblemProfile = None,
+                         nn_quality: NNQualityMetrics = None, early_metrics: EarlyMetrics = None):
+    """Pretty-print a method recommendation with NN quality assessment."""

    print("\n" + "=" * 70)
    print("               OPTIMIZATION METHOD ADVISOR")
@@ -820,6 +1198,45 @@ def print_recommendation(rec: MethodRecommendation, profile: ProblemProfile = No
        print(f"  Constraints: {profile.n_constraints}")
        print(f"  Max FEA budget: ~{profile.max_fea_trials} trials")

+    # NN Quality Assessment Section
+    if nn_quality and nn_quality.has_nn_data:
+        print("\nNN Quality Assessment:")
+        print(f"  Validations analyzed: {nn_quality.n_validations}")
+        print()
+
+        # Build table header
+        print("  | Objective     | NN Error | CV     | Ratio | Type       | Quality |")
+        print("  |---------------|----------|--------|-------|------------|---------|")
+
+        for obj_name in nn_quality.nn_errors.keys():
+            nn_error = nn_quality.nn_errors.get(obj_name, 0)
+            cv_ratio = nn_quality.cv_ratios.get(obj_name, 0)
+            obj_type = nn_quality.objective_types.get(obj_name, 'smooth')
+
+            # Get CV from early_metrics if available
+            cv_pct = 0.0
+            if early_metrics and early_metrics.coefficient_of_variation:
+                cv = early_metrics.coefficient_of_variation.get(obj_name, 0)
+                cv_pct = cv * 100  # Convert to percentage
+
+            # Quality indicator
+            if cv_ratio < 0.5:
+                quality = "✓ Great"
+            elif cv_ratio < 1.0:
+                quality = "✓ Good"
+            elif cv_ratio < 2.0:
+                quality = "~ OK"
+            else:
+                quality = "✗ Poor"
+
+            # Format row
+            print(f"  | {obj_name[:13]:<13} | {nn_error:>6.1f}% | {cv_pct:>5.1f}% | {cv_ratio:>5.2f} | {obj_type:<10} | {quality:<7} |")
+
+        print()
+        print(f"  Overall Quality: {nn_quality.overall_quality:.0%}")
+        print(f"  Turbo Suitability: {nn_quality.turbo_suitability:.0%}")
+        print(f"  Hybrid Suitability: {nn_quality.hybrid_suitability:.0%}")
+
    print("\n" + "-" * 70)
    print(f"\n  RECOMMENDED: {rec.method.upper()}")
    print(f"  Confidence: {rec.confidence:.0%}")
@@ -843,22 +1260,26 @@ def print_recommendation(rec: MethodRecommendation, profile: ProblemProfile = No


 # Convenience function for quick use
-def recommend_method(config_path: Path, db_path: Path = None) -> MethodRecommendation:
+def recommend_method(config_path: Path, db_path: Path = None,
+                     results_dir: Path = None) -> Tuple[MethodRecommendation, 'AdaptiveMethodSelector']:
    """
    Quick method recommendation from config file.

    Args:
        config_path: Path to optimization_config.json
        db_path: Optional path to existing study.db
+        results_dir: Optional path to results directory (for NN quality assessment)

    Returns:
-        MethodRecommendation
+        Tuple of (MethodRecommendation, AdaptiveMethodSelector)
+        The selector contains last_nn_quality and last_early_metrics for display
    """
    with open(config_path) as f:
        config = json.load(f)

    selector = AdaptiveMethodSelector()
-    return selector.recommend(config, db_path)
+    rec = selector.recommend(config, db_path, early_metrics=None, results_dir=results_dir)
+    return rec, selector


 if __name__ == "__main__":
@@ -869,7 +1290,14 @@ if __name__ == "__main__":
        config_path = Path(sys.argv[1])
        db_path = Path(sys.argv[2]) if len(sys.argv) > 2 else None

-        rec = recommend_method(config_path, db_path)
+        # Infer results_dir from config_path location (typically in 1_setup)
+        results_dir = None
+        if config_path.parent.name == "1_setup":
+            results_dir = config_path.parent.parent / "2_results"
+        elif "2_results" in str(config_path):
+            results_dir = config_path.parent
+
+        rec, selector = recommend_method(config_path, db_path, results_dir)

        # Also get profile for display
        with open(config_path) as f:
@@ -877,6 +1305,12 @@ if __name__ == "__main__":
        profiler = ProblemProfiler()
        profile = profiler.analyze(config)

-        print_recommendation(rec, profile)
+        # Print with NN quality metrics if available
+        print_recommendation(
+            rec,
+            profile,
+            nn_quality=selector.last_nn_quality,
+            early_metrics=selector.last_early_metrics
+        )
    else:
        print("Usage: python method_selector.py <config_path> [db_path]")