Atomizer/optimization_engine/extractors/custom_extractor_loader.py

"""
Custom Extractor Loader

Dynamically loads and executes custom Python extractors defined in AtomizerSpec v2.0.
Provides sandboxed execution with access to FEA results and common analysis libraries.

P3.9: Custom extractor runtime loader
"""

import ast
import hashlib
import importlib
import logging
import re
import sys
import traceback
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np

# Lazy imports for optional dependencies
_PYOP2 = None
_SCIPY = None

logger = logging.getLogger(__name__)


# ============================================================================
# Allowed modules for custom extractors (sandboxed environment)
# ============================================================================

ALLOWED_MODULES = {
    # Core Python
    "math",
    "statistics",
    "collections",
    "itertools",
    "functools",
    # Scientific computing
    "numpy",
    "scipy",
    "scipy.interpolate",
    "scipy.optimize",
    "scipy.integrate",
    "scipy.linalg",
    # FEA result parsing
    "pyNastran",
    "pyNastran.op2",
    "pyNastran.op2.op2",
    "pyNastran.bdf",
    "pyNastran.bdf.bdf",
    # Atomizer extractors
    "optimization_engine.extractors",
}

BLOCKED_MODULES = {
    "os",
    "subprocess",
    "shutil",
    "sys",
    "builtins",
    "__builtins__",
    "importlib",
    "eval",
    "exec",
    "compile",
    "open",
    "file",
    "socket",
    "requests",
    "urllib",
    "http",
}


# ============================================================================
# Code Validation
# ============================================================================

class ExtractorSecurityError(Exception):
    """Raised when custom extractor code contains disallowed patterns."""
    pass


class ExtractorValidationError(Exception):
    """Raised when custom extractor code is invalid."""
    pass


def validate_extractor_code(code: str, function_name: str) -> Tuple[bool, List[str]]:
    """
    Validate custom extractor code for security and correctness.

    Args:
        code: Python source code string
        function_name: Expected function name to find in code

    Returns:
        Tuple of (is_valid, list of error messages)

    Raises:
        ExtractorSecurityError: If dangerous patterns detected
    """
    errors = []

    # Check for syntax errors first
    try:
        tree = ast.parse(code)
    except SyntaxError as e:
        return False, [f"Syntax error: {e}"]

    # Check for disallowed patterns
    dangerous_patterns = [
        (r'\bexec\s*\(', 'exec() is not allowed'),
        (r'\beval\s*\(', 'eval() is not allowed'),
        (r'\bcompile\s*\(', 'compile() is not allowed'),
        (r'\b__import__\s*\(', '__import__() is not allowed'),
        (r'\bopen\s*\(', 'open() is not allowed - use op2_path parameter'),
        (r'\bos\.(system|popen|spawn|exec)', 'os.system/popen/spawn/exec is not allowed'),
        (r'\bsubprocess\.', 'subprocess module is not allowed'),
        (r'\bshutil\.', 'shutil module is not allowed'),
        (r'import\s+os\b', 'import os is not allowed'),
        (r'from\s+os\b', 'from os import is not allowed'),
        (r'import\s+subprocess', 'import subprocess is not allowed'),
        (r'import\s+sys\b', 'import sys is not allowed'),
    ]

    for pattern, message in dangerous_patterns:
        if re.search(pattern, code):
            raise ExtractorSecurityError(message)

    # Check that the expected function exists
    function_found = False
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef) and node.name == function_name:
            function_found = True

            # Check function signature
            args = node.args
            arg_names = [arg.arg for arg in args.args]

            # Must have op2_path as first argument (or op2_result/results)
            valid_first_args = {'op2_path', 'op2_result', 'results', 'data'}
            if not arg_names or arg_names[0] not in valid_first_args:
                errors.append(
                    f"Function {function_name} must have first argument from: "
                    f"{valid_first_args}, got: {arg_names[0] if arg_names else 'none'}"
                )
            break

    if not function_found:
        errors.append(f"Function '{function_name}' not found in code")

    # Check imports
    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                module = alias.name.split('.')[0]
                if module in BLOCKED_MODULES:
                    errors.append(f"Import of '{alias.name}' is not allowed")
        elif isinstance(node, ast.ImportFrom):
            if node.module:
                module = node.module.split('.')[0]
                if module in BLOCKED_MODULES:
                    errors.append(f"Import from '{node.module}' is not allowed")

    return len(errors) == 0, errors


# ============================================================================
# Extractor Compilation and Execution
# ============================================================================

class CustomExtractorContext:
    """
    Execution context for custom extractors.
    Provides safe access to FEA results and common utilities.
    """

    def __init__(self, op2_path: Optional[Path] = None,
                 bdf_path: Optional[Path] = None,
                 working_dir: Optional[Path] = None,
                 params: Optional[Dict[str, float]] = None):
        """
        Initialize extractor context.

        Args:
            op2_path: Path to OP2 results file
            bdf_path: Path to BDF model file
            working_dir: Working directory for the trial
            params: Current design parameters
        """
        self.op2_path = Path(op2_path) if op2_path else None
        self.bdf_path = Path(bdf_path) if bdf_path else None
        self.working_dir = Path(working_dir) if working_dir else None
        self.params = params or {}

        # Lazy-loaded results
        self._op2_result = None
        self._bdf_model = None

    @property
    def op2_result(self):
        """Lazy-load OP2 results."""
        if self._op2_result is None and self.op2_path and self.op2_path.exists():
            global _PYOP2
            if _PYOP2 is None:
                from pyNastran.op2.op2 import OP2
                _PYOP2 = OP2
            self._op2_result = _PYOP2(str(self.op2_path), debug=False)
        return self._op2_result

    @property
    def bdf_model(self):
        """Lazy-load BDF model."""
        if self._bdf_model is None and self.bdf_path and self.bdf_path.exists():
            from pyNastran.bdf.bdf import BDF
            self._bdf_model = BDF(debug=False)
            self._bdf_model.read_bdf(str(self.bdf_path))
        return self._bdf_model


class CustomExtractor:
    """
    Compiled custom extractor ready for execution.
    """

    def __init__(self, extractor_id: str, name: str, function_name: str,
                 code: str, outputs: List[Dict[str, Any]], dependencies: List[str] = None):
        """
        Initialize custom extractor.

        Args:
            extractor_id: Unique extractor ID
            name: Human-readable name
            function_name: Name of the extraction function
            code: Python source code
            outputs: List of output definitions
            dependencies: Optional list of required pip packages
        """
        self.extractor_id = extractor_id
        self.name = name
        self.function_name = function_name
        self.code = code
        self.outputs = outputs
        self.dependencies = dependencies or []

        # Compiled function
        self._compiled_func: Optional[Callable] = None
        self._code_hash: Optional[str] = None

    def compile(self) -> None:
        """
        Compile the extractor code and extract the function.

        Raises:
            ExtractorValidationError: If code is invalid
            ExtractorSecurityError: If code contains dangerous patterns
        """
        # Validate code
        is_valid, errors = validate_extractor_code(self.code, self.function_name)
        if not is_valid:
            raise ExtractorValidationError(f"Validation failed: {'; '.join(errors)}")

        # Compute code hash for caching
        self._code_hash = hashlib.sha256(self.code.encode()).hexdigest()[:12]

        # Create execution namespace with allowed imports
        namespace = {
            'np': np,
            'numpy': np,
            'math': __import__('math'),
            'statistics': __import__('statistics'),
            'collections': __import__('collections'),
            'itertools': __import__('itertools'),
            'functools': __import__('functools'),
        }

        # Add scipy if available
        try:
            import scipy
            namespace['scipy'] = scipy
            from scipy import interpolate, optimize, integrate, linalg
            namespace['interpolate'] = interpolate
            namespace['optimize'] = optimize
            namespace['integrate'] = integrate
            namespace['linalg'] = linalg
        except ImportError:
            pass

        # Add pyNastran if available
        try:
            from pyNastran.op2.op2 import OP2
            from pyNastran.bdf.bdf import BDF
            namespace['OP2'] = OP2
            namespace['BDF'] = BDF
        except ImportError:
            pass

        # Add Atomizer extractors
        try:
            from optimization_engine import extractors
            namespace['extractors'] = extractors
        except ImportError:
            pass

        # Execute the code to define the function
        try:
            exec(self.code, namespace)
        except Exception as e:
            raise ExtractorValidationError(f"Failed to compile: {e}")

        # Extract the function
        if self.function_name not in namespace:
            raise ExtractorValidationError(f"Function '{self.function_name}' not defined")

        self._compiled_func = namespace[self.function_name]
        logger.info(f"Compiled custom extractor: {self.name} ({self._code_hash})")

    def execute(self, context: CustomExtractorContext) -> Dict[str, float]:
        """
        Execute the extractor and return results.

        Args:
            context: Execution context with FEA results

        Returns:
            Dictionary of output_name -> value

        Raises:
            RuntimeError: If execution fails
        """
        if self._compiled_func is None:
            self.compile()

        try:
            # Call the function with appropriate arguments
            result = self._compiled_func(
                op2_path=str(context.op2_path) if context.op2_path else None,
                bdf_path=str(context.bdf_path) if context.bdf_path else None,
                params=context.params,
                working_dir=str(context.working_dir) if context.working_dir else None,
            )

            # Normalize result to dict
            if isinstance(result, dict):
                return result
            elif isinstance(result, (int, float)):
                # Single value - use first output name
                if self.outputs:
                    return {self.outputs[0]['name']: float(result)}
                return {'value': float(result)}
            elif isinstance(result, (list, tuple)):
                # Multiple values - map to output names
                output_dict = {}
                for i, val in enumerate(result):
                    if i < len(self.outputs):
                        output_dict[self.outputs[i]['name']] = float(val)
                    else:
                        output_dict[f'output_{i}'] = float(val)
                return output_dict
            else:
                raise RuntimeError(f"Unexpected result type: {type(result)}")

        except Exception as e:
            logger.error(f"Custom extractor {self.name} failed: {e}")
            logger.debug(traceback.format_exc())
            raise RuntimeError(f"Extractor {self.name} failed: {e}")


# ============================================================================
# Extractor Loader
# ============================================================================

class CustomExtractorLoader:
    """
    Loads and manages custom extractors from AtomizerSpec.
    """

    def __init__(self):
        """Initialize loader with empty cache."""
        self._cache: Dict[str, CustomExtractor] = {}

    def load_from_spec(self, spec: Dict[str, Any]) -> Dict[str, CustomExtractor]:
        """
        Load all custom extractors from an AtomizerSpec.

        Args:
            spec: AtomizerSpec dictionary

        Returns:
            Dictionary of extractor_id -> CustomExtractor
        """
        extractors = {}

        for ext_def in spec.get('extractors', []):
            # Skip builtin extractors
            if ext_def.get('builtin', True):
                continue

            # Custom extractor must have function definition
            func_def = ext_def.get('function', {})
            if not func_def.get('source'):
                logger.warning(f"Custom extractor {ext_def.get('id')} has no source code")
                continue

            extractor = CustomExtractor(
                extractor_id=ext_def.get('id', 'custom'),
                name=ext_def.get('name', 'Custom Extractor'),
                function_name=func_def.get('name', 'extract'),
                code=func_def.get('source', ''),
                outputs=ext_def.get('outputs', []),
                dependencies=func_def.get('dependencies', []),
            )

            try:
                extractor.compile()
                extractors[extractor.extractor_id] = extractor
                self._cache[extractor.extractor_id] = extractor
            except (ExtractorValidationError, ExtractorSecurityError) as e:
                logger.error(f"Failed to load extractor {extractor.name}: {e}")

        return extractors

    def get(self, extractor_id: str) -> Optional[CustomExtractor]:
        """Get a cached extractor by ID."""
        return self._cache.get(extractor_id)

    def execute_all(self, extractors: Dict[str, CustomExtractor],
                    context: CustomExtractorContext) -> Dict[str, Dict[str, float]]:
        """
        Execute all custom extractors and collect results.

        Args:
            extractors: Dictionary of extractor_id -> CustomExtractor
            context: Execution context

        Returns:
            Dictionary of extractor_id -> {output_name: value}
        """
        results = {}

        for ext_id, extractor in extractors.items():
            try:
                results[ext_id] = extractor.execute(context)
            except Exception as e:
                logger.error(f"Extractor {ext_id} failed: {e}")
                # Return NaN for failed extractors
                results[ext_id] = {
                    out['name']: float('nan')
                    for out in extractor.outputs
                }

        return results

    def clear_cache(self) -> None:
        """Clear the extractor cache."""
        self._cache.clear()


# ============================================================================
# Convenience Functions
# ============================================================================

# Global loader instance
_loader = CustomExtractorLoader()


def load_custom_extractors(spec: Dict[str, Any]) -> Dict[str, CustomExtractor]:
    """
    Load custom extractors from an AtomizerSpec.

    Args:
        spec: AtomizerSpec dictionary

    Returns:
        Dictionary of extractor_id -> CustomExtractor
    """
    return _loader.load_from_spec(spec)


def execute_custom_extractor(extractor_id: str,
                              op2_path: Union[str, Path],
                              bdf_path: Optional[Union[str, Path]] = None,
                              working_dir: Optional[Union[str, Path]] = None,
                              params: Optional[Dict[str, float]] = None) -> Dict[str, float]:
    """
    Execute a single cached custom extractor.

    Args:
        extractor_id: ID of the extractor to run
        op2_path: Path to OP2 results file
        bdf_path: Optional path to BDF file
        working_dir: Optional working directory
        params: Optional design parameters

    Returns:
        Dictionary of output_name -> value

    Raises:
        KeyError: If extractor not found in cache
    """
    extractor = _loader.get(extractor_id)
    if extractor is None:
        raise KeyError(f"Extractor '{extractor_id}' not found in cache")

    context = CustomExtractorContext(
        op2_path=op2_path,
        bdf_path=bdf_path,
        working_dir=working_dir,
        params=params
    )

    return extractor.execute(context)


def validate_custom_extractor(code: str, function_name: str = "extract") -> Tuple[bool, List[str]]:
    """
    Validate custom extractor code without executing it.

    Args:
        code: Python source code
        function_name: Expected function name

    Returns:
        Tuple of (is_valid, list of error/warning messages)
    """
    return validate_extractor_code(code, function_name)


__all__ = [
    'CustomExtractor',
    'CustomExtractorLoader',
    'CustomExtractorContext',
    'ExtractorSecurityError',
    'ExtractorValidationError',
    'load_custom_extractors',
    'execute_custom_extractor',
    'validate_custom_extractor',
]