Atomizer/optimization_engine/extractors/extractor_library.py

"""
Extractor Library Manager - Phase 3.2 Architecture Refactor

Manages a centralized library of reusable extractors to prevent code duplication
and keep study folders clean.

Architecture Principles:
1. Reusable extractors stored in optimization_engine/extractors/
2. Study folders only contain metadata (which extractors were used)
3. First-time generation adds to library with documentation
4. Subsequent requests reuse existing library code

Author: Antoine Letarte
Date: 2025-11-17
Phase: 3.2 Architecture Refactor
"""

import json
import hashlib
from pathlib import Path
from typing import Dict, Any, List, Optional
import logging

logger = logging.getLogger(__name__)


class ExtractorLibrary:
    """
    Centralized library of reusable FEA result extractors.

    Prevents code duplication by maintaining a core library of extractors
    that can be reused across all optimization studies.
    """

    def __init__(self, library_dir: Optional[Path] = None):
        """
        Initialize extractor library.

        Args:
            library_dir: Directory for core extractor library
                        (default: optimization_engine/extractors/)
        """
        if library_dir is None:
            library_dir = Path(__file__).parent / "extractors"

        self.library_dir = Path(library_dir)
        self.library_dir.mkdir(parents=True, exist_ok=True)

        # Create __init__.py for Python package
        init_file = self.library_dir / "__init__.py"
        if not init_file.exists():
            init_file.write_text('"""Core extractor library for Atomizer."""\n')

        # Library catalog - tracks all available extractors
        self.catalog_file = self.library_dir / "catalog.json"
        self.catalog = self._load_catalog()

        logger.info(f"Extractor library initialized: {self.library_dir}")
        logger.info(f"Library contains {len(self.catalog)} extractors")

    def _load_catalog(self) -> Dict[str, Any]:
        """Load extractor catalog from disk."""
        if self.catalog_file.exists():
            with open(self.catalog_file) as f:
                return json.load(f)
        return {}

    def _save_catalog(self):
        """Save extractor catalog to disk."""
        with open(self.catalog_file, 'w') as f:
            json.dump(self.catalog, f, indent=2)

    def _compute_signature(self, llm_feature: Dict[str, Any]) -> str:
        """
        Compute unique signature for an extractor based on its functionality.

        Two extractors are considered identical if they have the same:
        - Action (e.g., extract_displacement)
        - Domain (e.g., result_extraction)
        - Key parameters (e.g., result_type, metric)
        """
        # Normalize the feature specification
        signature_data = {
            'action': llm_feature.get('action', ''),
            'domain': llm_feature.get('domain', ''),
            'params': llm_feature.get('params', {})
        }

        # Create deterministic hash
        signature_str = json.dumps(signature_data, sort_keys=True)
        return hashlib.sha256(signature_str.encode()).hexdigest()[:16]

    def get_or_create(self, llm_feature: Dict[str, Any], extractor_code: str) -> Path:
        """
        Get existing extractor from library or add new one.

        Args:
            llm_feature: LLM feature specification (action, domain, params)
            extractor_code: Generated Python code for the extractor

        Returns:
            Path to extractor module in core library
        """
        # Compute signature to check if extractor already exists
        signature = self._compute_signature(llm_feature)

        # Check if extractor already exists in library
        if signature in self.catalog:
            extractor_info = self.catalog[signature]
            extractor_file = self.library_dir / extractor_info['filename']

            if extractor_file.exists():
                logger.info(f"Reusing existing extractor: {extractor_info['name']}")
                return extractor_file

        # Create new extractor in library
        action = llm_feature.get('action', 'unknown_action')
        filename = f"{action}.py"
        extractor_file = self.library_dir / filename

        # Write extractor code to library
        extractor_file.write_text(extractor_code)

        # Add to catalog
        self.catalog[signature] = {
            'name': action,
            'filename': filename,
            'action': llm_feature.get('action'),
            'domain': llm_feature.get('domain'),
            'description': llm_feature.get('description', ''),
            'params': llm_feature.get('params', {}),
            'signature': signature
        }
        self._save_catalog()

        logger.info(f"Added new extractor to library: {action}")
        return extractor_file

    def get_extractor_metadata(self, signature: str) -> Optional[Dict[str, Any]]:
        """Get metadata for an extractor by its signature."""
        return self.catalog.get(signature)

    def list_extractors(self) -> List[Dict[str, Any]]:
        """List all extractors in the library."""
        return list(self.catalog.values())

    def get_library_summary(self) -> str:
        """Generate human-readable summary of library contents."""
        lines = []
        lines.append("=" * 80)
        lines.append("ATOMIZER EXTRACTOR LIBRARY")
        lines.append("=" * 80)
        lines.append("")
        lines.append(f"Location: {self.library_dir}")
        lines.append(f"Total extractors: {len(self.catalog)}")
        lines.append("")

        if self.catalog:
            lines.append("Available Extractors:")
            lines.append("-" * 80)

            for signature, info in self.catalog.items():
                lines.append(f"\n{info['name']}")
                lines.append(f"  Domain: {info['domain']}")
                lines.append(f"  Description: {info['description']}")
                lines.append(f"  File: {info['filename']}")
                lines.append(f"  Signature: {signature}")
        else:
            lines.append("Library is empty. Extractors will be added on first use.")

        lines.append("")
        lines.append("=" * 80)

        return "\n".join(lines)


def create_study_manifest(extractors_used: List[str], output_dir: Path):
    """
    Create a manifest file documenting which extractors were used in a study.

    This replaces the old approach of copying extractor code into study folders.
    Now we just record which library extractors were used.

    Args:
        extractors_used: List of extractor signatures used in this study
        output_dir: Study output directory
    """
    manifest = {
        'extractors_used': extractors_used,
        'extractor_library': 'optimization_engine/extractors/',
        'note': 'Extractors are stored in the core library, not in this study folder'
    }

    manifest_file = output_dir / "extractors_manifest.json"
    with open(manifest_file, 'w') as f:
        json.dump(manifest, f, indent=2)

    logger.info(f"Study manifest created: {manifest_file}")


if __name__ == '__main__':
    """Test the extractor library system."""

    # Initialize library
    library = ExtractorLibrary()

    # Print summary
    print(library.get_library_summary())

    # Test adding an extractor
    test_feature = {
        'action': 'extract_displacement',
        'domain': 'result_extraction',
        'description': 'Extract displacement from OP2 file',
        'params': {'result_type': 'displacement', 'metric': 'max'}
    }

    test_code = '''"""Extract displacement from OP2 file."""
def extract_displacement(op2_file):
    # Implementation here
    pass
'''

    extractor_path = library.get_or_create(test_feature, test_code)
    print(f"\nExtractor created/retrieved: {extractor_path}")

    # Try to add it again - should reuse existing
    extractor_path2 = library.get_or_create(test_feature, test_code)
    print(f"Second call (should reuse): {extractor_path2}")

    # Verify they're the same
    assert extractor_path == extractor_path2, "Should reuse existing extractor!"
    print("\n[SUCCESS] Extractor deduplication working correctly!")