optimization_engine/extractor_library.py

"""
Extractor Library Manager - Phase 3.2 Architecture Refactor

Manages a centralized library of reusable extractors to prevent code duplication
and keep study folders clean.

Architecture Principles:
1. Reusable extractors stored in optimization_engine/extractors/
2. Study folders only contain metadata (which extractors were used)
3. First-time generation adds to library with documentation
4. Subsequent requests reuse existing library code

Author: Antoine Letarte
Date: 2025-11-17
Phase: 3.2 Architecture Refactor
"""

import json
import hashlib
from pathlib import Path
from typing import Dict, Any, List, Optional
import logging

logger = logging.getLogger(__name__)


class ExtractorLibrary:
    """
    Centralized library of reusable FEA result extractors.

    Prevents code duplication by maintaining a core library of extractors
    that can be reused across all optimization studies.
    """

    def __init__(self, library_dir: Optional[Path] = None):
        """
        Initialize extractor library.

        Args:
            library_dir: Directory for core extractor library
                        (default: optimization_engine/extractors/)
        """
        if library_dir is None:
            library_dir = Path(__file__).parent / "extractors"

        self.library_dir = Path(library_dir)
        self.library_dir.mkdir(parents=True, exist_ok=True)

        # Create __init__.py for Python package
        init_file = self.library_dir / "__init__.py"
        if not init_file.exists():
            init_file.write_text('"""Core extractor library for Atomizer."""\n')

        # Library catalog - tracks all available extractors
        self.catalog_file = self.library_dir / "catalog.json"
        self.catalog = self._load_catalog()

        logger.info(f"Extractor library initialized: {self.library_dir}")
        logger.info(f"Library contains {len(self.catalog)} extractors")

    def _load_catalog(self) -> Dict[str, Any]:
        """Load extractor catalog from disk."""
        if self.catalog_file.exists():
            with open(self.catalog_file) as f:
                return json.load(f)
        return {}

    def _save_catalog(self):
        """Save extractor catalog to disk."""
        with open(self.catalog_file, 'w') as f:
            json.dump(self.catalog, f, indent=2)

    def _compute_signature(self, llm_feature: Dict[str, Any]) -> str:
        """
        Compute unique signature for an extractor based on its functionality.

        Two extractors are considered identical if they have the same:
        - Action (e.g., extract_displacement)
        - Domain (e.g., result_extraction)
        - Key parameters (e.g., result_type, metric)
        """
        # Normalize the feature specification
        signature_data = {
            'action': llm_feature.get('action', ''),
            'domain': llm_feature.get('domain', ''),
            'params': llm_feature.get('params', {})
        }

        # Create deterministic hash
        signature_str = json.dumps(signature_data, sort_keys=True)
        return hashlib.sha256(signature_str.encode()).hexdigest()[:16]

    def get_or_create(self, llm_feature: Dict[str, Any], extractor_code: str) -> Path:
        """
        Get existing extractor from library or add new one.

        Args:
            llm_feature: LLM feature specification (action, domain, params)
            extractor_code: Generated Python code for the extractor

        Returns:
            Path to extractor module in core library
        """
        # Compute signature to check if extractor already exists
        signature = self._compute_signature(llm_feature)

        # Check if extractor already exists in library
        if signature in self.catalog:
            extractor_info = self.catalog[signature]
            extractor_file = self.library_dir / extractor_info['filename']

            if extractor_file.exists():
                logger.info(f"Reusing existing extractor: {extractor_info['name']}")
                return extractor_file

        # Create new extractor in library
        action = llm_feature.get('action', 'unknown_action')
        filename = f"{action}.py"
        extractor_file = self.library_dir / filename

        # Write extractor code to library
        extractor_file.write_text(extractor_code)

        # Add to catalog
        self.catalog[signature] = {
            'name': action,
            'filename': filename,
            'action': llm_feature.get('action'),
            'domain': llm_feature.get('domain'),
            'description': llm_feature.get('description', ''),
            'params': llm_feature.get('params', {}),
            'signature': signature
        }
        self._save_catalog()

        logger.info(f"Added new extractor to library: {action}")
        return extractor_file

    def get_extractor_metadata(self, signature: str) -> Optional[Dict[str, Any]]:
        """Get metadata for an extractor by its signature."""
        return self.catalog.get(signature)

    def list_extractors(self) -> List[Dict[str, Any]]:
        """List all extractors in the library."""
        return list(self.catalog.values())

    def get_library_summary(self) -> str:
        """Generate human-readable summary of library contents."""
        lines = []
        lines.append("=" * 80)
        lines.append("ATOMIZER EXTRACTOR LIBRARY")
        lines.append("=" * 80)
        lines.append("")
        lines.append(f"Location: {self.library_dir}")
        lines.append(f"Total extractors: {len(self.catalog)}")
        lines.append("")

        if self.catalog:
            lines.append("Available Extractors:")
            lines.append("-" * 80)

            for signature, info in self.catalog.items():
                lines.append(f"\n{info['name']}")
                lines.append(f"  Domain: {info['domain']}")
                lines.append(f"  Description: {info['description']}")
                lines.append(f"  File: {info['filename']}")
                lines.append(f"  Signature: {signature}")
        else:
            lines.append("Library is empty. Extractors will be added on first use.")

        lines.append("")
        lines.append("=" * 80)

        return "\n".join(lines)


def create_study_manifest(extractors_used: List[str], output_dir: Path):
    """
    Create a manifest file documenting which extractors were used in a study.

    This replaces the old approach of copying extractor code into study folders.
    Now we just record which library extractors were used.

    Args:
        extractors_used: List of extractor signatures used in this study
        output_dir: Study output directory
    """
    manifest = {
        'extractors_used': extractors_used,
        'extractor_library': 'optimization_engine/extractors/',
        'note': 'Extractors are stored in the core library, not in this study folder'
    }

    manifest_file = output_dir / "extractors_manifest.json"
    with open(manifest_file, 'w') as f:
        json.dump(manifest, f, indent=2)

    logger.info(f"Study manifest created: {manifest_file}")


if __name__ == '__main__':
    """Test the extractor library system."""

    # Initialize library
    library = ExtractorLibrary()

    # Print summary
    print(library.get_library_summary())

    # Test adding an extractor
    test_feature = {
        'action': 'extract_displacement',
        'domain': 'result_extraction',
        'description': 'Extract displacement from OP2 file',
        'params': {'result_type': 'displacement', 'metric': 'max'}
    }

    test_code = '''"""Extract displacement from OP2 file."""
def extract_displacement(op2_file):
    # Implementation here
    pass
'''

    extractor_path = library.get_or_create(test_feature, test_code)
    print(f"\nExtractor created/retrieved: {extractor_path}")

    # Try to add it again - should reuse existing
    extractor_path2 = library.get_or_create(test_feature, test_code)
    print(f"Second call (should reuse): {extractor_path2}")

    # Verify they're the same
    assert extractor_path == extractor_path2, "Should reuse existing extractor!"
    print("\n[SUCCESS] Extractor deduplication working correctly!")
refactor: Implement centralized extractor library to eliminate code duplication MAJOR ARCHITECTURE REFACTOR - Clean Study Folders Problem Identified by User: "My study folder is a mess, why? I want some order and real structure to develop an insanely good engineering software that evolve with time." - Every substudy was generating duplicate extractor code - Study folders polluted with reusable library code (generated_extractors/, generated_hooks/) - No code reuse across studies - Not production-grade architecture Solution - Centralized Library System: Implemented smart library with signature-based deduplication: - Core extractors in optimization_engine/extractors/ - Studies only store metadata (extractors_manifest.json) - Clean separation: studies = data, core = code Architecture: BEFORE (BAD): studies/my_study/ generated_extractors/ ❌ Code pollution! extract_displacement.py extract_von_mises_stress.py generated_hooks/ ❌ Code pollution! llm_workflow_config.json results.json AFTER (GOOD): optimization_engine/extractors/ ✓ Core library extract_displacement.py extract_stress.py catalog.json studies/my_study/ extractors_manifest.json ✓ Just references! llm_workflow_config.json ✓ Config optimization_results.json ✓ Results New Components: 1. ExtractorLibrary (extractor_library.py) - Signature-based deduplication - Centralized catalog (catalog.json) - Study manifest generation - Reusability across all studies 2. Updated ExtractorOrchestrator - Uses core library instead of per-study generation - Creates manifest instead of copying code - Backward compatible (legacy mode available) 3. Updated LLMOptimizationRunner - Removed generated_extractors/ directory creation - Removed generated_hooks/ directory creation - Uses core library exclusively 4. Updated Tests - Verifies extractors_manifest.json exists - Checks for clean study folder structure - All 18/18 checks pass Results: Study folders NOW ONLY contain: ✓ extractors_manifest.json - references to core library ✓ llm_workflow_config.json - study configuration ✓ optimization_results.json - optimization results ✓ optimization_history.json - trial history ✓ .db file - Optuna database Core library contains: ✓ extract_displacement.py - reusable across ALL studies ✓ extract_von_mises_stress.py - reusable across ALL studies ✓ extract_mass.py - reusable across ALL studies ✓ catalog.json - tracks all extractors with signatures Benefits: - Clean, professional study folder structure - Code reuse eliminates duplication - Library grows over time, studies stay clean - Production-grade architecture - "Insanely good engineering software that evolves with time" Testing: E2E test passes with clean folder structure - No generated_extractors/ pollution - Manifest correctly references library - Core library populated with reusable extractors - Study folder professional and minimal Documentation: - Added comprehensive architecture doc (docs/ARCHITECTURE_REFACTOR_NOV17.md) - Includes migration guide - Documents future work (hooks library, versioning, CLI tools) Next Steps: - Apply same architecture to hooks library - Add auto-generated documentation for library - Implement versioning for reproducibility 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-18 09:00:10 -05:00			`"""`
			`Extractor Library Manager - Phase 3.2 Architecture Refactor`

			`Manages a centralized library of reusable extractors to prevent code duplication`
			`and keep study folders clean.`

			`Architecture Principles:`
			`1. Reusable extractors stored in optimization_engine/extractors/`
			`2. Study folders only contain metadata (which extractors were used)`
			`3. First-time generation adds to library with documentation`
			`4. Subsequent requests reuse existing library code`

			`Author: Antoine Letarte`
			`Date: 2025-11-17`
			`Phase: 3.2 Architecture Refactor`
			`"""`

			`import json`
			`import hashlib`
			`from pathlib import Path`
			`from typing import Dict, Any, List, Optional`
			`import logging`

			`logger = logging.getLogger(__name__)`


			`class ExtractorLibrary:`
			`"""`
			`Centralized library of reusable FEA result extractors.`

			`Prevents code duplication by maintaining a core library of extractors`
			`that can be reused across all optimization studies.`
			`"""`

			`def __init__(self, library_dir: Optional[Path] = None):`
			`"""`
			`Initialize extractor library.`

			`Args:`
			`library_dir: Directory for core extractor library`
			`(default: optimization_engine/extractors/)`
			`"""`
			`if library_dir is None:`
			`library_dir = Path(__file__).parent / "extractors"`

			`self.library_dir = Path(library_dir)`
			`self.library_dir.mkdir(parents=True, exist_ok=True)`

			`# Create __init__.py for Python package`
			`init_file = self.library_dir / "__init__.py"`
			`if not init_file.exists():`
			`init_file.write_text('"""Core extractor library for Atomizer."""\n')`

			`# Library catalog - tracks all available extractors`
			`self.catalog_file = self.library_dir / "catalog.json"`
			`self.catalog = self._load_catalog()`

			`logger.info(f"Extractor library initialized: {self.library_dir}")`
			`logger.info(f"Library contains {len(self.catalog)} extractors")`

			`def _load_catalog(self) -> Dict[str, Any]:`
			`"""Load extractor catalog from disk."""`
			`if self.catalog_file.exists():`
			`with open(self.catalog_file) as f:`
			`return json.load(f)`
			`return {}`

			`def _save_catalog(self):`
			`"""Save extractor catalog to disk."""`
			`with open(self.catalog_file, 'w') as f:`
			`json.dump(self.catalog, f, indent=2)`

			`def _compute_signature(self, llm_feature: Dict[str, Any]) -> str:`
			`"""`
			`Compute unique signature for an extractor based on its functionality.`

			`Two extractors are considered identical if they have the same:`
			`- Action (e.g., extract_displacement)`
			`- Domain (e.g., result_extraction)`
			`- Key parameters (e.g., result_type, metric)`
			`"""`
			`# Normalize the feature specification`
			`signature_data = {`
			`'action': llm_feature.get('action', ''),`
			`'domain': llm_feature.get('domain', ''),`
			`'params': llm_feature.get('params', {})`
			`}`

			`# Create deterministic hash`
			`signature_str = json.dumps(signature_data, sort_keys=True)`
			`return hashlib.sha256(signature_str.encode()).hexdigest()[:16]`

			`def get_or_create(self, llm_feature: Dict[str, Any], extractor_code: str) -> Path:`
			`"""`
			`Get existing extractor from library or add new one.`

			`Args:`
			`llm_feature: LLM feature specification (action, domain, params)`
			`extractor_code: Generated Python code for the extractor`

			`Returns:`
			`Path to extractor module in core library`
			`"""`
			`# Compute signature to check if extractor already exists`
			`signature = self._compute_signature(llm_feature)`

			`# Check if extractor already exists in library`
			`if signature in self.catalog:`
			`extractor_info = self.catalog[signature]`
			`extractor_file = self.library_dir / extractor_info['filename']`

			`if extractor_file.exists():`
			`logger.info(f"Reusing existing extractor: {extractor_info['name']}")`
			`return extractor_file`

			`# Create new extractor in library`
			`action = llm_feature.get('action', 'unknown_action')`
			`filename = f"{action}.py"`
			`extractor_file = self.library_dir / filename`

			`# Write extractor code to library`
			`extractor_file.write_text(extractor_code)`

			`# Add to catalog`
			`self.catalog[signature] = {`
			`'name': action,`
			`'filename': filename,`
			`'action': llm_feature.get('action'),`
			`'domain': llm_feature.get('domain'),`
			`'description': llm_feature.get('description', ''),`
			`'params': llm_feature.get('params', {}),`
			`'signature': signature`
			`}`
			`self._save_catalog()`

			`logger.info(f"Added new extractor to library: {action}")`
			`return extractor_file`

			`def get_extractor_metadata(self, signature: str) -> Optional[Dict[str, Any]]:`
			`"""Get metadata for an extractor by its signature."""`
			`return self.catalog.get(signature)`

			`def list_extractors(self) -> List[Dict[str, Any]]:`
			`"""List all extractors in the library."""`
			`return list(self.catalog.values())`

			`def get_library_summary(self) -> str:`
			`"""Generate human-readable summary of library contents."""`
			`lines = []`
			`lines.append("=" * 80)`
			`lines.append("ATOMIZER EXTRACTOR LIBRARY")`
			`lines.append("=" * 80)`
			`lines.append("")`
			`lines.append(f"Location: {self.library_dir}")`
			`lines.append(f"Total extractors: {len(self.catalog)}")`
			`lines.append("")`

			`if self.catalog:`
			`lines.append("Available Extractors:")`
			`lines.append("-" * 80)`

			`for signature, info in self.catalog.items():`
			`lines.append(f"\n{info['name']}")`
			`lines.append(f" Domain: {info['domain']}")`
			`lines.append(f" Description: {info['description']}")`
			`lines.append(f" File: {info['filename']}")`
			`lines.append(f" Signature: {signature}")`
			`else:`
			`lines.append("Library is empty. Extractors will be added on first use.")`

			`lines.append("")`
			`lines.append("=" * 80)`

			`return "\n".join(lines)`


			`def create_study_manifest(extractors_used: List[str], output_dir: Path):`
			`"""`
			`Create a manifest file documenting which extractors were used in a study.`

			`This replaces the old approach of copying extractor code into study folders.`
			`Now we just record which library extractors were used.`

			`Args:`
			`extractors_used: List of extractor signatures used in this study`
			`output_dir: Study output directory`
			`"""`
			`manifest = {`
			`'extractors_used': extractors_used,`
			`'extractor_library': 'optimization_engine/extractors/',`
			`'note': 'Extractors are stored in the core library, not in this study folder'`
			`}`

			`manifest_file = output_dir / "extractors_manifest.json"`
			`with open(manifest_file, 'w') as f:`
			`json.dump(manifest, f, indent=2)`

			`logger.info(f"Study manifest created: {manifest_file}")`


			`if __name__ == '__main__':`
			`"""Test the extractor library system."""`

			`# Initialize library`
			`library = ExtractorLibrary()`

			`# Print summary`
			`print(library.get_library_summary())`

			`# Test adding an extractor`
			`test_feature = {`
			`'action': 'extract_displacement',`
			`'domain': 'result_extraction',`
			`'description': 'Extract displacement from OP2 file',`
			`'params': {'result_type': 'displacement', 'metric': 'max'}`
			`}`

			`test_code = '''"""Extract displacement from OP2 file."""`
			`def extract_displacement(op2_file):`
			`# Implementation here`
			`pass`
			`'''`

			`extractor_path = library.get_or_create(test_feature, test_code)`
			`print(f"\nExtractor created/retrieved: {extractor_path}")`

			`# Try to add it again - should reuse existing`
			`extractor_path2 = library.get_or_create(test_feature, test_code)`
			`print(f"Second call (should reuse): {extractor_path2}")`

			`# Verify they're the same`
			`assert extractor_path == extractor_path2, "Should reuse existing extractor!"`
			`print("\n[SUCCESS] Extractor deduplication working correctly!")`