""" Extractor Library Manager - Phase 3.2 Architecture Refactor Manages a centralized library of reusable extractors to prevent code duplication and keep study folders clean. Architecture Principles: 1. Reusable extractors stored in optimization_engine/extractors/ 2. Study folders only contain metadata (which extractors were used) 3. First-time generation adds to library with documentation 4. Subsequent requests reuse existing library code Author: Antoine Letarte Date: 2025-11-17 Phase: 3.2 Architecture Refactor """ import json import hashlib from pathlib import Path from typing import Dict, Any, List, Optional import logging logger = logging.getLogger(__name__) class ExtractorLibrary: """ Centralized library of reusable FEA result extractors. Prevents code duplication by maintaining a core library of extractors that can be reused across all optimization studies. """ def __init__(self, library_dir: Optional[Path] = None): """ Initialize extractor library. Args: library_dir: Directory for core extractor library (default: optimization_engine/extractors/) """ if library_dir is None: library_dir = Path(__file__).parent / "extractors" self.library_dir = Path(library_dir) self.library_dir.mkdir(parents=True, exist_ok=True) # Create __init__.py for Python package init_file = self.library_dir / "__init__.py" if not init_file.exists(): init_file.write_text('"""Core extractor library for Atomizer."""\n') # Library catalog - tracks all available extractors self.catalog_file = self.library_dir / "catalog.json" self.catalog = self._load_catalog() logger.info(f"Extractor library initialized: {self.library_dir}") logger.info(f"Library contains {len(self.catalog)} extractors") def _load_catalog(self) -> Dict[str, Any]: """Load extractor catalog from disk.""" if self.catalog_file.exists(): with open(self.catalog_file) as f: return json.load(f) return {} def _save_catalog(self): """Save extractor catalog to disk.""" with open(self.catalog_file, 'w') as f: json.dump(self.catalog, f, indent=2) def _compute_signature(self, llm_feature: Dict[str, Any]) -> str: """ Compute unique signature for an extractor based on its functionality. Two extractors are considered identical if they have the same: - Action (e.g., extract_displacement) - Domain (e.g., result_extraction) - Key parameters (e.g., result_type, metric) """ # Normalize the feature specification signature_data = { 'action': llm_feature.get('action', ''), 'domain': llm_feature.get('domain', ''), 'params': llm_feature.get('params', {}) } # Create deterministic hash signature_str = json.dumps(signature_data, sort_keys=True) return hashlib.sha256(signature_str.encode()).hexdigest()[:16] def get_or_create(self, llm_feature: Dict[str, Any], extractor_code: str) -> Path: """ Get existing extractor from library or add new one. Args: llm_feature: LLM feature specification (action, domain, params) extractor_code: Generated Python code for the extractor Returns: Path to extractor module in core library """ # Compute signature to check if extractor already exists signature = self._compute_signature(llm_feature) # Check if extractor already exists in library if signature in self.catalog: extractor_info = self.catalog[signature] extractor_file = self.library_dir / extractor_info['filename'] if extractor_file.exists(): logger.info(f"Reusing existing extractor: {extractor_info['name']}") return extractor_file # Create new extractor in library action = llm_feature.get('action', 'unknown_action') filename = f"{action}.py" extractor_file = self.library_dir / filename # Write extractor code to library extractor_file.write_text(extractor_code) # Add to catalog self.catalog[signature] = { 'name': action, 'filename': filename, 'action': llm_feature.get('action'), 'domain': llm_feature.get('domain'), 'description': llm_feature.get('description', ''), 'params': llm_feature.get('params', {}), 'signature': signature } self._save_catalog() logger.info(f"Added new extractor to library: {action}") return extractor_file def get_extractor_metadata(self, signature: str) -> Optional[Dict[str, Any]]: """Get metadata for an extractor by its signature.""" return self.catalog.get(signature) def list_extractors(self) -> List[Dict[str, Any]]: """List all extractors in the library.""" return list(self.catalog.values()) def get_library_summary(self) -> str: """Generate human-readable summary of library contents.""" lines = [] lines.append("=" * 80) lines.append("ATOMIZER EXTRACTOR LIBRARY") lines.append("=" * 80) lines.append("") lines.append(f"Location: {self.library_dir}") lines.append(f"Total extractors: {len(self.catalog)}") lines.append("") if self.catalog: lines.append("Available Extractors:") lines.append("-" * 80) for signature, info in self.catalog.items(): lines.append(f"\n{info['name']}") lines.append(f" Domain: {info['domain']}") lines.append(f" Description: {info['description']}") lines.append(f" File: {info['filename']}") lines.append(f" Signature: {signature}") else: lines.append("Library is empty. Extractors will be added on first use.") lines.append("") lines.append("=" * 80) return "\n".join(lines) def create_study_manifest(extractors_used: List[str], output_dir: Path): """ Create a manifest file documenting which extractors were used in a study. This replaces the old approach of copying extractor code into study folders. Now we just record which library extractors were used. Args: extractors_used: List of extractor signatures used in this study output_dir: Study output directory """ manifest = { 'extractors_used': extractors_used, 'extractor_library': 'optimization_engine/extractors/', 'note': 'Extractors are stored in the core library, not in this study folder' } manifest_file = output_dir / "extractors_manifest.json" with open(manifest_file, 'w') as f: json.dump(manifest, f, indent=2) logger.info(f"Study manifest created: {manifest_file}") if __name__ == '__main__': """Test the extractor library system.""" # Initialize library library = ExtractorLibrary() # Print summary print(library.get_library_summary()) # Test adding an extractor test_feature = { 'action': 'extract_displacement', 'domain': 'result_extraction', 'description': 'Extract displacement from OP2 file', 'params': {'result_type': 'displacement', 'metric': 'max'} } test_code = '''"""Extract displacement from OP2 file.""" def extract_displacement(op2_file): # Implementation here pass ''' extractor_path = library.get_or_create(test_feature, test_code) print(f"\nExtractor created/retrieved: {extractor_path}") # Try to add it again - should reuse existing extractor_path2 = library.get_or_create(test_feature, test_code) print(f"Second call (should reuse): {extractor_path2}") # Verify they're the same assert extractor_path == extractor_path2, "Should reuse existing extractor!" print("\n[SUCCESS] Extractor deduplication working correctly!")