Files
Atomizer/optimization_engine/extractor_library.py

234 lines
8.0 KiB
Python
Raw Normal View History

refactor: Implement centralized extractor library to eliminate code duplication MAJOR ARCHITECTURE REFACTOR - Clean Study Folders Problem Identified by User: "My study folder is a mess, why? I want some order and real structure to develop an insanely good engineering software that evolve with time." - Every substudy was generating duplicate extractor code - Study folders polluted with reusable library code (generated_extractors/, generated_hooks/) - No code reuse across studies - Not production-grade architecture Solution - Centralized Library System: Implemented smart library with signature-based deduplication: - Core extractors in optimization_engine/extractors/ - Studies only store metadata (extractors_manifest.json) - Clean separation: studies = data, core = code Architecture: BEFORE (BAD): studies/my_study/ generated_extractors/ ❌ Code pollution! extract_displacement.py extract_von_mises_stress.py generated_hooks/ ❌ Code pollution! llm_workflow_config.json results.json AFTER (GOOD): optimization_engine/extractors/ ✓ Core library extract_displacement.py extract_stress.py catalog.json studies/my_study/ extractors_manifest.json ✓ Just references! llm_workflow_config.json ✓ Config optimization_results.json ✓ Results New Components: 1. ExtractorLibrary (extractor_library.py) - Signature-based deduplication - Centralized catalog (catalog.json) - Study manifest generation - Reusability across all studies 2. Updated ExtractorOrchestrator - Uses core library instead of per-study generation - Creates manifest instead of copying code - Backward compatible (legacy mode available) 3. Updated LLMOptimizationRunner - Removed generated_extractors/ directory creation - Removed generated_hooks/ directory creation - Uses core library exclusively 4. Updated Tests - Verifies extractors_manifest.json exists - Checks for clean study folder structure - All 18/18 checks pass Results: Study folders NOW ONLY contain: ✓ extractors_manifest.json - references to core library ✓ llm_workflow_config.json - study configuration ✓ optimization_results.json - optimization results ✓ optimization_history.json - trial history ✓ .db file - Optuna database Core library contains: ✓ extract_displacement.py - reusable across ALL studies ✓ extract_von_mises_stress.py - reusable across ALL studies ✓ extract_mass.py - reusable across ALL studies ✓ catalog.json - tracks all extractors with signatures Benefits: - Clean, professional study folder structure - Code reuse eliminates duplication - Library grows over time, studies stay clean - Production-grade architecture - "Insanely good engineering software that evolves with time" Testing: E2E test passes with clean folder structure - No generated_extractors/ pollution - Manifest correctly references library - Core library populated with reusable extractors - Study folder professional and minimal Documentation: - Added comprehensive architecture doc (docs/ARCHITECTURE_REFACTOR_NOV17.md) - Includes migration guide - Documents future work (hooks library, versioning, CLI tools) Next Steps: - Apply same architecture to hooks library - Add auto-generated documentation for library - Implement versioning for reproducibility 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-18 09:00:10 -05:00
"""
Extractor Library Manager - Phase 3.2 Architecture Refactor
Manages a centralized library of reusable extractors to prevent code duplication
and keep study folders clean.
Architecture Principles:
1. Reusable extractors stored in optimization_engine/extractors/
2. Study folders only contain metadata (which extractors were used)
3. First-time generation adds to library with documentation
4. Subsequent requests reuse existing library code
Author: Antoine Letarte
Date: 2025-11-17
Phase: 3.2 Architecture Refactor
"""
import json
import hashlib
from pathlib import Path
from typing import Dict, Any, List, Optional
import logging
logger = logging.getLogger(__name__)
class ExtractorLibrary:
"""
Centralized library of reusable FEA result extractors.
Prevents code duplication by maintaining a core library of extractors
that can be reused across all optimization studies.
"""
def __init__(self, library_dir: Optional[Path] = None):
"""
Initialize extractor library.
Args:
library_dir: Directory for core extractor library
(default: optimization_engine/extractors/)
"""
if library_dir is None:
library_dir = Path(__file__).parent / "extractors"
self.library_dir = Path(library_dir)
self.library_dir.mkdir(parents=True, exist_ok=True)
# Create __init__.py for Python package
init_file = self.library_dir / "__init__.py"
if not init_file.exists():
init_file.write_text('"""Core extractor library for Atomizer."""\n')
# Library catalog - tracks all available extractors
self.catalog_file = self.library_dir / "catalog.json"
self.catalog = self._load_catalog()
logger.info(f"Extractor library initialized: {self.library_dir}")
logger.info(f"Library contains {len(self.catalog)} extractors")
def _load_catalog(self) -> Dict[str, Any]:
"""Load extractor catalog from disk."""
if self.catalog_file.exists():
with open(self.catalog_file) as f:
return json.load(f)
return {}
def _save_catalog(self):
"""Save extractor catalog to disk."""
with open(self.catalog_file, 'w') as f:
json.dump(self.catalog, f, indent=2)
def _compute_signature(self, llm_feature: Dict[str, Any]) -> str:
"""
Compute unique signature for an extractor based on its functionality.
Two extractors are considered identical if they have the same:
- Action (e.g., extract_displacement)
- Domain (e.g., result_extraction)
- Key parameters (e.g., result_type, metric)
"""
# Normalize the feature specification
signature_data = {
'action': llm_feature.get('action', ''),
'domain': llm_feature.get('domain', ''),
'params': llm_feature.get('params', {})
}
# Create deterministic hash
signature_str = json.dumps(signature_data, sort_keys=True)
return hashlib.sha256(signature_str.encode()).hexdigest()[:16]
def get_or_create(self, llm_feature: Dict[str, Any], extractor_code: str) -> Path:
"""
Get existing extractor from library or add new one.
Args:
llm_feature: LLM feature specification (action, domain, params)
extractor_code: Generated Python code for the extractor
Returns:
Path to extractor module in core library
"""
# Compute signature to check if extractor already exists
signature = self._compute_signature(llm_feature)
# Check if extractor already exists in library
if signature in self.catalog:
extractor_info = self.catalog[signature]
extractor_file = self.library_dir / extractor_info['filename']
if extractor_file.exists():
logger.info(f"Reusing existing extractor: {extractor_info['name']}")
return extractor_file
# Create new extractor in library
action = llm_feature.get('action', 'unknown_action')
filename = f"{action}.py"
extractor_file = self.library_dir / filename
# Write extractor code to library
extractor_file.write_text(extractor_code)
# Add to catalog
self.catalog[signature] = {
'name': action,
'filename': filename,
'action': llm_feature.get('action'),
'domain': llm_feature.get('domain'),
'description': llm_feature.get('description', ''),
'params': llm_feature.get('params', {}),
'signature': signature
}
self._save_catalog()
logger.info(f"Added new extractor to library: {action}")
return extractor_file
def get_extractor_metadata(self, signature: str) -> Optional[Dict[str, Any]]:
"""Get metadata for an extractor by its signature."""
return self.catalog.get(signature)
def list_extractors(self) -> List[Dict[str, Any]]:
"""List all extractors in the library."""
return list(self.catalog.values())
def get_library_summary(self) -> str:
"""Generate human-readable summary of library contents."""
lines = []
lines.append("=" * 80)
lines.append("ATOMIZER EXTRACTOR LIBRARY")
lines.append("=" * 80)
lines.append("")
lines.append(f"Location: {self.library_dir}")
lines.append(f"Total extractors: {len(self.catalog)}")
lines.append("")
if self.catalog:
lines.append("Available Extractors:")
lines.append("-" * 80)
for signature, info in self.catalog.items():
lines.append(f"\n{info['name']}")
lines.append(f" Domain: {info['domain']}")
lines.append(f" Description: {info['description']}")
lines.append(f" File: {info['filename']}")
lines.append(f" Signature: {signature}")
else:
lines.append("Library is empty. Extractors will be added on first use.")
lines.append("")
lines.append("=" * 80)
return "\n".join(lines)
def create_study_manifest(extractors_used: List[str], output_dir: Path):
"""
Create a manifest file documenting which extractors were used in a study.
This replaces the old approach of copying extractor code into study folders.
Now we just record which library extractors were used.
Args:
extractors_used: List of extractor signatures used in this study
output_dir: Study output directory
"""
manifest = {
'extractors_used': extractors_used,
'extractor_library': 'optimization_engine/extractors/',
'note': 'Extractors are stored in the core library, not in this study folder'
}
manifest_file = output_dir / "extractors_manifest.json"
with open(manifest_file, 'w') as f:
json.dump(manifest, f, indent=2)
logger.info(f"Study manifest created: {manifest_file}")
if __name__ == '__main__':
"""Test the extractor library system."""
# Initialize library
library = ExtractorLibrary()
# Print summary
print(library.get_library_summary())
# Test adding an extractor
test_feature = {
'action': 'extract_displacement',
'domain': 'result_extraction',
'description': 'Extract displacement from OP2 file',
'params': {'result_type': 'displacement', 'metric': 'max'}
}
test_code = '''"""Extract displacement from OP2 file."""
def extract_displacement(op2_file):
# Implementation here
pass
'''
extractor_path = library.get_or_create(test_feature, test_code)
print(f"\nExtractor created/retrieved: {extractor_path}")
# Try to add it again - should reuse existing
extractor_path2 = library.get_or_create(test_feature, test_code)
print(f"Second call (should reuse): {extractor_path2}")
# Verify they're the same
assert extractor_path == extractor_path2, "Should reuse existing extractor!"
print("\n[SUCCESS] Extractor deduplication working correctly!")