""" Model Cleanup System Intelligent cleanup of trial model files to save disk space. Keeps top-N trials based on objective value, deletes CAD/FEM files for poor trials. Strategy: - Preserve ALL trial results.json files (small, contain critical data) - Delete large CAD/FEM files (.prt, .sim, .fem, .op2, .f06) for non-top-N trials - Keep best trial models + user-specified number of top trials """ from pathlib import Path from typing import Dict, List, Optional import json import shutil class ModelCleanup: """ Clean up trial directories to save disk space. Deletes large model files (.prt, .sim, .fem, .op2, .f06) from trials that are not in the top-N performers. """ # File extensions to delete (large CAD/FEM/result files) CLEANUP_EXTENSIONS = { '.prt', # NX part files '.sim', # NX simulation files '.fem', # FEM mesh files '.afm', # NX assembly FEM '.op2', # Nastran binary results '.f06', # Nastran text results '.dat', # Nastran input deck '.bdf', # Nastran bulk data '.pch', # Nastran punch file '.log', # Nastran log '.master', # Nastran master file '.dball', # Nastran database '.MASTER', # Nastran master (uppercase) '.DBALL', # Nastran database (uppercase) } # Files to ALWAYS keep (small, critical data) PRESERVE_FILES = { 'results.json', 'trial_metadata.json', 'extraction_log.txt', } def __init__(self, substudy_dir: Path): """ Initialize cleanup manager. Args: substudy_dir: Path to substudy directory containing trial_XXX folders """ self.substudy_dir = Path(substudy_dir) self.history_file = self.substudy_dir / 'history.json' self.cleanup_log = self.substudy_dir / 'cleanup_log.json' def cleanup_models( self, keep_top_n: int = 10, dry_run: bool = False ) -> Dict: """ Clean up trial model files, keeping only top-N performers. Args: keep_top_n: Number of best trials to keep models for dry_run: If True, only report what would be deleted without deleting Returns: Dictionary with cleanup statistics """ if not self.history_file.exists(): raise FileNotFoundError(f"History file not found: {self.history_file}") # Load history with open(self.history_file, 'r') as f: history = json.load(f) # Sort trials by objective value (minimize) sorted_trials = sorted(history, key=lambda x: x.get('total_objective', float('inf'))) # Identify top-N trials to keep keep_trial_numbers = set() for i in range(min(keep_top_n, len(sorted_trials))): keep_trial_numbers.add(sorted_trials[i]['trial_number']) # Cleanup statistics stats = { 'total_trials': len(history), 'kept_trials': len(keep_trial_numbers), 'cleaned_trials': 0, 'files_deleted': 0, 'space_freed_mb': 0.0, 'deleted_files': [], 'kept_trial_numbers': sorted(list(keep_trial_numbers)), 'dry_run': dry_run } # Process each trial directory trial_dirs = sorted(self.substudy_dir.glob('trial_*')) for trial_dir in trial_dirs: if not trial_dir.is_dir(): continue # Extract trial number from directory name try: trial_num = int(trial_dir.name.split('_')[-1]) except (ValueError, IndexError): continue # Skip if this trial should be kept if trial_num in keep_trial_numbers: continue # Clean up this trial trial_stats = self._cleanup_trial_directory(trial_dir, dry_run) stats['files_deleted'] += trial_stats['files_deleted'] stats['space_freed_mb'] += trial_stats['space_freed_mb'] stats['deleted_files'].extend(trial_stats['deleted_files']) if trial_stats['files_deleted'] > 0: stats['cleaned_trials'] += 1 # Save cleanup log if not dry_run: with open(self.cleanup_log, 'w') as f: json.dump(stats, f, indent=2) return stats def _cleanup_trial_directory(self, trial_dir: Path, dry_run: bool) -> Dict: """ Clean up a single trial directory. Args: trial_dir: Path to trial directory dry_run: If True, don't actually delete files Returns: Dictionary with cleanup statistics for this trial """ stats = { 'files_deleted': 0, 'space_freed_mb': 0.0, 'deleted_files': [] } for file_path in trial_dir.iterdir(): if not file_path.is_file(): continue # Skip preserved files if file_path.name in self.PRESERVE_FILES: continue # Check if file should be deleted if file_path.suffix.lower() in self.CLEANUP_EXTENSIONS: file_size_mb = file_path.stat().st_size / (1024 * 1024) stats['files_deleted'] += 1 stats['space_freed_mb'] += file_size_mb stats['deleted_files'].append(str(file_path.relative_to(self.substudy_dir))) # Delete file (unless dry run) if not dry_run: try: file_path.unlink() except Exception as e: print(f"Warning: Could not delete {file_path}: {e}") return stats def print_cleanup_report(self, stats: Dict): """ Print human-readable cleanup report. Args: stats: Cleanup statistics dictionary """ print("\n" + "="*70) print("MODEL CLEANUP REPORT") print("="*70) if stats['dry_run']: print("[DRY RUN - No files were actually deleted]") print() print(f"Total trials: {stats['total_trials']}") print(f"Trials kept: {stats['kept_trials']}") print(f"Trials cleaned: {stats['cleaned_trials']}") print(f"Files deleted: {stats['files_deleted']}") print(f"Space freed: {stats['space_freed_mb']:.2f} MB") print() print(f"Kept trial numbers: {stats['kept_trial_numbers']}") print() if stats['files_deleted'] > 0: print("Deleted file types:") file_types = {} for filepath in stats['deleted_files']: ext = Path(filepath).suffix.lower() file_types[ext] = file_types.get(ext, 0) + 1 for ext, count in sorted(file_types.items()): print(f" {ext:15s}: {count:4d} files") print("="*70 + "\n") def cleanup_substudy( substudy_dir: Path, keep_top_n: int = 10, dry_run: bool = False, verbose: bool = True ) -> Dict: """ Convenience function to clean up a substudy. Args: substudy_dir: Path to substudy directory keep_top_n: Number of best trials to preserve models for dry_run: If True, only report what would be deleted verbose: If True, print cleanup report Returns: Cleanup statistics dictionary """ cleaner = ModelCleanup(substudy_dir) stats = cleaner.cleanup_models(keep_top_n=keep_top_n, dry_run=dry_run) if verbose: cleaner.print_cleanup_report(stats) return stats if __name__ == '__main__': import sys import argparse parser = argparse.ArgumentParser( description='Clean up optimization trial model files to save disk space' ) parser.add_argument( 'substudy_dir', type=Path, help='Path to substudy directory' ) parser.add_argument( '--keep-top-n', type=int, default=10, help='Number of best trials to keep models for (default: 10)' ) parser.add_argument( '--dry-run', action='store_true', help='Show what would be deleted without actually deleting' ) args = parser.parse_args() cleanup_substudy( args.substudy_dir, keep_top_n=args.keep_top_n, dry_run=args.dry_run )