""" Study Cleanup Utility ==================== Cleans up completed optimization studies to save disk space by removing large intermediate files (NX models, FEM meshes, solver results) while preserving essential data (parameters, extracted results, database). Usage: python -m optimization_engine.utils.study_cleanup [options] Options: --dry-run Show what would be deleted without actually deleting --keep-best N Keep iteration folders for the top N best trials --keep-pareto Keep all Pareto-optimal iterations (for multi-objective) --aggressive Delete ALL iteration data (only keep DB and config) The database (study.db) contains all optimization results and can regenerate any analysis. The original NX model in 1_setup is always preserved. """ import argparse import json import shutil import sqlite3 from pathlib import Path from typing import Optional # Files to ALWAYS keep in iteration folders (tiny, essential) ESSENTIAL_FILES = { 'params.exp', # Design parameters for this iteration '_temp_mass.txt', # Extracted mass '_temp_part_properties.json', # Part properties '_temp_zernike.json', # Zernike coefficients (if exists) 'results.json', # Any extracted results } # Extensions to DELETE (large, regenerable/already extracted) DELETABLE_EXTENSIONS = { '.op2', # Nastran binary results (~65 MB each) '.prt', # NX Part files (~30-35 MB each) '.fem', # FEM mesh files (~15 MB each) '.dat', # Nastran input deck (~15 MB each) '.sim', # Simulation file (~7 MB each) '.afm', # FEA auxiliary (~4 MB each) '.f04', # Nastran log '.f06', # Nastran output '.log', # Solver log '.diag', # Diagnostics } def get_study_info(study_path: Path) -> dict: """Get study metadata from config and database.""" config_path = study_path / 'optimization_config.json' # Try both possible DB locations db_path = study_path / '3_results' / 'study.db' if not db_path.exists(): db_path = study_path / '2_results' / 'study.db' info = { 'name': study_path.name, 'has_config': config_path.exists(), 'has_db': db_path.exists(), 'trial_count': 0, 'best_trials': [], 'pareto_trials': [], } if config_path.exists(): with open(config_path) as f: info['config'] = json.load(f) if db_path.exists(): conn = sqlite3.connect(db_path) cursor = conn.cursor() # Get trial count cursor.execute("SELECT COUNT(*) FROM trials WHERE state = 'COMPLETE'") info['trial_count'] = cursor.fetchone()[0] # Try to get best trials (for single objective) try: cursor.execute(""" SELECT trial_id, value FROM trial_values WHERE objective = 0 ORDER BY value ASC LIMIT 10 """) info['best_trials'] = [row[0] for row in cursor.fetchall()] except Exception as e: pass # Check for Pareto attribute try: cursor.execute(""" SELECT DISTINCT trial_id FROM trial_system_attrs WHERE key = 'pareto_optimal' AND value = '1' """) info['pareto_trials'] = [row[0] for row in cursor.fetchall()] except: pass conn.close() return info def calculate_cleanup_savings(study_path: Path, keep_iters: set = None) -> dict: """Calculate how much space would be saved by cleanup.""" iterations_path = study_path / '2_iterations' if not iterations_path.exists(): iterations_path = study_path / '1_working' # Legacy structure if not iterations_path.exists(): return {'total_size': 0, 'deletable_size': 0, 'keep_size': 0} total_size = 0 deletable_size = 0 keep_size = 0 keep_iters = keep_iters or set() for iter_folder in iterations_path.iterdir(): if not iter_folder.is_dir(): continue # Extract iteration number try: iter_num = int(iter_folder.name.replace('iter', '')) except: continue for f in iter_folder.iterdir(): if not f.is_file(): continue size = f.stat().st_size total_size += size # Keep entire folder if in keep_iters if iter_num in keep_iters: keep_size += size continue # Keep essential files if f.name.lower() in {e.lower() for e in ESSENTIAL_FILES}: keep_size += size elif f.suffix.lower() in DELETABLE_EXTENSIONS: deletable_size += size else: keep_size += size # Keep unknown files by default return { 'total_size': total_size, 'deletable_size': deletable_size, 'keep_size': keep_size, } def cleanup_study( study_path: Path, dry_run: bool = True, keep_best: int = 0, keep_pareto: bool = False, aggressive: bool = False, ) -> dict: """ Clean up a study to save disk space. Args: study_path: Path to study folder dry_run: If True, only report what would be deleted keep_best: Number of best iterations to keep completely keep_pareto: Keep all Pareto-optimal iterations aggressive: Delete ALL iteration folders (only keep DB) Returns: dict with cleanup statistics """ study_path = Path(study_path) if not study_path.exists(): raise ValueError(f"Study path does not exist: {study_path}") # Get study info info = get_study_info(study_path) # Determine which iterations to keep keep_iters = set() if keep_best > 0 and info['best_trials']: keep_iters.update(info['best_trials'][:keep_best]) if keep_pareto and info['pareto_trials']: keep_iters.update(info['pareto_trials']) # Find iterations folder iterations_path = study_path / '2_iterations' if not iterations_path.exists(): iterations_path = study_path / '1_working' if not iterations_path.exists(): return {'status': 'no_iterations', 'deleted_bytes': 0, 'deleted_files': 0} # Calculate savings savings = calculate_cleanup_savings(study_path, keep_iters) deleted_bytes = 0 deleted_files = 0 deleted_folders = 0 if aggressive: # Delete entire iterations folder if not dry_run: shutil.rmtree(iterations_path) deleted_bytes = savings['total_size'] deleted_folders = 1 else: deleted_bytes = savings['total_size'] else: # Selective cleanup for iter_folder in iterations_path.iterdir(): if not iter_folder.is_dir(): continue # Extract iteration number try: iter_num = int(iter_folder.name.replace('iter', '')) except: continue # Skip kept iterations if iter_num in keep_iters: continue for f in iter_folder.iterdir(): if not f.is_file(): continue # Keep essential files if f.name.lower() in {e.lower() for e in ESSENTIAL_FILES}: continue # Delete deletable extensions if f.suffix.lower() in DELETABLE_EXTENSIONS: size = f.stat().st_size if not dry_run: f.unlink() deleted_bytes += size deleted_files += 1 return { 'status': 'dry_run' if dry_run else 'completed', 'study_name': info['name'], 'trial_count': info['trial_count'], 'kept_iterations': list(keep_iters), 'total_size_before': savings['total_size'], 'deleted_bytes': deleted_bytes, 'deleted_files': deleted_files, 'deleted_folders': deleted_folders, 'space_saved_gb': deleted_bytes / (1024**3), } def cleanup_batch( parent_path: Path, pattern: str = "*", dry_run: bool = True, keep_best: int = 3, keep_pareto: bool = False, aggressive: bool = False, ) -> list: """ Clean up multiple studies matching a pattern. Args: parent_path: Parent directory containing studies pattern: Glob pattern to match study folders (e.g., "m1_mirror_*") dry_run: If True, only report keep_best: Keep N best iterations per study keep_pareto: Keep Pareto-optimal iterations aggressive: Delete all iteration folders Returns: List of cleanup results """ parent_path = Path(parent_path) results = [] for study_path in sorted(parent_path.glob(pattern)): if not study_path.is_dir(): continue # Check if it looks like a study (has iterations folder) if not (study_path / '2_iterations').exists() and not (study_path / '1_working').exists(): continue try: result = cleanup_study( study_path, dry_run=dry_run, keep_best=keep_best, keep_pareto=keep_pareto, aggressive=aggressive, ) results.append(result) except Exception as e: results.append({ 'study_name': study_path.name, 'status': 'error', 'error': str(e), }) return results def main(): parser = argparse.ArgumentParser( description='Clean up completed optimization studies to save disk space.', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__ ) parser.add_argument('study_path', type=Path, help='Path to study folder or parent directory') parser.add_argument('--dry-run', action='store_true', default=True, help='Show what would be deleted without deleting (default)') parser.add_argument('--execute', action='store_true', help='Actually delete files (opposite of --dry-run)') parser.add_argument('--keep-best', type=int, default=3, help='Keep N best iterations completely (default: 3)') parser.add_argument('--keep-pareto', action='store_true', help='Keep all Pareto-optimal iterations') parser.add_argument('--aggressive', action='store_true', help='Delete ALL iteration data (only keep DB)') parser.add_argument('--batch', type=str, metavar='PATTERN', help='Clean multiple studies matching pattern (e.g., "m1_mirror_*")') args = parser.parse_args() dry_run = not args.execute if args.batch: # Batch cleanup mode print(f"\n{'='*60}") print(f"BATCH CLEANUP: {args.study_path}") print(f"Pattern: {args.batch}") print(f"{'='*60}") print(f"Mode: {'DRY RUN' if dry_run else 'EXECUTE'}") results = cleanup_batch( args.study_path, pattern=args.batch, dry_run=dry_run, keep_best=args.keep_best, keep_pareto=args.keep_pareto, aggressive=args.aggressive, ) print(f"\n{'='*60}") print("BATCH RESULTS") print(f"{'='*60}") print(f"{'Study':<45} {'Trials':>7} {'Size':>8} {'Savings':>8}") print("-" * 75) total_saved = 0 for r in results: if r.get('status') == 'error': print(f"{r['study_name']:<45} ERROR: {r.get('error', 'Unknown')}") else: saved = r.get('space_saved_gb', 0) total_saved += saved print(f"{r['study_name']:<45} {r.get('trial_count', 0):>7} " f"{r.get('total_size_before', 0)/(1024**3):>7.1f}G {saved:>7.1f}G") print("-" * 75) print(f"{'TOTAL SAVINGS:':<45} {' '*15} {total_saved:>7.1f}G") if dry_run: print(f"\n[!] This was a dry run. Run with --execute to actually delete files.") return results else: # Single study cleanup print(f"\n{'='*60}") print(f"STUDY CLEANUP: {args.study_path.name}") print(f"{'='*60}") print(f"Mode: {'DRY RUN (no files deleted)' if dry_run else 'EXECUTE (files WILL be deleted)'}") print(f"Keep best: {args.keep_best} iterations") print(f"Keep Pareto: {args.keep_pareto}") print(f"Aggressive: {args.aggressive}") result = cleanup_study( args.study_path, dry_run=dry_run, keep_best=args.keep_best, keep_pareto=args.keep_pareto, aggressive=args.aggressive, ) print(f"\n{'='*60}") print("RESULTS") print(f"{'='*60}") print(f"Trials in study: {result['trial_count']}") print(f"Iterations kept: {len(result['kept_iterations'])} {result['kept_iterations'][:5]}{'...' if len(result['kept_iterations']) > 5 else ''}") print(f"Total size before: {result['total_size_before'] / (1024**3):.2f} GB") print(f"{'Would delete' if dry_run else 'Deleted'}: {result['deleted_files']} files") print(f"Space {'to save' if dry_run else 'saved'}: {result['space_saved_gb']:.2f} GB") if dry_run: print(f"\n[!] This was a dry run. Run with --execute to actually delete files.") return result if __name__ == '__main__': main()