Atomizer/optimization_engine/nx/model_cleanup.py

"""
Model Cleanup System

Intelligent cleanup of trial model files to save disk space.
Keeps top-N trials based on objective value, deletes CAD/FEM files for poor trials.

Strategy:
- Preserve ALL trial results.json files (small, contain critical data)
- Delete large CAD/FEM files (.prt, .sim, .fem, .op2, .f06) for non-top-N trials
- Keep best trial models + user-specified number of top trials
"""

from pathlib import Path
from typing import Dict, List, Optional
import json
import shutil


class ModelCleanup:
    """
    Clean up trial directories to save disk space.

    Deletes large model files (.prt, .sim, .fem, .op2, .f06) from trials
    that are not in the top-N performers.
    """

    # File extensions to delete (large CAD/FEM/result files)
    CLEANUP_EXTENSIONS = {
        '.prt',      # NX part files
        '.sim',      # NX simulation files
        '.fem',      # FEM mesh files
        '.afm',      # NX assembly FEM
        '.op2',      # Nastran binary results
        '.f06',      # Nastran text results
        '.dat',      # Nastran input deck
        '.bdf',      # Nastran bulk data
        '.pch',      # Nastran punch file
        '.log',      # Nastran log
        '.master',   # Nastran master file
        '.dball',    # Nastran database
        '.MASTER',   # Nastran master (uppercase)
        '.DBALL',    # Nastran database (uppercase)
    }

    # Files to ALWAYS keep (small, critical data)
    PRESERVE_FILES = {
        'results.json',
        'trial_metadata.json',
        'extraction_log.txt',
    }

    def __init__(self, substudy_dir: Path):
        """
        Initialize cleanup manager.

        Args:
            substudy_dir: Path to substudy directory containing trial_XXX folders
        """
        self.substudy_dir = Path(substudy_dir)
        self.history_file = self.substudy_dir / 'history.json'
        self.cleanup_log = self.substudy_dir / 'cleanup_log.json'

    def cleanup_models(
        self,
        keep_top_n: int = 10,
        dry_run: bool = False
    ) -> Dict:
        """
        Clean up trial model files, keeping only top-N performers.

        Args:
            keep_top_n: Number of best trials to keep models for
            dry_run: If True, only report what would be deleted without deleting

        Returns:
            Dictionary with cleanup statistics
        """
        if not self.history_file.exists():
            raise FileNotFoundError(f"History file not found: {self.history_file}")

        # Load history
        with open(self.history_file, 'r') as f:
            history = json.load(f)

        # Sort trials by objective value (minimize)
        sorted_trials = sorted(history, key=lambda x: x.get('total_objective', float('inf')))

        # Identify top-N trials to keep
        keep_trial_numbers = set()
        for i in range(min(keep_top_n, len(sorted_trials))):
            keep_trial_numbers.add(sorted_trials[i]['trial_number'])

        # Cleanup statistics
        stats = {
            'total_trials': len(history),
            'kept_trials': len(keep_trial_numbers),
            'cleaned_trials': 0,
            'files_deleted': 0,
            'space_freed_mb': 0.0,
            'deleted_files': [],
            'kept_trial_numbers': sorted(list(keep_trial_numbers)),
            'dry_run': dry_run
        }

        # Process each trial directory
        trial_dirs = sorted(self.substudy_dir.glob('trial_*'))

        for trial_dir in trial_dirs:
            if not trial_dir.is_dir():
                continue

            # Extract trial number from directory name
            try:
                trial_num = int(trial_dir.name.split('_')[-1])
            except (ValueError, IndexError):
                continue

            # Skip if this trial should be kept
            if trial_num in keep_trial_numbers:
                continue

            # Clean up this trial
            trial_stats = self._cleanup_trial_directory(trial_dir, dry_run)
            stats['files_deleted'] += trial_stats['files_deleted']
            stats['space_freed_mb'] += trial_stats['space_freed_mb']
            stats['deleted_files'].extend(trial_stats['deleted_files'])

            if trial_stats['files_deleted'] > 0:
                stats['cleaned_trials'] += 1

        # Save cleanup log
        if not dry_run:
            with open(self.cleanup_log, 'w') as f:
                json.dump(stats, f, indent=2)

        return stats

    def _cleanup_trial_directory(self, trial_dir: Path, dry_run: bool) -> Dict:
        """
        Clean up a single trial directory.

        Args:
            trial_dir: Path to trial directory
            dry_run: If True, don't actually delete files

        Returns:
            Dictionary with cleanup statistics for this trial
        """
        stats = {
            'files_deleted': 0,
            'space_freed_mb': 0.0,
            'deleted_files': []
        }

        for file_path in trial_dir.iterdir():
            if not file_path.is_file():
                continue

            # Skip preserved files
            if file_path.name in self.PRESERVE_FILES:
                continue

            # Check if file should be deleted
            if file_path.suffix.lower() in self.CLEANUP_EXTENSIONS:
                file_size_mb = file_path.stat().st_size / (1024 * 1024)

                stats['files_deleted'] += 1
                stats['space_freed_mb'] += file_size_mb
                stats['deleted_files'].append(str(file_path.relative_to(self.substudy_dir)))

                # Delete file (unless dry run)
                if not dry_run:
                    try:
                        file_path.unlink()
                    except Exception as e:
                        print(f"Warning: Could not delete {file_path}: {e}")

        return stats

    def print_cleanup_report(self, stats: Dict):
        """
        Print human-readable cleanup report.

        Args:
            stats: Cleanup statistics dictionary
        """
        print("\n" + "="*70)
        print("MODEL CLEANUP REPORT")
        print("="*70)

        if stats['dry_run']:
            print("[DRY RUN - No files were actually deleted]")
            print()

        print(f"Total trials:        {stats['total_trials']}")
        print(f"Trials kept:         {stats['kept_trials']}")
        print(f"Trials cleaned:      {stats['cleaned_trials']}")
        print(f"Files deleted:       {stats['files_deleted']}")
        print(f"Space freed:         {stats['space_freed_mb']:.2f} MB")
        print()
        print(f"Kept trial numbers:  {stats['kept_trial_numbers']}")
        print()

        if stats['files_deleted'] > 0:
            print("Deleted file types:")
            file_types = {}
            for filepath in stats['deleted_files']:
                ext = Path(filepath).suffix.lower()
                file_types[ext] = file_types.get(ext, 0) + 1

            for ext, count in sorted(file_types.items()):
                print(f"  {ext:15s}: {count:4d} files")

        print("="*70 + "\n")


def cleanup_substudy(
    substudy_dir: Path,
    keep_top_n: int = 10,
    dry_run: bool = False,
    verbose: bool = True
) -> Dict:
    """
    Convenience function to clean up a substudy.

    Args:
        substudy_dir: Path to substudy directory
        keep_top_n: Number of best trials to preserve models for
        dry_run: If True, only report what would be deleted
        verbose: If True, print cleanup report

    Returns:
        Cleanup statistics dictionary
    """
    cleaner = ModelCleanup(substudy_dir)
    stats = cleaner.cleanup_models(keep_top_n=keep_top_n, dry_run=dry_run)

    if verbose:
        cleaner.print_cleanup_report(stats)

    return stats


if __name__ == '__main__':
    import sys
    import argparse

    parser = argparse.ArgumentParser(
        description='Clean up optimization trial model files to save disk space'
    )
    parser.add_argument(
        'substudy_dir',
        type=Path,
        help='Path to substudy directory'
    )
    parser.add_argument(
        '--keep-top-n',
        type=int,
        default=10,
        help='Number of best trials to keep models for (default: 10)'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be deleted without actually deleting'
    )

    args = parser.parse_args()

    cleanup_substudy(
        args.substudy_dir,
        keep_top_n=args.keep_top_n,
        dry_run=args.dry_run
    )