"""
Study Cleanup Utility
====================

Cleans up completed optimization studies to save disk space by removing
large intermediate files (NX models, FEM meshes, solver results) while
preserving essential data (parameters, extracted results, database).

Usage:
    python -m optimization_engine.utils.study_cleanup <study_path> [options]

Options:
    --dry-run       Show what would be deleted without actually deleting
    --keep-best N   Keep iteration folders for the top N best trials
    --keep-pareto   Keep all Pareto-optimal iterations (for multi-objective)
    --aggressive    Delete ALL iteration data (only keep DB and config)

The database (study.db) contains all optimization results and can regenerate
any analysis. The original NX model in 1_setup is always preserved.
"""

import argparse
import json
import shutil
import sqlite3
from pathlib import Path
from typing import Optional


# Files to ALWAYS keep in iteration folders (tiny, essential)
ESSENTIAL_FILES = {
    'params.exp',           # Design parameters for this iteration
    '_temp_mass.txt',       # Extracted mass
    '_temp_part_properties.json',  # Part properties
    '_temp_zernike.json',   # Zernike coefficients (if exists)
    'results.json',         # Any extracted results
}

# Extensions to DELETE (large, regenerable/already extracted)
DELETABLE_EXTENSIONS = {
    '.op2',   # Nastran binary results (~65 MB each)
    '.prt',   # NX Part files (~30-35 MB each)
    '.fem',   # FEM mesh files (~15 MB each)
    '.dat',   # Nastran input deck (~15 MB each)
    '.sim',   # Simulation file (~7 MB each)
    '.afm',   # FEA auxiliary (~4 MB each)
    '.f04',   # Nastran log
    '.f06',   # Nastran output
    '.log',   # Solver log
    '.diag',  # Diagnostics
}


def get_study_info(study_path: Path) -> dict:
    """Get study metadata from config and database."""
    config_path = study_path / 'optimization_config.json'
    # Try both possible DB locations
    db_path = study_path / '3_results' / 'study.db'
    if not db_path.exists():
        db_path = study_path / '2_results' / 'study.db'

    info = {
        'name': study_path.name,
        'has_config': config_path.exists(),
        'has_db': db_path.exists(),
        'trial_count': 0,
        'best_trials': [],
        'pareto_trials': [],
    }

    if config_path.exists():
        with open(config_path) as f:
            info['config'] = json.load(f)

    if db_path.exists():
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        # Get trial count
        cursor.execute("SELECT COUNT(*) FROM trials WHERE state = 'COMPLETE'")
        info['trial_count'] = cursor.fetchone()[0]

        # Try to get best trials (for single objective)
        try:
            cursor.execute("""
                SELECT trial_id, value FROM trial_values
                WHERE objective = 0
                ORDER BY value ASC LIMIT 10
            """)
            info['best_trials'] = [row[0] for row in cursor.fetchall()]
        except Exception as e:
            pass

        # Check for Pareto attribute
        try:
            cursor.execute("""
                SELECT DISTINCT trial_id FROM trial_system_attrs
                WHERE key = 'pareto_optimal' AND value = '1'
            """)
            info['pareto_trials'] = [row[0] for row in cursor.fetchall()]
        except:
            pass

        conn.close()

    return info


def calculate_cleanup_savings(study_path: Path, keep_iters: set = None) -> dict:
    """Calculate how much space would be saved by cleanup."""
    iterations_path = study_path / '2_iterations'
    if not iterations_path.exists():
        iterations_path = study_path / '1_working'  # Legacy structure

    if not iterations_path.exists():
        return {'total_size': 0, 'deletable_size': 0, 'keep_size': 0}

    total_size = 0
    deletable_size = 0
    keep_size = 0
    keep_iters = keep_iters or set()

    for iter_folder in iterations_path.iterdir():
        if not iter_folder.is_dir():
            continue

        # Extract iteration number
        try:
            iter_num = int(iter_folder.name.replace('iter', ''))
        except:
            continue

        for f in iter_folder.iterdir():
            if not f.is_file():
                continue
            size = f.stat().st_size
            total_size += size

            # Keep entire folder if in keep_iters
            if iter_num in keep_iters:
                keep_size += size
                continue

            # Keep essential files
            if f.name.lower() in {e.lower() for e in ESSENTIAL_FILES}:
                keep_size += size
            elif f.suffix.lower() in DELETABLE_EXTENSIONS:
                deletable_size += size
            else:
                keep_size += size  # Keep unknown files by default

    return {
        'total_size': total_size,
        'deletable_size': deletable_size,
        'keep_size': keep_size,
    }


def cleanup_study(
    study_path: Path,
    dry_run: bool = True,
    keep_best: int = 0,
    keep_pareto: bool = False,
    aggressive: bool = False,
) -> dict:
    """
    Clean up a study to save disk space.

    Args:
        study_path: Path to study folder
        dry_run: If True, only report what would be deleted
        keep_best: Number of best iterations to keep completely
        keep_pareto: Keep all Pareto-optimal iterations
        aggressive: Delete ALL iteration folders (only keep DB)

    Returns:
        dict with cleanup statistics
    """
    study_path = Path(study_path)
    if not study_path.exists():
        raise ValueError(f"Study path does not exist: {study_path}")

    # Get study info
    info = get_study_info(study_path)

    # Determine which iterations to keep
    keep_iters = set()
    if keep_best > 0 and info['best_trials']:
        keep_iters.update(info['best_trials'][:keep_best])
    if keep_pareto and info['pareto_trials']:
        keep_iters.update(info['pareto_trials'])

    # Find iterations folder
    iterations_path = study_path / '2_iterations'
    if not iterations_path.exists():
        iterations_path = study_path / '1_working'

    if not iterations_path.exists():
        return {'status': 'no_iterations', 'deleted_bytes': 0, 'deleted_files': 0}

    # Calculate savings
    savings = calculate_cleanup_savings(study_path, keep_iters)

    deleted_bytes = 0
    deleted_files = 0
    deleted_folders = 0

    if aggressive:
        # Delete entire iterations folder
        if not dry_run:
            shutil.rmtree(iterations_path)
            deleted_bytes = savings['total_size']
            deleted_folders = 1
        else:
            deleted_bytes = savings['total_size']
    else:
        # Selective cleanup
        for iter_folder in iterations_path.iterdir():
            if not iter_folder.is_dir():
                continue

            # Extract iteration number
            try:
                iter_num = int(iter_folder.name.replace('iter', ''))
            except:
                continue

            # Skip kept iterations
            if iter_num in keep_iters:
                continue

            for f in iter_folder.iterdir():
                if not f.is_file():
                    continue

                # Keep essential files
                if f.name.lower() in {e.lower() for e in ESSENTIAL_FILES}:
                    continue

                # Delete deletable extensions
                if f.suffix.lower() in DELETABLE_EXTENSIONS:
                    size = f.stat().st_size
                    if not dry_run:
                        f.unlink()
                    deleted_bytes += size
                    deleted_files += 1

    return {
        'status': 'dry_run' if dry_run else 'completed',
        'study_name': info['name'],
        'trial_count': info['trial_count'],
        'kept_iterations': list(keep_iters),
        'total_size_before': savings['total_size'],
        'deleted_bytes': deleted_bytes,
        'deleted_files': deleted_files,
        'deleted_folders': deleted_folders,
        'space_saved_gb': deleted_bytes / (1024**3),
    }


def cleanup_batch(
    parent_path: Path,
    pattern: str = "*",
    dry_run: bool = True,
    keep_best: int = 3,
    keep_pareto: bool = False,
    aggressive: bool = False,
) -> list:
    """
    Clean up multiple studies matching a pattern.

    Args:
        parent_path: Parent directory containing studies
        pattern: Glob pattern to match study folders (e.g., "m1_mirror_*")
        dry_run: If True, only report
        keep_best: Keep N best iterations per study
        keep_pareto: Keep Pareto-optimal iterations
        aggressive: Delete all iteration folders

    Returns:
        List of cleanup results
    """
    parent_path = Path(parent_path)
    results = []

    for study_path in sorted(parent_path.glob(pattern)):
        if not study_path.is_dir():
            continue
        # Check if it looks like a study (has iterations folder)
        if not (study_path / '2_iterations').exists() and not (study_path / '1_working').exists():
            continue

        try:
            result = cleanup_study(
                study_path,
                dry_run=dry_run,
                keep_best=keep_best,
                keep_pareto=keep_pareto,
                aggressive=aggressive,
            )
            results.append(result)
        except Exception as e:
            results.append({
                'study_name': study_path.name,
                'status': 'error',
                'error': str(e),
            })

    return results


def main():
    parser = argparse.ArgumentParser(
        description='Clean up completed optimization studies to save disk space.',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__
    )
    parser.add_argument('study_path', type=Path, help='Path to study folder or parent directory')
    parser.add_argument('--dry-run', action='store_true', default=True,
                       help='Show what would be deleted without deleting (default)')
    parser.add_argument('--execute', action='store_true',
                       help='Actually delete files (opposite of --dry-run)')
    parser.add_argument('--keep-best', type=int, default=3,
                       help='Keep N best iterations completely (default: 3)')
    parser.add_argument('--keep-pareto', action='store_true',
                       help='Keep all Pareto-optimal iterations')
    parser.add_argument('--aggressive', action='store_true',
                       help='Delete ALL iteration data (only keep DB)')
    parser.add_argument('--batch', type=str, metavar='PATTERN',
                       help='Clean multiple studies matching pattern (e.g., "m1_mirror_*")')

    args = parser.parse_args()

    dry_run = not args.execute

    if args.batch:
        # Batch cleanup mode
        print(f"\n{'='*60}")
        print(f"BATCH CLEANUP: {args.study_path}")
        print(f"Pattern: {args.batch}")
        print(f"{'='*60}")
        print(f"Mode: {'DRY RUN' if dry_run else 'EXECUTE'}")

        results = cleanup_batch(
            args.study_path,
            pattern=args.batch,
            dry_run=dry_run,
            keep_best=args.keep_best,
            keep_pareto=args.keep_pareto,
            aggressive=args.aggressive,
        )

        print(f"\n{'='*60}")
        print("BATCH RESULTS")
        print(f"{'='*60}")
        print(f"{'Study':<45} {'Trials':>7} {'Size':>8} {'Savings':>8}")
        print("-" * 75)

        total_saved = 0
        for r in results:
            if r.get('status') == 'error':
                print(f"{r['study_name']:<45} ERROR: {r.get('error', 'Unknown')}")
            else:
                saved = r.get('space_saved_gb', 0)
                total_saved += saved
                print(f"{r['study_name']:<45} {r.get('trial_count', 0):>7} "
                      f"{r.get('total_size_before', 0)/(1024**3):>7.1f}G {saved:>7.1f}G")

        print("-" * 75)
        print(f"{'TOTAL SAVINGS:':<45} {' '*15} {total_saved:>7.1f}G")

        if dry_run:
            print(f"\n[!] This was a dry run. Run with --execute to actually delete files.")

        return results

    else:
        # Single study cleanup
        print(f"\n{'='*60}")
        print(f"STUDY CLEANUP: {args.study_path.name}")
        print(f"{'='*60}")
        print(f"Mode: {'DRY RUN (no files deleted)' if dry_run else 'EXECUTE (files WILL be deleted)'}")
        print(f"Keep best: {args.keep_best} iterations")
        print(f"Keep Pareto: {args.keep_pareto}")
        print(f"Aggressive: {args.aggressive}")

        result = cleanup_study(
            args.study_path,
            dry_run=dry_run,
            keep_best=args.keep_best,
            keep_pareto=args.keep_pareto,
            aggressive=args.aggressive,
        )

        print(f"\n{'='*60}")
        print("RESULTS")
        print(f"{'='*60}")
        print(f"Trials in study: {result['trial_count']}")
        print(f"Iterations kept: {len(result['kept_iterations'])} {result['kept_iterations'][:5]}{'...' if len(result['kept_iterations']) > 5 else ''}")
        print(f"Total size before: {result['total_size_before'] / (1024**3):.2f} GB")
        print(f"{'Would delete' if dry_run else 'Deleted'}: {result['deleted_files']} files")
        print(f"Space {'to save' if dry_run else 'saved'}: {result['space_saved_gb']:.2f} GB")

        if dry_run:
            print(f"\n[!] This was a dry run. Run with --execute to actually delete files.")

        return result


if __name__ == '__main__':
    main()