412 lines
13 KiB
Python
412 lines
13 KiB
Python
|
|
"""
|
||
|
|
Study Cleanup Utility
|
||
|
|
====================
|
||
|
|
|
||
|
|
Cleans up completed optimization studies to save disk space by removing
|
||
|
|
large intermediate files (NX models, FEM meshes, solver results) while
|
||
|
|
preserving essential data (parameters, extracted results, database).
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python -m optimization_engine.utils.study_cleanup <study_path> [options]
|
||
|
|
|
||
|
|
Options:
|
||
|
|
--dry-run Show what would be deleted without actually deleting
|
||
|
|
--keep-best N Keep iteration folders for the top N best trials
|
||
|
|
--keep-pareto Keep all Pareto-optimal iterations (for multi-objective)
|
||
|
|
--aggressive Delete ALL iteration data (only keep DB and config)
|
||
|
|
|
||
|
|
The database (study.db) contains all optimization results and can regenerate
|
||
|
|
any analysis. The original NX model in 1_setup is always preserved.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import json
|
||
|
|
import shutil
|
||
|
|
import sqlite3
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
|
||
|
|
# Files to ALWAYS keep in iteration folders (tiny, essential)
|
||
|
|
ESSENTIAL_FILES = {
|
||
|
|
'params.exp', # Design parameters for this iteration
|
||
|
|
'_temp_mass.txt', # Extracted mass
|
||
|
|
'_temp_part_properties.json', # Part properties
|
||
|
|
'_temp_zernike.json', # Zernike coefficients (if exists)
|
||
|
|
'results.json', # Any extracted results
|
||
|
|
}
|
||
|
|
|
||
|
|
# Extensions to DELETE (large, regenerable/already extracted)
|
||
|
|
DELETABLE_EXTENSIONS = {
|
||
|
|
'.op2', # Nastran binary results (~65 MB each)
|
||
|
|
'.prt', # NX Part files (~30-35 MB each)
|
||
|
|
'.fem', # FEM mesh files (~15 MB each)
|
||
|
|
'.dat', # Nastran input deck (~15 MB each)
|
||
|
|
'.sim', # Simulation file (~7 MB each)
|
||
|
|
'.afm', # FEA auxiliary (~4 MB each)
|
||
|
|
'.f04', # Nastran log
|
||
|
|
'.f06', # Nastran output
|
||
|
|
'.log', # Solver log
|
||
|
|
'.diag', # Diagnostics
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def get_study_info(study_path: Path) -> dict:
|
||
|
|
"""Get study metadata from config and database."""
|
||
|
|
config_path = study_path / 'optimization_config.json'
|
||
|
|
# Try both possible DB locations
|
||
|
|
db_path = study_path / '3_results' / 'study.db'
|
||
|
|
if not db_path.exists():
|
||
|
|
db_path = study_path / '2_results' / 'study.db'
|
||
|
|
|
||
|
|
info = {
|
||
|
|
'name': study_path.name,
|
||
|
|
'has_config': config_path.exists(),
|
||
|
|
'has_db': db_path.exists(),
|
||
|
|
'trial_count': 0,
|
||
|
|
'best_trials': [],
|
||
|
|
'pareto_trials': [],
|
||
|
|
}
|
||
|
|
|
||
|
|
if config_path.exists():
|
||
|
|
with open(config_path) as f:
|
||
|
|
info['config'] = json.load(f)
|
||
|
|
|
||
|
|
if db_path.exists():
|
||
|
|
conn = sqlite3.connect(db_path)
|
||
|
|
cursor = conn.cursor()
|
||
|
|
|
||
|
|
# Get trial count
|
||
|
|
cursor.execute("SELECT COUNT(*) FROM trials WHERE state = 'COMPLETE'")
|
||
|
|
info['trial_count'] = cursor.fetchone()[0]
|
||
|
|
|
||
|
|
# Try to get best trials (for single objective)
|
||
|
|
try:
|
||
|
|
cursor.execute("""
|
||
|
|
SELECT trial_id, value FROM trial_values
|
||
|
|
WHERE objective = 0
|
||
|
|
ORDER BY value ASC LIMIT 10
|
||
|
|
""")
|
||
|
|
info['best_trials'] = [row[0] for row in cursor.fetchall()]
|
||
|
|
except Exception as e:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Check for Pareto attribute
|
||
|
|
try:
|
||
|
|
cursor.execute("""
|
||
|
|
SELECT DISTINCT trial_id FROM trial_system_attrs
|
||
|
|
WHERE key = 'pareto_optimal' AND value = '1'
|
||
|
|
""")
|
||
|
|
info['pareto_trials'] = [row[0] for row in cursor.fetchall()]
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
return info
|
||
|
|
|
||
|
|
|
||
|
|
def calculate_cleanup_savings(study_path: Path, keep_iters: set = None) -> dict:
|
||
|
|
"""Calculate how much space would be saved by cleanup."""
|
||
|
|
iterations_path = study_path / '2_iterations'
|
||
|
|
if not iterations_path.exists():
|
||
|
|
iterations_path = study_path / '1_working' # Legacy structure
|
||
|
|
|
||
|
|
if not iterations_path.exists():
|
||
|
|
return {'total_size': 0, 'deletable_size': 0, 'keep_size': 0}
|
||
|
|
|
||
|
|
total_size = 0
|
||
|
|
deletable_size = 0
|
||
|
|
keep_size = 0
|
||
|
|
keep_iters = keep_iters or set()
|
||
|
|
|
||
|
|
for iter_folder in iterations_path.iterdir():
|
||
|
|
if not iter_folder.is_dir():
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Extract iteration number
|
||
|
|
try:
|
||
|
|
iter_num = int(iter_folder.name.replace('iter', ''))
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
for f in iter_folder.iterdir():
|
||
|
|
if not f.is_file():
|
||
|
|
continue
|
||
|
|
size = f.stat().st_size
|
||
|
|
total_size += size
|
||
|
|
|
||
|
|
# Keep entire folder if in keep_iters
|
||
|
|
if iter_num in keep_iters:
|
||
|
|
keep_size += size
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Keep essential files
|
||
|
|
if f.name.lower() in {e.lower() for e in ESSENTIAL_FILES}:
|
||
|
|
keep_size += size
|
||
|
|
elif f.suffix.lower() in DELETABLE_EXTENSIONS:
|
||
|
|
deletable_size += size
|
||
|
|
else:
|
||
|
|
keep_size += size # Keep unknown files by default
|
||
|
|
|
||
|
|
return {
|
||
|
|
'total_size': total_size,
|
||
|
|
'deletable_size': deletable_size,
|
||
|
|
'keep_size': keep_size,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def cleanup_study(
|
||
|
|
study_path: Path,
|
||
|
|
dry_run: bool = True,
|
||
|
|
keep_best: int = 0,
|
||
|
|
keep_pareto: bool = False,
|
||
|
|
aggressive: bool = False,
|
||
|
|
) -> dict:
|
||
|
|
"""
|
||
|
|
Clean up a study to save disk space.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
study_path: Path to study folder
|
||
|
|
dry_run: If True, only report what would be deleted
|
||
|
|
keep_best: Number of best iterations to keep completely
|
||
|
|
keep_pareto: Keep all Pareto-optimal iterations
|
||
|
|
aggressive: Delete ALL iteration folders (only keep DB)
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
dict with cleanup statistics
|
||
|
|
"""
|
||
|
|
study_path = Path(study_path)
|
||
|
|
if not study_path.exists():
|
||
|
|
raise ValueError(f"Study path does not exist: {study_path}")
|
||
|
|
|
||
|
|
# Get study info
|
||
|
|
info = get_study_info(study_path)
|
||
|
|
|
||
|
|
# Determine which iterations to keep
|
||
|
|
keep_iters = set()
|
||
|
|
if keep_best > 0 and info['best_trials']:
|
||
|
|
keep_iters.update(info['best_trials'][:keep_best])
|
||
|
|
if keep_pareto and info['pareto_trials']:
|
||
|
|
keep_iters.update(info['pareto_trials'])
|
||
|
|
|
||
|
|
# Find iterations folder
|
||
|
|
iterations_path = study_path / '2_iterations'
|
||
|
|
if not iterations_path.exists():
|
||
|
|
iterations_path = study_path / '1_working'
|
||
|
|
|
||
|
|
if not iterations_path.exists():
|
||
|
|
return {'status': 'no_iterations', 'deleted_bytes': 0, 'deleted_files': 0}
|
||
|
|
|
||
|
|
# Calculate savings
|
||
|
|
savings = calculate_cleanup_savings(study_path, keep_iters)
|
||
|
|
|
||
|
|
deleted_bytes = 0
|
||
|
|
deleted_files = 0
|
||
|
|
deleted_folders = 0
|
||
|
|
|
||
|
|
if aggressive:
|
||
|
|
# Delete entire iterations folder
|
||
|
|
if not dry_run:
|
||
|
|
shutil.rmtree(iterations_path)
|
||
|
|
deleted_bytes = savings['total_size']
|
||
|
|
deleted_folders = 1
|
||
|
|
else:
|
||
|
|
deleted_bytes = savings['total_size']
|
||
|
|
else:
|
||
|
|
# Selective cleanup
|
||
|
|
for iter_folder in iterations_path.iterdir():
|
||
|
|
if not iter_folder.is_dir():
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Extract iteration number
|
||
|
|
try:
|
||
|
|
iter_num = int(iter_folder.name.replace('iter', ''))
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Skip kept iterations
|
||
|
|
if iter_num in keep_iters:
|
||
|
|
continue
|
||
|
|
|
||
|
|
for f in iter_folder.iterdir():
|
||
|
|
if not f.is_file():
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Keep essential files
|
||
|
|
if f.name.lower() in {e.lower() for e in ESSENTIAL_FILES}:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Delete deletable extensions
|
||
|
|
if f.suffix.lower() in DELETABLE_EXTENSIONS:
|
||
|
|
size = f.stat().st_size
|
||
|
|
if not dry_run:
|
||
|
|
f.unlink()
|
||
|
|
deleted_bytes += size
|
||
|
|
deleted_files += 1
|
||
|
|
|
||
|
|
return {
|
||
|
|
'status': 'dry_run' if dry_run else 'completed',
|
||
|
|
'study_name': info['name'],
|
||
|
|
'trial_count': info['trial_count'],
|
||
|
|
'kept_iterations': list(keep_iters),
|
||
|
|
'total_size_before': savings['total_size'],
|
||
|
|
'deleted_bytes': deleted_bytes,
|
||
|
|
'deleted_files': deleted_files,
|
||
|
|
'deleted_folders': deleted_folders,
|
||
|
|
'space_saved_gb': deleted_bytes / (1024**3),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def cleanup_batch(
|
||
|
|
parent_path: Path,
|
||
|
|
pattern: str = "*",
|
||
|
|
dry_run: bool = True,
|
||
|
|
keep_best: int = 3,
|
||
|
|
keep_pareto: bool = False,
|
||
|
|
aggressive: bool = False,
|
||
|
|
) -> list:
|
||
|
|
"""
|
||
|
|
Clean up multiple studies matching a pattern.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
parent_path: Parent directory containing studies
|
||
|
|
pattern: Glob pattern to match study folders (e.g., "m1_mirror_*")
|
||
|
|
dry_run: If True, only report
|
||
|
|
keep_best: Keep N best iterations per study
|
||
|
|
keep_pareto: Keep Pareto-optimal iterations
|
||
|
|
aggressive: Delete all iteration folders
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of cleanup results
|
||
|
|
"""
|
||
|
|
parent_path = Path(parent_path)
|
||
|
|
results = []
|
||
|
|
|
||
|
|
for study_path in sorted(parent_path.glob(pattern)):
|
||
|
|
if not study_path.is_dir():
|
||
|
|
continue
|
||
|
|
# Check if it looks like a study (has iterations folder)
|
||
|
|
if not (study_path / '2_iterations').exists() and not (study_path / '1_working').exists():
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
result = cleanup_study(
|
||
|
|
study_path,
|
||
|
|
dry_run=dry_run,
|
||
|
|
keep_best=keep_best,
|
||
|
|
keep_pareto=keep_pareto,
|
||
|
|
aggressive=aggressive,
|
||
|
|
)
|
||
|
|
results.append(result)
|
||
|
|
except Exception as e:
|
||
|
|
results.append({
|
||
|
|
'study_name': study_path.name,
|
||
|
|
'status': 'error',
|
||
|
|
'error': str(e),
|
||
|
|
})
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description='Clean up completed optimization studies to save disk space.',
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
|
|
epilog=__doc__
|
||
|
|
)
|
||
|
|
parser.add_argument('study_path', type=Path, help='Path to study folder or parent directory')
|
||
|
|
parser.add_argument('--dry-run', action='store_true', default=True,
|
||
|
|
help='Show what would be deleted without deleting (default)')
|
||
|
|
parser.add_argument('--execute', action='store_true',
|
||
|
|
help='Actually delete files (opposite of --dry-run)')
|
||
|
|
parser.add_argument('--keep-best', type=int, default=3,
|
||
|
|
help='Keep N best iterations completely (default: 3)')
|
||
|
|
parser.add_argument('--keep-pareto', action='store_true',
|
||
|
|
help='Keep all Pareto-optimal iterations')
|
||
|
|
parser.add_argument('--aggressive', action='store_true',
|
||
|
|
help='Delete ALL iteration data (only keep DB)')
|
||
|
|
parser.add_argument('--batch', type=str, metavar='PATTERN',
|
||
|
|
help='Clean multiple studies matching pattern (e.g., "m1_mirror_*")')
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
dry_run = not args.execute
|
||
|
|
|
||
|
|
if args.batch:
|
||
|
|
# Batch cleanup mode
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print(f"BATCH CLEANUP: {args.study_path}")
|
||
|
|
print(f"Pattern: {args.batch}")
|
||
|
|
print(f"{'='*60}")
|
||
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'EXECUTE'}")
|
||
|
|
|
||
|
|
results = cleanup_batch(
|
||
|
|
args.study_path,
|
||
|
|
pattern=args.batch,
|
||
|
|
dry_run=dry_run,
|
||
|
|
keep_best=args.keep_best,
|
||
|
|
keep_pareto=args.keep_pareto,
|
||
|
|
aggressive=args.aggressive,
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print("BATCH RESULTS")
|
||
|
|
print(f"{'='*60}")
|
||
|
|
print(f"{'Study':<45} {'Trials':>7} {'Size':>8} {'Savings':>8}")
|
||
|
|
print("-" * 75)
|
||
|
|
|
||
|
|
total_saved = 0
|
||
|
|
for r in results:
|
||
|
|
if r.get('status') == 'error':
|
||
|
|
print(f"{r['study_name']:<45} ERROR: {r.get('error', 'Unknown')}")
|
||
|
|
else:
|
||
|
|
saved = r.get('space_saved_gb', 0)
|
||
|
|
total_saved += saved
|
||
|
|
print(f"{r['study_name']:<45} {r.get('trial_count', 0):>7} "
|
||
|
|
f"{r.get('total_size_before', 0)/(1024**3):>7.1f}G {saved:>7.1f}G")
|
||
|
|
|
||
|
|
print("-" * 75)
|
||
|
|
print(f"{'TOTAL SAVINGS:':<45} {' '*15} {total_saved:>7.1f}G")
|
||
|
|
|
||
|
|
if dry_run:
|
||
|
|
print(f"\n[!] This was a dry run. Run with --execute to actually delete files.")
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
else:
|
||
|
|
# Single study cleanup
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print(f"STUDY CLEANUP: {args.study_path.name}")
|
||
|
|
print(f"{'='*60}")
|
||
|
|
print(f"Mode: {'DRY RUN (no files deleted)' if dry_run else 'EXECUTE (files WILL be deleted)'}")
|
||
|
|
print(f"Keep best: {args.keep_best} iterations")
|
||
|
|
print(f"Keep Pareto: {args.keep_pareto}")
|
||
|
|
print(f"Aggressive: {args.aggressive}")
|
||
|
|
|
||
|
|
result = cleanup_study(
|
||
|
|
args.study_path,
|
||
|
|
dry_run=dry_run,
|
||
|
|
keep_best=args.keep_best,
|
||
|
|
keep_pareto=args.keep_pareto,
|
||
|
|
aggressive=args.aggressive,
|
||
|
|
)
|
||
|
|
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print("RESULTS")
|
||
|
|
print(f"{'='*60}")
|
||
|
|
print(f"Trials in study: {result['trial_count']}")
|
||
|
|
print(f"Iterations kept: {len(result['kept_iterations'])} {result['kept_iterations'][:5]}{'...' if len(result['kept_iterations']) > 5 else ''}")
|
||
|
|
print(f"Total size before: {result['total_size_before'] / (1024**3):.2f} GB")
|
||
|
|
print(f"{'Would delete' if dry_run else 'Deleted'}: {result['deleted_files']} files")
|
||
|
|
print(f"Space {'to save' if dry_run else 'saved'}: {result['space_saved_gb']:.2f} GB")
|
||
|
|
|
||
|
|
if dry_run:
|
||
|
|
print(f"\n[!] This was a dry run. Run with --execute to actually delete files.")
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|