""" Study Archiver - Disk Space Optimization for Atomizer Studies This module provides utilities for: 1. Cleaning up completed studies (removing regenerable files) 2. Archiving studies to remote storage (dalidou server) 3. Restoring archived studies on-demand Usage: # Cleanup a completed study (keep only essential files) python -m optimization_engine.utils.study_archiver cleanup studies/M1_Mirror/m1_mirror_V12 # Archive to remote server python -m optimization_engine.utils.study_archiver archive studies/M1_Mirror/m1_mirror_V12 # Restore from remote python -m optimization_engine.utils.study_archiver restore m1_mirror_V12 # Show disk usage analysis python -m optimization_engine.utils.study_archiver analyze studies/M1_Mirror """ import os import json import shutil import tarfile import subprocess from pathlib import Path from datetime import datetime from typing import Optional, Dict, List, Tuple import logging logger = logging.getLogger(__name__) # Configuration REMOTE_CONFIG = { "host": "192.168.86.50", # Local WiFi "host_tailscale": "100.80.199.40", # Remote via Tailscale "user": "papa", "archive_path": "/srv/storage/atomizer-archive", "ssh_port": 22, } # Files to KEEP per trial (essential for analysis) ESSENTIAL_EXTENSIONS = { '.op2', # Nastran binary results (Zernike extraction) '.json', # Parameters, results, metadata '.npz', # Pre-computed Zernike coefficients '.html', # Generated reports '.png', # Visualization images '.csv', # Exported data } # Files to DELETE per trial (regenerable from master + params) DELETABLE_EXTENSIONS = { '.prt', # NX part files (copy of master) '.fem', # FEM mesh files (copy of master) '.sim', # Simulation files (copy of master) '.afm', # Assembly FEM files '.dat', # Solver input deck (can regenerate) '.f04', # Nastran output log '.f06', # Nastran printed output '.log', # Generic log files '.diag', # Diagnostic files '.txt', # Temp text files '.exp', # Expression files '.bak', # Backup files } # Folders to always keep entirely KEEP_FOLDERS = { '1_setup', # Master model files (source of truth) '3_results', # Final results, database, reports 'best_design_archive', # Archived best designs } def analyze_study(study_path: Path) -> Dict: """Analyze disk usage of a study folder.""" study_path = Path(study_path) analysis = { "study_name": study_path.name, "total_size_bytes": 0, "by_extension": {}, "by_folder": {}, "essential_size": 0, "deletable_size": 0, "trial_count": 0, } for f in study_path.rglob("*"): if f.is_file(): sz = f.stat().st_size ext = f.suffix.lower() analysis["total_size_bytes"] += sz analysis["by_extension"][ext] = analysis["by_extension"].get(ext, 0) + sz # Categorize by folder rel_parts = f.relative_to(study_path).parts if rel_parts: folder = rel_parts[0] analysis["by_folder"][folder] = analysis["by_folder"].get(folder, 0) + sz # Essential vs deletable if ext in ESSENTIAL_EXTENSIONS: analysis["essential_size"] += sz elif ext in DELETABLE_EXTENSIONS: analysis["deletable_size"] += sz # Count trials iterations_dir = study_path / "2_iterations" if iterations_dir.exists(): analysis["trial_count"] = len([ d for d in iterations_dir.iterdir() if d.is_dir() and (d.name.startswith("trial_") or d.name.startswith("iter")) ]) return analysis def print_analysis(analysis: Dict): """Print formatted analysis results.""" total_gb = analysis["total_size_bytes"] / 1e9 essential_gb = analysis["essential_size"] / 1e9 deletable_gb = analysis["deletable_size"] / 1e9 print(f"\n{'='*60}") print(f"Study: {analysis['study_name']}") print(f"{'='*60}") print(f"Total size: {total_gb:8.2f} GB") print(f"Trials: {analysis['trial_count']:8d}") print(f"Essential: {essential_gb:8.2f} GB ({100*essential_gb/total_gb:.1f}%)") print(f"Deletable: {deletable_gb:8.2f} GB ({100*deletable_gb/total_gb:.1f}%)") print(f"Potential save: {deletable_gb:8.2f} GB") print(f"\nBy folder:") for folder, size in sorted(analysis["by_folder"].items(), key=lambda x: -x[1]): print(f" {folder:25} {size/1e9:8.2f} GB") print(f"\nTop extensions:") for ext, size in sorted(analysis["by_extension"].items(), key=lambda x: -x[1])[:10]: status = "[KEEP]" if ext in ESSENTIAL_EXTENSIONS else "[DEL?]" if ext in DELETABLE_EXTENSIONS else "[ ]" print(f" {status} {ext:10} {size/1e9:8.2f} GB") def cleanup_study(study_path: Path, dry_run: bool = True) -> Tuple[int, int]: """ Clean up a completed study by removing regenerable files from trial folders. Args: study_path: Path to study folder dry_run: If True, only report what would be deleted Returns: (files_deleted, bytes_freed) """ study_path = Path(study_path) iterations_dir = study_path / "2_iterations" if not iterations_dir.exists(): logger.warning(f"No iterations folder found in {study_path}") return 0, 0 files_to_delete = [] bytes_to_free = 0 # Find all deletable files in trial folders for trial_dir in iterations_dir.iterdir(): if not trial_dir.is_dir(): continue for f in trial_dir.iterdir(): if f.is_file() and f.suffix.lower() in DELETABLE_EXTENSIONS: files_to_delete.append(f) bytes_to_free += f.stat().st_size if dry_run: print(f"\n[DRY RUN] Would delete {len(files_to_delete)} files, freeing {bytes_to_free/1e9:.2f} GB") print("\nSample files to delete:") for f in files_to_delete[:10]: print(f" {f.relative_to(study_path)}") if len(files_to_delete) > 10: print(f" ... and {len(files_to_delete) - 10} more") return 0, 0 # Actually delete deleted = 0 freed = 0 for f in files_to_delete: try: sz = f.stat().st_size f.unlink() deleted += 1 freed += sz except Exception as e: logger.error(f"Failed to delete {f}: {e}") print(f"Deleted {deleted} files, freed {freed/1e9:.2f} GB") return deleted, freed def archive_to_remote( study_path: Path, use_tailscale: bool = False, dry_run: bool = True ) -> bool: """ Archive a study to the remote dalidou server. Args: study_path: Path to study folder use_tailscale: Use Tailscale IP (for remote access) dry_run: If True, only report what would be done Returns: True if successful """ study_path = Path(study_path) study_name = study_path.name host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"] user = REMOTE_CONFIG["user"] remote_path = REMOTE_CONFIG["archive_path"] # Create compressed archive locally first archive_name = f"{study_name}_{datetime.now().strftime('%Y%m%d')}.tar.gz" local_archive = study_path.parent / archive_name if dry_run: print(f"\n[DRY RUN] Would archive {study_name}") print(f" 1. Create {archive_name}") print(f" 2. Upload to {user}@{host}:{remote_path}/") print(f" 3. Delete local archive") return True print(f"Creating archive: {archive_name}") with tarfile.open(local_archive, "w:gz") as tar: tar.add(study_path, arcname=study_name) archive_size = local_archive.stat().st_size print(f"Archive size: {archive_size/1e9:.2f} GB") # Upload via rsync (more reliable than scp for large files) print(f"Uploading to {host}...") # First ensure remote directory exists ssh_cmd = f'ssh {user}@{host} "mkdir -p {remote_path}"' subprocess.run(ssh_cmd, shell=True, check=True) # Upload rsync_cmd = f'rsync -avz --progress "{local_archive}" {user}@{host}:{remote_path}/' result = subprocess.run(rsync_cmd, shell=True) if result.returncode == 0: print("Upload successful!") # Clean up local archive local_archive.unlink() return True else: print(f"Upload failed with code {result.returncode}") return False def restore_from_remote( study_name: str, target_dir: Path, use_tailscale: bool = False ) -> bool: """ Restore a study from the remote server. Args: study_name: Name of the study to restore target_dir: Where to extract the study use_tailscale: Use Tailscale IP Returns: True if successful """ host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"] user = REMOTE_CONFIG["user"] remote_path = REMOTE_CONFIG["archive_path"] target_dir = Path(target_dir) # Find the archive on remote print(f"Looking for {study_name} on {host}...") ssh_cmd = f'ssh {user}@{host} "ls {remote_path}/{study_name}*.tar.gz 2>/dev/null | head -1"' result = subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True) if not result.stdout.strip(): print(f"No archive found for {study_name}") return False remote_archive = result.stdout.strip() local_archive = target_dir / Path(remote_archive).name print(f"Downloading: {remote_archive}") rsync_cmd = f'rsync -avz --progress {user}@{host}:"{remote_archive}" "{local_archive}"' result = subprocess.run(rsync_cmd, shell=True) if result.returncode != 0: print("Download failed") return False print("Extracting...") with tarfile.open(local_archive, "r:gz") as tar: tar.extractall(target_dir) # Clean up local_archive.unlink() print(f"Restored to {target_dir / study_name}") return True def list_remote_archives(use_tailscale: bool = False) -> List[Dict]: """List all archived studies on the remote server.""" host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"] user = REMOTE_CONFIG["user"] remote_path = REMOTE_CONFIG["archive_path"] ssh_cmd = f'ssh {user}@{host} "ls -lh {remote_path}/*.tar.gz 2>/dev/null"' result = subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True) archives = [] for line in result.stdout.strip().split('\n'): if line and '.tar.gz' in line: parts = line.split() if len(parts) >= 9: archives.append({ "name": parts[-1].split('/')[-1], "size": parts[4], "date": f"{parts[5]} {parts[6]} {parts[7]}", }) return archives def analyze_all_studies(studies_dir: Path) -> Dict: """Analyze all studies in a directory.""" studies_dir = Path(studies_dir) total_analysis = { "total_size": 0, "total_essential": 0, "total_deletable": 0, "studies": [], } for study in sorted(studies_dir.iterdir()): if study.is_dir() and not study.name.startswith('.'): analysis = analyze_study(study) total_analysis["studies"].append(analysis) total_analysis["total_size"] += analysis["total_size_bytes"] total_analysis["total_essential"] += analysis["essential_size"] total_analysis["total_deletable"] += analysis["deletable_size"] return total_analysis def main(): import argparse parser = argparse.ArgumentParser(description="Atomizer Study Archiver") parser.add_argument("command", choices=["analyze", "cleanup", "archive", "restore", "list"]) parser.add_argument("path", nargs="?", help="Study path or name") parser.add_argument("--dry-run", action="store_true", default=True, help="Don't actually delete/transfer (default: True)") parser.add_argument("--execute", action="store_true", help="Actually perform the operation") parser.add_argument("--tailscale", action="store_true", help="Use Tailscale IP for remote access") args = parser.parse_args() dry_run = not args.execute if args.command == "analyze": if not args.path: print("Usage: study_archiver analyze ") return path = Path(args.path) if path.is_dir(): # Check if it's a single study or a collection if (path / "optimization_config.json").exists() or (path / "1_setup").exists(): # Single study analysis = analyze_study(path) print_analysis(analysis) else: # Collection of studies total = analyze_all_studies(path) print(f"\n{'='*60}") print(f"Summary: {len(total['studies'])} studies") print(f"{'='*60}") print(f"Total size: {total['total_size']/1e9:8.2f} GB") print(f"Essential: {total['total_essential']/1e9:8.2f} GB") print(f"Deletable: {total['total_deletable']/1e9:8.2f} GB") print(f"Potential save: {total['total_deletable']/1e9:8.2f} GB") print(f"\nPer study:") for s in total["studies"]: print(f" {s['study_name']:40} {s['total_size_bytes']/1e9:6.2f} GB ({s['trial_count']:3d} trials)") elif args.command == "cleanup": if not args.path: print("Usage: study_archiver cleanup [--execute]") return cleanup_study(Path(args.path), dry_run=dry_run) elif args.command == "archive": if not args.path: print("Usage: study_archiver archive [--execute] [--tailscale]") return archive_to_remote(Path(args.path), use_tailscale=args.tailscale, dry_run=dry_run) elif args.command == "restore": if not args.path: print("Usage: study_archiver restore [--tailscale]") return target = Path.cwd() / "studies" restore_from_remote(args.path, target, use_tailscale=args.tailscale) elif args.command == "list": archives = list_remote_archives(use_tailscale=args.tailscale) if archives: print(f"\nArchived studies on dalidou:") print(f"{'='*60}") for a in archives: print(f" {a['name']:40} {a['size']:>8} {a['date']}") else: print("No archives found (or server not reachable)") if __name__ == "__main__": main()