Atomizer/optimization_engine/utils/study_archiver.py

"""
Study Archiver - Disk Space Optimization for Atomizer Studies

This module provides utilities for:
1. Cleaning up completed studies (removing regenerable files)
2. Archiving studies to remote storage (dalidou server)
3. Restoring archived studies on-demand

Usage:
    # Cleanup a completed study (keep only essential files)
    python -m optimization_engine.utils.study_archiver cleanup studies/M1_Mirror/m1_mirror_V12

    # Archive to remote server
    python -m optimization_engine.utils.study_archiver archive studies/M1_Mirror/m1_mirror_V12

    # Restore from remote
    python -m optimization_engine.utils.study_archiver restore m1_mirror_V12

    # Show disk usage analysis
    python -m optimization_engine.utils.study_archiver analyze studies/M1_Mirror
"""

import os
import json
import shutil
import tarfile
import subprocess
from pathlib import Path
from datetime import datetime
from typing import Optional, Dict, List, Tuple
import logging

logger = logging.getLogger(__name__)

# Configuration
REMOTE_CONFIG = {
    "host": "192.168.86.50",  # Local WiFi
    "host_tailscale": "100.80.199.40",  # Remote via Tailscale
    "user": "papa",
    "archive_path": "/srv/storage/atomizer-archive",
    "ssh_port": 22,
}

# Files to KEEP per trial (essential for analysis)
ESSENTIAL_EXTENSIONS = {
    '.op2',   # Nastran binary results (Zernike extraction)
    '.json',  # Parameters, results, metadata
    '.npz',   # Pre-computed Zernike coefficients
    '.html',  # Generated reports
    '.png',   # Visualization images
    '.csv',   # Exported data
}

# Files to DELETE per trial (regenerable from master + params)
DELETABLE_EXTENSIONS = {
    '.prt',   # NX part files (copy of master)
    '.fem',   # FEM mesh files (copy of master)
    '.sim',   # Simulation files (copy of master)
    '.afm',   # Assembly FEM files
    '.dat',   # Solver input deck (can regenerate)
    '.f04',   # Nastran output log
    '.f06',   # Nastran printed output
    '.log',   # Generic log files
    '.diag',  # Diagnostic files
    '.txt',   # Temp text files
    '.exp',   # Expression files
    '.bak',   # Backup files
}

# Folders to always keep entirely
KEEP_FOLDERS = {
    '1_setup',           # Master model files (source of truth)
    '3_results',         # Final results, database, reports
    'best_design_archive',  # Archived best designs
}


def analyze_study(study_path: Path) -> Dict:
    """Analyze disk usage of a study folder."""
    study_path = Path(study_path)

    analysis = {
        "study_name": study_path.name,
        "total_size_bytes": 0,
        "by_extension": {},
        "by_folder": {},
        "essential_size": 0,
        "deletable_size": 0,
        "trial_count": 0,
    }

    for f in study_path.rglob("*"):
        if f.is_file():
            sz = f.stat().st_size
            ext = f.suffix.lower()

            analysis["total_size_bytes"] += sz
            analysis["by_extension"][ext] = analysis["by_extension"].get(ext, 0) + sz

            # Categorize by folder
            rel_parts = f.relative_to(study_path).parts
            if rel_parts:
                folder = rel_parts[0]
                analysis["by_folder"][folder] = analysis["by_folder"].get(folder, 0) + sz

            # Essential vs deletable
            if ext in ESSENTIAL_EXTENSIONS:
                analysis["essential_size"] += sz
            elif ext in DELETABLE_EXTENSIONS:
                analysis["deletable_size"] += sz

    # Count trials
    iterations_dir = study_path / "2_iterations"
    if iterations_dir.exists():
        analysis["trial_count"] = len([
            d for d in iterations_dir.iterdir()
            if d.is_dir() and (d.name.startswith("trial_") or d.name.startswith("iter"))
        ])

    return analysis


def print_analysis(analysis: Dict):
    """Print formatted analysis results."""
    total_gb = analysis["total_size_bytes"] / 1e9
    essential_gb = analysis["essential_size"] / 1e9
    deletable_gb = analysis["deletable_size"] / 1e9

    print(f"\n{'='*60}")
    print(f"Study: {analysis['study_name']}")
    print(f"{'='*60}")
    print(f"Total size:     {total_gb:8.2f} GB")
    print(f"Trials:         {analysis['trial_count']:8d}")
    print(f"Essential:      {essential_gb:8.2f} GB ({100*essential_gb/total_gb:.1f}%)")
    print(f"Deletable:      {deletable_gb:8.2f} GB ({100*deletable_gb/total_gb:.1f}%)")
    print(f"Potential save: {deletable_gb:8.2f} GB")

    print(f"\nBy folder:")
    for folder, size in sorted(analysis["by_folder"].items(), key=lambda x: -x[1]):
        print(f"  {folder:25} {size/1e9:8.2f} GB")

    print(f"\nTop extensions:")
    for ext, size in sorted(analysis["by_extension"].items(), key=lambda x: -x[1])[:10]:
        status = "[KEEP]" if ext in ESSENTIAL_EXTENSIONS else "[DEL?]" if ext in DELETABLE_EXTENSIONS else "[    ]"
        print(f"  {status} {ext:10} {size/1e9:8.2f} GB")


def cleanup_study(study_path: Path, dry_run: bool = True) -> Tuple[int, int]:
    """
    Clean up a completed study by removing regenerable files from trial folders.

    Args:
        study_path: Path to study folder
        dry_run: If True, only report what would be deleted

    Returns:
        (files_deleted, bytes_freed)
    """
    study_path = Path(study_path)
    iterations_dir = study_path / "2_iterations"

    if not iterations_dir.exists():
        logger.warning(f"No iterations folder found in {study_path}")
        return 0, 0

    files_to_delete = []
    bytes_to_free = 0

    # Find all deletable files in trial folders
    for trial_dir in iterations_dir.iterdir():
        if not trial_dir.is_dir():
            continue

        for f in trial_dir.iterdir():
            if f.is_file() and f.suffix.lower() in DELETABLE_EXTENSIONS:
                files_to_delete.append(f)
                bytes_to_free += f.stat().st_size

    if dry_run:
        print(f"\n[DRY RUN] Would delete {len(files_to_delete)} files, freeing {bytes_to_free/1e9:.2f} GB")
        print("\nSample files to delete:")
        for f in files_to_delete[:10]:
            print(f"  {f.relative_to(study_path)}")
        if len(files_to_delete) > 10:
            print(f"  ... and {len(files_to_delete) - 10} more")
        return 0, 0

    # Actually delete
    deleted = 0
    freed = 0
    for f in files_to_delete:
        try:
            sz = f.stat().st_size
            f.unlink()
            deleted += 1
            freed += sz
        except Exception as e:
            logger.error(f"Failed to delete {f}: {e}")

    print(f"Deleted {deleted} files, freed {freed/1e9:.2f} GB")
    return deleted, freed


def archive_to_remote(
    study_path: Path,
    use_tailscale: bool = False,
    dry_run: bool = True
) -> bool:
    """
    Archive a study to the remote dalidou server.

    Args:
        study_path: Path to study folder
        use_tailscale: Use Tailscale IP (for remote access)
        dry_run: If True, only report what would be done

    Returns:
        True if successful
    """
    study_path = Path(study_path)
    study_name = study_path.name

    host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"]
    user = REMOTE_CONFIG["user"]
    remote_path = REMOTE_CONFIG["archive_path"]

    # Create compressed archive locally first
    archive_name = f"{study_name}_{datetime.now().strftime('%Y%m%d')}.tar.gz"
    local_archive = study_path.parent / archive_name

    if dry_run:
        print(f"\n[DRY RUN] Would archive {study_name}")
        print(f"  1. Create {archive_name}")
        print(f"  2. Upload to {user}@{host}:{remote_path}/")
        print(f"  3. Delete local archive")
        return True

    print(f"Creating archive: {archive_name}")
    with tarfile.open(local_archive, "w:gz") as tar:
        tar.add(study_path, arcname=study_name)

    archive_size = local_archive.stat().st_size
    print(f"Archive size: {archive_size/1e9:.2f} GB")

    # Upload via rsync (more reliable than scp for large files)
    print(f"Uploading to {host}...")

    # First ensure remote directory exists
    ssh_cmd = f'ssh {user}@{host} "mkdir -p {remote_path}"'
    subprocess.run(ssh_cmd, shell=True, check=True)

    # Upload
    rsync_cmd = f'rsync -avz --progress "{local_archive}" {user}@{host}:{remote_path}/'
    result = subprocess.run(rsync_cmd, shell=True)

    if result.returncode == 0:
        print("Upload successful!")
        # Clean up local archive
        local_archive.unlink()
        return True
    else:
        print(f"Upload failed with code {result.returncode}")
        return False


def restore_from_remote(
    study_name: str,
    target_dir: Path,
    use_tailscale: bool = False
) -> bool:
    """
    Restore a study from the remote server.

    Args:
        study_name: Name of the study to restore
        target_dir: Where to extract the study
        use_tailscale: Use Tailscale IP

    Returns:
        True if successful
    """
    host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"]
    user = REMOTE_CONFIG["user"]
    remote_path = REMOTE_CONFIG["archive_path"]

    target_dir = Path(target_dir)

    # Find the archive on remote
    print(f"Looking for {study_name} on {host}...")

    ssh_cmd = f'ssh {user}@{host} "ls {remote_path}/{study_name}*.tar.gz 2>/dev/null | head -1"'
    result = subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True)

    if not result.stdout.strip():
        print(f"No archive found for {study_name}")
        return False

    remote_archive = result.stdout.strip()
    local_archive = target_dir / Path(remote_archive).name

    print(f"Downloading: {remote_archive}")
    rsync_cmd = f'rsync -avz --progress {user}@{host}:"{remote_archive}" "{local_archive}"'
    result = subprocess.run(rsync_cmd, shell=True)

    if result.returncode != 0:
        print("Download failed")
        return False

    print("Extracting...")
    with tarfile.open(local_archive, "r:gz") as tar:
        tar.extractall(target_dir)

    # Clean up
    local_archive.unlink()
    print(f"Restored to {target_dir / study_name}")
    return True


def list_remote_archives(use_tailscale: bool = False) -> List[Dict]:
    """List all archived studies on the remote server."""
    host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"]
    user = REMOTE_CONFIG["user"]
    remote_path = REMOTE_CONFIG["archive_path"]

    ssh_cmd = f'ssh {user}@{host} "ls -lh {remote_path}/*.tar.gz 2>/dev/null"'
    result = subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True)

    archives = []
    for line in result.stdout.strip().split('\n'):
        if line and '.tar.gz' in line:
            parts = line.split()
            if len(parts) >= 9:
                archives.append({
                    "name": parts[-1].split('/')[-1],
                    "size": parts[4],
                    "date": f"{parts[5]} {parts[6]} {parts[7]}",
                })

    return archives


def analyze_all_studies(studies_dir: Path) -> Dict:
    """Analyze all studies in a directory."""
    studies_dir = Path(studies_dir)

    total_analysis = {
        "total_size": 0,
        "total_essential": 0,
        "total_deletable": 0,
        "studies": [],
    }

    for study in sorted(studies_dir.iterdir()):
        if study.is_dir() and not study.name.startswith('.'):
            analysis = analyze_study(study)
            total_analysis["studies"].append(analysis)
            total_analysis["total_size"] += analysis["total_size_bytes"]
            total_analysis["total_essential"] += analysis["essential_size"]
            total_analysis["total_deletable"] += analysis["deletable_size"]

    return total_analysis


def main():
    import argparse

    parser = argparse.ArgumentParser(description="Atomizer Study Archiver")
    parser.add_argument("command", choices=["analyze", "cleanup", "archive", "restore", "list"])
    parser.add_argument("path", nargs="?", help="Study path or name")
    parser.add_argument("--dry-run", action="store_true", default=True,
                        help="Don't actually delete/transfer (default: True)")
    parser.add_argument("--execute", action="store_true",
                        help="Actually perform the operation")
    parser.add_argument("--tailscale", action="store_true",
                        help="Use Tailscale IP for remote access")

    args = parser.parse_args()

    dry_run = not args.execute

    if args.command == "analyze":
        if not args.path:
            print("Usage: study_archiver analyze <path>")
            return

        path = Path(args.path)
        if path.is_dir():
            # Check if it's a single study or a collection
            if (path / "optimization_config.json").exists() or (path / "1_setup").exists():
                # Single study
                analysis = analyze_study(path)
                print_analysis(analysis)
            else:
                # Collection of studies
                total = analyze_all_studies(path)
                print(f"\n{'='*60}")
                print(f"Summary: {len(total['studies'])} studies")
                print(f"{'='*60}")
                print(f"Total size:     {total['total_size']/1e9:8.2f} GB")
                print(f"Essential:      {total['total_essential']/1e9:8.2f} GB")
                print(f"Deletable:      {total['total_deletable']/1e9:8.2f} GB")
                print(f"Potential save: {total['total_deletable']/1e9:8.2f} GB")
                print(f"\nPer study:")
                for s in total["studies"]:
                    print(f"  {s['study_name']:40} {s['total_size_bytes']/1e9:6.2f} GB ({s['trial_count']:3d} trials)")

    elif args.command == "cleanup":
        if not args.path:
            print("Usage: study_archiver cleanup <study_path> [--execute]")
            return
        cleanup_study(Path(args.path), dry_run=dry_run)

    elif args.command == "archive":
        if not args.path:
            print("Usage: study_archiver archive <study_path> [--execute] [--tailscale]")
            return
        archive_to_remote(Path(args.path), use_tailscale=args.tailscale, dry_run=dry_run)

    elif args.command == "restore":
        if not args.path:
            print("Usage: study_archiver restore <study_name> [--tailscale]")
            return
        target = Path.cwd() / "studies"
        restore_from_remote(args.path, target, use_tailscale=args.tailscale)

    elif args.command == "list":
        archives = list_remote_archives(use_tailscale=args.tailscale)
        if archives:
            print(f"\nArchived studies on dalidou:")
            print(f"{'='*60}")
            for a in archives:
                print(f"  {a['name']:40} {a['size']:>8}  {a['date']}")
        else:
            print("No archives found (or server not reachable)")


if __name__ == "__main__":
    main()