feat: Pre-migration checkpoint - updated docs and utilities
Updates before optimization_engine migration: - Updated migration plan to v2.1 with complete file inventory - Added OP_07 disk optimization protocol - Added SYS_16 self-aware turbo protocol - Added study archiver and cleanup utilities - Added ensemble surrogate module - Updated NX solver and session manager - Updated zernike HTML generator - Added context engineering plan - LAC session insights updates 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
438
optimization_engine/utils/study_archiver.py
Normal file
438
optimization_engine/utils/study_archiver.py
Normal file
@@ -0,0 +1,438 @@
|
||||
"""
|
||||
Study Archiver - Disk Space Optimization for Atomizer Studies
|
||||
|
||||
This module provides utilities for:
|
||||
1. Cleaning up completed studies (removing regenerable files)
|
||||
2. Archiving studies to remote storage (dalidou server)
|
||||
3. Restoring archived studies on-demand
|
||||
|
||||
Usage:
|
||||
# Cleanup a completed study (keep only essential files)
|
||||
python -m optimization_engine.utils.study_archiver cleanup studies/M1_Mirror/m1_mirror_V12
|
||||
|
||||
# Archive to remote server
|
||||
python -m optimization_engine.utils.study_archiver archive studies/M1_Mirror/m1_mirror_V12
|
||||
|
||||
# Restore from remote
|
||||
python -m optimization_engine.utils.study_archiver restore m1_mirror_V12
|
||||
|
||||
# Show disk usage analysis
|
||||
python -m optimization_engine.utils.study_archiver analyze studies/M1_Mirror
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import tarfile
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, List, Tuple
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
REMOTE_CONFIG = {
|
||||
"host": "192.168.86.50", # Local WiFi
|
||||
"host_tailscale": "100.80.199.40", # Remote via Tailscale
|
||||
"user": "papa",
|
||||
"archive_path": "/srv/storage/atomizer-archive",
|
||||
"ssh_port": 22,
|
||||
}
|
||||
|
||||
# Files to KEEP per trial (essential for analysis)
|
||||
ESSENTIAL_EXTENSIONS = {
|
||||
'.op2', # Nastran binary results (Zernike extraction)
|
||||
'.json', # Parameters, results, metadata
|
||||
'.npz', # Pre-computed Zernike coefficients
|
||||
'.html', # Generated reports
|
||||
'.png', # Visualization images
|
||||
'.csv', # Exported data
|
||||
}
|
||||
|
||||
# Files to DELETE per trial (regenerable from master + params)
|
||||
DELETABLE_EXTENSIONS = {
|
||||
'.prt', # NX part files (copy of master)
|
||||
'.fem', # FEM mesh files (copy of master)
|
||||
'.sim', # Simulation files (copy of master)
|
||||
'.afm', # Assembly FEM files
|
||||
'.dat', # Solver input deck (can regenerate)
|
||||
'.f04', # Nastran output log
|
||||
'.f06', # Nastran printed output
|
||||
'.log', # Generic log files
|
||||
'.diag', # Diagnostic files
|
||||
'.txt', # Temp text files
|
||||
'.exp', # Expression files
|
||||
'.bak', # Backup files
|
||||
}
|
||||
|
||||
# Folders to always keep entirely
|
||||
KEEP_FOLDERS = {
|
||||
'1_setup', # Master model files (source of truth)
|
||||
'3_results', # Final results, database, reports
|
||||
'best_design_archive', # Archived best designs
|
||||
}
|
||||
|
||||
|
||||
def analyze_study(study_path: Path) -> Dict:
|
||||
"""Analyze disk usage of a study folder."""
|
||||
study_path = Path(study_path)
|
||||
|
||||
analysis = {
|
||||
"study_name": study_path.name,
|
||||
"total_size_bytes": 0,
|
||||
"by_extension": {},
|
||||
"by_folder": {},
|
||||
"essential_size": 0,
|
||||
"deletable_size": 0,
|
||||
"trial_count": 0,
|
||||
}
|
||||
|
||||
for f in study_path.rglob("*"):
|
||||
if f.is_file():
|
||||
sz = f.stat().st_size
|
||||
ext = f.suffix.lower()
|
||||
|
||||
analysis["total_size_bytes"] += sz
|
||||
analysis["by_extension"][ext] = analysis["by_extension"].get(ext, 0) + sz
|
||||
|
||||
# Categorize by folder
|
||||
rel_parts = f.relative_to(study_path).parts
|
||||
if rel_parts:
|
||||
folder = rel_parts[0]
|
||||
analysis["by_folder"][folder] = analysis["by_folder"].get(folder, 0) + sz
|
||||
|
||||
# Essential vs deletable
|
||||
if ext in ESSENTIAL_EXTENSIONS:
|
||||
analysis["essential_size"] += sz
|
||||
elif ext in DELETABLE_EXTENSIONS:
|
||||
analysis["deletable_size"] += sz
|
||||
|
||||
# Count trials
|
||||
iterations_dir = study_path / "2_iterations"
|
||||
if iterations_dir.exists():
|
||||
analysis["trial_count"] = len([
|
||||
d for d in iterations_dir.iterdir()
|
||||
if d.is_dir() and (d.name.startswith("trial_") or d.name.startswith("iter"))
|
||||
])
|
||||
|
||||
return analysis
|
||||
|
||||
|
||||
def print_analysis(analysis: Dict):
|
||||
"""Print formatted analysis results."""
|
||||
total_gb = analysis["total_size_bytes"] / 1e9
|
||||
essential_gb = analysis["essential_size"] / 1e9
|
||||
deletable_gb = analysis["deletable_size"] / 1e9
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Study: {analysis['study_name']}")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total size: {total_gb:8.2f} GB")
|
||||
print(f"Trials: {analysis['trial_count']:8d}")
|
||||
print(f"Essential: {essential_gb:8.2f} GB ({100*essential_gb/total_gb:.1f}%)")
|
||||
print(f"Deletable: {deletable_gb:8.2f} GB ({100*deletable_gb/total_gb:.1f}%)")
|
||||
print(f"Potential save: {deletable_gb:8.2f} GB")
|
||||
|
||||
print(f"\nBy folder:")
|
||||
for folder, size in sorted(analysis["by_folder"].items(), key=lambda x: -x[1]):
|
||||
print(f" {folder:25} {size/1e9:8.2f} GB")
|
||||
|
||||
print(f"\nTop extensions:")
|
||||
for ext, size in sorted(analysis["by_extension"].items(), key=lambda x: -x[1])[:10]:
|
||||
status = "[KEEP]" if ext in ESSENTIAL_EXTENSIONS else "[DEL?]" if ext in DELETABLE_EXTENSIONS else "[ ]"
|
||||
print(f" {status} {ext:10} {size/1e9:8.2f} GB")
|
||||
|
||||
|
||||
def cleanup_study(study_path: Path, dry_run: bool = True) -> Tuple[int, int]:
|
||||
"""
|
||||
Clean up a completed study by removing regenerable files from trial folders.
|
||||
|
||||
Args:
|
||||
study_path: Path to study folder
|
||||
dry_run: If True, only report what would be deleted
|
||||
|
||||
Returns:
|
||||
(files_deleted, bytes_freed)
|
||||
"""
|
||||
study_path = Path(study_path)
|
||||
iterations_dir = study_path / "2_iterations"
|
||||
|
||||
if not iterations_dir.exists():
|
||||
logger.warning(f"No iterations folder found in {study_path}")
|
||||
return 0, 0
|
||||
|
||||
files_to_delete = []
|
||||
bytes_to_free = 0
|
||||
|
||||
# Find all deletable files in trial folders
|
||||
for trial_dir in iterations_dir.iterdir():
|
||||
if not trial_dir.is_dir():
|
||||
continue
|
||||
|
||||
for f in trial_dir.iterdir():
|
||||
if f.is_file() and f.suffix.lower() in DELETABLE_EXTENSIONS:
|
||||
files_to_delete.append(f)
|
||||
bytes_to_free += f.stat().st_size
|
||||
|
||||
if dry_run:
|
||||
print(f"\n[DRY RUN] Would delete {len(files_to_delete)} files, freeing {bytes_to_free/1e9:.2f} GB")
|
||||
print("\nSample files to delete:")
|
||||
for f in files_to_delete[:10]:
|
||||
print(f" {f.relative_to(study_path)}")
|
||||
if len(files_to_delete) > 10:
|
||||
print(f" ... and {len(files_to_delete) - 10} more")
|
||||
return 0, 0
|
||||
|
||||
# Actually delete
|
||||
deleted = 0
|
||||
freed = 0
|
||||
for f in files_to_delete:
|
||||
try:
|
||||
sz = f.stat().st_size
|
||||
f.unlink()
|
||||
deleted += 1
|
||||
freed += sz
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to delete {f}: {e}")
|
||||
|
||||
print(f"Deleted {deleted} files, freed {freed/1e9:.2f} GB")
|
||||
return deleted, freed
|
||||
|
||||
|
||||
def archive_to_remote(
|
||||
study_path: Path,
|
||||
use_tailscale: bool = False,
|
||||
dry_run: bool = True
|
||||
) -> bool:
|
||||
"""
|
||||
Archive a study to the remote dalidou server.
|
||||
|
||||
Args:
|
||||
study_path: Path to study folder
|
||||
use_tailscale: Use Tailscale IP (for remote access)
|
||||
dry_run: If True, only report what would be done
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
study_path = Path(study_path)
|
||||
study_name = study_path.name
|
||||
|
||||
host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"]
|
||||
user = REMOTE_CONFIG["user"]
|
||||
remote_path = REMOTE_CONFIG["archive_path"]
|
||||
|
||||
# Create compressed archive locally first
|
||||
archive_name = f"{study_name}_{datetime.now().strftime('%Y%m%d')}.tar.gz"
|
||||
local_archive = study_path.parent / archive_name
|
||||
|
||||
if dry_run:
|
||||
print(f"\n[DRY RUN] Would archive {study_name}")
|
||||
print(f" 1. Create {archive_name}")
|
||||
print(f" 2. Upload to {user}@{host}:{remote_path}/")
|
||||
print(f" 3. Delete local archive")
|
||||
return True
|
||||
|
||||
print(f"Creating archive: {archive_name}")
|
||||
with tarfile.open(local_archive, "w:gz") as tar:
|
||||
tar.add(study_path, arcname=study_name)
|
||||
|
||||
archive_size = local_archive.stat().st_size
|
||||
print(f"Archive size: {archive_size/1e9:.2f} GB")
|
||||
|
||||
# Upload via rsync (more reliable than scp for large files)
|
||||
print(f"Uploading to {host}...")
|
||||
|
||||
# First ensure remote directory exists
|
||||
ssh_cmd = f'ssh {user}@{host} "mkdir -p {remote_path}"'
|
||||
subprocess.run(ssh_cmd, shell=True, check=True)
|
||||
|
||||
# Upload
|
||||
rsync_cmd = f'rsync -avz --progress "{local_archive}" {user}@{host}:{remote_path}/'
|
||||
result = subprocess.run(rsync_cmd, shell=True)
|
||||
|
||||
if result.returncode == 0:
|
||||
print("Upload successful!")
|
||||
# Clean up local archive
|
||||
local_archive.unlink()
|
||||
return True
|
||||
else:
|
||||
print(f"Upload failed with code {result.returncode}")
|
||||
return False
|
||||
|
||||
|
||||
def restore_from_remote(
|
||||
study_name: str,
|
||||
target_dir: Path,
|
||||
use_tailscale: bool = False
|
||||
) -> bool:
|
||||
"""
|
||||
Restore a study from the remote server.
|
||||
|
||||
Args:
|
||||
study_name: Name of the study to restore
|
||||
target_dir: Where to extract the study
|
||||
use_tailscale: Use Tailscale IP
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"]
|
||||
user = REMOTE_CONFIG["user"]
|
||||
remote_path = REMOTE_CONFIG["archive_path"]
|
||||
|
||||
target_dir = Path(target_dir)
|
||||
|
||||
# Find the archive on remote
|
||||
print(f"Looking for {study_name} on {host}...")
|
||||
|
||||
ssh_cmd = f'ssh {user}@{host} "ls {remote_path}/{study_name}*.tar.gz 2>/dev/null | head -1"'
|
||||
result = subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True)
|
||||
|
||||
if not result.stdout.strip():
|
||||
print(f"No archive found for {study_name}")
|
||||
return False
|
||||
|
||||
remote_archive = result.stdout.strip()
|
||||
local_archive = target_dir / Path(remote_archive).name
|
||||
|
||||
print(f"Downloading: {remote_archive}")
|
||||
rsync_cmd = f'rsync -avz --progress {user}@{host}:"{remote_archive}" "{local_archive}"'
|
||||
result = subprocess.run(rsync_cmd, shell=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print("Download failed")
|
||||
return False
|
||||
|
||||
print("Extracting...")
|
||||
with tarfile.open(local_archive, "r:gz") as tar:
|
||||
tar.extractall(target_dir)
|
||||
|
||||
# Clean up
|
||||
local_archive.unlink()
|
||||
print(f"Restored to {target_dir / study_name}")
|
||||
return True
|
||||
|
||||
|
||||
def list_remote_archives(use_tailscale: bool = False) -> List[Dict]:
|
||||
"""List all archived studies on the remote server."""
|
||||
host = REMOTE_CONFIG["host_tailscale"] if use_tailscale else REMOTE_CONFIG["host"]
|
||||
user = REMOTE_CONFIG["user"]
|
||||
remote_path = REMOTE_CONFIG["archive_path"]
|
||||
|
||||
ssh_cmd = f'ssh {user}@{host} "ls -lh {remote_path}/*.tar.gz 2>/dev/null"'
|
||||
result = subprocess.run(ssh_cmd, shell=True, capture_output=True, text=True)
|
||||
|
||||
archives = []
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line and '.tar.gz' in line:
|
||||
parts = line.split()
|
||||
if len(parts) >= 9:
|
||||
archives.append({
|
||||
"name": parts[-1].split('/')[-1],
|
||||
"size": parts[4],
|
||||
"date": f"{parts[5]} {parts[6]} {parts[7]}",
|
||||
})
|
||||
|
||||
return archives
|
||||
|
||||
|
||||
def analyze_all_studies(studies_dir: Path) -> Dict:
|
||||
"""Analyze all studies in a directory."""
|
||||
studies_dir = Path(studies_dir)
|
||||
|
||||
total_analysis = {
|
||||
"total_size": 0,
|
||||
"total_essential": 0,
|
||||
"total_deletable": 0,
|
||||
"studies": [],
|
||||
}
|
||||
|
||||
for study in sorted(studies_dir.iterdir()):
|
||||
if study.is_dir() and not study.name.startswith('.'):
|
||||
analysis = analyze_study(study)
|
||||
total_analysis["studies"].append(analysis)
|
||||
total_analysis["total_size"] += analysis["total_size_bytes"]
|
||||
total_analysis["total_essential"] += analysis["essential_size"]
|
||||
total_analysis["total_deletable"] += analysis["deletable_size"]
|
||||
|
||||
return total_analysis
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Atomizer Study Archiver")
|
||||
parser.add_argument("command", choices=["analyze", "cleanup", "archive", "restore", "list"])
|
||||
parser.add_argument("path", nargs="?", help="Study path or name")
|
||||
parser.add_argument("--dry-run", action="store_true", default=True,
|
||||
help="Don't actually delete/transfer (default: True)")
|
||||
parser.add_argument("--execute", action="store_true",
|
||||
help="Actually perform the operation")
|
||||
parser.add_argument("--tailscale", action="store_true",
|
||||
help="Use Tailscale IP for remote access")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
dry_run = not args.execute
|
||||
|
||||
if args.command == "analyze":
|
||||
if not args.path:
|
||||
print("Usage: study_archiver analyze <path>")
|
||||
return
|
||||
|
||||
path = Path(args.path)
|
||||
if path.is_dir():
|
||||
# Check if it's a single study or a collection
|
||||
if (path / "optimization_config.json").exists() or (path / "1_setup").exists():
|
||||
# Single study
|
||||
analysis = analyze_study(path)
|
||||
print_analysis(analysis)
|
||||
else:
|
||||
# Collection of studies
|
||||
total = analyze_all_studies(path)
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Summary: {len(total['studies'])} studies")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total size: {total['total_size']/1e9:8.2f} GB")
|
||||
print(f"Essential: {total['total_essential']/1e9:8.2f} GB")
|
||||
print(f"Deletable: {total['total_deletable']/1e9:8.2f} GB")
|
||||
print(f"Potential save: {total['total_deletable']/1e9:8.2f} GB")
|
||||
print(f"\nPer study:")
|
||||
for s in total["studies"]:
|
||||
print(f" {s['study_name']:40} {s['total_size_bytes']/1e9:6.2f} GB ({s['trial_count']:3d} trials)")
|
||||
|
||||
elif args.command == "cleanup":
|
||||
if not args.path:
|
||||
print("Usage: study_archiver cleanup <study_path> [--execute]")
|
||||
return
|
||||
cleanup_study(Path(args.path), dry_run=dry_run)
|
||||
|
||||
elif args.command == "archive":
|
||||
if not args.path:
|
||||
print("Usage: study_archiver archive <study_path> [--execute] [--tailscale]")
|
||||
return
|
||||
archive_to_remote(Path(args.path), use_tailscale=args.tailscale, dry_run=dry_run)
|
||||
|
||||
elif args.command == "restore":
|
||||
if not args.path:
|
||||
print("Usage: study_archiver restore <study_name> [--tailscale]")
|
||||
return
|
||||
target = Path.cwd() / "studies"
|
||||
restore_from_remote(args.path, target, use_tailscale=args.tailscale)
|
||||
|
||||
elif args.command == "list":
|
||||
archives = list_remote_archives(use_tailscale=args.tailscale)
|
||||
if archives:
|
||||
print(f"\nArchived studies on dalidou:")
|
||||
print(f"{'='*60}")
|
||||
for a in archives:
|
||||
print(f" {a['name']:40} {a['size']:>8} {a['date']}")
|
||||
else:
|
||||
print("No archives found (or server not reachable)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user