Atomizer/optimization_engine/nx/session_manager.py

"""
NX Session Manager - Prevents conflicts when multiple optimizations run concurrently.

This module ensures that NX sessions don't interfere with each other when:
1. Multiple optimizations are running simultaneously
2. User has NX open for manual work
3. Multiple Atomizer instances are running

Key Features:
- Session detection (running NX processes)
- File locking (prevents concurrent access to same model)
- Process queuing (waits if NX is busy with another optimization)
- Batch mode isolation (uses dedicated NX instances)
"""

import psutil
import time
import os
from pathlib import Path
from typing import Optional, List
from contextlib import contextmanager
from dataclasses import dataclass
import json

# Platform-specific imports
if os.name != 'nt':  # Unix/Linux/Mac
    import fcntl
else:  # Windows
    import msvcrt


@dataclass
class NXSessionInfo:
    """Information about a running NX session."""
    pid: int
    name: str
    cmdline: List[str]
    working_dir: Optional[str]
    create_time: float


class NXSessionManager:
    """
    Manages NX sessions to prevent conflicts between concurrent optimizations.

    Strategy:
    1. Detect running NX processes
    2. Use file locks to ensure exclusive model access
    3. Queue optimization trials if NX is busy
    4. Isolate batch mode sessions from interactive sessions
    """

    def __init__(
        self,
        lock_dir: Optional[Path] = None,
        max_concurrent_sessions: int = 1,
        wait_timeout: int = 300,
        verbose: bool = True
    ):
        """
        Initialize session manager.

        Args:
            lock_dir: Directory for lock files (default: temp)
            max_concurrent_sessions: Maximum concurrent NX optimization sessions
            wait_timeout: Maximum wait time for NX to become available (seconds)
            verbose: Print session management info
        """
        self.lock_dir = Path(lock_dir) if lock_dir else Path.home() / ".atomizer" / "locks"
        self.lock_dir.mkdir(parents=True, exist_ok=True)

        self.max_concurrent = max_concurrent_sessions
        self.wait_timeout = wait_timeout
        self.verbose = verbose

        # Track active sessions
        self.session_lock_file = self.lock_dir / "nx_sessions.json"
        self.global_lock_file = self.lock_dir / "nx_global.lock"

    def get_running_nx_sessions(self) -> List[NXSessionInfo]:
        """
        Detect all running NX processes.

        Returns:
            List of NX session info objects
        """
        nx_sessions = []

        for proc in psutil.process_iter(['pid', 'name', 'cmdline', 'cwd', 'create_time']):
            try:
                name = proc.info['name']

                # Check if this is an NX process
                if name and any(nx_exe in name.lower() for nx_exe in ['ugraf.exe', 'nx.exe', 'run_journal.exe', 'nxmgr_inter.exe']):
                    session = NXSessionInfo(
                        pid=proc.info['pid'],
                        name=name,
                        cmdline=proc.info['cmdline'] or [],
                        working_dir=proc.info['cwd'],
                        create_time=proc.info['create_time']
                    )
                    nx_sessions.append(session)

            except (psutil.NoSuchProcess, psutil.AccessDenied):
                continue

        return nx_sessions

    def is_nx_interactive_session_running(self) -> bool:
        """
        Check if user has NX open interactively (not batch mode).

        Returns:
            True if interactive NX session detected
        """
        sessions = self.get_running_nx_sessions()

        for session in sessions:
            # Interactive sessions are typically ugraf.exe or nx.exe without -batch
            if 'ugraf.exe' in session.name.lower() or 'nx.exe' in session.name.lower():
                # Check if it's not a batch session
                if '-batch' not in ' '.join(session.cmdline).lower():
                    return True

        return False

    @contextmanager
    def acquire_model_lock(self, model_file: Path, study_name: str):
        """
        Acquire exclusive lock for a specific model file.

        This prevents two optimizations from modifying the same model simultaneously.

        Args:
            model_file: Path to the model file (.prt)
            study_name: Name of the study (for logging)

        Yields:
            Lock context

        Raises:
            TimeoutError: If lock cannot be acquired within timeout
        """
        # Create lock file for this specific model
        model_hash = str(abs(hash(str(model_file))))
        lock_file = self.lock_dir / f"model_{model_hash}.lock"

        if self.verbose:
            print(f"\n[SESSION MGR] Acquiring lock for model: {model_file.name}")

        lock_fd = None
        start_time = time.time()

        try:
            # Try to acquire lock with timeout
            while True:
                try:
                    lock_fd = open(lock_file, 'w')

                    # Try to acquire exclusive lock (non-blocking)
                    if os.name == 'nt':  # Windows
                        import msvcrt
                        msvcrt.locking(lock_fd.fileno(), msvcrt.LK_NBLCK, 1)
                    else:  # Unix
                        fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)

                    # Write lock info
                    lock_info = {
                        'study_name': study_name,
                        'model_file': str(model_file),
                        'pid': os.getpid(),
                        'timestamp': time.time()
                    }
                    lock_fd.write(json.dumps(lock_info, indent=2))
                    lock_fd.flush()

                    if self.verbose:
                        print(f"[SESSION MGR] Lock acquired successfully")

                    break  # Lock acquired!

                except (IOError, OSError):
                    # Lock is held by another process
                    elapsed = time.time() - start_time

                    if elapsed > self.wait_timeout:
                        raise TimeoutError(
                            f"Could not acquire model lock for {model_file.name} "
                            f"after {self.wait_timeout}s. Another optimization may be using this model."
                        )

                    if self.verbose and elapsed % 10 == 0:  # Print every 10 seconds
                        print(f"[SESSION MGR] Waiting for model lock... ({elapsed:.0f}s)")

                    time.sleep(1)

            yield  # Lock acquired, user code runs here

        finally:
            # Release lock
            if lock_fd:
                try:
                    if os.name == 'nt':
                        import msvcrt
                        msvcrt.locking(lock_fd.fileno(), msvcrt.LK_UNLCK, 1)
                    else:
                        fcntl.flock(lock_fd, fcntl.LOCK_UN)

                    lock_fd.close()

                    if self.verbose:
                        print(f"[SESSION MGR] Lock released for model: {model_file.name}")

                except Exception as e:
                    if self.verbose:
                        print(f"[SESSION MGR] Warning: Failed to release lock: {e}")

            # Clean up lock file
            try:
                if lock_file.exists():
                    lock_file.unlink()
            except:
                pass

    @contextmanager
    def acquire_nx_session(self, study_name: str):
        """
        Acquire permission to run an NX batch session.

        This ensures we don't exceed max_concurrent_sessions and
        don't interfere with interactive NX sessions.

        Args:
            study_name: Name of the study (for logging)

        Yields:
            Session context

        Raises:
            TimeoutError: If session cannot be acquired within timeout
        """
        if self.verbose:
            print(f"\n[SESSION MGR] Requesting NX batch session for study: {study_name}")

        # Check for interactive NX sessions
        if self.is_nx_interactive_session_running():
            if self.verbose:
                print(f"[SESSION MGR] WARNING: Interactive NX session detected!")
                print(f"[SESSION MGR] Batch operations may conflict with user's work.")
                print(f"[SESSION MGR] Recommend closing interactive NX before running optimization.")

        start_time = time.time()
        session_acquired = False

        try:
            # Wait for available session slot
            while True:
                active_sessions = self._count_active_sessions()

                if active_sessions < self.max_concurrent:
                    # Register this session
                    self._register_session(study_name)
                    session_acquired = True

                    if self.verbose:
                        print(f"[SESSION MGR] NX session acquired (active: {active_sessions + 1}/{self.max_concurrent})")

                    break

                # Check timeout
                elapsed = time.time() - start_time
                if elapsed > self.wait_timeout:
                    raise TimeoutError(
                        f"Could not acquire NX session after {self.wait_timeout}s. "
                        f"Max concurrent sessions ({self.max_concurrent}) reached."
                    )

                if self.verbose and elapsed % 10 == 0:
                    print(f"[SESSION MGR] Waiting for NX session... ({elapsed:.0f}s, active: {active_sessions})")

                time.sleep(2)

            yield  # Session acquired, user code runs here

        finally:
            # Unregister session
            if session_acquired:
                self._unregister_session(study_name)

                if self.verbose:
                    print(f"[SESSION MGR] NX session released for study: {study_name}")

    def _count_active_sessions(self) -> int:
        """Count active optimization sessions."""
        if not self.session_lock_file.exists():
            return 0

        try:
            with open(self.session_lock_file, 'r') as f:
                sessions = json.load(f)

            # Clean up stale sessions (processes that no longer exist)
            active_sessions = []
            for session in sessions:
                pid = session.get('pid')
                if pid and psutil.pid_exists(pid):
                    active_sessions.append(session)

            # Update file with only active sessions
            with open(self.session_lock_file, 'w') as f:
                json.dump(active_sessions, f, indent=2)

            return len(active_sessions)

        except Exception as e:
            if self.verbose:
                print(f"[SESSION MGR] Warning: Failed to count sessions: {e}")
            return 0

    def _register_session(self, study_name: str):
        """Register a new optimization session."""
        sessions = []

        if self.session_lock_file.exists():
            try:
                with open(self.session_lock_file, 'r') as f:
                    sessions = json.load(f)
            except:
                sessions = []

        # Add new session
        sessions.append({
            'study_name': study_name,
            'pid': os.getpid(),
            'start_time': time.time(),
            'timestamp': time.time()
        })

        # Save
        with open(self.session_lock_file, 'w') as f:
            json.dump(sessions, f, indent=2)

    def _unregister_session(self, study_name: str):
        """Unregister an optimization session."""
        if not self.session_lock_file.exists():
            return

        try:
            with open(self.session_lock_file, 'r') as f:
                sessions = json.load(f)

            # Remove this session
            pid = os.getpid()
            sessions = [s for s in sessions if s.get('pid') != pid]

            # Save
            with open(self.session_lock_file, 'w') as f:
                json.dump(sessions, f, indent=2)

        except Exception as e:
            if self.verbose:
                print(f"[SESSION MGR] Warning: Failed to unregister session: {e}")

    def cleanup_stale_locks(self):
        """Remove lock files from crashed processes."""
        if not self.lock_dir.exists():
            return

        cleaned = 0

        for lock_file in self.lock_dir.glob("*.lock"):
            try:
                # Try to read lock info
                with open(lock_file, 'r') as f:
                    lock_info = json.load(f)

                pid = lock_info.get('pid')

                # Check if process still exists
                if pid and not psutil.pid_exists(pid):
                    lock_file.unlink()
                    cleaned += 1

                    if self.verbose:
                        print(f"[SESSION MGR] Cleaned stale lock: {lock_file.name}")

            except Exception:
                # If we can't read lock file, it might be corrupted - remove it
                try:
                    lock_file.unlink()
                    cleaned += 1
                except:
                    pass

        if self.verbose and cleaned > 0:
            print(f"[SESSION MGR] Cleaned {cleaned} stale lock file(s)")

    def get_status_report(self) -> str:
        """Generate status report of NX sessions and locks."""
        report = "\n" + "="*70 + "\n"
        report += "  NX SESSION MANAGER STATUS\n"
        report += "="*70 + "\n"

        # Running NX sessions
        nx_sessions = self.get_running_nx_sessions()
        report += f"\n  Running NX Processes: {len(nx_sessions)}\n"

        for session in nx_sessions:
            report += f"    PID {session.pid}: {session.name}\n"

            if session.working_dir:
                report += f"      Working dir: {session.working_dir}\n"

        # Interactive session warning
        if self.is_nx_interactive_session_running():
            report += f"\n  WARNING: Interactive NX session detected!\n"
            report += f"           Batch operations may conflict with user's work.\n"

        # Active optimization sessions
        active_count = self._count_active_sessions()
        report += f"\n  Active Optimization Sessions: {active_count}/{self.max_concurrent}\n"

        if self.session_lock_file.exists():
            try:
                with open(self.session_lock_file, 'r') as f:
                    sessions = json.load(f)

                for session in sessions:
                    study = session.get('study_name', 'Unknown')
                    pid = session.get('pid', 'Unknown')
                    report += f"    {study} (PID {pid})\n"

            except:
                pass

        # Lock files
        lock_files = list(self.lock_dir.glob("*.lock"))
        report += f"\n  Active Lock Files: {len(lock_files)}\n"

        report += "="*70 + "\n"

        return report