Atomizer/projects/hydrotech-beam/studies/01_doe_landscape/sampling.py

"""Latin Hypercube Sampling for Hydrotech Beam DoE.

Generates LHS sample points for the 4 design variables with:
- Maximin LHS for space-filling coverage
- Integer rounding for hole_count (DV4)
- Stratified integer sampling to ensure all 11 hole_count levels are covered
- Baseline (Trial 0) always included

References:
    OPTIMIZATION_STRATEGY.md §2.3 — Phase 1 configuration
    OPTIMIZATION_STRATEGY.md §1.4 — Integer handling
"""

from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import Any

import numpy as np
from scipy.stats.qmc import LatinHypercube

from geometric_checks import DesignPoint

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Design variable bounds
# ---------------------------------------------------------------------------
@dataclass(frozen=True)
class DVBounds:
    """Design variable bounds and metadata."""

    name: str
    nx_expression: str
    lower: float
    upper: float
    baseline: float
    is_integer: bool = False


DV_DEFINITIONS: list[DVBounds] = [
    DVBounds("beam_half_core_thickness", "beam_half_core_thickness", 10.0, 40.0, 25.162),
    DVBounds("beam_face_thickness", "beam_face_thickness", 10.0, 40.0, 21.504),
    DVBounds("holes_diameter", "holes_diameter", 150.0, 450.0, 300.0),
    DVBounds("hole_count", "hole_count", 5.0, 15.0, 10.0, is_integer=True),
]

N_DVS = len(DV_DEFINITIONS)
BASELINE_VALUES = [dv.baseline for dv in DV_DEFINITIONS]


def get_baseline_point() -> DesignPoint:
    """Return the baseline design point (Trial 0).

    Returns:
        DesignPoint with confirmed baseline values from NX introspection.
    """
    return DesignPoint(
        beam_half_core_thickness=25.162,
        beam_face_thickness=21.504,
        holes_diameter=300.0,
        hole_count=10,
    )


def generate_lhs_samples(
    n_samples: int = 50,
    seed: int = 42,
    include_baseline: bool = True,
) -> list[DesignPoint]:
    """Generate LHS sample points for the DoE study.

    Strategy for integer coverage (hole_count = 5..15, 11 levels):
        1. Generate `n_samples` LHS points with continuous DV4
        2. Round DV4 to nearest integer
        3. Check coverage — if any of the 11 levels are missing, replace
           the closest duplicate trial with the missing level
        This ensures all integer levels are represented while maintaining
        the space-filling property of LHS for the continuous variables.

    Args:
        n_samples: Number of LHS sample points (default: 50).
        seed: Random seed for reproducibility.
        include_baseline: If True, prepend baseline as Trial 0.

    Returns:
        List of DesignPoint instances. If include_baseline is True,
        the first element is the baseline (Trial 0).
    """
    logger.info(
        "Generating %d LHS samples (seed=%d, baseline=%s)",
        n_samples, seed, include_baseline,
    )

    # Generate unit hypercube LHS samples
    sampler = LatinHypercube(d=N_DVS, seed=seed, optimization="random-cd")
    unit_samples = sampler.random(n=n_samples)  # shape: (n_samples, 4)

    # Scale to design variable bounds
    samples = _scale_samples(unit_samples)

    # Round hole_count to nearest integer
    samples[:, 3] = np.round(samples[:, 3]).astype(int)
    samples[:, 3] = np.clip(samples[:, 3], 5, 15)

    # Ensure full integer coverage for hole_count
    samples = _ensure_integer_coverage(samples, rng=np.random.default_rng(seed))

    # Convert to DesignPoint list
    points: list[DesignPoint] = []
    if include_baseline:
        points.append(get_baseline_point())

    for row in samples:
        points.append(
            DesignPoint(
                beam_half_core_thickness=float(row[0]),
                beam_face_thickness=float(row[1]),
                holes_diameter=float(row[2]),
                hole_count=int(row[3]),
            )
        )

    logger.info(
        "Generated %d total points (%d LHS + %s baseline)",
        len(points),
        n_samples,
        "1" if include_baseline else "0",
    )
    _log_coverage(points)

    return points


def _scale_samples(unit_samples: np.ndarray) -> np.ndarray:
    """Scale unit hypercube [0,1]^d samples to design variable bounds.

    Args:
        unit_samples: Array of shape (n, 4) with values in [0, 1].

    Returns:
        Scaled array with values in [lower, upper] for each DV.
    """
    lower = np.array([dv.lower for dv in DV_DEFINITIONS])
    upper = np.array([dv.upper for dv in DV_DEFINITIONS])
    return lower + unit_samples * (upper - lower)


def _ensure_integer_coverage(
    samples: np.ndarray,
    rng: np.random.Generator,
) -> np.ndarray:
    """Ensure all 11 hole_count levels (5-15) are represented.

    If any integer level is missing, replace a duplicate from the most
    over-represented level with a sample at the missing level.
    Continuous DVs for replacement samples are drawn randomly within bounds.

    Args:
        samples: Array of shape (n, 4) with rounded hole_count in col 3.
        rng: NumPy random generator for reproducibility.

    Returns:
        Modified samples array with full integer coverage.
    """
    all_levels = set(range(5, 16))  # {5, 6, 7, ..., 15}
    present_levels = set(int(x) for x in samples[:, 3])
    missing_levels = all_levels - present_levels

    if not missing_levels:
        logger.info("All 11 hole_count levels represented ✓")
        return samples

    # Skip patching when sample size is too small to cover all levels
    n_samples = len(samples)
    if n_samples < len(all_levels):
        logger.info(
            "Only %d samples — too few to cover all 11 hole_count levels "
            "(need ≥11). Skipping stratified patching.",
            n_samples,
        )
        return samples

    logger.warning(
        "Missing hole_count levels: %s — patching with replacements",
        sorted(missing_levels),
    )

    for missing_level in sorted(missing_levels):
        # Find the most over-represented level
        unique, counts = np.unique(samples[:, 3].astype(int), return_counts=True)
        most_common_idx = np.argmax(counts)
        most_common_level = unique[most_common_idx]

        # Find indices with the most common level
        candidates = np.where(samples[:, 3].astype(int) == most_common_level)[0]
        replace_idx = rng.choice(candidates)

        # Replace: keep continuous DVs random within bounds, set hole_count
        for j, dv in enumerate(DV_DEFINITIONS):
            if dv.is_integer:
                samples[replace_idx, j] = missing_level
            else:
                samples[replace_idx, j] = rng.uniform(dv.lower, dv.upper)

        logger.info(
            "  Replaced trial at idx %d: hole_count %d → %d",
            replace_idx, most_common_level, missing_level,
        )

    return samples


def _log_coverage(points: list[DesignPoint]) -> None:
    """Log hole_count coverage statistics."""
    counts: dict[int, int] = {}
    for p in points:
        counts[p.hole_count] = counts.get(p.hole_count, 0) + 1

    logger.info("Hole count coverage:")
    for level in range(5, 16):
        n = counts.get(level, 0)
        logger.info("  hole_count=%2d: %d trials", level, n)


def points_to_dicts(points: list[DesignPoint]) -> list[dict[str, Any]]:
    """Convert DesignPoint list to list of dicts (for Optuna enqueue).

    Args:
        points: List of DesignPoint instances.

    Returns:
        List of dicts with DV names as keys.
    """
    return [
        {
            "beam_half_core_thickness": p.beam_half_core_thickness,
            "beam_face_thickness": p.beam_face_thickness,
            "holes_diameter": p.holes_diameter,
            "hole_count": p.hole_count,
        }
        for p in points
    ]


# ---------------------------------------------------------------------------
# Quick self-test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

    points = generate_lhs_samples(n_samples=50, seed=42)
    print(f"\nGenerated {len(points)} total points")
    print(f"  Trial 0 (baseline): {points[0]}")
    print(f"  Trial 1 (first LHS): {points[1]}")
    print(f"  Trial {len(points)-1} (last LHS): {points[-1]}")

    # Verify coverage
    hole_counts = {p.hole_count for p in points}
    expected = set(range(5, 16))
    assert hole_counts == expected, (
        f"Missing hole_count levels: {expected - hole_counts}"
    )
    print("\nAll 11 hole_count levels covered ✓")

    # Verify bounds
    for p in points[1:]:  # skip baseline
        assert 10.0 <= p.beam_half_core_thickness <= 40.0
        assert 10.0 <= p.beam_face_thickness <= 40.0
        assert 150.0 <= p.holes_diameter <= 450.0
        assert 5 <= p.hole_count <= 15
    print("All samples within bounds ✓")