Atomizer/studies/M1_Mirror/analyze_flatback_campaign.py

#!/usr/bin/env python3
"""Analyze all flat back campaign data to design optimal SAT V9."""

import sqlite3
import json
import numpy as np
from pathlib import Path

STUDIES_DIR = Path(__file__).parent

# All flat back databases
STUDIES = [
    ('V3', STUDIES_DIR / 'm1_mirror_cost_reduction_flat_back_V3' / '3_results' / 'study.db'),
    ('V4', STUDIES_DIR / 'm1_mirror_cost_reduction_flat_back_V4' / '3_results' / 'study.db'),
    ('V5', STUDIES_DIR / 'm1_mirror_cost_reduction_flat_back_V5' / '3_results' / 'study.db'),
    ('V6', STUDIES_DIR / 'm1_mirror_cost_reduction_flat_back_V6' / '3_results' / 'study.db'),
    ('V7', STUDIES_DIR / 'm1_mirror_cost_reduction_flat_back_V7' / '3_results' / 'study.db'),
    ('V8', STUDIES_DIR / 'm1_mirror_cost_reduction_flat_back_V8' / '3_results' / 'study.db'),
]

MAX_MASS = 120.0


def load_all_data():
    """Load all trial data from all studies."""
    all_data = []

    for name, db_path in STUDIES:
        if not db_path.exists():
            continue

        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()

        cursor.execute('SELECT trial_id FROM trials WHERE state = "COMPLETE"')
        trial_ids = [r[0] for r in cursor.fetchall()]

        for tid in trial_ids:
            # Get params
            cursor.execute('SELECT param_name, param_value FROM trial_params WHERE trial_id = ?', (tid,))
            params_raw = {r[0]: r[1] for r in cursor.fetchall()}
            params = {(k.split(']', 1)[1] if ']' in k else k): v for k, v in params_raw.items()}

            # Get attributes
            cursor.execute('SELECT key, value_json FROM trial_user_attributes WHERE trial_id = ?', (tid,))
            attrs = {r[0]: json.loads(r[1]) for r in cursor.fetchall()}

            # Get WS
            cursor.execute('SELECT value FROM trial_values WHERE trial_id = ?', (tid,))
            ws_row = cursor.fetchone()
            ws = ws_row[0] if ws_row else None

            mass = attrs.get('mass_kg', 999.0)
            wfe_40 = attrs.get('obj_wfe_40_20') or attrs.get('wfe_40_20')
            wfe_60 = attrs.get('obj_wfe_60_20') or attrs.get('wfe_60_20')
            mfg_90 = attrs.get('obj_mfg_90') or attrs.get('mfg_90')

            if wfe_40 is None or wfe_60 is None or mfg_90 is None:
                continue

            all_data.append({
                'study': name,
                'trial_id': tid,
                'params': params,
                'mass': mass,
                'wfe_40': wfe_40,
                'wfe_60': wfe_60,
                'mfg_90': mfg_90,
                'ws': ws,
                'feasible': mass <= MAX_MASS
            })

        conn.close()

    return all_data


def main():
    data = load_all_data()

    print("=" * 70)
    print("FLAT BACK CAMPAIGN - COMPLETE DATA ANALYSIS")
    print("=" * 70)
    print()

    # Summary by study
    print("1. DATA INVENTORY BY STUDY")
    print("-" * 70)

    from collections import defaultdict
    by_study = defaultdict(list)
    for d in data:
        by_study[d['study']].append(d)

    total = 0
    total_feasible = 0
    for name in ['V3', 'V4', 'V5', 'V6', 'V7', 'V8']:
        trials = by_study.get(name, [])
        feasible = [t for t in trials if t['feasible']]
        best = min([t['ws'] for t in feasible]) if feasible else None
        total += len(trials)
        total_feasible += len(feasible)

        if best:
            print(f"  {name}: {len(trials):4d} trials, {len(feasible):4d} feasible, best WS = {best:.2f}")
        else:
            print(f"  {name}: {len(trials):4d} trials, {len(feasible):4d} feasible")

    print(f"\n  TOTAL: {total} trials, {total_feasible} feasible")

    # Global best analysis
    print()
    print("2. TOP 10 DESIGNS (ALL STUDIES)")
    print("-" * 70)

    feasible_data = [d for d in data if d['feasible']]
    top10 = sorted(feasible_data, key=lambda x: x['ws'])[:10]

    print(f"  {'Rank':<5} {'Study':<6} {'WS':<10} {'40-20':<8} {'60-20':<8} {'Mfg90':<8} {'Mass':<8}")
    print("  " + "-" * 60)
    for i, d in enumerate(top10, 1):
        print(f"  {i:<5} {d['study']:<6} {d['ws']:<10.2f} {d['wfe_40']:<8.2f} {d['wfe_60']:<8.2f} {d['mfg_90']:<8.2f} {d['mass']:<8.2f}")

    # Analyze optimal region
    print()
    print("3. OPTIMAL PARAMETER REGION (Top 20 designs)")
    print("-" * 70)

    top20 = sorted(feasible_data, key=lambda x: x['ws'])[:20]

    # Get param names from first design
    param_names = list(top20[0]['params'].keys())

    print(f"\n  Parameter ranges in top 20 designs:")
    print(f"  {'Parameter':<35} {'Min':<10} {'Max':<10} {'Mean':<10}")
    print("  " + "-" * 65)

    optimal_ranges = {}
    for pname in sorted(param_names):
        values = [d['params'].get(pname) for d in top20 if pname in d['params']]
        if values and all(v is not None for v in values):
            optimal_ranges[pname] = {
                'min': min(values),
                'max': max(values),
                'mean': np.mean(values)
            }
            print(f"  {pname:<35} {min(values):<10.2f} {max(values):<10.2f} {np.mean(values):<10.2f}")

    # Mass analysis
    print()
    print("4. MASS VS WS CORRELATION")
    print("-" * 70)

    masses = [d['mass'] for d in feasible_data]
    ws_values = [d['ws'] for d in feasible_data]

    # Bin by mass
    bins = [(105, 110), (110, 115), (115, 118), (118, 120)]
    print(f"\n  {'Mass Range':<15} {'Count':<8} {'Best WS':<10} {'Mean WS':<10}")
    print("  " + "-" * 45)

    for low, high in bins:
        in_bin = [d for d in feasible_data if low <= d['mass'] < high]
        if in_bin:
            best = min(d['ws'] for d in in_bin)
            mean = np.mean([d['ws'] for d in in_bin])
            print(f"  {low}-{high} kg{'':<5} {len(in_bin):<8} {best:<10.2f} {mean:<10.2f}")

    # Find sweet spot
    print()
    print("5. RECOMMENDED SAT V9 STRATEGY")
    print("-" * 70)

    best_design = top10[0]
    print(f"""
  A. USE ALL {total_feasible} FEASIBLE SAMPLES FOR TRAINING
     - V8 only used V6 data (196 samples)
     - With {total_feasible} samples, surrogate will be much more accurate

  B. FOCUS ON OPTIMAL MASS REGION
     - Best designs have mass 115-119 kg
     - V8's threshold at 115 kg was too conservative
     - Recommendation: soft threshold at 118 kg

  C. ADAPTIVE EXPLORATION SCHEDULE
     - Phase 1 (trials 1-30): exploration_weight = 0.2
     - Phase 2 (trials 31-80): exploration_weight = 0.1
     - Phase 3 (trials 81+): exploration_weight = 0.05 (pure exploitation)

  D. EXPLOIT BEST REGION
     - Best design: WS={best_design['ws']:.2f} from {best_design['study']}
     - Sample 70% of candidates within 5% of best params
     - Only 30% random exploration

  E. L-BFGS POLISH (last 10 trials)
     - Start from best found design
     - Trust region around current best
     - Gradient descent with surrogate
""")

    # Output best params for V9 seeding
    print("6. BEST DESIGN PARAMS (FOR V9 SEEDING)")
    print("-" * 70)
    print()
    for pname, value in sorted(best_design['params'].items()):
        print(f"  {pname}: {value}")

    print()
    print("=" * 70)


if __name__ == "__main__":
    main()