DOLPHIN/nautilus_dolphin/dvae/exp9c_overfitting_validation.py

"""
Exp 9c — Overfitting Validation for D_LIQ_GOLD

Battery of tests designed to expose any period-specific bias in the D_LIQ_GOLD result
(8x/9x + liquidation guard, exp9b: ROI=181.81%, DD=17.65%, Calmar=10.30).

Three test families:

1. TEMPORAL SPLIT (H1/H2)
   Same split as exp8 adaptive_beta validation (days 0-27 vs days 28-55).
   Each half: fresh engine, fresh capital=$25k, cold start.
   Pass criterion: Calmar(d_liq) > Calmar(adaptive_beta) in BOTH halves.
   If d_liq only wins in one half → period-specific, do NOT flip default.

2. QUARTERLY SPLIT (Q1/Q2/Q3/Q4)
   Four independent ~14-day windows.
   Finer-grained: reveals if any single quarter is carrying the full result.
   Pass criterion: d_liq Calmar consistently above adaptive_beta across quarters.

3. MARGIN BUFFER SENSITIVITY
   Test margin_buffer = 0.80, 0.90, 0.95 (gold), 1.00 on the full period.
   Confirms the specific 10.6% floor is not cherry-picked.
   Pass criterion: ROI/DD metrics stable across ±0.15 variation in buffer.

Reference benchmarks:
  D_LIQ_GOLD (full period): ROI=181.81%, DD=17.65%, Calmar=10.30
  adaptive_beta (full):      ROI= 96.55%, DD=14.32%, Calmar= 6.74

Results → exp9c_overfitting_results.json
"""

import sys, time, json, math
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
import numpy as np

_HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(_HERE.parent))

from exp_shared import (
    ensure_jit, ENGINE_KWARGS, MC_BASE_CFG,
    load_data, load_forewarner, log_results,
)
from nautilus_dolphin.nautilus.proxy_boost_engine import (
    AdaptiveBoostEngine, LiquidationGuardEngine,
    DEFAULT_THRESHOLD, DEFAULT_ALPHA,
    D_LIQ_SOFT_CAP, D_LIQ_ABS_CAP, D_LIQ_MC_REF,
)
from nautilus_dolphin.nautilus.adaptive_circuit_breaker import AdaptiveCircuitBreaker

_D_LIQ_FULL = dict(roi=181.81, dd=17.65, calmar=10.30, trades=2155)
_ABETA_FULL  = dict(roi= 96.55, dd=14.32, calmar= 6.74, trades=2155)

_PROXY = dict(threshold=DEFAULT_THRESHOLD, alpha=DEFAULT_ALPHA,
              adaptive_beta=True, adaptive_alpha=False, adaptive_thr=False)


# ── Engine factories ──────────────────────────────────────────────────────────

def _make_dliq(kw, margin_buffer=0.95):
    return LiquidationGuardEngine(
        extended_soft_cap=D_LIQ_SOFT_CAP,
        extended_abs_cap=D_LIQ_ABS_CAP,
        mc_leverage_ref=D_LIQ_MC_REF,
        margin_buffer=margin_buffer,
        **_PROXY, **kw,
    )

def _make_abeta(kw):
    return AdaptiveBoostEngine(**_PROXY, **kw)


# ── Run harness (window-aware) ────────────────────────────────────────────────

def _run_window(engine_factory, name, d, fw, day_indices):
    """Run a sub-period backtest over the given day index slice."""
    kw = ENGINE_KWARGS.copy()
    acb = AdaptiveCircuitBreaker()
    # Preload full date list for proper w750 context even in sub-period runs
    acb.preload_w750(d['date_strings'])

    eng = engine_factory(kw)
    eng.set_ob_engine(d['ob_eng'])
    eng.set_acb(acb)
    if fw is not None:
        eng.set_mc_forewarner(fw, MC_BASE_CFG)
    eng.set_esoteric_hazard_multiplier(0.0)

    daily_caps, daily_pnls = [], []
    pf_list = d['parquet_files']

    for idx in day_indices:
        pf   = pf_list[idx]
        ds   = pf.stem
        df, acols, dvol = d['pq_data'][ds]
        cap_before = eng.capital
        vol_ok = np.where(np.isfinite(dvol), dvol > d['vol_p60'], False)
        eng.process_day(ds, df, acols, vol_regime_ok=vol_ok)
        daily_caps.append(eng.capital)
        daily_pnls.append(eng.capital - cap_before)

    tr  = eng.trade_history
    n   = len(tr)
    roi = (eng.capital - 25000.0) / 25000.0 * 100.0

    liq_stops = getattr(eng, 'liquidation_stops', 0)
    mc_mon    = getattr(eng, 'mc_monitor', {})

    if n == 0:
        return dict(name=name, roi=roi, dd=0.0, calmar=0.0, trades=0,
                    liq_stops=liq_stops, days=len(day_indices))

    def _abs(t): return t.pnl_absolute if hasattr(t, 'pnl_absolute') else t.pnl_pct * 250.0
    wins   = [t for t in tr if _abs(t) > 0]
    losses = [t for t in tr if _abs(t) <= 0]

    peak_cap, max_dd = 25000.0, 0.0
    for cap in daily_caps:
        peak_cap = max(peak_cap, cap)
        max_dd   = max(max_dd, (peak_cap - cap) / peak_cap * 100.0)

    calmar = roi / max_dd if max_dd > 0 else 0.0

    return dict(
        name=name, roi=roi, dd=max_dd, calmar=calmar, trades=n,
        liq_stops=liq_stops, days=len(day_indices),
        mc_red=mc_mon.get('red', 0), mc_halted=mc_mon.get('halted', 0),
    )


def _compare(dliq_r, abeta_r, window_label):
    """Print head-to-head for one window."""
    d_roi = dliq_r['roi']  - abeta_r['roi']
    d_dd  = dliq_r['dd']   - abeta_r['dd']
    d_cal = dliq_r['calmar'] - abeta_r['calmar']
    liq   = dliq_r.get('liq_stops', 0)
    verdict = 'PASS' if dliq_r['calmar'] > abeta_r['calmar'] else 'FAIL'
    print(f"  {window_label:<18}  d_liq {dliq_r['roi']:>7.2f}% / {dliq_r['dd']:>5.2f}% "
          f"cal={dliq_r['calmar']:.2f}  |  abeta {abeta_r['roi']:>7.2f}% / {abeta_r['dd']:>5.2f}% "
          f"cal={abeta_r['calmar']:.2f}  |  ΔROI={d_roi:+.2f} ΔDD={d_dd:+.2f} ΔCal={d_cal:+.2f} "
          f"liq={liq}  [{verdict}]")
    return verdict == 'PASS'


# ── Main ─────────────────────────────────────────────────────────────────────

def main():
    t_start = time.time()
    print("=" * 80)
    print("Exp 9c — D_LIQ_GOLD Overfitting Validation")
    print("=" * 80)

    ensure_jit()
    d  = load_data()
    fw = load_forewarner()

    n_days = len(d['parquet_files'])
    print(f"  Dataset: {n_days} trading days")

    # Day index windows
    all_idx = list(range(n_days))
    mid     = n_days // 2
    h1_idx  = all_idx[:mid]
    h2_idx  = all_idx[mid:]
    q_size  = n_days // 4
    q_idx   = [all_idx[i*q_size : (i+1)*q_size] for i in range(4)]
    # Last quarter gets any remainder
    q_idx[3] = all_idx[3*q_size:]

    print(f"  H1: days 0–{mid-1} ({len(h1_idx)}d)  "
          f"H2: days {mid}–{n_days-1} ({len(h2_idx)}d)")
    print(f"  Q1:{len(q_idx[0])}d  Q2:{len(q_idx[1])}d  "
          f"Q3:{len(q_idx[2])}d  Q4:{len(q_idx[3])}d")

    results_all = []
    pass_counts  = {'split': 0, 'split_total': 0,
                    'quarter': 0, 'quarter_total': 0}

    # ── FAMILY 1: Temporal split H1/H2 ───────────────────────────────────────
    print(f"\n{'='*80}")
    print("FAMILY 1 — Temporal Split H1/H2")
    print(f"{'='*80}")

    for label, idx in [('H1 (days 0-27)', h1_idx), ('H2 (days 28-55)', h2_idx)]:
        t0 = time.time()
        print(f"\n  {label}:")
        dliq_r  = _run_window(lambda kw: _make_dliq(kw),  f'd_liq_{label}',  d, fw, idx)
        abeta_r = _run_window(lambda kw: _make_abeta(kw), f'abeta_{label}', d, fw, idx)
        elapsed = time.time() - t0
        passed  = _compare(dliq_r, abeta_r, label)
        print(f"    trades: d_liq={dliq_r['trades']}  abeta={abeta_r['trades']}  ({elapsed:.0f}s)")
        results_all += [dliq_r, abeta_r]
        pass_counts['split']       += int(passed)
        pass_counts['split_total'] += 1

    split_verdict = ('PASS ✓' if pass_counts['split'] == pass_counts['split_total']
                     else f"PARTIAL ({pass_counts['split']}/{pass_counts['split_total']})")
    print(f"\n  H1/H2 SPLIT VERDICT: {split_verdict}")

    # ── FAMILY 2: Quarterly split ─────────────────────────────────────────────
    print(f"\n{'='*80}")
    print("FAMILY 2 — Quarterly Split (Q1/Q2/Q3/Q4)")
    print(f"{'='*80}")

    for qi, idx in enumerate(q_idx, 1):
        label = f'Q{qi} (days {idx[0]}-{idx[-1]})'
        t0 = time.time()
        print(f"\n  {label}:")
        dliq_r  = _run_window(lambda kw: _make_dliq(kw),  f'd_liq_Q{qi}',  d, fw, idx)
        abeta_r = _run_window(lambda kw: _make_abeta(kw), f'abeta_Q{qi}', d, fw, idx)
        elapsed = time.time() - t0
        passed  = _compare(dliq_r, abeta_r, label)
        print(f"    trades: d_liq={dliq_r['trades']}  abeta={abeta_r['trades']}  ({elapsed:.0f}s)")
        results_all += [dliq_r, abeta_r]
        pass_counts['quarter']       += int(passed)
        pass_counts['quarter_total'] += 1

    quarter_verdict = ('PASS ✓' if pass_counts['quarter'] == pass_counts['quarter_total']
                       else f"PARTIAL ({pass_counts['quarter']}/{pass_counts['quarter_total']})")
    print(f"\n  QUARTERLY VERDICT: {quarter_verdict}")

    # ── FAMILY 3: Margin buffer sensitivity (full period) ─────────────────────
    print(f"\n{'='*80}")
    print("FAMILY 3 — Margin Buffer Sensitivity (full period, d_liq only)")
    print(f"{'='*80}")
    print(f"  Floor = (1/abs_cap) * buffer  |  abs_cap=9.0")
    print(f"  {'Buffer':>8}  {'Floor%':>7}  {'ROI%':>8}  {'DD%':>6}  {'Calmar':>7}  "
          f"{'liq_stops':>10}  {'ΔROI vs gold':>13}")

    buf_results = []
    for buf in [0.80, 0.90, 0.95, 1.00]:
        t0   = time.time()
        floor_pct = (1.0 / D_LIQ_ABS_CAP) * buf * 100
        r    = _run_window(lambda kw, b=buf: _make_dliq(kw, margin_buffer=b),
                           f'd_liq_buf{buf:.2f}', d, fw, all_idx)
        elapsed = time.time() - t0
        d_roi = r['roi'] - _D_LIQ_FULL['roi']
        marker = ' ← GOLD' if abs(buf - 0.95) < 0.001 else ''
        print(f"  {buf:>8.2f}  {floor_pct:>6.1f}%  {r['roi']:>8.2f}  {r['dd']:>6.2f}  "
              f"{r['calmar']:>7.2f}  {r['liq_stops']:>10}  {d_roi:>+13.2f}pp  ({elapsed:.0f}s){marker}")
        r['margin_buffer'] = buf
        buf_results.append(r)
        results_all.append(r)

    # Stability check: ROI range across buffers
    buf_rois = [r['roi'] for r in buf_results]
    roi_range = max(buf_rois) - min(buf_rois)
    buf_dds  = [r['dd'] for r in buf_results]
    dd_range  = max(buf_dds) - min(buf_dds)
    buf_stable = roi_range < 10.0 and dd_range < 2.0
    print(f"\n  ROI range across buffers: {roi_range:.2f}pp  "
          f"DD range: {dd_range:.2f}pp  "
          f"['STABLE ✓' if buf_stable else 'UNSTABLE ✗']")

    # ── SUMMARY ───────────────────────────────────────────────────────────────
    total_passes = pass_counts['split'] + pass_counts['quarter']
    total_tests  = pass_counts['split_total'] + pass_counts['quarter_total']

    print(f"\n{'='*80}")
    print("OVERFITTING VALIDATION SUMMARY")
    print(f"{'='*80}")
    print(f"  Temporal split  (H1/H2):  {pass_counts['split']}/{pass_counts['split_total']}  {split_verdict}")
    print(f"  Quarterly split (Q1-Q4):  {pass_counts['quarter']}/{pass_counts['quarter_total']}  {quarter_verdict}")
    print(f"  Margin buffer stability:  {'STABLE ✓' if buf_stable else 'UNSTABLE ✗'}  "
          f"(ROI range={roi_range:.1f}pp, DD range={dd_range:.1f}pp)")
    print()

    all_pass = (total_passes == total_tests and buf_stable)
    if all_pass:
        print("  VERDICT: ALL TESTS PASS ✓")
        print("  D_LIQ_GOLD is robust. Calmar advantage holds across all time windows.")
        print("  Margin buffer choice is not critical. Safe to set as DEFAULT.")
    else:
        print("  VERDICT: SOME TESTS FAIL ✗")
        print(f"  {total_passes}/{total_tests} split windows passed, "
              f"buffer stable={buf_stable}.")
        print("  Do NOT flip default until failures are investigated.")

    outfile = _HERE / "exp9c_overfitting_results.json"
    log_results(results_all, outfile, meta={
        "exp": "exp9c",
        "question": "Is D_LIQ_GOLD robust across time windows and parameter perturbations?",
        "split_passes": f"{pass_counts['split']}/{pass_counts['split_total']}",
        "quarter_passes": f"{pass_counts['quarter']}/{pass_counts['quarter_total']}",
        "buf_roi_range_pp": round(roi_range, 3),
        "buf_dd_range_pp": round(dd_range, 3),
        "all_pass": all_pass,
        "total_elapsed_s": round(time.time() - t_start, 1),
    })

    print(f"\nTotal elapsed: {(time.time()-t_start)/60:.1f} min")
    print("Done.")


if __name__ == "__main__":
    main()