""" exp13_model_sweep.py — Multi-model exp13 test harness. For each model in the registry: 1. Auto-identify proxy_B dim (highest |r| correlation with raw proxy_B signal) 2. Validate calibration (always-positive in 56-day window required for exp13 scaling) 3. Run exp13 Phase 1 (14-day screening) + Phase 2 (full 56-day, top-k configs) 4. Save per-model results to exp13_sweep__results.json 5. Print final comparison table across all models All tests use IDENTICAL configs/window/threshold to exp13 v2 (the CONFIRMED baseline). Threshold: Calmar > 7.83 (102% of D_LIQ_GOLD baseline 7.67 in the 56-day window) Reference: v2 BOB — 9/20 PASS, best dROI=+4.59pp, best Calmar=7.87 Usage (from nautilus_dolphin/ dir): python dvae/exp13_model_sweep.py # all available models in registry python dvae/exp13_model_sweep.py --models v4 # single model python dvae/exp13_model_sweep.py --models v4 v5 v6 # explicit list python dvae/exp13_model_sweep.py --probe_only # dim probe only, no backtest python dvae/exp13_model_sweep.py --subset 14 --top_k 20 # explicit Phase 1/2 params Adding new models (v5, v6, v7, v8): 1. Transfer model JSON from DOLPHIN Linux to models/convnext_dvae_ML/ 2. Uncomment (or add) the entry in MODEL_REGISTRY below 3. Re-run this script """ import sys, os, time, json, importlib, argparse import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace', line_buffering=True) import numpy as np import pandas as pd from pathlib import Path ROOT = Path(__file__).resolve().parent.parent.parent DVAE_DIR = ROOT / 'nautilus_dolphin' / 'dvae' sys.path.insert(0, str(ROOT / 'nautilus_dolphin')) # ── Model registry ───────────────────────────────────────────────────────────── # Slot in v5/v6/v7/v8 when transferred from DOLPHIN Linux — just uncomment. MODEL_REGISTRY = { 'v2_bob': ROOT / 'nautilus_dolphin' / 'dvae' / 'convnext_model_v2.json', 'v4': ROOT / 'models' / 'convnext_dvae_ML' / 'convnext_model_v4_ep22_best.json', 'v5': ROOT / 'models' / 'dolphin_training' / 'winning_models' / 'v5_ep28_best_total_loss.json', 'v6': ROOT / 'models' / 'dolphin_training' / 'winning_models' / 'v6_ep8_best_val_loss.json', 'v7': ROOT / 'models' / 'dolphin_training' / 'winning_models' / 'v7_ep10_best_generalization.json', # v8: step=2 training (1.2M windows), only 2 epochs — val=34.92. Experimental. 'v8': ROOT / 'models' / 'dolphin_training' / 'dvae' / 'v8_step2.json', # v6.5 (ANOMALOUS — DO NOT USE: broken during training per researcher note) } KLINES_DIR = ROOT / 'vbt_cache_klines' DATE_START = '2025-12-31' DATE_END = '2026-02-25' CALMAR_THR = 7.83 # 102% of D_LIQ_GOLD baseline 7.67 FEATURE_COLS = [ 'v50_lambda_max_velocity', 'v150_lambda_max_velocity', 'v300_lambda_max_velocity', 'v750_lambda_max_velocity', 'vel_div', 'instability_50', 'instability_150', ] T_WIN = 32 # ── Proxy_B dim identification ───────────────────────────────────────────────── from dvae.convnext_dvae import ConvNeXtVAE def _load_model(path: Path): with open(path) as f: meta = json.load(f) arch = meta['architecture'] m = ConvNeXtVAE( C_in=arch['C_in'], T_in=arch['T_in'], z_dim=arch['z_dim'], base_ch=arch['base_ch'], n_blocks=arch.get('n_blocks', 3), seed=42, ) m.load(str(path)) nm = np.array(meta['norm_mean']) if 'norm_mean' in meta else None ns = np.array(meta['norm_std']) if 'norm_std' in meta else None return m, nm, ns, meta def _build_probe_set(): """Sample probe windows from 56-day window; shared across all models.""" files = sorted(KLINES_DIR.glob('*.parquet')) period = [f for f in files if DATE_START <= f.stem[:10] <= DATE_END] rng = np.random.default_rng(42) probes_raw, proxy_B_vals = [], [] step = max(1, len(period) // 60) for f in period[::step]: try: df = pd.read_parquet(f, columns=FEATURE_COLS).dropna() if len(df) < T_WIN + 10: continue mid = len(df) // 2 pos = int(rng.integers(max(0, mid-30), min(len(df)-T_WIN, mid+30))) arr = df[FEATURE_COLS].values[pos:pos+T_WIN].astype(np.float64) proxy_B = (arr[:, 5] - arr[:, 3]).reshape(-1, 1) exf = np.zeros((T_WIN, 3), dtype=np.float64) arr11 = np.concatenate([arr, proxy_B, exf], axis=1).T # (11, T) if not np.isfinite(arr11).all(): continue probes_raw.append(arr11) proxy_B_vals.append(float(proxy_B.mean())) except Exception: pass return np.stack(probes_raw), np.array(proxy_B_vals) def probe_model(tag: str, path: Path, probes_raw: np.ndarray, proxy_B_arr: np.ndarray) -> dict: """Identify proxy_B dim and report calibration for one model.""" print(f"\n{'='*60}") print(f" PROBE: {tag} ({path.name})") print(f"{'='*60}") model, nm, ns, meta = _load_model(path) ep = meta.get('epoch', '?') val = meta.get('val_loss', 0.0) print(f" epoch={ep} val_loss={val:.5f}") probes = probes_raw.copy() if nm is not None: probes = (probes - nm[None, :, None]) / ns[None, :, None] np.clip(probes, -6., 6., out=probes) z_mu, z_logvar = model.encode(probes) z_std = z_mu.std(0) corrs = [] for d in range(z_mu.shape[1]): if z_std[d] > 0.01: r = float(np.corrcoef(z_mu[:, d], proxy_B_arr)[0, 1]) if np.isfinite(r): corrs.append((abs(r), r, d)) corrs.sort(reverse=True) best_abs_r, best_r, best_dim = corrs[0] if corrs else (0.0, 0.0, -1) z_best = z_mu[:, best_dim] z_min, z_max = float(z_best.min()), float(z_best.max()) always_pos = z_min > 0 always_neg = z_max < 0 if always_pos: calib = 'ALWAYS_POSITIVE' elif always_neg: calib = 'ALWAYS_NEGATIVE' else: calib = f'MIXED[{z_min:+.3f},{z_max:+.3f}]' q75, q25 = np.percentile(proxy_B_arr, 75), np.percentile(proxy_B_arr, 25) z_hi = float(z_best[proxy_B_arr >= q75].mean()) z_lo = float(z_best[proxy_B_arr <= q25].mean()) sep = abs(z_hi - z_lo) # Also find best POSITIVELY correlated dim (same sign as v2 z[13]) pos_corrs = [(abs_r, r, d) for abs_r, r, d in corrs if r > 0] pos_dim = pos_corrs[0][2] if pos_corrs else best_dim pos_r = pos_corrs[0][1] if pos_corrs else best_r usable = always_pos and best_abs_r > 0.5 print(f" proxy_B dim : z[{best_dim}] r={best_r:+.4f} " f"(best |r|) best positive: z[{pos_dim}] r={pos_r:+.4f}") print(f" Top-5 : " + ' '.join(f'z[{d}]={r:+.3f}' for _,r,d in corrs[:5])) print(f" Calibration : {calib} sep={sep:.4f}") print(f" Usable : {'YES ✓' if usable else 'CAUTION ⚠ (will skip exp13 sweep)'}") return { 'tag': tag, 'path': str(path), 'epoch': ep, 'val_loss': val, 'proxy_B_dim': best_dim, 'proxy_B_r': best_r, 'proxy_B_dim_pos': pos_dim, 'proxy_B_r_pos': pos_r, 'calibration': calib, 'always_positive': always_pos, 'separation': sep, 'top5': [(r,d) for _,r,d in corrs[:5]], } # ── exp13 runner per model ───────────────────────────────────────────────────── def run_exp13_for_model(probe: dict, subset_days: int, top_k: int, only_config: str = None) -> dict: """Patch MODEL_1M + PROXY_B_DIM into exp13, call main(), read results JSON.""" import dvae.exp13_multiscale_sweep as e13 importlib.reload(e13) # fresh state: clears cached signals from prior model # Patch module-level constants AFTER reload e13.MODEL_1M = Path(probe['path']) e13.PROXY_B_DIM = probe['proxy_B_dim'] out_file = ROOT / f"exp13_sweep_{probe['tag']}_results.json" e13.OUT_FILE = out_file # Patch sys.argv so argparse inside main() picks up our params sys.argv = [ 'exp13_multiscale_sweep.py', '--subset', str(subset_days), '--top_k', str(top_k), '--skip_sets', 'B,Bp', # skip 5s sets (no 5s model per model variant) ] if only_config: sys.argv += ['--only_config', only_config, '--skip_5s'] print(f"\n{'━'*60}") print(f" EXP13: {probe['tag']} z[{probe['proxy_B_dim']}] r={probe['proxy_B_r']:+.4f}") print(f" subset={subset_days}d top_k={top_k} out={out_file.name}") print(f"{'━'*60}") t0 = time.time() e13.main() elapsed = time.time() - t0 # Read saved results if not out_file.exists(): print(f"[ERROR] Results file not written: {out_file}") return {'tag': probe['tag'], 'error': 'no results file'} with open(out_file) as f: raw = json.load(f) baseline_full = raw.get('phase2', {}).get('baseline_full') or raw.get('phase1_results', [{}])[0] p2_list = raw.get('phase2', {}).get('results', []) if not p2_list: # full run (subset=0): use phase1_results as p2 p2_list = raw.get('phase1_results', []) baseline_cal = (raw.get('phase2', {}).get('baseline_full') or {}).get('Calmar', 0.0) n_pass = sum(1 for r in p2_list if r.get('Calmar', 0) > CALMAR_THR) best = max(p2_list, key=lambda r: r.get('Calmar', 0)) if p2_list else {} base_roi = (raw.get('phase2', {}).get('baseline_full') or {}).get('ROI', 0.0) return { 'tag': probe['tag'], 'val_loss': probe['val_loss'], 'proxy_B_dim': probe['proxy_B_dim'], 'proxy_B_r': probe['proxy_B_r'], 'calibration': probe['calibration'], 'baseline_calmar': baseline_cal, 'baseline_roi': base_roi, 'n_phase2': len(p2_list), 'n_pass': n_pass, 'best_config': best.get('name', '?'), 'best_roi': best.get('ROI', 0.0), 'best_calmar': best.get('Calmar', 0.0), 'best_droi': best.get('ROI', 0.0) - base_roi, 'best_ddd': best.get('DD', 0.0) - (raw.get('phase2', {}).get('baseline_full') or {}).get('DD', 0.0), 'elapsed_s': round(elapsed), 'results_file': str(out_file), } # ── Comparison tables ────────────────────────────────────────────────────────── def _print_probe_table(probes: dict): print(f"\n{'='*72}") print(f" MODEL PROBE SUMMARY") print(f"{'='*72}") print(f" {'Tag':10s} {'ValLoss':>8s} {'Dim':>5s} {'r':>7s} {'Sep':>6s} {'Calibration':22s} OK?") print(f" {'-'*72}") for tag, p in probes.items(): ok = '✓' if p['always_positive'] and abs(p['proxy_B_r']) > 0.5 else '⚠' print(f" {tag:10s} {p['val_loss']:8.4f} " f"z[{p['proxy_B_dim']:2d}] {p['proxy_B_r']:+7.4f} " f"{p['separation']:6.4f} {p['calibration']:22s} {ok}") def _print_comparison_table(results: list): if not results: return print(f"\n{'='*84}") print(f" EXP13 MULTI-MODEL FINAL COMPARISON") print(f" Threshold: Calmar > {CALMAR_THR} | Reference: v2_BOB — 9/20 PASS dROI=+4.59pp Cal=7.87") print(f"{'='*84}") print(f" {'Tag':10s} {'ValLoss':>8s} {'Dim':>5s} {'r':>7s} " f"{'Pass':>6s} {'BestCal':>8s} {'BestdROI':>9s} {'BestdDD':>8s} Best Config") print(f" {'-'*84}") for r in sorted(results, key=lambda x: x.get('best_calmar', 0), reverse=True): pass_str = f"{r['n_pass']:2d}/{r['n_phase2']}" flag = '✓' if r['n_pass'] > 0 else ' ' print(f" {r['tag']:10s} {r['val_loss']:8.4f} " f"z[{r['proxy_B_dim']:2d}] {r['proxy_B_r']:+7.4f} " f"{pass_str:>6s} {flag} {r['best_calmar']:8.3f} " f"{r['best_droi']:+9.2f}pp {r['best_ddd']:+8.2f}pp {r['best_config']}") print(f" {'─'*84}") print(f" {'v2_bob REF':10s} {'18.0024':>8s} z[13] {'+0.9332':>7s} " f"{'9/20':>6s} ✓ {'7.870':>8s} {'+4.59':>9s}pp {'0.00':>8s}pp A_P5_M2_W1_a0.5") # ── Main ─────────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description='Multi-model exp13 test harness') parser.add_argument('--models', nargs='+', default=None, help='Model tags to run (default: all available)') parser.add_argument('--probe_only', action='store_true', help='Dim probe only — skip exp13 sweep') parser.add_argument('--subset', type=int, default=14, help='Phase 1 days (default 14)') parser.add_argument('--top_k', type=int, default=20, help='Phase 2 top-k configs (default 20)') parser.add_argument('--fast_check', type=str, default='', help='Skip Phase 1; run just this config on full window. ' 'Default known-winner: A_P5_M2_W1_a0.5') args = parser.parse_args() if args.fast_check == 'winner': args.fast_check = 'A_P5_M2_W1_a0.5' # shorthand # Select models tags = args.models or list(MODEL_REGISTRY.keys()) available = {t: MODEL_REGISTRY[t] for t in tags if t in MODEL_REGISTRY and Path(MODEL_REGISTRY[t]).exists()} skipped = [t for t in tags if t not in MODEL_REGISTRY] missing = [t for t in MODEL_REGISTRY if t not in available and t in (args.models or list(MODEL_REGISTRY.keys()))] if skipped: print(f"[WARN] Unknown tags: {skipped}") if missing: print(f"[WARN] File not found (transfer from DOLPHIN Linux): {missing}") if not available: print("No model files found. Check MODEL_REGISTRY or --models flag."); return print(f"\n{'━'*60}") print(f" EXP13 MULTI-MODEL SWEEP") print(f" Models : {list(available.keys())}") print(f" Window : {DATE_START} → {DATE_END}") print(f" Threshold: Calmar > {CALMAR_THR}") print(f" Phase1 : {args.subset} days Phase2 top-k: {args.top_k}") print(f"{'━'*60}") # Build shared probe set print(f"\nBuilding probe set ({DATE_START}→{DATE_END})...") probes_raw, proxy_B_arr = _build_probe_set() print(f" {len(probes_raw)} windows proxy_B: μ={proxy_B_arr.mean():+.4f} σ={proxy_B_arr.std():.4f}") # Step 1: probe all models probe_reports = {tag: probe_model(tag, path, probes_raw, proxy_B_arr) for tag, path in available.items()} _print_probe_table(probe_reports) if args.probe_only: return # Step 2: run exp13 for each usable model sweep_results = [] for tag, probe in probe_reports.items(): if not probe['always_positive']: print(f"\n[SKIP] {tag}: calib={probe['calibration']} — not always-positive") continue try: summary = run_exp13_for_model(probe, args.subset, args.top_k, only_config=args.fast_check or None) sweep_results.append(summary) except Exception as ex: import traceback print(f"\n[ERROR] {tag}: {ex}") traceback.print_exc() _print_comparison_table(sweep_results) # Save combined summary out = ROOT / 'exp13_model_sweep_results.json' with open(out, 'w') as f: json.dump({ 'probes': probe_reports, 'sweep': sweep_results, 'threshold': CALMAR_THR, 'window': {'start': DATE_START, 'end': DATE_END}, 'ref_v2_bob': {'n_pass': 9, 'best_droi': 4.59, 'best_calmar': 7.87}, }, f, indent=2, default=str) print(f"\nSummary → {out}") if __name__ == '__main__': main()