"""vel_div Threshold Sweep — 5-Year Klines ========================================== Raw edge sweep: no engine stack, no fees, no leverage. Sweeps SHORT (vd <= T) and LONG (vd >= T) across 15 thresholds each. Output: 1. Console: edge matrix (threshold × year) + ranked summary 2. run_logs/vd_sweep_YYYYMMDD_HHMMSS.csv — per (T, dir, year, quarter) 3. run_logs/vd_regime_YYYYMMDD_HHMMSS.csv — per (date, T, dir): regime ctx WARNING: This is descriptive/diagnostic only. Selecting a threshold FROM this sweep on the SAME dataset = data snooping. Use it to understand WHICH market conditions produce edge, not to tune the system. Vectorization: precomputes rolling fut_min/fut_max once per file (sliding_window_view). Runtime: ~5-10 min for 1710 files. """ import sys, time, csv, gc sys.stdout.reconfigure(encoding='utf-8', errors='replace') from pathlib import Path from datetime import datetime from collections import defaultdict import numpy as np import pandas as pd from numpy.lib.stride_tricks import sliding_window_view # ── Config ──────────────────────────────────────────────────────────────────── VBT_DIR = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines") LOG_DIR = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\nautilus_dolphin\run_logs") TP_BPS = 95 MAX_HOLD = 120 tp_pct = TP_BPS / 10000.0 # SHORT thresholds (vel_div <= T, T negative) SHORT_T = [-0.005, -0.008, -0.010, -0.013, -0.015, -0.020, -0.025, -0.030, -0.040, -0.050, -0.070, -0.100, -0.140, -0.200, -0.300] # LONG thresholds (vel_div >= T, T positive) LONG_T = [+0.005, +0.008, +0.010, +0.013, +0.015, +0.020, +0.025, +0.030, +0.040, +0.050, +0.070, +0.100, +0.140, +0.200, +0.300] THRESHOLDS = [('S', t) for t in SHORT_T] + [('L', t) for t in LONG_T] N_T = len(THRESHOLDS) # ── Accumulators ────────────────────────────────────────────────────────────── # sweep_stats[(dir, T, year, quarter)] = {'wins': 0, 'losses': 0, 'n_sig': 0, # 'gross_win': 0.0, 'gross_loss': 0.0} sweep_stats = defaultdict(lambda: {'wins': 0, 'losses': 0, 'n_sig': 0, 'gross_win': 0.0, 'gross_loss': 0.0}) # control_stats[(year, quarter)] = {'up': 0, 'dn': 0, 'n': 0} ctrl_stats = defaultdict(lambda: {'up': 0, 'dn': 0, 'n': 0}) # regime_rows: one row per (date, dir, T) for CSV — kept lightweight regime_rows = [] # ── Enumerate files ─────────────────────────────────────────────────────────── parquet_files = sorted(VBT_DIR.glob("*.parquet")) parquet_files = [p for p in parquet_files if 'catalog' not in str(p)] total = len(parquet_files) print(f"Files: {total} TP={TP_BPS}bps MAX_HOLD={MAX_HOLD} bars") print(f"SHORT thresholds: {len(SHORT_T)} LONG thresholds: {len(LONG_T)}\n") t0 = time.time() for i, pf in enumerate(parquet_files): ds = pf.stem # "YYYY-MM-DD" year = ds[:4] month = int(ds[5:7]) quarter = f"Q{(month-1)//3+1}" try: df = pd.read_parquet(pf) except Exception: continue if 'vel_div' not in df.columns or 'BTCUSDT' not in df.columns: continue vd = df['vel_div'].values.astype(np.float64) btc = df['BTCUSDT'].values.astype(np.float64) # Sanitize vd = np.where(np.isfinite(vd), vd, 0.0) btc = np.where(np.isfinite(btc) & (btc > 0), btc, np.nan) n = len(btc) if n < MAX_HOLD + 2: del df, vd, btc continue # ── Precompute rolling future min/max in one pass ───────────────────────── # windows shape: (n - MAX_HOLD, MAX_HOLD + 1) — includes bar i itself windows = sliding_window_view(btc, MAX_HOLD + 1) # (n-MAX_HOLD, 121) ep_arr = windows[:, 0] # entry prices (bar i) fut_min = np.nanmin(windows[:, 1:], axis=1) # min of next 120 bars fut_max = np.nanmax(windows[:, 1:], axis=1) # max of next 120 bars last_px = windows[:, -1] # exit price at MAX_HOLD if no TP valid = np.isfinite(ep_arr) & (ep_arr > 0) n_valid = np.sum(valid) # ── Regime metadata for this date ───────────────────────────────────────── btc_clean = btc[np.isfinite(btc)] if len(btc_clean) >= 2: btc_return = (btc_clean[-1] - btc_clean[0]) / btc_clean[0] log_r = np.diff(np.log(btc_clean)) realized_vol = float(np.std(log_r)) if len(log_r) > 1 else 0.0 else: btc_return = 0.0 realized_vol = 0.0 vd_finite = vd[np.isfinite(vd)] vd_median = float(np.median(vd_finite)) if len(vd_finite) > 0 else 0.0 vd_std = float(np.std(vd_finite)) if len(vd_finite) > 0 else 0.0 vd_p10 = float(np.percentile(vd_finite, 10)) if len(vd_finite) > 5 else 0.0 vd_p90 = float(np.percentile(vd_finite, 90)) if len(vd_finite) > 5 else 0.0 # ── Control: unconditional baseline every 60th bar ──────────────────────── ck = (year, quarter) for j in range(0, n - MAX_HOLD, 60): ep = btc[j]; ex = btc[j + MAX_HOLD] if np.isfinite(ep) and np.isfinite(ex) and ep > 0: r = (ex - ep) / ep ctrl_stats[ck]['up'] += int(r >= tp_pct) ctrl_stats[ck]['dn'] += int(r <= -tp_pct) ctrl_stats[ck]['n'] += 1 # ── Per-threshold sweep ─────────────────────────────────────────────────── for (direction, T) in THRESHOLDS: if direction == 'S': sig_mask = vd[:n - MAX_HOLD] <= T else: sig_mask = vd[:n - MAX_HOLD] >= T sig_idx = np.where(sig_mask & valid)[0] if len(sig_idx) == 0: regime_rows.append({ 'date': ds, 'year': year, 'quarter': quarter, 'direction': direction, 'threshold': T, 'n_sig': 0, 'wins': 0, 'losses': 0, 'wr': np.nan, 'gross_win': 0.0, 'gross_loss': 0.0, 'btc_return': round(btc_return, 6), 'realized_vol': round(realized_vol, 8), 'vd_median': round(vd_median, 6), 'vd_std': round(vd_std, 6), 'vd_p10': round(vd_p10, 6), 'vd_p90': round(vd_p90, 6), }) continue ep_s = ep_arr[sig_idx] fmin_s = fut_min[sig_idx] fmax_s = fut_max[sig_idx] last_s = last_px[sig_idx] if direction == 'S': tp_price = ep_s * (1.0 - tp_pct) hit = fmin_s <= tp_price # For losses: return at MAX_HOLD exit, sign-adjusted for short loss_ret = np.where(np.isfinite(last_s), (ep_s - last_s) / ep_s, 0.0) else: tp_price = ep_s * (1.0 + tp_pct) hit = fmax_s >= tp_price loss_ret = np.where(np.isfinite(last_s), (last_s - ep_s) / ep_s, 0.0) wins_arr = hit loss_arr = ~hit n_sig = len(sig_idx) wins = int(np.sum(wins_arr)) losses = int(np.sum(loss_arr)) gross_win = wins * tp_pct gross_loss= float(np.sum(np.abs(loss_ret[loss_arr]))) sk = (direction, T, year, quarter) sweep_stats[sk]['wins'] += wins sweep_stats[sk]['losses'] += losses sweep_stats[sk]['n_sig'] += n_sig sweep_stats[sk]['gross_win'] += gross_win sweep_stats[sk]['gross_loss'] += gross_loss wr = wins / (wins + losses) * 100 if (wins + losses) > 0 else float('nan') regime_rows.append({ 'date': ds, 'year': year, 'quarter': quarter, 'direction': direction, 'threshold': T, 'n_sig': n_sig, 'wins': wins, 'losses': losses, 'wr': round(wr, 2), 'gross_win': round(gross_win, 6), 'gross_loss': round(gross_loss, 6), 'btc_return': round(btc_return, 6), 'realized_vol': round(realized_vol, 8), 'vd_median': round(vd_median, 6), 'vd_std': round(vd_std, 6), 'vd_p10': round(vd_p10, 6), 'vd_p90': round(vd_p90, 6), }) del df, vd, btc, windows, ep_arr, fut_min, fut_max, last_px, valid if (i + 1) % 100 == 0: gc.collect() elapsed = time.time() - t0 rate = (i + 1) / elapsed eta = (total - i - 1) / rate print(f" [{i+1}/{total}] {ds} {elapsed/60:.1f}m elapsed {eta/60:.1f}m eta") elapsed = time.time() - t0 print(f"\nPass complete in {elapsed:.0f}s ({elapsed/60:.1f}m)") # ── Compute per-quarter control baselines ───────────────────────────────────── def ctrl_baseline(year, quarter, direction): ck = (year, quarter) c = ctrl_stats.get(ck, {'up': 0, 'dn': 0, 'n': 0}) if c['n'] == 0: return float('nan') if direction == 'S': return c['dn'] / c['n'] * 100 else: return c['up'] / c['n'] * 100 # Overall control ctrl_n_tot = sum(v['n'] for v in ctrl_stats.values()) ctrl_dn_tot = sum(v['dn'] for v in ctrl_stats.values()) ctrl_up_tot = sum(v['up'] for v in ctrl_stats.values()) ctrl_dn_pct = ctrl_dn_tot / ctrl_n_tot * 100 if ctrl_n_tot else 0 ctrl_up_pct = ctrl_up_tot / ctrl_n_tot * 100 if ctrl_n_tot else 0 print(f"\nControl (unconditional 120-bar baseline):") print(f" DOWN {TP_BPS}bps: {ctrl_dn_pct:.1f}% UP {TP_BPS}bps: {ctrl_up_pct:.1f}% n={ctrl_n_tot:,}") # ── Build summary table ─────────────────────────────────────────────────────── YEARS = ['2021', '2022', '2023', '2024', '2025', '2026'] summary_rows = [] for (direction, T) in THRESHOLDS: row = {'direction': direction, 'threshold': T} total_wins = total_losses = total_sigs = 0 total_gw = total_gl = 0.0 for yr in YEARS: yr_wins = yr_losses = yr_sigs = 0 yr_gw = yr_gl = 0.0 for q in ['Q1', 'Q2', 'Q3', 'Q4']: sk = (direction, T, yr, q) if sk in sweep_stats: s = sweep_stats[sk] yr_wins += s['wins']; yr_losses += s['losses'] yr_sigs += s['n_sig']; yr_gw += s['gross_win'] yr_gl += s['gross_loss'] yr_n = yr_wins + yr_losses yr_wr = yr_wins / yr_n * 100 if yr_n > 0 else float('nan') yr_pf = yr_gw / yr_gl if yr_gl > 0 else (999.0 if yr_gw > 0 else float('nan')) ctrl_bl = ctrl_dn_pct if direction == 'S' else ctrl_up_pct yr_edge = yr_wr - ctrl_bl if not (yr_wr != yr_wr) else float('nan') # nan-safe row[f'{yr}_sigs'] = yr_sigs row[f'{yr}_wr'] = round(yr_wr, 2) if not (yr_wr != yr_wr) else float('nan') row[f'{yr}_pf'] = round(yr_pf, 3) if not (yr_pf != yr_pf) else float('nan') row[f'{yr}_edge'] = round(yr_edge, 2) if not (yr_edge != yr_edge) else float('nan') total_wins += yr_wins; total_losses += yr_losses total_sigs += yr_sigs; total_gw += yr_gw; total_gl += yr_gl total_n = total_wins + total_losses total_wr = total_wins / total_n * 100 if total_n > 0 else float('nan') total_pf = total_gw / total_gl if total_gl > 0 else (999.0 if total_gw > 0 else float('nan')) ctrl_bl = ctrl_dn_pct if direction == 'S' else ctrl_up_pct total_edge = total_wr - ctrl_bl if not (total_wr != total_wr) else float('nan') row['total_sigs'] = total_sigs row['total_wins'] = total_wins row['total_losses']= total_losses row['total_wr'] = round(total_wr, 2) row['total_pf'] = round(total_pf, 3) row['total_edge_pp'] = round(total_edge, 2) row['ctrl_baseline'] = round(ctrl_bl, 2) summary_rows.append(row) # ── Print edge matrix (SHORT) ───────────────────────────────────────────────── def fmt_edge(e): if e != e: return ' --- ' return f'{e:>+5.1f}pp' if abs(e) >= 0.1 else f' {e:>+4.2f}pp' print(f"\n{'='*90}") print(f" EDGE MATRIX — SHORT (vel_div <= T) [edge = WR − unconditional_down_{TP_BPS}bps]") print(f" ctrl_baseline = {ctrl_dn_pct:.1f}%") print(f"{'='*90}") hdr = f" {'Threshold':>10} " + " ".join(f"{'Edge '+yr:>10}" for yr in YEARS) + " " + f"{'TOTAL':>10} {'n_sigs':>8}" print(hdr) print(f" {'-'*88}") for row in summary_rows: if row['direction'] != 'S': continue yr_edges = " ".join( f"{row.get(yr+'_edge', float('nan')):>+8.1f}pp" if not (row.get(yr+'_edge', float('nan')) != row.get(yr+'_edge', float('nan'))) else ' ---' for yr in YEARS ) tag = '← EDGE' if row['total_edge_pp'] > 2 else ('← weak' if row['total_edge_pp'] > 0 else '') print(f" {row['threshold']:>10.3f} {yr_edges} {row['total_edge_pp']:>+8.1f}pp {row['total_sigs']:>8,} {tag}") print(f"\n{'='*90}") print(f" EDGE MATRIX — LONG (vel_div >= T) [edge = WR − unconditional_up_{TP_BPS}bps]") print(f" ctrl_baseline = {ctrl_up_pct:.1f}%") print(f"{'='*90}") print(hdr) print(f" {'-'*88}") for row in summary_rows: if row['direction'] != 'L': continue yr_edges = " ".join( f"{row.get(yr+'_edge', float('nan')):>+8.1f}pp" if not (row.get(yr+'_edge', float('nan')) != row.get(yr+'_edge', float('nan'))) else ' ---' for yr in YEARS ) tag = '← EDGE' if row['total_edge_pp'] > 2 else ('← weak' if row['total_edge_pp'] > 0 else '') print(f" {row['threshold']:>10.3f} {yr_edges} {row['total_edge_pp']:>+8.1f}pp {row['total_sigs']:>8,} {tag}") # ── Ranked summary (positive edge, SHORT) ──────────────────────────────────── print(f"\n{'='*70}") print(f" TOP SHORT THRESHOLDS by total edge (>0 only)") print(f"{'='*70}") print(f" {'T':>8} {'n_sigs':>8} {'n_trades':>9} {'WR':>7} {'PF':>7} {'Edge':>8} {'2022':>8} {'2023':>8} {'2024':>8}") print(f" {'-'*68}") short_rows = sorted([r for r in summary_rows if r['direction'] == 'S' and r['total_edge_pp'] > 0], key=lambda r: r['total_edge_pp'], reverse=True) for r in short_rows: print(f" {r['threshold']:>8.3f} {r['total_sigs']:>8,} " f"{r['total_wins']+r['total_losses']:>9,} " f"{r['total_wr']:>6.1f}% {r['total_pf']:>7.3f} {r['total_edge_pp']:>+7.1f}pp " f"{r.get('2022_edge', float('nan')):>+7.1f}pp " f"{r.get('2023_edge', float('nan')):>+7.1f}pp " f"{r.get('2024_edge', float('nan')):>+7.1f}pp") if not short_rows: print(" (none — no positive edge found at any threshold)") print(f"\n{'='*70}") print(f" TOP LONG THRESHOLDS by total edge (>0 only)") print(f"{'='*70}") print(f" {'T':>8} {'n_sigs':>8} {'n_trades':>9} {'WR':>7} {'PF':>7} {'Edge':>8} {'2022':>8} {'2023':>8} {'2024':>8}") print(f" {'-'*68}") long_rows = sorted([r for r in summary_rows if r['direction'] == 'L' and r['total_edge_pp'] > 0], key=lambda r: r['total_edge_pp'], reverse=True) for r in long_rows: print(f" {r['threshold']:>8.3f} {r['total_sigs']:>8,} " f"{r['total_wins']+r['total_losses']:>9,} " f"{r['total_wr']:>6.1f}% {r['total_pf']:>7.3f} {r['total_edge_pp']:>+7.1f}pp " f"{r.get('2022_edge', float('nan')):>+7.1f}pp " f"{r.get('2023_edge', float('nan')):>+7.1f}pp " f"{r.get('2024_edge', float('nan')):>+7.1f}pp") if not long_rows: print(" (none — no positive edge found at any threshold)") print(f"\n *** OVERFITTING CAVEAT: thresholds selected from this sweep ***") print(f" *** on this dataset constitute data snooping. Use for regime ***") print(f" *** correlation analysis only, not for system recalibration. ***") # ── Save CSVs ───────────────────────────────────────────────────────────────── LOG_DIR.mkdir(exist_ok=True) ts = datetime.now().strftime("%Y%m%d_%H%M%S") # 1. Summary CSV sweep_path = LOG_DIR / f"vd_sweep_{ts}.csv" summary_fieldnames = ['direction', 'threshold', 'total_sigs', 'total_wins', 'total_losses', 'total_wr', 'total_pf', 'total_edge_pp', 'ctrl_baseline'] for yr in YEARS: summary_fieldnames += [f'{yr}_sigs', f'{yr}_wr', f'{yr}_pf', f'{yr}_edge'] with open(sweep_path, 'w', newline='') as f: w = csv.DictWriter(f, fieldnames=summary_fieldnames, extrasaction='ignore') w.writeheader() w.writerows(summary_rows) # 2. Regime correlation CSV regime_path = LOG_DIR / f"vd_regime_{ts}.csv" regime_fieldnames = ['date', 'year', 'quarter', 'direction', 'threshold', 'n_sig', 'wins', 'losses', 'wr', 'gross_win', 'gross_loss', 'btc_return', 'realized_vol', 'vd_median', 'vd_std', 'vd_p10', 'vd_p90'] with open(regime_path, 'w', newline='') as f: w = csv.DictWriter(f, fieldnames=regime_fieldnames, extrasaction='ignore') w.writeheader() w.writerows(regime_rows) print(f"\n sweep → {sweep_path}") print(f" regime → {regime_path}") print(f" Runtime: {elapsed:.0f}s") print(f" Rows in regime CSV: {len(regime_rows):,}")