DOLPHIN/nautilus_dolphin/vel_div_sweep_5y.py

"""vel_div Threshold Sweep — 5-Year Klines
==========================================
Raw edge sweep: no engine stack, no fees, no leverage.
Sweeps SHORT (vd <= T) and LONG (vd >= T) across 15 thresholds each.

Output:
  1. Console: edge matrix (threshold × year) + ranked summary
  2. run_logs/vd_sweep_YYYYMMDD_HHMMSS.csv     — per (T, dir, year, quarter)
  3. run_logs/vd_regime_YYYYMMDD_HHMMSS.csv    — per (date, T, dir): regime ctx

WARNING: This is descriptive/diagnostic only.
Selecting a threshold FROM this sweep on the SAME dataset = data snooping.
Use it to understand WHICH market conditions produce edge, not to tune the system.

Vectorization: precomputes rolling fut_min/fut_max once per file (sliding_window_view).
Runtime: ~5-10 min for 1710 files.
"""
import sys, time, csv, gc
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
from datetime import datetime
from collections import defaultdict
import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view

# ── Config ────────────────────────────────────────────────────────────────────
VBT_DIR  = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines")
LOG_DIR  = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\nautilus_dolphin\run_logs")
TP_BPS   = 95
MAX_HOLD = 120
tp_pct   = TP_BPS / 10000.0

# SHORT thresholds (vel_div <= T, T negative)
SHORT_T = [-0.005, -0.008, -0.010, -0.013, -0.015, -0.020,
           -0.025, -0.030, -0.040, -0.050, -0.070, -0.100,
           -0.140, -0.200, -0.300]

# LONG thresholds (vel_div >= T, T positive)
LONG_T  = [+0.005, +0.008, +0.010, +0.013, +0.015, +0.020,
           +0.025, +0.030, +0.040, +0.050, +0.070, +0.100,
           +0.140, +0.200, +0.300]

THRESHOLDS = [('S', t) for t in SHORT_T] + [('L', t) for t in LONG_T]
N_T = len(THRESHOLDS)

# ── Accumulators ──────────────────────────────────────────────────────────────
# sweep_stats[(dir, T, year, quarter)] = {'wins': 0, 'losses': 0, 'n_sig': 0,
#                                          'gross_win': 0.0, 'gross_loss': 0.0}
sweep_stats = defaultdict(lambda: {'wins': 0, 'losses': 0, 'n_sig': 0,
                                    'gross_win': 0.0, 'gross_loss': 0.0})

# control_stats[(year, quarter)] = {'up': 0, 'dn': 0, 'n': 0}
ctrl_stats = defaultdict(lambda: {'up': 0, 'dn': 0, 'n': 0})

# regime_rows: one row per (date, dir, T) for CSV — kept lightweight
regime_rows = []

# ── Enumerate files ───────────────────────────────────────────────────────────
parquet_files = sorted(VBT_DIR.glob("*.parquet"))
parquet_files = [p for p in parquet_files if 'catalog' not in str(p)]
total = len(parquet_files)
print(f"Files: {total}   TP={TP_BPS}bps   MAX_HOLD={MAX_HOLD} bars")
print(f"SHORT thresholds: {len(SHORT_T)}   LONG thresholds: {len(LONG_T)}\n")

t0 = time.time()

for i, pf in enumerate(parquet_files):
    ds   = pf.stem              # "YYYY-MM-DD"
    year = ds[:4]
    month = int(ds[5:7])
    quarter = f"Q{(month-1)//3+1}"

    try:
        df = pd.read_parquet(pf)
    except Exception:
        continue
    if 'vel_div' not in df.columns or 'BTCUSDT' not in df.columns:
        continue

    vd  = df['vel_div'].values.astype(np.float64)
    btc = df['BTCUSDT'].values.astype(np.float64)

    # Sanitize
    vd  = np.where(np.isfinite(vd), vd, 0.0)
    btc = np.where(np.isfinite(btc) & (btc > 0), btc, np.nan)

    n = len(btc)
    if n < MAX_HOLD + 2:
        del df, vd, btc
        continue

    # ── Precompute rolling future min/max in one pass ─────────────────────────
    # windows shape: (n - MAX_HOLD, MAX_HOLD + 1) — includes bar i itself
    windows = sliding_window_view(btc, MAX_HOLD + 1)   # (n-MAX_HOLD, 121)
    ep_arr  = windows[:, 0]           # entry prices (bar i)
    fut_min = np.nanmin(windows[:, 1:], axis=1)   # min of next 120 bars
    fut_max = np.nanmax(windows[:, 1:], axis=1)   # max of next 120 bars
    last_px  = windows[:, -1]         # exit price at MAX_HOLD if no TP

    valid   = np.isfinite(ep_arr) & (ep_arr > 0)
    n_valid = np.sum(valid)

    # ── Regime metadata for this date ─────────────────────────────────────────
    btc_clean = btc[np.isfinite(btc)]
    if len(btc_clean) >= 2:
        btc_return = (btc_clean[-1] - btc_clean[0]) / btc_clean[0]
        log_r = np.diff(np.log(btc_clean))
        realized_vol = float(np.std(log_r)) if len(log_r) > 1 else 0.0
    else:
        btc_return = 0.0
        realized_vol = 0.0

    vd_finite = vd[np.isfinite(vd)]
    vd_median = float(np.median(vd_finite)) if len(vd_finite) > 0 else 0.0
    vd_std    = float(np.std(vd_finite))    if len(vd_finite) > 0 else 0.0
    vd_p10    = float(np.percentile(vd_finite, 10)) if len(vd_finite) > 5 else 0.0
    vd_p90    = float(np.percentile(vd_finite, 90)) if len(vd_finite) > 5 else 0.0

    # ── Control: unconditional baseline every 60th bar ────────────────────────
    ck = (year, quarter)
    for j in range(0, n - MAX_HOLD, 60):
        ep = btc[j]; ex = btc[j + MAX_HOLD]
        if np.isfinite(ep) and np.isfinite(ex) and ep > 0:
            r = (ex - ep) / ep
            ctrl_stats[ck]['up'] += int(r >=  tp_pct)
            ctrl_stats[ck]['dn'] += int(r <= -tp_pct)
            ctrl_stats[ck]['n']  += 1

    # ── Per-threshold sweep ───────────────────────────────────────────────────
    for (direction, T) in THRESHOLDS:
        if direction == 'S':
            sig_mask = vd[:n - MAX_HOLD] <= T
        else:
            sig_mask = vd[:n - MAX_HOLD] >= T

        sig_idx = np.where(sig_mask & valid)[0]
        if len(sig_idx) == 0:
            regime_rows.append({
                'date': ds, 'year': year, 'quarter': quarter,
                'direction': direction, 'threshold': T,
                'n_sig': 0, 'wins': 0, 'losses': 0, 'wr': np.nan,
                'gross_win': 0.0, 'gross_loss': 0.0,
                'btc_return': round(btc_return, 6),
                'realized_vol': round(realized_vol, 8),
                'vd_median': round(vd_median, 6),
                'vd_std': round(vd_std, 6),
                'vd_p10': round(vd_p10, 6),
                'vd_p90': round(vd_p90, 6),
            })
            continue

        ep_s   = ep_arr[sig_idx]
        fmin_s = fut_min[sig_idx]
        fmax_s = fut_max[sig_idx]
        last_s = last_px[sig_idx]

        if direction == 'S':
            tp_price = ep_s * (1.0 - tp_pct)
            hit = fmin_s <= tp_price
            # For losses: return at MAX_HOLD exit, sign-adjusted for short
            loss_ret = np.where(np.isfinite(last_s), (ep_s - last_s) / ep_s, 0.0)
        else:
            tp_price = ep_s * (1.0 + tp_pct)
            hit = fmax_s >= tp_price
            loss_ret = np.where(np.isfinite(last_s), (last_s - ep_s) / ep_s, 0.0)

        wins_arr  = hit
        loss_arr  = ~hit
        n_sig     = len(sig_idx)
        wins      = int(np.sum(wins_arr))
        losses    = int(np.sum(loss_arr))
        gross_win = wins * tp_pct
        gross_loss= float(np.sum(np.abs(loss_ret[loss_arr])))

        sk = (direction, T, year, quarter)
        sweep_stats[sk]['wins']       += wins
        sweep_stats[sk]['losses']     += losses
        sweep_stats[sk]['n_sig']      += n_sig
        sweep_stats[sk]['gross_win']  += gross_win
        sweep_stats[sk]['gross_loss'] += gross_loss

        wr = wins / (wins + losses) * 100 if (wins + losses) > 0 else float('nan')
        regime_rows.append({
            'date': ds, 'year': year, 'quarter': quarter,
            'direction': direction, 'threshold': T,
            'n_sig': n_sig, 'wins': wins, 'losses': losses, 'wr': round(wr, 2),
            'gross_win': round(gross_win, 6), 'gross_loss': round(gross_loss, 6),
            'btc_return': round(btc_return, 6),
            'realized_vol': round(realized_vol, 8),
            'vd_median': round(vd_median, 6),
            'vd_std': round(vd_std, 6),
            'vd_p10': round(vd_p10, 6),
            'vd_p90': round(vd_p90, 6),
        })

    del df, vd, btc, windows, ep_arr, fut_min, fut_max, last_px, valid

    if (i + 1) % 100 == 0:
        gc.collect()
        elapsed = time.time() - t0
        rate = (i + 1) / elapsed
        eta = (total - i - 1) / rate
        print(f"  [{i+1}/{total}] {ds}  {elapsed/60:.1f}m elapsed  {eta/60:.1f}m eta")

elapsed = time.time() - t0
print(f"\nPass complete in {elapsed:.0f}s ({elapsed/60:.1f}m)")

# ── Compute per-quarter control baselines ─────────────────────────────────────
def ctrl_baseline(year, quarter, direction):
    ck = (year, quarter)
    c = ctrl_stats.get(ck, {'up': 0, 'dn': 0, 'n': 0})
    if c['n'] == 0:
        return float('nan')
    if direction == 'S':
        return c['dn'] / c['n'] * 100
    else:
        return c['up'] / c['n'] * 100

# Overall control
ctrl_n_tot  = sum(v['n']  for v in ctrl_stats.values())
ctrl_dn_tot = sum(v['dn'] for v in ctrl_stats.values())
ctrl_up_tot = sum(v['up'] for v in ctrl_stats.values())
ctrl_dn_pct = ctrl_dn_tot / ctrl_n_tot * 100 if ctrl_n_tot else 0
ctrl_up_pct = ctrl_up_tot / ctrl_n_tot * 100 if ctrl_n_tot else 0

print(f"\nControl (unconditional 120-bar baseline):")
print(f"  DOWN {TP_BPS}bps: {ctrl_dn_pct:.1f}%   UP {TP_BPS}bps: {ctrl_up_pct:.1f}%   n={ctrl_n_tot:,}")

# ── Build summary table ───────────────────────────────────────────────────────
YEARS = ['2021', '2022', '2023', '2024', '2025', '2026']

summary_rows = []
for (direction, T) in THRESHOLDS:
    row = {'direction': direction, 'threshold': T}
    total_wins = total_losses = total_sigs = 0
    total_gw = total_gl = 0.0
    for yr in YEARS:
        yr_wins = yr_losses = yr_sigs = 0
        yr_gw = yr_gl = 0.0
        for q in ['Q1', 'Q2', 'Q3', 'Q4']:
            sk = (direction, T, yr, q)
            if sk in sweep_stats:
                s = sweep_stats[sk]
                yr_wins    += s['wins'];    yr_losses  += s['losses']
                yr_sigs    += s['n_sig'];   yr_gw      += s['gross_win']
                yr_gl      += s['gross_loss']
        yr_n = yr_wins + yr_losses
        yr_wr = yr_wins / yr_n * 100 if yr_n > 0 else float('nan')
        yr_pf = yr_gw / yr_gl if yr_gl > 0 else (999.0 if yr_gw > 0 else float('nan'))
        ctrl_bl = ctrl_dn_pct if direction == 'S' else ctrl_up_pct
        yr_edge = yr_wr - ctrl_bl if not (yr_wr != yr_wr) else float('nan')  # nan-safe
        row[f'{yr}_sigs']  = yr_sigs
        row[f'{yr}_wr']    = round(yr_wr, 2) if not (yr_wr != yr_wr) else float('nan')
        row[f'{yr}_pf']    = round(yr_pf, 3) if not (yr_pf != yr_pf) else float('nan')
        row[f'{yr}_edge']  = round(yr_edge, 2) if not (yr_edge != yr_edge) else float('nan')
        total_wins  += yr_wins;  total_losses += yr_losses
        total_sigs  += yr_sigs;  total_gw     += yr_gw;  total_gl += yr_gl

    total_n  = total_wins + total_losses
    total_wr = total_wins / total_n * 100 if total_n > 0 else float('nan')
    total_pf = total_gw / total_gl if total_gl > 0 else (999.0 if total_gw > 0 else float('nan'))
    ctrl_bl  = ctrl_dn_pct if direction == 'S' else ctrl_up_pct
    total_edge = total_wr - ctrl_bl if not (total_wr != total_wr) else float('nan')
    row['total_sigs']  = total_sigs
    row['total_wins']  = total_wins
    row['total_losses']= total_losses
    row['total_wr']    = round(total_wr, 2)
    row['total_pf']    = round(total_pf, 3)
    row['total_edge_pp'] = round(total_edge, 2)
    row['ctrl_baseline'] = round(ctrl_bl, 2)
    summary_rows.append(row)

# ── Print edge matrix (SHORT) ─────────────────────────────────────────────────
def fmt_edge(e):
    if e != e: return '  --- '
    return f'{e:>+5.1f}pp' if abs(e) >= 0.1 else f'  {e:>+4.2f}pp'

print(f"\n{'='*90}")
print(f"  EDGE MATRIX — SHORT (vel_div <= T)   [edge = WR − unconditional_down_{TP_BPS}bps]")
print(f"  ctrl_baseline = {ctrl_dn_pct:.1f}%")
print(f"{'='*90}")
hdr = f"  {'Threshold':>10}  " + "  ".join(f"{'Edge '+yr:>10}" for yr in YEARS) + "  " + f"{'TOTAL':>10}  {'n_sigs':>8}"
print(hdr)
print(f"  {'-'*88}")
for row in summary_rows:
    if row['direction'] != 'S':
        continue
    yr_edges = "  ".join(
        f"{row.get(yr+'_edge', float('nan')):>+8.1f}pp" if not (row.get(yr+'_edge', float('nan')) != row.get(yr+'_edge', float('nan'))) else '       ---'
        for yr in YEARS
    )
    tag = '← EDGE' if row['total_edge_pp'] > 2 else ('← weak' if row['total_edge_pp'] > 0 else '')
    print(f"  {row['threshold']:>10.3f}  {yr_edges}  {row['total_edge_pp']:>+8.1f}pp  {row['total_sigs']:>8,}  {tag}")

print(f"\n{'='*90}")
print(f"  EDGE MATRIX — LONG (vel_div >= T)    [edge = WR − unconditional_up_{TP_BPS}bps]")
print(f"  ctrl_baseline = {ctrl_up_pct:.1f}%")
print(f"{'='*90}")
print(hdr)
print(f"  {'-'*88}")
for row in summary_rows:
    if row['direction'] != 'L':
        continue
    yr_edges = "  ".join(
        f"{row.get(yr+'_edge', float('nan')):>+8.1f}pp" if not (row.get(yr+'_edge', float('nan')) != row.get(yr+'_edge', float('nan'))) else '       ---'
        for yr in YEARS
    )
    tag = '← EDGE' if row['total_edge_pp'] > 2 else ('← weak' if row['total_edge_pp'] > 0 else '')
    print(f"  {row['threshold']:>10.3f}  {yr_edges}  {row['total_edge_pp']:>+8.1f}pp  {row['total_sigs']:>8,}  {tag}")

# ── Ranked summary (positive edge, SHORT) ────────────────────────────────────
print(f"\n{'='*70}")
print(f"  TOP SHORT THRESHOLDS by total edge (>0 only)")
print(f"{'='*70}")
print(f"  {'T':>8}  {'n_sigs':>8}  {'n_trades':>9}  {'WR':>7}  {'PF':>7}  {'Edge':>8}  {'2022':>8}  {'2023':>8}  {'2024':>8}")
print(f"  {'-'*68}")
short_rows = sorted([r for r in summary_rows if r['direction'] == 'S' and r['total_edge_pp'] > 0],
                     key=lambda r: r['total_edge_pp'], reverse=True)
for r in short_rows:
    print(f"  {r['threshold']:>8.3f}  {r['total_sigs']:>8,}  "
          f"{r['total_wins']+r['total_losses']:>9,}  "
          f"{r['total_wr']:>6.1f}%  {r['total_pf']:>7.3f}  {r['total_edge_pp']:>+7.1f}pp  "
          f"{r.get('2022_edge', float('nan')):>+7.1f}pp  "
          f"{r.get('2023_edge', float('nan')):>+7.1f}pp  "
          f"{r.get('2024_edge', float('nan')):>+7.1f}pp")
if not short_rows:
    print("  (none — no positive edge found at any threshold)")

print(f"\n{'='*70}")
print(f"  TOP LONG THRESHOLDS by total edge (>0 only)")
print(f"{'='*70}")
print(f"  {'T':>8}  {'n_sigs':>8}  {'n_trades':>9}  {'WR':>7}  {'PF':>7}  {'Edge':>8}  {'2022':>8}  {'2023':>8}  {'2024':>8}")
print(f"  {'-'*68}")
long_rows = sorted([r for r in summary_rows if r['direction'] == 'L' and r['total_edge_pp'] > 0],
                    key=lambda r: r['total_edge_pp'], reverse=True)
for r in long_rows:
    print(f"  {r['threshold']:>8.3f}  {r['total_sigs']:>8,}  "
          f"{r['total_wins']+r['total_losses']:>9,}  "
          f"{r['total_wr']:>6.1f}%  {r['total_pf']:>7.3f}  {r['total_edge_pp']:>+7.1f}pp  "
          f"{r.get('2022_edge', float('nan')):>+7.1f}pp  "
          f"{r.get('2023_edge', float('nan')):>+7.1f}pp  "
          f"{r.get('2024_edge', float('nan')):>+7.1f}pp")
if not long_rows:
    print("  (none — no positive edge found at any threshold)")

print(f"\n  *** OVERFITTING CAVEAT: thresholds selected from this sweep ***")
print(f"  *** on this dataset constitute data snooping. Use for regime ***")
print(f"  *** correlation analysis only, not for system recalibration. ***")

# ── Save CSVs ─────────────────────────────────────────────────────────────────
LOG_DIR.mkdir(exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")

# 1. Summary CSV
sweep_path = LOG_DIR / f"vd_sweep_{ts}.csv"
summary_fieldnames = ['direction', 'threshold', 'total_sigs', 'total_wins', 'total_losses',
                       'total_wr', 'total_pf', 'total_edge_pp', 'ctrl_baseline']
for yr in YEARS:
    summary_fieldnames += [f'{yr}_sigs', f'{yr}_wr', f'{yr}_pf', f'{yr}_edge']
with open(sweep_path, 'w', newline='') as f:
    w = csv.DictWriter(f, fieldnames=summary_fieldnames, extrasaction='ignore')
    w.writeheader()
    w.writerows(summary_rows)

# 2. Regime correlation CSV
regime_path = LOG_DIR / f"vd_regime_{ts}.csv"
regime_fieldnames = ['date', 'year', 'quarter', 'direction', 'threshold',
                      'n_sig', 'wins', 'losses', 'wr',
                      'gross_win', 'gross_loss',
                      'btc_return', 'realized_vol',
                      'vd_median', 'vd_std', 'vd_p10', 'vd_p90']
with open(regime_path, 'w', newline='') as f:
    w = csv.DictWriter(f, fieldnames=regime_fieldnames, extrasaction='ignore')
    w.writeheader()
    w.writerows(regime_rows)

print(f"\n  sweep   → {sweep_path}")
print(f"  regime  → {regime_path}")
print(f"  Runtime: {elapsed:.0f}s")
print(f"  Rows in regime CSV: {len(regime_rows):,}")