Files
DOLPHIN/nautilus_dolphin/vel_div_sweep_5y.py
hjnormey 01c19662cb initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems:
- prod/ (BLUE harness, configs, scripts, docs)
- nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved)
- adaptive_exit/ (AEM engine + models/bucket_assignments.pkl)
- Observability/ (EsoF advisor, TUI, dashboards)
- external_factors/ (EsoF producer)
- mc_forewarning_qlabs_fork/ (MC regime/envelope)

Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00

382 lines
18 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""vel_div Threshold Sweep — 5-Year Klines
==========================================
Raw edge sweep: no engine stack, no fees, no leverage.
Sweeps SHORT (vd <= T) and LONG (vd >= T) across 15 thresholds each.
Output:
1. Console: edge matrix (threshold × year) + ranked summary
2. run_logs/vd_sweep_YYYYMMDD_HHMMSS.csv — per (T, dir, year, quarter)
3. run_logs/vd_regime_YYYYMMDD_HHMMSS.csv — per (date, T, dir): regime ctx
WARNING: This is descriptive/diagnostic only.
Selecting a threshold FROM this sweep on the SAME dataset = data snooping.
Use it to understand WHICH market conditions produce edge, not to tune the system.
Vectorization: precomputes rolling fut_min/fut_max once per file (sliding_window_view).
Runtime: ~5-10 min for 1710 files.
"""
import sys, time, csv, gc
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
from datetime import datetime
from collections import defaultdict
import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view
# ── Config ────────────────────────────────────────────────────────────────────
VBT_DIR = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines")
LOG_DIR = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\nautilus_dolphin\run_logs")
TP_BPS = 95
MAX_HOLD = 120
tp_pct = TP_BPS / 10000.0
# SHORT thresholds (vel_div <= T, T negative)
SHORT_T = [-0.005, -0.008, -0.010, -0.013, -0.015, -0.020,
-0.025, -0.030, -0.040, -0.050, -0.070, -0.100,
-0.140, -0.200, -0.300]
# LONG thresholds (vel_div >= T, T positive)
LONG_T = [+0.005, +0.008, +0.010, +0.013, +0.015, +0.020,
+0.025, +0.030, +0.040, +0.050, +0.070, +0.100,
+0.140, +0.200, +0.300]
THRESHOLDS = [('S', t) for t in SHORT_T] + [('L', t) for t in LONG_T]
N_T = len(THRESHOLDS)
# ── Accumulators ──────────────────────────────────────────────────────────────
# sweep_stats[(dir, T, year, quarter)] = {'wins': 0, 'losses': 0, 'n_sig': 0,
# 'gross_win': 0.0, 'gross_loss': 0.0}
sweep_stats = defaultdict(lambda: {'wins': 0, 'losses': 0, 'n_sig': 0,
'gross_win': 0.0, 'gross_loss': 0.0})
# control_stats[(year, quarter)] = {'up': 0, 'dn': 0, 'n': 0}
ctrl_stats = defaultdict(lambda: {'up': 0, 'dn': 0, 'n': 0})
# regime_rows: one row per (date, dir, T) for CSV — kept lightweight
regime_rows = []
# ── Enumerate files ───────────────────────────────────────────────────────────
parquet_files = sorted(VBT_DIR.glob("*.parquet"))
parquet_files = [p for p in parquet_files if 'catalog' not in str(p)]
total = len(parquet_files)
print(f"Files: {total} TP={TP_BPS}bps MAX_HOLD={MAX_HOLD} bars")
print(f"SHORT thresholds: {len(SHORT_T)} LONG thresholds: {len(LONG_T)}\n")
t0 = time.time()
for i, pf in enumerate(parquet_files):
ds = pf.stem # "YYYY-MM-DD"
year = ds[:4]
month = int(ds[5:7])
quarter = f"Q{(month-1)//3+1}"
try:
df = pd.read_parquet(pf)
except Exception:
continue
if 'vel_div' not in df.columns or 'BTCUSDT' not in df.columns:
continue
vd = df['vel_div'].values.astype(np.float64)
btc = df['BTCUSDT'].values.astype(np.float64)
# Sanitize
vd = np.where(np.isfinite(vd), vd, 0.0)
btc = np.where(np.isfinite(btc) & (btc > 0), btc, np.nan)
n = len(btc)
if n < MAX_HOLD + 2:
del df, vd, btc
continue
# ── Precompute rolling future min/max in one pass ─────────────────────────
# windows shape: (n - MAX_HOLD, MAX_HOLD + 1) — includes bar i itself
windows = sliding_window_view(btc, MAX_HOLD + 1) # (n-MAX_HOLD, 121)
ep_arr = windows[:, 0] # entry prices (bar i)
fut_min = np.nanmin(windows[:, 1:], axis=1) # min of next 120 bars
fut_max = np.nanmax(windows[:, 1:], axis=1) # max of next 120 bars
last_px = windows[:, -1] # exit price at MAX_HOLD if no TP
valid = np.isfinite(ep_arr) & (ep_arr > 0)
n_valid = np.sum(valid)
# ── Regime metadata for this date ─────────────────────────────────────────
btc_clean = btc[np.isfinite(btc)]
if len(btc_clean) >= 2:
btc_return = (btc_clean[-1] - btc_clean[0]) / btc_clean[0]
log_r = np.diff(np.log(btc_clean))
realized_vol = float(np.std(log_r)) if len(log_r) > 1 else 0.0
else:
btc_return = 0.0
realized_vol = 0.0
vd_finite = vd[np.isfinite(vd)]
vd_median = float(np.median(vd_finite)) if len(vd_finite) > 0 else 0.0
vd_std = float(np.std(vd_finite)) if len(vd_finite) > 0 else 0.0
vd_p10 = float(np.percentile(vd_finite, 10)) if len(vd_finite) > 5 else 0.0
vd_p90 = float(np.percentile(vd_finite, 90)) if len(vd_finite) > 5 else 0.0
# ── Control: unconditional baseline every 60th bar ────────────────────────
ck = (year, quarter)
for j in range(0, n - MAX_HOLD, 60):
ep = btc[j]; ex = btc[j + MAX_HOLD]
if np.isfinite(ep) and np.isfinite(ex) and ep > 0:
r = (ex - ep) / ep
ctrl_stats[ck]['up'] += int(r >= tp_pct)
ctrl_stats[ck]['dn'] += int(r <= -tp_pct)
ctrl_stats[ck]['n'] += 1
# ── Per-threshold sweep ───────────────────────────────────────────────────
for (direction, T) in THRESHOLDS:
if direction == 'S':
sig_mask = vd[:n - MAX_HOLD] <= T
else:
sig_mask = vd[:n - MAX_HOLD] >= T
sig_idx = np.where(sig_mask & valid)[0]
if len(sig_idx) == 0:
regime_rows.append({
'date': ds, 'year': year, 'quarter': quarter,
'direction': direction, 'threshold': T,
'n_sig': 0, 'wins': 0, 'losses': 0, 'wr': np.nan,
'gross_win': 0.0, 'gross_loss': 0.0,
'btc_return': round(btc_return, 6),
'realized_vol': round(realized_vol, 8),
'vd_median': round(vd_median, 6),
'vd_std': round(vd_std, 6),
'vd_p10': round(vd_p10, 6),
'vd_p90': round(vd_p90, 6),
})
continue
ep_s = ep_arr[sig_idx]
fmin_s = fut_min[sig_idx]
fmax_s = fut_max[sig_idx]
last_s = last_px[sig_idx]
if direction == 'S':
tp_price = ep_s * (1.0 - tp_pct)
hit = fmin_s <= tp_price
# For losses: return at MAX_HOLD exit, sign-adjusted for short
loss_ret = np.where(np.isfinite(last_s), (ep_s - last_s) / ep_s, 0.0)
else:
tp_price = ep_s * (1.0 + tp_pct)
hit = fmax_s >= tp_price
loss_ret = np.where(np.isfinite(last_s), (last_s - ep_s) / ep_s, 0.0)
wins_arr = hit
loss_arr = ~hit
n_sig = len(sig_idx)
wins = int(np.sum(wins_arr))
losses = int(np.sum(loss_arr))
gross_win = wins * tp_pct
gross_loss= float(np.sum(np.abs(loss_ret[loss_arr])))
sk = (direction, T, year, quarter)
sweep_stats[sk]['wins'] += wins
sweep_stats[sk]['losses'] += losses
sweep_stats[sk]['n_sig'] += n_sig
sweep_stats[sk]['gross_win'] += gross_win
sweep_stats[sk]['gross_loss'] += gross_loss
wr = wins / (wins + losses) * 100 if (wins + losses) > 0 else float('nan')
regime_rows.append({
'date': ds, 'year': year, 'quarter': quarter,
'direction': direction, 'threshold': T,
'n_sig': n_sig, 'wins': wins, 'losses': losses, 'wr': round(wr, 2),
'gross_win': round(gross_win, 6), 'gross_loss': round(gross_loss, 6),
'btc_return': round(btc_return, 6),
'realized_vol': round(realized_vol, 8),
'vd_median': round(vd_median, 6),
'vd_std': round(vd_std, 6),
'vd_p10': round(vd_p10, 6),
'vd_p90': round(vd_p90, 6),
})
del df, vd, btc, windows, ep_arr, fut_min, fut_max, last_px, valid
if (i + 1) % 100 == 0:
gc.collect()
elapsed = time.time() - t0
rate = (i + 1) / elapsed
eta = (total - i - 1) / rate
print(f" [{i+1}/{total}] {ds} {elapsed/60:.1f}m elapsed {eta/60:.1f}m eta")
elapsed = time.time() - t0
print(f"\nPass complete in {elapsed:.0f}s ({elapsed/60:.1f}m)")
# ── Compute per-quarter control baselines ─────────────────────────────────────
def ctrl_baseline(year, quarter, direction):
ck = (year, quarter)
c = ctrl_stats.get(ck, {'up': 0, 'dn': 0, 'n': 0})
if c['n'] == 0:
return float('nan')
if direction == 'S':
return c['dn'] / c['n'] * 100
else:
return c['up'] / c['n'] * 100
# Overall control
ctrl_n_tot = sum(v['n'] for v in ctrl_stats.values())
ctrl_dn_tot = sum(v['dn'] for v in ctrl_stats.values())
ctrl_up_tot = sum(v['up'] for v in ctrl_stats.values())
ctrl_dn_pct = ctrl_dn_tot / ctrl_n_tot * 100 if ctrl_n_tot else 0
ctrl_up_pct = ctrl_up_tot / ctrl_n_tot * 100 if ctrl_n_tot else 0
print(f"\nControl (unconditional 120-bar baseline):")
print(f" DOWN {TP_BPS}bps: {ctrl_dn_pct:.1f}% UP {TP_BPS}bps: {ctrl_up_pct:.1f}% n={ctrl_n_tot:,}")
# ── Build summary table ───────────────────────────────────────────────────────
YEARS = ['2021', '2022', '2023', '2024', '2025', '2026']
summary_rows = []
for (direction, T) in THRESHOLDS:
row = {'direction': direction, 'threshold': T}
total_wins = total_losses = total_sigs = 0
total_gw = total_gl = 0.0
for yr in YEARS:
yr_wins = yr_losses = yr_sigs = 0
yr_gw = yr_gl = 0.0
for q in ['Q1', 'Q2', 'Q3', 'Q4']:
sk = (direction, T, yr, q)
if sk in sweep_stats:
s = sweep_stats[sk]
yr_wins += s['wins']; yr_losses += s['losses']
yr_sigs += s['n_sig']; yr_gw += s['gross_win']
yr_gl += s['gross_loss']
yr_n = yr_wins + yr_losses
yr_wr = yr_wins / yr_n * 100 if yr_n > 0 else float('nan')
yr_pf = yr_gw / yr_gl if yr_gl > 0 else (999.0 if yr_gw > 0 else float('nan'))
ctrl_bl = ctrl_dn_pct if direction == 'S' else ctrl_up_pct
yr_edge = yr_wr - ctrl_bl if not (yr_wr != yr_wr) else float('nan') # nan-safe
row[f'{yr}_sigs'] = yr_sigs
row[f'{yr}_wr'] = round(yr_wr, 2) if not (yr_wr != yr_wr) else float('nan')
row[f'{yr}_pf'] = round(yr_pf, 3) if not (yr_pf != yr_pf) else float('nan')
row[f'{yr}_edge'] = round(yr_edge, 2) if not (yr_edge != yr_edge) else float('nan')
total_wins += yr_wins; total_losses += yr_losses
total_sigs += yr_sigs; total_gw += yr_gw; total_gl += yr_gl
total_n = total_wins + total_losses
total_wr = total_wins / total_n * 100 if total_n > 0 else float('nan')
total_pf = total_gw / total_gl if total_gl > 0 else (999.0 if total_gw > 0 else float('nan'))
ctrl_bl = ctrl_dn_pct if direction == 'S' else ctrl_up_pct
total_edge = total_wr - ctrl_bl if not (total_wr != total_wr) else float('nan')
row['total_sigs'] = total_sigs
row['total_wins'] = total_wins
row['total_losses']= total_losses
row['total_wr'] = round(total_wr, 2)
row['total_pf'] = round(total_pf, 3)
row['total_edge_pp'] = round(total_edge, 2)
row['ctrl_baseline'] = round(ctrl_bl, 2)
summary_rows.append(row)
# ── Print edge matrix (SHORT) ─────────────────────────────────────────────────
def fmt_edge(e):
if e != e: return ' --- '
return f'{e:>+5.1f}pp' if abs(e) >= 0.1 else f' {e:>+4.2f}pp'
print(f"\n{'='*90}")
print(f" EDGE MATRIX — SHORT (vel_div <= T) [edge = WR unconditional_down_{TP_BPS}bps]")
print(f" ctrl_baseline = {ctrl_dn_pct:.1f}%")
print(f"{'='*90}")
hdr = f" {'Threshold':>10} " + " ".join(f"{'Edge '+yr:>10}" for yr in YEARS) + " " + f"{'TOTAL':>10} {'n_sigs':>8}"
print(hdr)
print(f" {'-'*88}")
for row in summary_rows:
if row['direction'] != 'S':
continue
yr_edges = " ".join(
f"{row.get(yr+'_edge', float('nan')):>+8.1f}pp" if not (row.get(yr+'_edge', float('nan')) != row.get(yr+'_edge', float('nan'))) else ' ---'
for yr in YEARS
)
tag = '← EDGE' if row['total_edge_pp'] > 2 else ('← weak' if row['total_edge_pp'] > 0 else '')
print(f" {row['threshold']:>10.3f} {yr_edges} {row['total_edge_pp']:>+8.1f}pp {row['total_sigs']:>8,} {tag}")
print(f"\n{'='*90}")
print(f" EDGE MATRIX — LONG (vel_div >= T) [edge = WR unconditional_up_{TP_BPS}bps]")
print(f" ctrl_baseline = {ctrl_up_pct:.1f}%")
print(f"{'='*90}")
print(hdr)
print(f" {'-'*88}")
for row in summary_rows:
if row['direction'] != 'L':
continue
yr_edges = " ".join(
f"{row.get(yr+'_edge', float('nan')):>+8.1f}pp" if not (row.get(yr+'_edge', float('nan')) != row.get(yr+'_edge', float('nan'))) else ' ---'
for yr in YEARS
)
tag = '← EDGE' if row['total_edge_pp'] > 2 else ('← weak' if row['total_edge_pp'] > 0 else '')
print(f" {row['threshold']:>10.3f} {yr_edges} {row['total_edge_pp']:>+8.1f}pp {row['total_sigs']:>8,} {tag}")
# ── Ranked summary (positive edge, SHORT) ────────────────────────────────────
print(f"\n{'='*70}")
print(f" TOP SHORT THRESHOLDS by total edge (>0 only)")
print(f"{'='*70}")
print(f" {'T':>8} {'n_sigs':>8} {'n_trades':>9} {'WR':>7} {'PF':>7} {'Edge':>8} {'2022':>8} {'2023':>8} {'2024':>8}")
print(f" {'-'*68}")
short_rows = sorted([r for r in summary_rows if r['direction'] == 'S' and r['total_edge_pp'] > 0],
key=lambda r: r['total_edge_pp'], reverse=True)
for r in short_rows:
print(f" {r['threshold']:>8.3f} {r['total_sigs']:>8,} "
f"{r['total_wins']+r['total_losses']:>9,} "
f"{r['total_wr']:>6.1f}% {r['total_pf']:>7.3f} {r['total_edge_pp']:>+7.1f}pp "
f"{r.get('2022_edge', float('nan')):>+7.1f}pp "
f"{r.get('2023_edge', float('nan')):>+7.1f}pp "
f"{r.get('2024_edge', float('nan')):>+7.1f}pp")
if not short_rows:
print(" (none — no positive edge found at any threshold)")
print(f"\n{'='*70}")
print(f" TOP LONG THRESHOLDS by total edge (>0 only)")
print(f"{'='*70}")
print(f" {'T':>8} {'n_sigs':>8} {'n_trades':>9} {'WR':>7} {'PF':>7} {'Edge':>8} {'2022':>8} {'2023':>8} {'2024':>8}")
print(f" {'-'*68}")
long_rows = sorted([r for r in summary_rows if r['direction'] == 'L' and r['total_edge_pp'] > 0],
key=lambda r: r['total_edge_pp'], reverse=True)
for r in long_rows:
print(f" {r['threshold']:>8.3f} {r['total_sigs']:>8,} "
f"{r['total_wins']+r['total_losses']:>9,} "
f"{r['total_wr']:>6.1f}% {r['total_pf']:>7.3f} {r['total_edge_pp']:>+7.1f}pp "
f"{r.get('2022_edge', float('nan')):>+7.1f}pp "
f"{r.get('2023_edge', float('nan')):>+7.1f}pp "
f"{r.get('2024_edge', float('nan')):>+7.1f}pp")
if not long_rows:
print(" (none — no positive edge found at any threshold)")
print(f"\n *** OVERFITTING CAVEAT: thresholds selected from this sweep ***")
print(f" *** on this dataset constitute data snooping. Use for regime ***")
print(f" *** correlation analysis only, not for system recalibration. ***")
# ── Save CSVs ─────────────────────────────────────────────────────────────────
LOG_DIR.mkdir(exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
# 1. Summary CSV
sweep_path = LOG_DIR / f"vd_sweep_{ts}.csv"
summary_fieldnames = ['direction', 'threshold', 'total_sigs', 'total_wins', 'total_losses',
'total_wr', 'total_pf', 'total_edge_pp', 'ctrl_baseline']
for yr in YEARS:
summary_fieldnames += [f'{yr}_sigs', f'{yr}_wr', f'{yr}_pf', f'{yr}_edge']
with open(sweep_path, 'w', newline='') as f:
w = csv.DictWriter(f, fieldnames=summary_fieldnames, extrasaction='ignore')
w.writeheader()
w.writerows(summary_rows)
# 2. Regime correlation CSV
regime_path = LOG_DIR / f"vd_regime_{ts}.csv"
regime_fieldnames = ['date', 'year', 'quarter', 'direction', 'threshold',
'n_sig', 'wins', 'losses', 'wr',
'gross_win', 'gross_loss',
'btc_return', 'realized_vol',
'vd_median', 'vd_std', 'vd_p10', 'vd_p90']
with open(regime_path, 'w', newline='') as f:
w = csv.DictWriter(f, fieldnames=regime_fieldnames, extrasaction='ignore')
w.writeheader()
w.writerows(regime_rows)
print(f"\n sweep → {sweep_path}")
print(f" regime → {regime_path}")
print(f" Runtime: {elapsed:.0f}s")
print(f" Rows in regime CSV: {len(regime_rows):,}")