657 lines
26 KiB
Python
657 lines
26 KiB
Python
|
|
"""Entry-bar eigenvalue feature sweep — winner vs loser discrimination.
|
||
|
|
|
||
|
|
Loads the 55-day champion dataset, re-runs the full engine stack to collect
|
||
|
|
trade_history with entry_bar indices, then extracts a rich feature matrix
|
||
|
|
(raw + derivatives + cross-TF combos + historical context) at each entry bar.
|
||
|
|
|
||
|
|
Statistical analysis:
|
||
|
|
- Pearson r vs pnl_pct
|
||
|
|
- Point-biserial + KS stat vs winner binary
|
||
|
|
- ROC-AUC for winner / MAX_HOLD-loss / TP discrimination
|
||
|
|
|
||
|
|
Outputs:
|
||
|
|
run_logs/entry_quality_features_<ts>.csv — per-trade feature matrix
|
||
|
|
run_logs/entry_quality_sweep_<ts>.csv — per-feature analysis table
|
||
|
|
"""
|
||
|
|
import sys, time, csv
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
||
|
|
|
||
|
|
# ── Numba JIT warmup (must run BEFORE other imports that touch numpy internals) ──
|
||
|
|
print("Compiling numba kernels...")
|
||
|
|
t0c = time.time()
|
||
|
|
from nautilus_dolphin.nautilus.alpha_asset_selector import compute_irp_nb, compute_ars_nb, rank_assets_irp_nb
|
||
|
|
from nautilus_dolphin.nautilus.alpha_bet_sizer import compute_sizing_nb
|
||
|
|
from nautilus_dolphin.nautilus.alpha_signal_generator import check_dc_nb
|
||
|
|
from nautilus_dolphin.nautilus.ob_features import (
|
||
|
|
OBFeatureEngine, compute_imbalance_nb, compute_depth_1pct_nb,
|
||
|
|
compute_depth_quality_nb, compute_fill_probability_nb, compute_spread_proxy_nb,
|
||
|
|
compute_depth_asymmetry_nb, compute_imbalance_persistence_nb,
|
||
|
|
compute_withdrawal_velocity_nb, compute_market_agreement_nb, compute_cascade_signal_nb,
|
||
|
|
)
|
||
|
|
from nautilus_dolphin.nautilus.ob_provider import MockOBProvider
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
_p = np.array([1.0, 2.0, 3.0], dtype=np.float64)
|
||
|
|
compute_irp_nb(_p, -1); compute_ars_nb(1.0, 0.5, 0.01)
|
||
|
|
rank_assets_irp_nb(np.ones((10, 2), dtype=np.float64), 8, -1, 5, 500.0, 20, 0.20)
|
||
|
|
compute_sizing_nb(-0.03, -0.02, -0.05, 3.0, 0.5, 5.0, 0.20, True, True, 0.0,
|
||
|
|
np.zeros(4, dtype=np.int64), np.zeros(4, dtype=np.int64),
|
||
|
|
np.zeros(5, dtype=np.float64), 0, -1, 0.01, 0.04)
|
||
|
|
check_dc_nb(_p, 3, 1, 0.75)
|
||
|
|
_b = np.array([100.0, 200.0, 300.0, 400.0, 500.0], dtype=np.float64)
|
||
|
|
_a = np.array([110.0, 190.0, 310.0, 390.0, 510.0], dtype=np.float64)
|
||
|
|
compute_imbalance_nb(_b, _a); compute_depth_1pct_nb(_b, _a)
|
||
|
|
compute_depth_quality_nb(210.0, 200.0); compute_fill_probability_nb(1.0)
|
||
|
|
compute_spread_proxy_nb(_b, _a); compute_depth_asymmetry_nb(_b, _a)
|
||
|
|
compute_imbalance_persistence_nb(np.array([0.1, -0.1], dtype=np.float64), 2)
|
||
|
|
compute_withdrawal_velocity_nb(np.array([100.0, 110.0], dtype=np.float64), 1)
|
||
|
|
compute_market_agreement_nb(np.array([0.1, -0.05], dtype=np.float64), 2)
|
||
|
|
compute_cascade_signal_nb(np.array([-0.05, -0.15], dtype=np.float64), 2, -0.10)
|
||
|
|
print(f" JIT: {time.time() - t0c:.1f}s")
|
||
|
|
|
||
|
|
import pandas as pd
|
||
|
|
from nautilus_dolphin.nautilus.esf_alpha_orchestrator import NDAlphaEngine
|
||
|
|
from nautilus_dolphin.nautilus.adaptive_circuit_breaker import AdaptiveCircuitBreaker
|
||
|
|
from mc.mc_ml import DolphinForewarner
|
||
|
|
|
||
|
|
# ── Config (identical to test_pf_dynamic_beta_validate.py) ──────────────────────
|
||
|
|
VBT_DIR = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache")
|
||
|
|
MC_MODELS_DIR = str(Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\nautilus_dolphin\mc_results\models"))
|
||
|
|
|
||
|
|
META_COLS = {
|
||
|
|
'timestamp', 'scan_number',
|
||
|
|
'v50_lambda_max_velocity', 'v150_lambda_max_velocity',
|
||
|
|
'v300_lambda_max_velocity', 'v750_lambda_max_velocity',
|
||
|
|
'vel_div', 'instability_50', 'instability_150',
|
||
|
|
}
|
||
|
|
|
||
|
|
ENGINE_KWARGS = dict(
|
||
|
|
initial_capital=25000.0, vel_div_threshold=-0.02, vel_div_extreme=-0.05,
|
||
|
|
min_leverage=0.5, max_leverage=5.0, leverage_convexity=3.0,
|
||
|
|
fraction=0.20, fixed_tp_pct=0.0099, stop_pct=1.0, max_hold_bars=120,
|
||
|
|
use_direction_confirm=True, dc_lookback_bars=7, dc_min_magnitude_bps=0.75,
|
||
|
|
dc_skip_contradicts=True, dc_leverage_boost=1.0, dc_leverage_reduce=0.5,
|
||
|
|
use_asset_selection=True, min_irp_alignment=0.45,
|
||
|
|
use_sp_fees=True, use_sp_slippage=True,
|
||
|
|
sp_maker_entry_rate=0.62, sp_maker_exit_rate=0.50,
|
||
|
|
use_ob_edge=True, ob_edge_bps=5.0, ob_confirm_rate=0.40,
|
||
|
|
lookback=100, use_alpha_layers=True, use_dynamic_leverage=True, seed=42,
|
||
|
|
tf_enabled=False,
|
||
|
|
)
|
||
|
|
|
||
|
|
MC_BASE_CFG = {
|
||
|
|
'trial_id': 0, 'vel_div_threshold': -0.020, 'vel_div_extreme': -0.050,
|
||
|
|
'use_direction_confirm': True, 'dc_lookback_bars': 7,
|
||
|
|
'dc_min_magnitude_bps': 0.75, 'dc_skip_contradicts': True,
|
||
|
|
'dc_leverage_boost': 1.00, 'dc_leverage_reduce': 0.50,
|
||
|
|
'vd_trend_lookback': 10, 'min_leverage': 0.50, 'max_leverage': 5.00,
|
||
|
|
'leverage_convexity': 3.00, 'fraction': 0.20,
|
||
|
|
'use_alpha_layers': True, 'use_dynamic_leverage': True,
|
||
|
|
'fixed_tp_pct': 0.0099, 'stop_pct': 1.00, 'max_hold_bars': 120,
|
||
|
|
'use_sp_fees': True, 'use_sp_slippage': True,
|
||
|
|
'sp_maker_entry_rate': 0.62, 'sp_maker_exit_rate': 0.50,
|
||
|
|
'use_ob_edge': True, 'ob_edge_bps': 5.00, 'ob_confirm_rate': 0.40,
|
||
|
|
'ob_imbalance_bias': -0.09, 'ob_depth_scale': 1.00,
|
||
|
|
'use_asset_selection': True, 'min_irp_alignment': 0.45, 'lookback': 100,
|
||
|
|
'acb_beta_high': 0.80, 'acb_beta_low': 0.20, 'acb_w750_threshold_pct': 60,
|
||
|
|
}
|
||
|
|
|
||
|
|
FEAT_COLS = ['vel_div', 'v50', 'v150', 'v300', 'v750', 'inst50', 'inst150']
|
||
|
|
N_FEAT = len(FEAT_COLS) # 7
|
||
|
|
|
||
|
|
# ── Step 1: Load data + build global bar feature arrays ──────────────────────────
|
||
|
|
print("\nLoading parquet files...")
|
||
|
|
parquet_files = sorted(VBT_DIR.glob("*.parquet"))
|
||
|
|
parquet_files = [p for p in parquet_files if 'catalog' not in str(p)]
|
||
|
|
print(f" {len(parquet_files)} files")
|
||
|
|
|
||
|
|
pq_data = {}
|
||
|
|
for pf in parquet_files:
|
||
|
|
df = pd.read_parquet(pf)
|
||
|
|
ac = [c for c in df.columns if c not in META_COLS]
|
||
|
|
bp = df['BTCUSDT'].values if 'BTCUSDT' in df.columns else None
|
||
|
|
dv = np.full(len(df), np.nan)
|
||
|
|
if bp is not None:
|
||
|
|
for i in range(50, len(bp)):
|
||
|
|
seg = bp[max(0, i-50):i]
|
||
|
|
if len(seg) >= 10:
|
||
|
|
dv[i] = float(np.std(np.diff(seg) / seg[:-1]))
|
||
|
|
pq_data[pf.stem] = (df, ac, dv)
|
||
|
|
|
||
|
|
# vol_p60 from first 2 files (matches reference)
|
||
|
|
all_vols = []
|
||
|
|
for pf in parquet_files[:2]:
|
||
|
|
df, _, _ = pq_data[pf.stem]
|
||
|
|
if 'BTCUSDT' not in df.columns:
|
||
|
|
continue
|
||
|
|
pr = df['BTCUSDT'].values
|
||
|
|
for i in range(60, len(pr)):
|
||
|
|
seg = pr[max(0, i-50):i]
|
||
|
|
if len(seg) >= 10:
|
||
|
|
v = float(np.std(np.diff(seg) / seg[:-1]))
|
||
|
|
if v > 0:
|
||
|
|
all_vols.append(v)
|
||
|
|
vol_p60 = float(np.percentile(all_vols, 60))
|
||
|
|
|
||
|
|
print(f"\nBuilding global bar feature arrays...")
|
||
|
|
# gbar_features: list accumulator → converted to np.ndarray after full pass
|
||
|
|
# Layout: [vel_div, v50, v150, v300, v750, inst50, inst150]
|
||
|
|
# gbar_valid_mask: array[bool] indexed by global_bar_idx
|
||
|
|
# gbar_to_rownum: global_bar_idx → index into gbar_features (only valid bars)
|
||
|
|
# Every bar (including NaN-vd bars) increments the global counter, matching engine.
|
||
|
|
|
||
|
|
feat_accum = [] # only valid bars appended
|
||
|
|
gbar_valid_mask_list = [] # one bool per global bar
|
||
|
|
gbar_to_rownum = {} # global_bar_idx → row index in feat_accum
|
||
|
|
|
||
|
|
g = 0
|
||
|
|
for pf in parquet_files:
|
||
|
|
df, _, _ = pq_data[pf.stem]
|
||
|
|
vd_col = df['vel_div'].values
|
||
|
|
v50_col = df['v50_lambda_max_velocity'].values
|
||
|
|
v150_col = df['v150_lambda_max_velocity'].values
|
||
|
|
v300_col = df['v300_lambda_max_velocity'].values
|
||
|
|
v750_col = df['v750_lambda_max_velocity'].values
|
||
|
|
i50_col = df['instability_50'].values
|
||
|
|
i150_col = df['instability_150'].values
|
||
|
|
|
||
|
|
for ri in range(len(df)):
|
||
|
|
vd = vd_col[ri]
|
||
|
|
valid = np.isfinite(vd)
|
||
|
|
if valid:
|
||
|
|
row_idx = len(feat_accum)
|
||
|
|
feat_accum.append([
|
||
|
|
float(vd),
|
||
|
|
float(v50_col[ri]) if np.isfinite(v50_col[ri]) else np.nan,
|
||
|
|
float(v150_col[ri]) if np.isfinite(v150_col[ri]) else np.nan,
|
||
|
|
float(v300_col[ri]) if np.isfinite(v300_col[ri]) else np.nan,
|
||
|
|
float(v750_col[ri]) if np.isfinite(v750_col[ri]) else np.nan,
|
||
|
|
float(i50_col[ri]) if np.isfinite(i50_col[ri]) else np.nan,
|
||
|
|
float(i150_col[ri]) if np.isfinite(i150_col[ri]) else np.nan,
|
||
|
|
])
|
||
|
|
gbar_to_rownum[g] = row_idx
|
||
|
|
gbar_valid_mask_list.append(True)
|
||
|
|
else:
|
||
|
|
gbar_valid_mask_list.append(False)
|
||
|
|
g += 1
|
||
|
|
|
||
|
|
gbar_features = np.array(feat_accum, dtype=np.float64) # (N_valid, 7)
|
||
|
|
gbar_valid_mask = np.array(gbar_valid_mask_list, dtype=bool)
|
||
|
|
N_total_gbars = g
|
||
|
|
|
||
|
|
print(f" Total global bars: {N_total_gbars}")
|
||
|
|
print(f" Valid bars (non-NaN vel_div): {len(feat_accum)} ({len(feat_accum)/N_total_gbars*100:.1f}%)")
|
||
|
|
|
||
|
|
# ── Step 2: Run champion engine ──────────────────────────────────────────────────
|
||
|
|
print("\nLoading MC-Forewarner...")
|
||
|
|
forewarner = DolphinForewarner(models_dir=MC_MODELS_DIR)
|
||
|
|
print(" OK")
|
||
|
|
|
||
|
|
acb = AdaptiveCircuitBreaker()
|
||
|
|
date_strings = [pf.stem for pf in parquet_files]
|
||
|
|
acb.preload_w750(date_strings)
|
||
|
|
|
||
|
|
OB_ASSETS = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"]
|
||
|
|
_mock_ob = MockOBProvider(
|
||
|
|
imbalance_bias=-0.09, depth_scale=1.0, assets=OB_ASSETS,
|
||
|
|
imbalance_biases={"BTCUSDT": -0.086, "ETHUSDT": -0.092,
|
||
|
|
"BNBUSDT": +0.05, "SOLUSDT": +0.05},
|
||
|
|
)
|
||
|
|
ob_eng = OBFeatureEngine(_mock_ob)
|
||
|
|
ob_eng.preload_date("mock", OB_ASSETS)
|
||
|
|
|
||
|
|
print(f"\n=== Running champion engine (55 days) ===")
|
||
|
|
t0 = time.time()
|
||
|
|
engine = NDAlphaEngine(**ENGINE_KWARGS)
|
||
|
|
engine.set_ob_engine(ob_eng)
|
||
|
|
engine.set_acb(acb)
|
||
|
|
engine.set_mc_forewarner(forewarner, MC_BASE_CFG)
|
||
|
|
engine.set_esoteric_hazard_multiplier(0.0)
|
||
|
|
|
||
|
|
for pf in parquet_files:
|
||
|
|
ds = pf.stem
|
||
|
|
df, acols, dvol = pq_data[ds]
|
||
|
|
vol_ok = np.where(np.isfinite(dvol), dvol > vol_p60, False)
|
||
|
|
engine.process_day(ds, df, acols, vol_regime_ok=vol_ok)
|
||
|
|
|
||
|
|
tr = engine.trade_history
|
||
|
|
roi = (engine.capital - 25000) / 25000 * 100
|
||
|
|
w_count = sum(1 for t in tr if t.pnl_absolute > 0)
|
||
|
|
print(f" {time.time()-t0:.1f}s | {len(tr)} trades | ROI={roi:+.2f}% | WR={w_count/len(tr)*100:.1f}%")
|
||
|
|
|
||
|
|
# ── Step 3: Extract entry-bar features ──────────────────────────────────────────
|
||
|
|
print(f"\nExtracting entry-bar features for {len(tr)} trades...")
|
||
|
|
|
||
|
|
def _get_row(gbar_idx):
|
||
|
|
"""Return feature vector for a global bar, or all-NaN if invalid."""
|
||
|
|
if gbar_idx < 0 or not gbar_valid_mask[gbar_idx]:
|
||
|
|
return np.full(N_FEAT, np.nan)
|
||
|
|
ri = gbar_to_rownum.get(gbar_idx)
|
||
|
|
if ri is None:
|
||
|
|
return np.full(N_FEAT, np.nan)
|
||
|
|
return gbar_features[ri]
|
||
|
|
|
||
|
|
# Feature name catalogue
|
||
|
|
feat_names = []
|
||
|
|
feat_names += FEAT_COLS # 7 raw
|
||
|
|
feat_names += [f"d1_{c}" for c in FEAT_COLS] # 7 first-deriv
|
||
|
|
feat_names += [f"d2_{c}" for c in FEAT_COLS] # 7 second-deriv
|
||
|
|
feat_names += [f"d3_{c}" for c in FEAT_COLS] # 7 third-deriv
|
||
|
|
|
||
|
|
VEL_NAMES = ['v50', 'v150', 'v300', 'v750']
|
||
|
|
VEL_IDX = [1, 2, 3, 4] # indices in FEAT_COLS
|
||
|
|
pairs = [(i, j) for i in range(4) for j in range(i+1, 4)]
|
||
|
|
for pi, pj in pairs:
|
||
|
|
feat_names.append(f"diff_{VEL_NAMES[pi]}_{VEL_NAMES[pj]}") # 6 diffs
|
||
|
|
for pi, pj in pairs:
|
||
|
|
feat_names.append(f"ratio_{VEL_NAMES[pi]}_{VEL_NAMES[pj]}") # 6 ratios
|
||
|
|
|
||
|
|
feat_names += [
|
||
|
|
'inter_inst_ratio', # inst50/inst150
|
||
|
|
'inter_inst_prod', # inst50*inst150
|
||
|
|
'inter_vd_inst50', # vel_div*inst50
|
||
|
|
'inter_vd_inst150', # vel_div*inst150
|
||
|
|
] # 4 instability interactions
|
||
|
|
|
||
|
|
feat_names += [
|
||
|
|
'hist_vd_mean3', # mean vel_div last 3 bars
|
||
|
|
'hist_vd_std3', # std vel_div last 3 bars
|
||
|
|
'hist_vd_min5', # min vel_div last 5 bars
|
||
|
|
'hist_v50_mean3', # mean v50 last 3 bars
|
||
|
|
'hist_v750_mean3', # mean v750 last 3 bars
|
||
|
|
] # 5 historical context
|
||
|
|
|
||
|
|
N_FEATURES = len(feat_names) # should be 52
|
||
|
|
print(f" Feature count: {N_FEATURES}")
|
||
|
|
|
||
|
|
trade_feat_rows = [] # list of np.ndarray length N_FEATURES
|
||
|
|
outcome_rows = [] # list of dicts
|
||
|
|
|
||
|
|
NAN_VEC = np.full(N_FEATURES, np.nan)
|
||
|
|
|
||
|
|
for t in tr:
|
||
|
|
eb = t.entry_bar
|
||
|
|
# Raw features at entry bar
|
||
|
|
f0 = _get_row(eb)
|
||
|
|
f1 = _get_row(eb - 1)
|
||
|
|
f2 = _get_row(eb - 2)
|
||
|
|
f3 = _get_row(eb - 3)
|
||
|
|
|
||
|
|
# Check if valid lookback (all 4 bars must be valid)
|
||
|
|
invalid_lookback = (
|
||
|
|
eb - 3 < 0
|
||
|
|
or not gbar_valid_mask[eb]
|
||
|
|
or not gbar_valid_mask[eb - 1]
|
||
|
|
or not gbar_valid_mask[eb - 2]
|
||
|
|
or not gbar_valid_mask[eb - 3]
|
||
|
|
)
|
||
|
|
|
||
|
|
row = np.empty(N_FEATURES, dtype=np.float64)
|
||
|
|
|
||
|
|
# --- Raw (7) ---
|
||
|
|
row[:7] = f0
|
||
|
|
|
||
|
|
if invalid_lookback:
|
||
|
|
# Derivatives all NaN; raw features still set above
|
||
|
|
row[7:] = np.nan
|
||
|
|
else:
|
||
|
|
# --- 1st derivatives (7) ---
|
||
|
|
d1_0 = f0 - f1
|
||
|
|
d1_1 = f1 - f2
|
||
|
|
d1_2 = f2 - f3
|
||
|
|
row[7:14] = d1_0
|
||
|
|
|
||
|
|
# --- 2nd derivatives (7) ---
|
||
|
|
d2_0 = d1_0 - d1_1
|
||
|
|
d2_1 = d1_1 - d1_2
|
||
|
|
row[14:21] = d2_0
|
||
|
|
|
||
|
|
# --- 3rd derivatives (7) ---
|
||
|
|
d3_0 = d2_0 - d2_1
|
||
|
|
row[21:28] = d3_0
|
||
|
|
|
||
|
|
# --- Cross-TF velocity at entry bar ---
|
||
|
|
vi = VEL_IDX # [1,2,3,4]
|
||
|
|
offset = 28
|
||
|
|
for pi, pj in pairs: # 6 diffs
|
||
|
|
row[offset] = f0[vi[pi]] - f0[vi[pj]]
|
||
|
|
offset += 1
|
||
|
|
for pi, pj in pairs: # 6 ratios
|
||
|
|
denom = f0[vi[pj]]
|
||
|
|
row[offset] = f0[vi[pi]] / denom if (np.isfinite(denom) and denom != 0.0) else np.nan
|
||
|
|
offset += 1
|
||
|
|
|
||
|
|
# --- Instability interactions ---
|
||
|
|
inst50 = f0[5]
|
||
|
|
inst150 = f0[6]
|
||
|
|
vd = f0[0]
|
||
|
|
row[offset] = inst50 / inst150 if (np.isfinite(inst150) and inst150 != 0.0) else np.nan
|
||
|
|
row[offset + 1] = inst50 * inst150
|
||
|
|
row[offset + 2] = vd * inst50
|
||
|
|
row[offset + 3] = vd * inst150
|
||
|
|
offset += 4
|
||
|
|
|
||
|
|
# --- Historical context: collect vel_div over last 5 valid bars ---
|
||
|
|
# Use last 5 global bars [eb-4..eb] for vd_min5; [eb-2..eb] for mean3/std3
|
||
|
|
vd_last5 = np.array([_get_row(eb - k)[0] for k in range(4, -1, -1)])
|
||
|
|
vd_last3 = vd_last5[2:] # [eb-2, eb-1, eb]
|
||
|
|
v50_last3 = np.array([_get_row(eb - k)[1] for k in range(2, -1, -1)])
|
||
|
|
v750_last3 = np.array([_get_row(eb - k)[4] for k in range(2, -1, -1)])
|
||
|
|
|
||
|
|
def _safe_mean(arr):
|
||
|
|
valid = arr[np.isfinite(arr)]
|
||
|
|
return float(np.mean(valid)) if len(valid) > 0 else np.nan
|
||
|
|
|
||
|
|
def _safe_std(arr):
|
||
|
|
valid = arr[np.isfinite(arr)]
|
||
|
|
return float(np.std(valid)) if len(valid) > 1 else np.nan
|
||
|
|
|
||
|
|
def _safe_min(arr):
|
||
|
|
valid = arr[np.isfinite(arr)]
|
||
|
|
return float(np.min(valid)) if len(valid) > 0 else np.nan
|
||
|
|
|
||
|
|
row[offset] = _safe_mean(vd_last3)
|
||
|
|
row[offset + 1] = _safe_std(vd_last3)
|
||
|
|
row[offset + 2] = _safe_min(vd_last5)
|
||
|
|
row[offset + 3] = _safe_mean(v50_last3)
|
||
|
|
row[offset + 4] = _safe_mean(v750_last3)
|
||
|
|
|
||
|
|
trade_feat_rows.append(row)
|
||
|
|
|
||
|
|
pnl_abs = t.pnl_absolute
|
||
|
|
outcome_rows.append({
|
||
|
|
'trade_id': t.trade_id,
|
||
|
|
'asset': t.asset,
|
||
|
|
'direction': t.direction,
|
||
|
|
'entry_bar': t.entry_bar,
|
||
|
|
'exit_bar': t.exit_bar,
|
||
|
|
'bars_held': t.bars_held,
|
||
|
|
'exit_reason': t.exit_reason,
|
||
|
|
'leverage': t.leverage,
|
||
|
|
'notional': t.notional,
|
||
|
|
'pnl_pct': t.pnl_pct,
|
||
|
|
'pnl_absolute': pnl_abs,
|
||
|
|
'winner': int(pnl_abs > 0),
|
||
|
|
'is_tp': int(t.exit_reason == 'FIXED_TP'),
|
||
|
|
'is_maxhold_loss':int(t.exit_reason == 'MAX_HOLD' and pnl_abs <= 0),
|
||
|
|
'is_maxhold_win': int(t.exit_reason == 'MAX_HOLD' and pnl_abs > 0),
|
||
|
|
'invalid_lookback': int(invalid_lookback),
|
||
|
|
})
|
||
|
|
|
||
|
|
feat_matrix = np.array(trade_feat_rows, dtype=np.float64) # (N_trades, N_FEATURES)
|
||
|
|
print(f" Feature matrix: {feat_matrix.shape}")
|
||
|
|
print(f" Trades with invalid lookback: {sum(o['invalid_lookback'] for o in outcome_rows)}")
|
||
|
|
|
||
|
|
# ── Step 4: Clip extreme ratios at 99th percentile ──────────────────────────────
|
||
|
|
print("\nClipping ratio features at 99th percentile...")
|
||
|
|
ratio_start = 28 + 6 # after diffs
|
||
|
|
ratio_end = ratio_start + 6
|
||
|
|
for col_i in range(ratio_start, ratio_end):
|
||
|
|
col = feat_matrix[:, col_i]
|
||
|
|
valid = col[np.isfinite(col)]
|
||
|
|
if len(valid) > 10:
|
||
|
|
lo, hi = np.percentile(valid, 1), np.percentile(valid, 99)
|
||
|
|
feat_matrix[:, col_i] = np.clip(col, lo, hi)
|
||
|
|
|
||
|
|
# ── Step 5: Statistical analysis ────────────────────────────────────────────────
|
||
|
|
print("Running statistical analysis...")
|
||
|
|
try:
|
||
|
|
from scipy.stats import ks_2samp, pearsonr, pointbiserialr
|
||
|
|
HAS_SCIPY = True
|
||
|
|
except ImportError:
|
||
|
|
HAS_SCIPY = False
|
||
|
|
print(" WARNING: scipy not available — KS + point-biserial skipped")
|
||
|
|
|
||
|
|
try:
|
||
|
|
from sklearn.metrics import roc_auc_score as _sklearn_auc
|
||
|
|
def roc_auc_score(y_true, y_score):
|
||
|
|
return _sklearn_auc(y_true, y_score)
|
||
|
|
HAS_SKLEARN = True
|
||
|
|
except ImportError:
|
||
|
|
HAS_SKLEARN = False
|
||
|
|
def roc_auc_score(y_true, y_score):
|
||
|
|
"""Manual ROC-AUC via rank sum."""
|
||
|
|
y_true = np.asarray(y_true, dtype=np.float64)
|
||
|
|
y_score = np.asarray(y_score, dtype=np.float64)
|
||
|
|
n1 = int(np.sum(y_true == 1))
|
||
|
|
n0 = len(y_true) - n1
|
||
|
|
if n1 == 0 or n0 == 0:
|
||
|
|
return np.nan
|
||
|
|
order = np.argsort(y_score)
|
||
|
|
ranks = np.empty(len(y_score), dtype=np.float64)
|
||
|
|
ranks[order] = np.arange(1, len(y_score) + 1)
|
||
|
|
auc = (np.sum(ranks[y_true == 1]) - n1 * (n1 + 1) / 2) / (n1 * n0)
|
||
|
|
return float(auc)
|
||
|
|
|
||
|
|
pnl_pct_arr = np.array([o['pnl_pct'] for o in outcome_rows])
|
||
|
|
winner_arr = np.array([o['winner'] for o in outcome_rows])
|
||
|
|
is_tp_arr = np.array([o['is_tp'] for o in outcome_rows])
|
||
|
|
is_mhl_arr = np.array([o['is_maxhold_loss'] for o in outcome_rows])
|
||
|
|
|
||
|
|
analysis_rows = []
|
||
|
|
|
||
|
|
for fi, fname in enumerate(feat_names):
|
||
|
|
col = feat_matrix[:, fi]
|
||
|
|
valid_mask = np.isfinite(col) & np.isfinite(pnl_pct_arr)
|
||
|
|
n_valid = int(np.sum(valid_mask))
|
||
|
|
|
||
|
|
if n_valid < 10:
|
||
|
|
analysis_rows.append({
|
||
|
|
'feature': fname, 'n_valid': n_valid,
|
||
|
|
'pearson_r': np.nan, 'pearson_p': np.nan, 'pb_corr': np.nan,
|
||
|
|
'ks_stat_winner': np.nan, 'ks_pval_winner': np.nan,
|
||
|
|
'roc_auc_winner': np.nan, 'roc_auc_maxhold_loss': np.nan,
|
||
|
|
'roc_auc_tp': np.nan, 'winner_mean': np.nan, 'loser_mean': np.nan,
|
||
|
|
})
|
||
|
|
continue
|
||
|
|
|
||
|
|
x = col[valid_mask]
|
||
|
|
y_pnl = pnl_pct_arr[valid_mask]
|
||
|
|
y_win = winner_arr[valid_mask]
|
||
|
|
y_tp = is_tp_arr[valid_mask]
|
||
|
|
y_mhl = is_mhl_arr[valid_mask]
|
||
|
|
|
||
|
|
# Pearson r with pnl_pct
|
||
|
|
if HAS_SCIPY:
|
||
|
|
pr, pp = pearsonr(x, y_pnl)
|
||
|
|
else:
|
||
|
|
cov = np.cov(x, y_pnl)[0, 1]
|
||
|
|
pr = cov / (np.std(x) * np.std(y_pnl) + 1e-15)
|
||
|
|
pp = np.nan
|
||
|
|
|
||
|
|
# Point-biserial
|
||
|
|
pb = np.nan
|
||
|
|
if HAS_SCIPY and len(np.unique(y_win)) == 2:
|
||
|
|
pb, _ = pointbiserialr(y_win, x)
|
||
|
|
|
||
|
|
# KS stat winner
|
||
|
|
ks_stat, ks_pval = np.nan, np.nan
|
||
|
|
if HAS_SCIPY and len(np.unique(y_win)) == 2:
|
||
|
|
w_vals = x[y_win == 1]
|
||
|
|
l_vals = x[y_win == 0]
|
||
|
|
if len(w_vals) >= 2 and len(l_vals) >= 2:
|
||
|
|
ks_stat, ks_pval = ks_2samp(w_vals, l_vals)
|
||
|
|
|
||
|
|
# ROC-AUC: winner (take max of both directions)
|
||
|
|
roc_w = np.nan
|
||
|
|
if len(np.unique(y_win)) == 2:
|
||
|
|
try:
|
||
|
|
a1 = roc_auc_score(y_win, x)
|
||
|
|
roc_w = max(a1, 1.0 - a1)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# ROC-AUC: MAX_HOLD loss
|
||
|
|
roc_mhl = np.nan
|
||
|
|
if len(np.unique(y_mhl)) == 2:
|
||
|
|
try:
|
||
|
|
a1 = roc_auc_score(y_mhl, x)
|
||
|
|
roc_mhl = max(a1, 1.0 - a1)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# ROC-AUC: TP
|
||
|
|
roc_tp = np.nan
|
||
|
|
if len(np.unique(y_tp)) == 2:
|
||
|
|
try:
|
||
|
|
a1 = roc_auc_score(y_tp, x)
|
||
|
|
roc_tp = max(a1, 1.0 - a1)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Winner / loser mean
|
||
|
|
w_mean = float(np.mean(x[y_win == 1])) if np.any(y_win == 1) else np.nan
|
||
|
|
l_mean = float(np.mean(x[y_win == 0])) if np.any(y_win == 0) else np.nan
|
||
|
|
|
||
|
|
analysis_rows.append({
|
||
|
|
'feature': fname, 'n_valid': n_valid,
|
||
|
|
'pearson_r': float(pr), 'pearson_p': float(pp) if not np.isnan(pp) else np.nan,
|
||
|
|
'pb_corr': float(pb) if not np.isnan(pb) else np.nan,
|
||
|
|
'ks_stat_winner': float(ks_stat) if not np.isnan(ks_stat) else np.nan,
|
||
|
|
'ks_pval_winner': float(ks_pval) if not np.isnan(ks_pval) else np.nan,
|
||
|
|
'roc_auc_winner': float(roc_w) if not np.isnan(roc_w) else np.nan,
|
||
|
|
'roc_auc_maxhold_loss': float(roc_mhl) if not np.isnan(roc_mhl) else np.nan,
|
||
|
|
'roc_auc_tp': float(roc_tp) if not np.isnan(roc_tp) else np.nan,
|
||
|
|
'winner_mean': float(w_mean) if not np.isnan(w_mean) else np.nan,
|
||
|
|
'loser_mean': float(l_mean) if not np.isnan(l_mean) else np.nan,
|
||
|
|
})
|
||
|
|
|
||
|
|
# ── Step 6: Print sorted tables ──────────────────────────────────────────────────
|
||
|
|
def _fmt(v, fmt=".4f"):
|
||
|
|
return f"{v:{fmt}}" if (v is not None and not np.isnan(v)) else " nan"
|
||
|
|
|
||
|
|
def _print_table(rows, sort_key, title, n=20):
|
||
|
|
def _sk(r):
|
||
|
|
v = r[sort_key]
|
||
|
|
return float(abs(v)) if (v is not None and not np.isnan(v)) else 0.0
|
||
|
|
sorted_rows = sorted(rows, key=_sk, reverse=True)[:n]
|
||
|
|
print(f"\n{'═'*95}")
|
||
|
|
print(f" {title}")
|
||
|
|
print(f"{'─'*95}")
|
||
|
|
print(f" {'feature':<30} {'n_valid':>7} {'pearson_r':>9} {'pb_corr':>7} "
|
||
|
|
f"{'ks_stat':>7} {'roc_win':>7} {'roc_mhl':>7} {'roc_tp':>7} {'win_mean':>9} {'los_mean':>9}")
|
||
|
|
print(f"{'─'*95}")
|
||
|
|
for r in sorted_rows:
|
||
|
|
print(f" {r['feature']:<30} {r['n_valid']:>7d} {_fmt(r['pearson_r']):>9} "
|
||
|
|
f"{_fmt(r['pb_corr']):>7} {_fmt(r['ks_stat_winner']):>7} "
|
||
|
|
f"{_fmt(r['roc_auc_winner']):>7} {_fmt(r['roc_auc_maxhold_loss']):>7} "
|
||
|
|
f"{_fmt(r['roc_auc_tp']):>7} {_fmt(r['winner_mean']):>9} {_fmt(r['loser_mean']):>9}")
|
||
|
|
|
||
|
|
# Sort guards: handle nan safely
|
||
|
|
def _key_roc_win(r):
|
||
|
|
v = r['roc_auc_winner']
|
||
|
|
return float(v) if (v is not None and not np.isnan(v)) else 0.0
|
||
|
|
|
||
|
|
def _key_roc_mhl(r):
|
||
|
|
v = r['roc_auc_maxhold_loss']
|
||
|
|
return float(v) if (v is not None and not np.isnan(v)) else 0.0
|
||
|
|
|
||
|
|
def _key_pearson(r):
|
||
|
|
v = r['pearson_r']
|
||
|
|
return abs(float(v)) if (v is not None and not np.isnan(v)) else 0.0
|
||
|
|
|
||
|
|
_print_table(
|
||
|
|
sorted(analysis_rows, key=_key_roc_win, reverse=True),
|
||
|
|
'roc_auc_winner',
|
||
|
|
"TOP 20 BY ROC-AUC (winner discrimination)", n=20
|
||
|
|
)
|
||
|
|
|
||
|
|
_print_table(
|
||
|
|
sorted(analysis_rows, key=_key_roc_mhl, reverse=True),
|
||
|
|
'roc_auc_maxhold_loss',
|
||
|
|
"TOP 20 BY ROC-AUC (MAX_HOLD loss discrimination)", n=20
|
||
|
|
)
|
||
|
|
|
||
|
|
_print_table(
|
||
|
|
sorted(analysis_rows, key=_key_pearson, reverse=True),
|
||
|
|
'pearson_r',
|
||
|
|
"TOP 20 BY |Pearson r| (pnl_pct correlation)", n=20
|
||
|
|
)
|
||
|
|
|
||
|
|
# ── ASCII histograms for top 5 winner-discriminating features ───────────────────
|
||
|
|
top5_feats = [r['feature'] for r in sorted(analysis_rows, key=_key_roc_win, reverse=True)[:5]]
|
||
|
|
|
||
|
|
def ascii_hist_pair(feat_name, fi):
|
||
|
|
col = feat_matrix[:, fi]
|
||
|
|
w_vals = col[(winner_arr == 1) & np.isfinite(col)]
|
||
|
|
l_vals = col[(winner_arr == 0) & np.isfinite(col)]
|
||
|
|
if len(w_vals) < 5 or len(l_vals) < 5:
|
||
|
|
return
|
||
|
|
all_valid = col[np.isfinite(col)]
|
||
|
|
lo = float(np.percentile(all_valid, 1))
|
||
|
|
hi = float(np.percentile(all_valid, 99))
|
||
|
|
BINS = 15
|
||
|
|
edges = np.linspace(lo, hi, BINS + 1)
|
||
|
|
|
||
|
|
def _hist_bar(vals, edges):
|
||
|
|
counts, _ = np.histogram(vals, bins=edges)
|
||
|
|
return counts
|
||
|
|
|
||
|
|
wc = _hist_bar(np.clip(w_vals, lo, hi), edges)
|
||
|
|
lc = _hist_bar(np.clip(l_vals, lo, hi), edges)
|
||
|
|
w_tot = max(1, len(w_vals))
|
||
|
|
l_tot = max(1, len(l_vals))
|
||
|
|
bar_max = max(1, max(np.max(wc) / w_tot, np.max(lc) / l_tot))
|
||
|
|
WIDTH = 20
|
||
|
|
|
||
|
|
print(f"\n {'─'*60}")
|
||
|
|
print(f" {feat_name} (winners n={w_tot} losers n={l_tot} range=[{lo:.4f}, {hi:.4f}])")
|
||
|
|
print(f" {'─'*60}")
|
||
|
|
print(f" {'bin':>22} {'WINNER':>{WIDTH}} {'LOSER':>{WIDTH}}")
|
||
|
|
for i in range(BINS):
|
||
|
|
lbl = f"[{edges[i]:+.4f},{edges[i+1]:+.4f})"
|
||
|
|
wbar = '█' * int(wc[i] / w_tot / bar_max * WIDTH)
|
||
|
|
lbar = '█' * int(lc[i] / l_tot / bar_max * WIDTH)
|
||
|
|
print(f" {lbl:>22} {wbar:<{WIDTH}} {lbar:<{WIDTH}}")
|
||
|
|
|
||
|
|
print(f"\n{'═'*95}")
|
||
|
|
print(f" ASCII HISTOGRAMS — TOP 5 WINNER-DISCRIMINATING FEATURES")
|
||
|
|
|
||
|
|
for fname in top5_feats:
|
||
|
|
fi = feat_names.index(fname)
|
||
|
|
ascii_hist_pair(fname, fi)
|
||
|
|
|
||
|
|
# ── Step 7: Save outputs ─────────────────────────────────────────────────────────
|
||
|
|
LOG_DIR = Path(__file__).parent / "run_logs"
|
||
|
|
LOG_DIR.mkdir(exist_ok=True)
|
||
|
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
|
|
|
||
|
|
# Per-trade feature matrix + outcomes
|
||
|
|
feat_csv = LOG_DIR / f"entry_quality_features_{ts}.csv"
|
||
|
|
with open(feat_csv, 'w', newline='') as f:
|
||
|
|
cw = csv.writer(f)
|
||
|
|
outcome_keys = ['trade_id', 'asset', 'direction', 'entry_bar', 'exit_bar',
|
||
|
|
'bars_held', 'exit_reason', 'leverage', 'notional',
|
||
|
|
'pnl_pct', 'pnl_absolute', 'winner', 'is_tp',
|
||
|
|
'is_maxhold_loss', 'is_maxhold_win', 'invalid_lookback']
|
||
|
|
cw.writerow(outcome_keys + feat_names)
|
||
|
|
for i, o in enumerate(outcome_rows):
|
||
|
|
out_vals = [o[k] for k in outcome_keys]
|
||
|
|
feat_vals = [f"{v:.8g}" if np.isfinite(v) else '' for v in feat_matrix[i]]
|
||
|
|
cw.writerow(out_vals + feat_vals)
|
||
|
|
|
||
|
|
# Feature analysis table
|
||
|
|
sweep_csv = LOG_DIR / f"entry_quality_sweep_{ts}.csv"
|
||
|
|
with open(sweep_csv, 'w', newline='') as f:
|
||
|
|
cw = csv.writer(f)
|
||
|
|
cw.writerow(['feature', 'n_valid', 'pearson_r', 'pearson_p', 'pb_corr',
|
||
|
|
'ks_stat_winner', 'ks_pval_winner',
|
||
|
|
'roc_auc_winner', 'roc_auc_maxhold_loss', 'roc_auc_tp',
|
||
|
|
'winner_mean', 'loser_mean'])
|
||
|
|
for r in sorted(analysis_rows, key=_key_roc_win, reverse=True):
|
||
|
|
def fmtv(v): return f"{v:.6f}" if (v is not None and not np.isnan(v)) else ''
|
||
|
|
cw.writerow([r['feature'], r['n_valid'],
|
||
|
|
fmtv(r['pearson_r']), fmtv(r['pearson_p']), fmtv(r['pb_corr']),
|
||
|
|
fmtv(r['ks_stat_winner']), fmtv(r['ks_pval_winner']),
|
||
|
|
fmtv(r['roc_auc_winner']), fmtv(r['roc_auc_maxhold_loss']),
|
||
|
|
fmtv(r['roc_auc_tp']),
|
||
|
|
fmtv(r['winner_mean']), fmtv(r['loser_mean'])])
|
||
|
|
|
||
|
|
print(f"\n{'═'*95}")
|
||
|
|
print(f" per-trade features → {feat_csv} ({len(outcome_rows)} rows, {N_FEATURES} features)")
|
||
|
|
print(f" feature analysis → {sweep_csv} ({len(analysis_rows)} rows)")
|
||
|
|
print(f"{'═'*95}")
|