578 lines
25 KiB
Python
578 lines
25 KiB
Python
|
|
"""
|
|||
|
|
1m Klines vs 5s NG5 — Overlapping Period Comparison Study
|
|||
|
|
===========================================================
|
|||
|
|
Overlapping window: 2026-01-01 to 2026-03-05 (64 days)
|
|||
|
|
- 1m klines data: vbt_cache_klines/2026-*.parquet (1439 rows/day, 1-min bars)
|
|||
|
|
- 5s NG5 data: vbt_cache_ng5/2026-*.parquet (~6154 rows/day, 5s bars)
|
|||
|
|
|
|||
|
|
Analyses:
|
|||
|
|
1. Signal distribution comparison (vel_div, v50, v150, v750, instability)
|
|||
|
|
2. Cross-correlation and lead-lag structure (1m vel_div vs 5s vel_div)
|
|||
|
|
3. PCA on both signal spaces (shared vs unique variance)
|
|||
|
|
4. Backtest performance comparison (same engine, same dates, both data sources)
|
|||
|
|
5. Signal alignment quantification (how often do both timescales agree?)
|
|||
|
|
6. Statistical characteristics: skew, kurtosis, autocorrelation, stationarity tests
|
|||
|
|
|
|||
|
|
Run: python test_1m_vs_5s_comparison.py
|
|||
|
|
Output: run_logs/1m_vs_5s_comparison_TIMESTAMP.json + .md report
|
|||
|
|
"""
|
|||
|
|
import sys, time, json, warnings
|
|||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
|||
|
|
warnings.filterwarnings('ignore')
|
|||
|
|
from pathlib import Path
|
|||
|
|
from datetime import datetime
|
|||
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
from scipy import stats
|
|||
|
|
from scipy.stats import spearmanr, pearsonr, ks_2samp
|
|||
|
|
|
|||
|
|
HCM = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')
|
|||
|
|
KLINES_DIR = HCM / 'vbt_cache_klines'
|
|||
|
|
NG5_DIR = HCM / 'vbt_cache_ng5'
|
|||
|
|
LOGS_DIR = HCM / 'nautilus_dolphin' / 'run_logs'
|
|||
|
|
LOGS_DIR.mkdir(exist_ok=True)
|
|||
|
|
|
|||
|
|
OVERLAP_START = '2026-01-01'
|
|||
|
|
OVERLAP_END = '2026-03-05'
|
|||
|
|
|
|||
|
|
SIGNAL_COLS = ['vel_div', 'v50_lambda_max_velocity', 'v150_lambda_max_velocity',
|
|||
|
|
'v300_lambda_max_velocity', 'v750_lambda_max_velocity',
|
|||
|
|
'instability_50', 'instability_150']
|
|||
|
|
|
|||
|
|
t0 = time.time()
|
|||
|
|
run_ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|||
|
|
|
|||
|
|
print("=" * 70)
|
|||
|
|
print("1m vs 5s Signal Comparison Study")
|
|||
|
|
print(f"Overlap window: {OVERLAP_START} to {OVERLAP_END}")
|
|||
|
|
print("=" * 70)
|
|||
|
|
|
|||
|
|
# ── 1. Load data ────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def load_overlap(data_dir: Path, label: str) -> pd.DataFrame:
|
|||
|
|
from datetime import datetime as dt, timedelta
|
|||
|
|
d = dt.strptime(OVERLAP_START, '%Y-%m-%d')
|
|||
|
|
end = dt.strptime(OVERLAP_END, '%Y-%m-%d')
|
|||
|
|
frames = []
|
|||
|
|
while d <= end:
|
|||
|
|
ds = d.strftime('%Y-%m-%d')
|
|||
|
|
pf = data_dir / f'{ds}.parquet'
|
|||
|
|
if pf.exists():
|
|||
|
|
df = pd.read_parquet(pf)
|
|||
|
|
df['date_str'] = ds
|
|||
|
|
frames.append(df)
|
|||
|
|
d += timedelta(days=1)
|
|||
|
|
if not frames:
|
|||
|
|
print(f" ERROR: No parquets found for {label} in {data_dir}")
|
|||
|
|
return pd.DataFrame()
|
|||
|
|
full = pd.concat(frames, ignore_index=True)
|
|||
|
|
print(f" {label}: {len(frames)} dates, {len(full):,} rows, cols={list(full.columns[:8])}")
|
|||
|
|
return full
|
|||
|
|
|
|||
|
|
print("\n--- Loading data ---")
|
|||
|
|
df_1m = load_overlap(KLINES_DIR, '1m klines')
|
|||
|
|
df_5s = load_overlap(NG5_DIR, '5s NG5')
|
|||
|
|
|
|||
|
|
if df_1m.empty or df_5s.empty:
|
|||
|
|
print("ABORT: Missing data for one or both timescales")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
# ── 2. Signal distribution comparison ─────────────────────────────────────────
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 70)
|
|||
|
|
print("SECTION 1: Signal Distribution Comparison")
|
|||
|
|
print("=" * 70)
|
|||
|
|
|
|||
|
|
results = {'run_ts': run_ts, 'overlap': f'{OVERLAP_START}/{OVERLAP_END}'}
|
|||
|
|
dist_results = {}
|
|||
|
|
|
|||
|
|
for col in SIGNAL_COLS:
|
|||
|
|
if col not in df_1m.columns or col not in df_5s.columns:
|
|||
|
|
continue
|
|||
|
|
s1 = df_1m[col].dropna()
|
|||
|
|
s5 = df_5s[col].dropna()
|
|||
|
|
|
|||
|
|
# KS test: are distributions different?
|
|||
|
|
ks_stat, ks_p = ks_2samp(s1.values, s5.values)
|
|||
|
|
|
|||
|
|
dist_results[col] = {
|
|||
|
|
'1m_mean': float(s1.mean()), '1m_std': float(s1.std()),
|
|||
|
|
'1m_p5': float(s1.quantile(0.05)), '1m_p50': float(s1.median()),
|
|||
|
|
'1m_p95': float(s1.quantile(0.95)),
|
|||
|
|
'1m_skew': float(s1.skew()), '1m_kurt': float(s1.kurtosis()),
|
|||
|
|
'5s_mean': float(s5.mean()), '5s_std': float(s5.std()),
|
|||
|
|
'5s_p5': float(s5.quantile(0.05)), '5s_p50': float(s5.median()),
|
|||
|
|
'5s_p95': float(s5.quantile(0.95)),
|
|||
|
|
'5s_skew': float(s5.skew()), '5s_kurt': float(s5.kurtosis()),
|
|||
|
|
'ks_stat': float(ks_stat), 'ks_p': float(ks_p),
|
|||
|
|
'scale_ratio_std': float(s1.std() / s5.std()) if s5.std() > 0 else None,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
print(f"\n{col}:")
|
|||
|
|
print(f" 1m: mean={s1.mean():.4f} std={s1.std():.4f} p5={s1.quantile(0.05):.4f} "
|
|||
|
|
f"p50={s1.median():.4f} p95={s1.quantile(0.95):.4f} "
|
|||
|
|
f"skew={s1.skew():.3f} kurt={s1.kurtosis():.3f}")
|
|||
|
|
print(f" 5s: mean={s5.mean():.4f} std={s5.std():.4f} p5={s5.quantile(0.05):.4f} "
|
|||
|
|
f"p50={s5.median():.4f} p95={s5.quantile(0.95):.4f} "
|
|||
|
|
f"skew={s5.skew():.3f} kurt={s5.kurtosis():.3f}")
|
|||
|
|
print(f" Scale ratio (1m_std/5s_std): {s1.std()/s5.std():.2f}x | "
|
|||
|
|
f"KS stat={ks_stat:.4f} p={ks_p:.4f} ({'DIFFERENT' if ks_p < 0.05 else 'similar'})")
|
|||
|
|
|
|||
|
|
results['distributions'] = dist_results
|
|||
|
|
|
|||
|
|
# ── 3. Cross-correlation and lead-lag ─────────────────────────────────────────
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 70)
|
|||
|
|
print("SECTION 2: Cross-Correlation and Lead-Lag (1m vs 5s vel_div)")
|
|||
|
|
print("=" * 70)
|
|||
|
|
print("Methodology: resample both to common 5-min bars, compute cross-corr at lags 0-12")
|
|||
|
|
|
|||
|
|
# Resample both to 5-minute bars (common time grid)
|
|||
|
|
def resample_to_5min(df: pd.DataFrame, col: str = 'vel_div') -> pd.Series:
|
|||
|
|
if 'timestamp' not in df.columns:
|
|||
|
|
return pd.Series(dtype=float)
|
|||
|
|
ts = pd.to_datetime(df['timestamp'])
|
|||
|
|
s = pd.Series(df[col].values, index=ts)
|
|||
|
|
# Resample to 5-min, take last value (most recent)
|
|||
|
|
return s.resample('5min').last().dropna()
|
|||
|
|
|
|||
|
|
vd_1m_5min = resample_to_5min(df_1m, 'vel_div')
|
|||
|
|
vd_5s_5min = resample_to_5min(df_5s, 'vel_div')
|
|||
|
|
|
|||
|
|
# Align on common index
|
|||
|
|
common_idx = vd_1m_5min.index.intersection(vd_5s_5min.index)
|
|||
|
|
a1 = vd_1m_5min.reindex(common_idx)
|
|||
|
|
a5 = vd_5s_5min.reindex(common_idx)
|
|||
|
|
a1_clean = a1.dropna()
|
|||
|
|
a5_clean = a5.reindex(a1_clean.index).dropna()
|
|||
|
|
common_clean = a1_clean.index.intersection(a5_clean.index)
|
|||
|
|
x1 = a1_clean.reindex(common_clean).values
|
|||
|
|
x5 = a5_clean.reindex(common_clean).values
|
|||
|
|
|
|||
|
|
print(f" Common 5-min bars: {len(common_clean)}")
|
|||
|
|
|
|||
|
|
crosscorr = {}
|
|||
|
|
print(f"\n {'Lag':>6} {'Pearson r':>10} {'p-value':>10} {'Spearman r':>11}")
|
|||
|
|
print(f" {'-'*6} {'-'*10} {'-'*10} {'-'*11}")
|
|||
|
|
for lag in range(-6, 13): # -6 to +12 lags (negative = 1m leads 5s)
|
|||
|
|
if lag < 0:
|
|||
|
|
a_1m = x1[-lag:]
|
|||
|
|
a_5s = x5[:lag]
|
|||
|
|
elif lag == 0:
|
|||
|
|
a_1m = x1
|
|||
|
|
a_5s = x5
|
|||
|
|
else:
|
|||
|
|
a_1m = x1[:-lag]
|
|||
|
|
a_5s = x5[lag:]
|
|||
|
|
n = min(len(a_1m), len(a_5s))
|
|||
|
|
a_1m, a_5s = a_1m[:n], a_5s[:n]
|
|||
|
|
if n < 10:
|
|||
|
|
continue
|
|||
|
|
pr, pp = pearsonr(a_1m, a_5s)
|
|||
|
|
sr, sp = spearmanr(a_1m, a_5s)
|
|||
|
|
crosscorr[lag] = {'pearson_r': float(pr), 'pearson_p': float(pp),
|
|||
|
|
'spearman_r': float(sr), 'spearman_p': float(sp), 'n': n}
|
|||
|
|
marker = ' <-- PEAK' if lag == 0 else (' <-- 1m LEADS' if lag < 0 and abs(pr) > 0.3 else '')
|
|||
|
|
print(f" lag={lag:+3d} r={pr:+.4f} p={pp:.4f} rho={sr:+.4f}{marker}")
|
|||
|
|
|
|||
|
|
results['crosscorr_5min'] = crosscorr
|
|||
|
|
|
|||
|
|
# Find best lag
|
|||
|
|
best_lag = max(crosscorr.items(), key=lambda x: abs(x[1]['pearson_r']))
|
|||
|
|
print(f"\n Best lag: {best_lag[0]:+d} (r={best_lag[1]['pearson_r']:+.4f})")
|
|||
|
|
if best_lag[0] < 0:
|
|||
|
|
print(f" INTERPRETATION: 1m signal LEADS 5s by {abs(best_lag[0])} × 5min = {abs(best_lag[0])*5} minutes")
|
|||
|
|
elif best_lag[0] > 0:
|
|||
|
|
print(f" INTERPRETATION: 5s signal LEADS 1m by {best_lag[0]} × 5min = {best_lag[0]*5} minutes")
|
|||
|
|
else:
|
|||
|
|
print(f" INTERPRETATION: Signals are contemporaneous (no lead-lag at 5-min resolution)")
|
|||
|
|
|
|||
|
|
# ── 4. PCA on both signal spaces ───────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 70)
|
|||
|
|
print("SECTION 3: PCA — Shared vs Unique Variance")
|
|||
|
|
print("=" * 70)
|
|||
|
|
|
|||
|
|
from sklearn.decomposition import PCA
|
|||
|
|
from sklearn.preprocessing import StandardScaler
|
|||
|
|
|
|||
|
|
pca_results = {}
|
|||
|
|
|
|||
|
|
for label, df in [('1m', df_1m), ('5s', df_5s)]:
|
|||
|
|
cols_avail = [c for c in SIGNAL_COLS if c in df.columns]
|
|||
|
|
X = df[cols_avail].dropna()
|
|||
|
|
if len(X) < 100:
|
|||
|
|
continue
|
|||
|
|
scaler = StandardScaler()
|
|||
|
|
Xs = scaler.fit_transform(X)
|
|||
|
|
pca = PCA()
|
|||
|
|
pca.fit(Xs)
|
|||
|
|
evr = pca.explained_variance_ratio_
|
|||
|
|
cumvar = np.cumsum(evr)
|
|||
|
|
n_for_90 = int(np.searchsorted(cumvar, 0.90)) + 1
|
|||
|
|
|
|||
|
|
pca_results[label] = {
|
|||
|
|
'n_features': len(cols_avail),
|
|||
|
|
'n_samples': len(X),
|
|||
|
|
'explained_variance_ratio': evr.tolist(),
|
|||
|
|
'cumulative_variance': cumvar.tolist(),
|
|||
|
|
'n_components_90pct': n_for_90,
|
|||
|
|
'components_top3': pca.components_[:3].tolist(),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
print(f"\n{label} PCA ({len(X):,} samples, {len(cols_avail)} features):")
|
|||
|
|
for i, (ev, cv) in enumerate(zip(evr[:6], cumvar[:6])):
|
|||
|
|
print(f" PC{i+1}: var={ev:.4f} ({ev*100:.1f}%) cumul={cv*100:.1f}%")
|
|||
|
|
print(f" Components for 90% variance: {n_for_90}")
|
|||
|
|
print(f" PC1 loadings: " + ", ".join(f"{c}={v:.3f}" for c, v in zip(cols_avail, pca.components_[0])))
|
|||
|
|
print(f" PC2 loadings: " + ", ".join(f"{c}={v:.3f}" for c, v in zip(cols_avail, pca.components_[1])))
|
|||
|
|
|
|||
|
|
# Joint PCA on 5-min resampled aligned data
|
|||
|
|
print("\n--- Joint PCA on aligned 5-min data ---")
|
|||
|
|
joint_cols = [c for c in SIGNAL_COLS if c in df_1m.columns and c in df_5s.columns]
|
|||
|
|
joint_frames = []
|
|||
|
|
for ds in sorted(df_1m['date_str'].unique()):
|
|||
|
|
sub_1m = df_1m[df_1m['date_str'] == ds]
|
|||
|
|
sub_5s = df_5s[df_5s['date_str'] == ds]
|
|||
|
|
if len(sub_1m) < 50 or len(sub_5s) < 100:
|
|||
|
|
continue
|
|||
|
|
for col in joint_cols:
|
|||
|
|
if 'timestamp' not in sub_1m.columns:
|
|||
|
|
break
|
|||
|
|
ts_1m = pd.to_datetime(sub_1m['timestamp'])
|
|||
|
|
s1m = pd.Series(sub_1m[col].values, index=ts_1m).resample('5min').last()
|
|||
|
|
ts_5s = pd.to_datetime(sub_5s['timestamp'])
|
|||
|
|
s5s = pd.Series(sub_5s[col].values, index=ts_5s).resample('5min').last()
|
|||
|
|
idx = s1m.index.intersection(s5s.index)
|
|||
|
|
if len(idx) < 10:
|
|||
|
|
break
|
|||
|
|
else:
|
|||
|
|
row = {}
|
|||
|
|
for col in joint_cols:
|
|||
|
|
ts_1m = pd.to_datetime(sub_1m['timestamp'])
|
|||
|
|
s1m = pd.Series(sub_1m[col].values, index=ts_1m).resample('5min').last()
|
|||
|
|
ts_5s = pd.to_datetime(sub_5s['timestamp'])
|
|||
|
|
s5s = pd.Series(sub_5s[col].values, index=ts_5s).resample('5min').last()
|
|||
|
|
idx = s1m.index.intersection(s5s.index)
|
|||
|
|
row[f'1m_{col}'] = s1m.reindex(idx).values.tolist()
|
|||
|
|
row[f'5s_{col}'] = s5s.reindex(idx).values.tolist()
|
|||
|
|
# Build aligned dataframe for this date
|
|||
|
|
try:
|
|||
|
|
n = len(s1m.reindex(idx))
|
|||
|
|
date_df = pd.DataFrame({f'1m_{c}': pd.Series(sub_1m[c].values, index=pd.to_datetime(sub_1m['timestamp'])).resample('5min').last().reindex(idx).values for c in joint_cols} |
|
|||
|
|
{f'5s_{c}': pd.Series(sub_5s[c].values, index=pd.to_datetime(sub_5s['timestamp'])).resample('5min').last().reindex(idx).values for c in joint_cols})
|
|||
|
|
joint_frames.append(date_df)
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
if joint_frames:
|
|||
|
|
joint_df = pd.concat(joint_frames, ignore_index=True).dropna()
|
|||
|
|
print(f" Joint aligned data: {len(joint_df):,} 5-min bars × {len(joint_df.columns)} features")
|
|||
|
|
if len(joint_df) > 50:
|
|||
|
|
Xj = StandardScaler().fit_transform(joint_df)
|
|||
|
|
pca_j = PCA()
|
|||
|
|
pca_j.fit(Xj)
|
|||
|
|
evr_j = pca_j.explained_variance_ratio_
|
|||
|
|
cumvar_j = np.cumsum(evr_j)
|
|||
|
|
# How much variance is "shared" (first PC explains variance from both 1m and 5s features)?
|
|||
|
|
pc1_load = pca_j.components_[0]
|
|||
|
|
cols_j = list(joint_df.columns)
|
|||
|
|
pc1_1m_load = [abs(pc1_load[i]) for i, c in enumerate(cols_j) if c.startswith('1m_')]
|
|||
|
|
pc1_5s_load = [abs(pc1_load[i]) for i, c in enumerate(cols_j) if c.startswith('5s_')]
|
|||
|
|
shared_signal = evr_j[0] # PC1 = shared component
|
|||
|
|
|
|||
|
|
pca_results['joint'] = {
|
|||
|
|
'n_samples': len(joint_df),
|
|||
|
|
'n_features': len(joint_df.columns),
|
|||
|
|
'explained_variance_ratio': evr_j.tolist(),
|
|||
|
|
'pc1_variance': float(evr_j[0]),
|
|||
|
|
'pc1_1m_mean_loading': float(np.mean(pc1_1m_load)),
|
|||
|
|
'pc1_5s_mean_loading': float(np.mean(pc1_5s_load)),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
print(f"\n Joint PCA variance explained:")
|
|||
|
|
for i, (ev, cv) in enumerate(zip(evr_j[:6], cumvar_j[:6])):
|
|||
|
|
print(f" PC{i+1}: {ev*100:.1f}% (cumul {cv*100:.1f}%)")
|
|||
|
|
print(f"\n PC1 (shared) explains {evr_j[0]*100:.1f}% of joint variance")
|
|||
|
|
print(f" PC1 mean |loading| — 1m features: {np.mean(pc1_1m_load):.4f}")
|
|||
|
|
print(f" PC1 mean |loading| — 5s features: {np.mean(pc1_5s_load):.4f}")
|
|||
|
|
if np.mean(pc1_1m_load) > 0.1 and np.mean(pc1_5s_load) > 0.1:
|
|||
|
|
print(f" INTERPRETATION: PC1 loads strongly on BOTH timescales -> genuine shared variance")
|
|||
|
|
else:
|
|||
|
|
print(f" INTERPRETATION: PC1 loads unevenly -> signals are largely independent")
|
|||
|
|
|
|||
|
|
results['pca'] = pca_results
|
|||
|
|
|
|||
|
|
# ── 5. Signal alignment quantification ────────────────────────────────────────
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 70)
|
|||
|
|
print("SECTION 4: Signal Alignment Quantification")
|
|||
|
|
print("=" * 70)
|
|||
|
|
print("How often does 1m vel_div < -0.50 align with 5s vel_div < -0.02?")
|
|||
|
|
|
|||
|
|
# Need daily-level alignment
|
|||
|
|
VD_1M_THRESH = -0.50
|
|||
|
|
VD_5S_THRESH = -0.02
|
|||
|
|
|
|||
|
|
align_results = []
|
|||
|
|
for ds in sorted(df_1m['date_str'].unique()):
|
|||
|
|
sub_1m = df_1m[df_1m['date_str'] == ds]
|
|||
|
|
sub_5s = df_5s[df_5s['date_str'] == ds]
|
|||
|
|
if len(sub_1m) < 100 or len(sub_5s) < 100:
|
|||
|
|
continue
|
|||
|
|
# 1m signal: fraction of bars signaling
|
|||
|
|
frac_1m = (sub_1m['vel_div'] < VD_1M_THRESH).mean()
|
|||
|
|
# 5s signal: fraction of bars signaling
|
|||
|
|
frac_5s = (sub_5s['vel_div'] < VD_5S_THRESH).mean()
|
|||
|
|
# min daily vel_div (peak signal strength)
|
|||
|
|
min_1m = sub_1m['vel_div'].min()
|
|||
|
|
min_5s = sub_5s['vel_div'].min()
|
|||
|
|
align_results.append({
|
|||
|
|
'date': ds, 'frac_1m': frac_1m, 'frac_5s': frac_5s,
|
|||
|
|
'min_1m': min_1m, 'min_5s': min_5s,
|
|||
|
|
'both_signal': (frac_1m > 0) and (frac_5s > 0),
|
|||
|
|
'only_1m': (frac_1m > 0) and (frac_5s == 0),
|
|||
|
|
'only_5s': (frac_1m == 0) and (frac_5s > 0),
|
|||
|
|
'neither': (frac_1m == 0) and (frac_5s == 0),
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
align_df = pd.DataFrame(align_results)
|
|||
|
|
n = len(align_df)
|
|||
|
|
both = align_df['both_signal'].sum()
|
|||
|
|
only_1m = align_df['only_1m'].sum()
|
|||
|
|
only_5s = align_df['only_5s'].sum()
|
|||
|
|
neither = align_df['neither'].sum()
|
|||
|
|
|
|||
|
|
print(f" Analysis over {n} overlapping days:")
|
|||
|
|
print(f" Both signal : {both}/{n} ({both/n*100:.1f}%) — MTF alignment days")
|
|||
|
|
print(f" Only 1m signals: {only_1m}/{n} ({only_1m/n*100:.1f}%)")
|
|||
|
|
print(f" Only 5s signals: {only_5s}/{n} ({only_5s/n*100:.1f}%)")
|
|||
|
|
print(f" Neither signals: {neither}/{n} ({neither/n*100:.1f}%)")
|
|||
|
|
|
|||
|
|
# Correlation between daily signal fractions
|
|||
|
|
corr_frac, p_frac = pearsonr(align_df['frac_1m'], align_df['frac_5s'])
|
|||
|
|
corr_min, p_min = pearsonr(align_df['min_1m'], align_df['min_5s'])
|
|||
|
|
print(f"\n Pearson corr (daily signal fraction 1m vs 5s): r={corr_frac:.4f} p={p_frac:.4f}")
|
|||
|
|
print(f" Pearson corr (daily min vel_div 1m vs 5s): r={corr_min:.4f} p={p_min:.4f}")
|
|||
|
|
|
|||
|
|
results['alignment'] = {
|
|||
|
|
'n_days': n,
|
|||
|
|
'both_signal_pct': float(both/n*100),
|
|||
|
|
'only_1m_pct': float(only_1m/n*100),
|
|||
|
|
'only_5s_pct': float(only_5s/n*100),
|
|||
|
|
'neither_pct': float(neither/n*100),
|
|||
|
|
'corr_daily_frac': float(corr_frac), 'p_daily_frac': float(p_frac),
|
|||
|
|
'corr_daily_min': float(corr_min), 'p_daily_min': float(p_min),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ── 6. Autocorrelation structure ───────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 70)
|
|||
|
|
print("SECTION 5: Autocorrelation and Stationarity")
|
|||
|
|
print("=" * 70)
|
|||
|
|
|
|||
|
|
def acf_lags(series, max_lag=20):
|
|||
|
|
s = series.dropna().values
|
|||
|
|
s = s - s.mean()
|
|||
|
|
result = {}
|
|||
|
|
for lag in range(1, max_lag+1):
|
|||
|
|
if len(s) <= lag:
|
|||
|
|
break
|
|||
|
|
cov = np.mean(s[lag:] * s[:-lag])
|
|||
|
|
var = np.mean(s**2)
|
|||
|
|
result[lag] = float(cov / var) if var > 0 else 0.0
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
print(f"\nACF (vel_div, lags 1-10):")
|
|||
|
|
acf_1m = acf_lags(df_1m['vel_div'], 10)
|
|||
|
|
acf_5s = acf_lags(df_5s['vel_div'], 10)
|
|||
|
|
print(f" {'Lag':>4} {'1m ACF':>8} {'5s ACF':>8}")
|
|||
|
|
for lag in range(1, 11):
|
|||
|
|
print(f" {lag:>4} {acf_1m.get(lag, 0):>+8.4f} {acf_5s.get(lag, 0):>+8.4f}")
|
|||
|
|
|
|||
|
|
# ADF stationarity test
|
|||
|
|
try:
|
|||
|
|
from statsmodels.tsa.stattools import adfuller
|
|||
|
|
for label, ser in [('1m', df_1m['vel_div']), ('5s', df_5s['vel_div'])]:
|
|||
|
|
adf_stat, adf_p, _, _, crit, _ = adfuller(ser.dropna().values[:5000], maxlag=5)
|
|||
|
|
print(f"\n ADF test {label} vel_div: stat={adf_stat:.4f} p={adf_p:.6f} "
|
|||
|
|
f"{'STATIONARY' if adf_p < 0.05 else 'NON-STATIONARY'}")
|
|||
|
|
results[f'adf_{label}'] = {'stat': float(adf_stat), 'p': float(adf_p)}
|
|||
|
|
except ImportError:
|
|||
|
|
print(" statsmodels not available — skipping ADF test")
|
|||
|
|
|
|||
|
|
results['acf_1m'] = acf_1m
|
|||
|
|
results['acf_5s'] = acf_5s
|
|||
|
|
|
|||
|
|
# ── 7. Backtest performance comparison ────────────────────────────────────────
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 70)
|
|||
|
|
print("SECTION 6: Backtest Performance — 1m vs 5s on Overlapping 64 Days")
|
|||
|
|
print("=" * 70)
|
|||
|
|
print("NOTE: 1m system uses VD_THRESHOLD=-0.50, 5s uses -0.02. Engine identical otherwise.")
|
|||
|
|
print("Running 1m system on 64 overlap days...")
|
|||
|
|
|
|||
|
|
sys.path.insert(0, str(HCM / 'nautilus_dolphin'))
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from nautilus_dolphin.nautilus.esf_alpha_orchestrator import NDAlphaEngine
|
|||
|
|
from nautilus_dolphin.nautilus.adaptive_circuit_breaker import AdaptiveCircuitBreaker, ACBConfig
|
|||
|
|
from mc.mc_ml import DolphinForewarner
|
|||
|
|
|
|||
|
|
MC_MODELS_DIR = str(HCM / 'nautilus_dolphin' / 'mc_results' / 'models')
|
|||
|
|
MC_BASE_CFG = {'trial_id': 0, 'vel_div_threshold': -0.02, 'vel_div_extreme': -0.05,
|
|||
|
|
'use_direction_confirm': True, 'dc_lookback_bars': 7,
|
|||
|
|
'dc_min_magnitude_bps': 0.75, 'dc_skip_contradicts': True,
|
|||
|
|
'dc_leverage_boost': 1.0, 'dc_leverage_reduce': 0.5,
|
|||
|
|
'vd_trend_lookback': 10, 'min_leverage': 0.5, 'max_leverage': 5.0,
|
|||
|
|
'leverage_convexity': 3.0, 'fraction': 0.2, 'use_alpha_layers': True,
|
|||
|
|
'use_dynamic_leverage': True, 'fixed_tp_pct': 0.0099, 'stop_pct': 1.0,
|
|||
|
|
'max_hold_bars': 120, 'use_sp_fees': True, 'use_sp_slippage': True,
|
|||
|
|
'sp_maker_entry_rate': 0.62, 'sp_maker_exit_rate': 0.5,
|
|||
|
|
'use_ob_edge': True, 'ob_edge_bps': 5.0, 'ob_confirm_rate': 0.4,
|
|||
|
|
'ob_imbalance_bias': -0.09, 'ob_depth_scale': 1.0,
|
|||
|
|
'use_asset_selection': True, 'min_irp_alignment': 0.45,
|
|||
|
|
'lookback': 100, 'acb_beta_high': 0.8, 'acb_beta_low': 0.2,
|
|||
|
|
'acb_w750_threshold_pct': 60}
|
|||
|
|
|
|||
|
|
META_COLS = {'timestamp', 'scan_number', 'v50_lambda_max_velocity', 'v150_lambda_max_velocity',
|
|||
|
|
'v300_lambda_max_velocity', 'v750_lambda_max_velocity', 'vel_div',
|
|||
|
|
'instability_50', 'instability_150'}
|
|||
|
|
|
|||
|
|
def run_overlap_backtest(data_dir, vd_thresh, vd_extreme, label):
|
|||
|
|
engine_kwargs = dict(
|
|||
|
|
initial_capital=25000.0,
|
|||
|
|
vel_div_threshold=vd_thresh, vel_div_extreme=vd_extreme,
|
|||
|
|
min_leverage=0.5, max_leverage=5.0, leverage_convexity=3.0,
|
|||
|
|
fraction=0.20, fixed_tp_pct=0.0099, stop_pct=1.0, max_hold_bars=120,
|
|||
|
|
use_direction_confirm=True, dc_lookback_bars=7, dc_min_magnitude_bps=0.75,
|
|||
|
|
dc_skip_contradicts=True, dc_leverage_boost=1.0, dc_leverage_reduce=0.5,
|
|||
|
|
use_asset_selection=True, min_irp_alignment=0.45,
|
|||
|
|
use_sp_fees=True, use_sp_slippage=True,
|
|||
|
|
sp_maker_entry_rate=0.62, sp_maker_exit_rate=0.50,
|
|||
|
|
use_ob_edge=True, ob_edge_bps=5.0, ob_confirm_rate=0.40,
|
|||
|
|
lookback=100, use_alpha_layers=True, use_dynamic_leverage=True, seed=42,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
from datetime import datetime as dt, timedelta
|
|||
|
|
d = dt.strptime(OVERLAP_START, '%Y-%m-%d')
|
|||
|
|
end = dt.strptime(OVERLAP_END, '%Y-%m-%d')
|
|||
|
|
date_strings = []
|
|||
|
|
while d <= end:
|
|||
|
|
pf = data_dir / f'{d.strftime("%Y-%m-%d")}.parquet'
|
|||
|
|
if pf.exists():
|
|||
|
|
date_strings.append(d.strftime('%Y-%m-%d'))
|
|||
|
|
d += timedelta(days=1)
|
|||
|
|
|
|||
|
|
pq_data = {}
|
|||
|
|
for ds in date_strings:
|
|||
|
|
df = pd.read_parquet(data_dir / f'{ds}.parquet')
|
|||
|
|
asset_cols = [c for c in df.columns if c not in META_COLS]
|
|||
|
|
dvol_arr = df['v50_lambda_max_velocity'].fillna(0).values
|
|||
|
|
pq_data[ds] = (df, asset_cols, dvol_arr)
|
|||
|
|
|
|||
|
|
acb = AdaptiveCircuitBreaker(ACBConfig(W750_THRESHOLD_PCT=60, BETA_HIGH=0.8, BETA_LOW=0.2))
|
|||
|
|
acb.preload_w750(date_strings)
|
|||
|
|
# Populate w750 from parquet
|
|||
|
|
for ds, (df, _, _) in pq_data.items():
|
|||
|
|
if 'v750_lambda_max_velocity' in df.columns:
|
|||
|
|
v750 = df['v750_lambda_max_velocity'].dropna()
|
|||
|
|
if len(v750) > 0:
|
|||
|
|
acb._w750_vel_cache[ds] = float(v750.median())
|
|||
|
|
w750_vals = [v for v in acb._w750_vel_cache.values() if v != 0.0]
|
|||
|
|
if w750_vals:
|
|||
|
|
acb._w750_threshold = float(np.percentile(w750_vals, acb.config.W750_THRESHOLD_PCT))
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
fw = DolphinForewarner(MC_MODELS_DIR)
|
|||
|
|
except Exception:
|
|||
|
|
fw = None
|
|||
|
|
|
|||
|
|
engine = NDAlphaEngine(**engine_kwargs)
|
|||
|
|
engine.set_acb(acb)
|
|||
|
|
if fw:
|
|||
|
|
engine.set_mc_forewarner(fw, MC_BASE_CFG)
|
|||
|
|
|
|||
|
|
daily = []
|
|||
|
|
for ds in date_strings:
|
|||
|
|
df, asset_cols, dvol_arr = pq_data[ds]
|
|||
|
|
if len(df) < 200:
|
|||
|
|
continue
|
|||
|
|
vol_p60 = np.nanpercentile(dvol_arr, 60)
|
|||
|
|
vol_ok = np.where(np.isfinite(dvol_arr), dvol_arr > vol_p60, False)
|
|||
|
|
result = engine.process_day(ds, df, asset_cols, vol_regime_ok=vol_ok)
|
|||
|
|
daily.append(result)
|
|||
|
|
|
|||
|
|
all_trades = [{'pnl': t.pnl_absolute, 'pnl_pct': t.pnl_pct * 100,
|
|||
|
|
'bars_held': t.bars_held, 'exit_reason': t.exit_reason,
|
|||
|
|
'leverage': t.leverage}
|
|||
|
|
for t in engine.trade_history]
|
|||
|
|
|
|||
|
|
cap_series = [engine_kwargs['initial_capital']] + [r['capital'] for r in daily]
|
|||
|
|
peak = max(cap_series)
|
|||
|
|
min_cap = min(cap_series)
|
|||
|
|
dd = (min_cap - peak) / peak * 100
|
|||
|
|
|
|||
|
|
roi = (cap_series[-1] / cap_series[0] - 1) * 100
|
|||
|
|
wins = [t for t in all_trades if t['pnl'] > 0]
|
|||
|
|
losses = [t for t in all_trades if t['pnl'] < 0]
|
|||
|
|
wr = len(wins) / len(all_trades) * 100 if all_trades else 0
|
|||
|
|
pf = sum(t['pnl'] for t in wins) / abs(sum(t['pnl'] for t in losses)) if losses else float('inf')
|
|||
|
|
tp_exits = sum(1 for t in all_trades if t['exit_reason'] == 'FIXED_TP')
|
|||
|
|
mh_exits = sum(1 for t in all_trades if t['exit_reason'] == 'MAX_HOLD')
|
|||
|
|
|
|||
|
|
print(f"\n {label} ({OVERLAP_START} to {OVERLAP_END}, {len(daily)} days):")
|
|||
|
|
print(f" ROI: {roi:+.2f}%")
|
|||
|
|
print(f" PF: {pf:.4f}")
|
|||
|
|
print(f" Max DD: {dd:.2f}%")
|
|||
|
|
print(f" WR: {wr:.2f}%")
|
|||
|
|
print(f" Trades: {len(all_trades)} ({len(all_trades)/len(daily):.2f}/day)")
|
|||
|
|
print(f" TP exits: {tp_exits} ({tp_exits/max(1,len(all_trades))*100:.1f}%)")
|
|||
|
|
print(f" MH exits: {mh_exits} ({mh_exits/max(1,len(all_trades))*100:.1f}%)")
|
|||
|
|
print(f" Avg lev: {np.mean([t['leverage'] for t in all_trades]):.3f}x" if all_trades else "")
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
'label': label, 'n_days': len(daily), 'n_trades': len(all_trades),
|
|||
|
|
'roi_pct': float(roi), 'pf': float(pf), 'max_dd_pct': float(dd),
|
|||
|
|
'wr_pct': float(wr), 'trades_per_day': float(len(all_trades)/max(1,len(daily))),
|
|||
|
|
'tp_exits_pct': float(tp_exits/max(1,len(all_trades))*100),
|
|||
|
|
'mh_exits_pct': float(mh_exits/max(1,len(all_trades))*100),
|
|||
|
|
'avg_leverage': float(np.mean([t['leverage'] for t in all_trades])) if all_trades else 0,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
res_1m = run_overlap_backtest(KLINES_DIR, -0.50, -1.25, '1m klines (64d overlap)')
|
|||
|
|
res_5s = run_overlap_backtest(NG5_DIR, -0.02, -0.05, '5s NG5 (64d overlap)')
|
|||
|
|
|
|||
|
|
results['backtest_comparison'] = {'1m': res_1m, '5s': res_5s}
|
|||
|
|
|
|||
|
|
print(f"\n COMPARISON TABLE:")
|
|||
|
|
print(f" {'Metric':<20} {'1m klines':>12} {'5s NG5':>12} {'Delta':>10}")
|
|||
|
|
print(f" {'-'*56}")
|
|||
|
|
for k, label in [('roi_pct','ROI %'), ('pf','PF'), ('max_dd_pct','Max DD %'),
|
|||
|
|
('wr_pct','WR %'), ('trades_per_day','Trades/day'),
|
|||
|
|
('tp_exits_pct','TP exits %'), ('avg_leverage','Avg leverage')]:
|
|||
|
|
v1 = res_1m.get(k, 0)
|
|||
|
|
v5 = res_5s.get(k, 0)
|
|||
|
|
delta = v1 - v5
|
|||
|
|
print(f" {label:<20} {v1:>12.3f} {v5:>12.3f} {delta:>+10.3f}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
import traceback
|
|||
|
|
print(f" Backtest comparison failed: {e}")
|
|||
|
|
traceback.print_exc()
|
|||
|
|
results['backtest_comparison'] = {'error': str(e)}
|
|||
|
|
|
|||
|
|
# ── 8. Save results ────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
elapsed = time.time() - t0
|
|||
|
|
results['runtime_s'] = float(elapsed)
|
|||
|
|
|
|||
|
|
out_json = LOGS_DIR / f'1m_vs_5s_comparison_{run_ts}.json'
|
|||
|
|
with open(out_json, 'w') as f:
|
|||
|
|
json.dump(results, f, indent=2, default=str)
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 70)
|
|||
|
|
print(f"COMPLETE in {elapsed:.1f}s")
|
|||
|
|
print(f"Results saved: {out_json}")
|
|||
|
|
print("=" * 70)
|