Files
DOLPHIN/nautilus_dolphin/dvae/resolution_alignment_check.py

245 lines
10 KiB
Python
Raw Normal View History

"""
Resolution Alignment Check
===========================
GATE TEST: Before training Titan VAE on 1m klines, verify that T0-T4 feature
distributions at 1m resolution are sufficiently aligned with 5s production data.
If they diverge, a 1m-trained model cannot be used at 5s inference the latent
dims would represent different physics (50-min vs 4.2-min eigenvalue dynamics).
Tests:
1. Per-dim KS test (distribution shape match)
2. Per-dim mean/std ratio (scale match)
3. PCA cosine similarity (covariance structure match)
4. T1 feature correlation matrix alignment (Frobenius distance)
Pass criteria (all must hold):
- KS p > 0.05 for >= 70% of T1+T2 dims (distributions compatible)
- mean ratio within [0.1, 10x] and std ratio within [0.1, 10x] for >= 70% of dims
- PCA cosine(PC1, PC2) >= 0.80 (shared dominant variance axes)
Output: resolution_alignment_report.json
"""
import sys
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
import json
import numpy as np
import pandas as pd
from scipy import stats
ROOT = Path(__file__).parent.parent
VBT5s = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache")
VBT1m = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines")
OUT = Path(__file__).parent / "resolution_alignment_report.json"
sys.path.insert(0, str(ROOT))
from dvae.titan_sensor import build_feature_vector
META_COLS = {'timestamp','scan_number','v50_lambda_max_velocity','v150_lambda_max_velocity',
'v300_lambda_max_velocity','v750_lambda_max_velocity','vel_div',
'instability_50','instability_150'}
N_5S_DAYS = 10 # use first N days from vbt_cache (5s data)
N_1M_DAYS = 60 # use N days from klines (1m data); pick older period if available
def extract_features(parquet_dir, n_days, label, date_before=None):
"""Build feature matrix from parquet files."""
files = sorted(parquet_dir.glob("*.parquet"))
files = [f for f in files if 'catalog' not in str(f)]
if date_before:
files = [f for f in files if f.stem < date_before]
files = files[:n_days]
if not files:
print(f" [{label}] No files found in {parquet_dir}")
return None, []
print(f" [{label}] {len(files)} days ({files[0].stem} to {files[-1].stem})")
rows = []
for pf in files:
df = pd.read_parquet(pf)
assets = [c for c in df.columns if c not in META_COLS]
for ri in range(50, len(df), 10): # stride=10 to keep runtime low
feat = build_feature_vector(df, ri, assets)
rows.append(feat)
X = np.array(rows, dtype=np.float64)
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
print(f" [{label}] Feature matrix: {X.shape}")
return X, [f.stem for f in files]
def ks_test_per_dim(X5, X1, dims):
"""KS test for each dim. Returns fraction with p>0.05."""
results = {}
for d in dims:
a, b = X5[:, d], X1[:, d]
# skip if all-zero in either
if np.std(a) < 1e-10 or np.std(b) < 1e-10:
results[d] = {"p": 1.0, "stat": 0.0, "trivial": True}
continue
stat, p = stats.ks_2samp(a, b)
results[d] = {"p": float(p), "stat": float(stat), "trivial": False}
return results
def scale_check(X5, X1, dims):
"""Check mean/std ratios."""
results = {}
for d in dims:
m5, s5 = float(np.mean(X5[:, d])), float(np.std(X5[:, d]))
m1, s1 = float(np.mean(X1[:, d])), float(np.std(X1[:, d]))
mean_ratio = abs(m5) / (abs(m1) + 1e-10) if abs(m1) > 1e-10 else float('inf')
std_ratio = s5 / (s1 + 1e-10) if s1 > 1e-10 else float('inf')
results[d] = {
"mean_5s": m5, "mean_1m": m1, "mean_ratio": mean_ratio,
"std_5s": s5, "std_1m": s1, "std_ratio": std_ratio,
}
return results
def pca_cosine(X5, X1, n_components=4):
"""PCA on both matrices, return cosine sim of top components."""
from numpy.linalg import svd
def top_pcs(X, k):
Xc = X - X.mean(axis=0)
_, _, Vt = svd(Xc, full_matrices=False)
return Vt[:k] # (k, d)
pcs5 = top_pcs(X5, n_components)
pcs1 = top_pcs(X1, n_components)
cosines = []
for i in range(n_components):
c = abs(float(np.dot(pcs5[i], pcs1[i]) /
(np.linalg.norm(pcs5[i]) * np.linalg.norm(pcs1[i]) + 1e-12)))
cosines.append(c)
return cosines
def corr_frobenius(X5, X1, dims):
"""Frobenius distance between T1 correlation matrices."""
A = np.corrcoef(X5[:, dims].T)
B = np.corrcoef(X1[:, dims].T)
A = np.nan_to_num(A); B = np.nan_to_num(B)
frob = float(np.linalg.norm(A - B, 'fro'))
max_frob = float(np.sqrt(2 * len(dims))) # upper bound (orthogonal corr matrices)
return frob, frob / max_frob # raw and normalised [0,1]
def main():
print("=== Resolution Alignment Check ===")
print(f"5s source: {VBT5s}")
print(f"1m source: {VBT1m}")
# Pick 1m days from OLDER period if possible (true OOS); otherwise use what's there
klines_files = sorted(VBT1m.glob("*.parquet"))
klines_files = [f for f in klines_files if 'catalog' not in str(f)]
# try to use oldest N_1M_DAYS
date_cut = klines_files[0].stem if klines_files else None
print(f" 1m klines available: {len(klines_files)} days ({klines_files[0].stem if klines_files else '?'} to {klines_files[-1].stem if klines_files else '?'})")
print("\nBuilding feature matrices...")
X5, days5 = extract_features(VBT5s, N_5S_DAYS, "5s-prod")
X1, days1 = extract_features(VBT1m, N_1M_DAYS, "1m-klines")
if X5 is None or X1 is None:
print("ERROR: could not load data from one or both sources.")
return
# T1 dims: 8-27 (eigenvalue velocity features)
# T2 dims: 28-77 (per-asset z-scores)
T1_DIMS = list(range(8, 28))
T2_DIMS = list(range(28, 78))
report = {
"n_5s_samples": len(X5),
"n_1m_samples": len(X1),
"5s_days": days5,
"1m_days": days1,
}
print("\n--- T1 eigenvalue velocity features (dims 8-27) ---")
ks_t1 = ks_test_per_dim(X5, X1, T1_DIMS)
sc_t1 = scale_check(X5, X1, T1_DIMS)
frob_t1, frob_t1_norm = corr_frobenius(X5, X1, T1_DIMS)
pval_pass_t1 = sum(1 for v in ks_t1.values() if v['p'] > 0.05 or v.get('trivial'))
scale_pass_t1 = sum(1 for v in sc_t1.values()
if 0.1 <= v['std_ratio'] <= 10 and v['std_ratio'] != float('inf'))
print(f" KS p>0.05: {pval_pass_t1}/{len(T1_DIMS)} dims")
print(f" std_ratio in [0.1,10]: {scale_pass_t1}/{len(T1_DIMS)} dims")
print(f" Correlation matrix Frobenius dist: {frob_t1:.3f} (normalised: {frob_t1_norm:.3f})")
print(" Per-dim std ratios (5s/1m):")
for d in T1_DIMS:
r = sc_t1[d]['std_ratio']
flag = "OK" if 0.1 <= r <= 10 else "MISMATCH"
print(f" dim{d:3d}: std_5s={sc_t1[d]['std_5s']:.4f} std_1m={sc_t1[d]['std_1m']:.4f} ratio={r:.2f} {flag}")
print("\n--- T2 per-asset z-score features (dims 28-77) ---")
ks_t2 = ks_test_per_dim(X5, X1, T2_DIMS)
sc_t2 = scale_check(X5, X1, T2_DIMS)
pval_pass_t2 = sum(1 for v in ks_t2.values() if v['p'] > 0.05 or v.get('trivial'))
scale_pass_t2 = sum(1 for v in sc_t2.values()
if 0.1 <= v['std_ratio'] <= 10 and v['std_ratio'] != float('inf'))
print(f" KS p>0.05: {pval_pass_t2}/{len(T2_DIMS)} dims")
print(f" std_ratio in [0.1,10]: {scale_pass_t2}/{len(T2_DIMS)} dims")
print("\n--- PCA cosine similarity (T1+T2 joint, top-4 components) ---")
joint_dims = T1_DIMS + T2_DIMS
pca_cos = pca_cosine(X5[:, joint_dims], X1[:, joint_dims], n_components=4)
for i, c in enumerate(pca_cos):
flag = "OK" if c >= 0.80 else "DIVERGED"
print(f" PC{i+1}: cosine={c:.3f} {flag}")
# Pass/fail verdict
ks_ok = (pval_pass_t1 / len(T1_DIMS)) >= 0.70
scale_ok = (scale_pass_t1 / len(T1_DIMS)) >= 0.70
pca_ok = pca_cos[0] >= 0.80 and pca_cos[1] >= 0.80
frob_ok = frob_t1_norm < 0.40
verdict = {
"ks_pass": bool(ks_ok),
"scale_pass": bool(scale_ok),
"pca_pass": bool(pca_ok),
"frob_pass": bool(frob_ok),
"PROCEED": bool(ks_ok and scale_ok and pca_ok and frob_ok),
}
print("\n=== VERDICT ===")
print(f" KS distribution match: {'PASS' if ks_ok else 'FAIL'} ({pval_pass_t1}/{len(T1_DIMS)} dims pass)")
print(f" Scale match: {'PASS' if scale_ok else 'FAIL'} ({scale_pass_t1}/{len(T1_DIMS)} dims pass)")
print(f" PCA structure match: {'PASS' if pca_ok else 'FAIL'} (PC1={pca_cos[0]:.3f} PC2={pca_cos[1]:.3f})")
print(f" Corr matrix alignment: {'PASS' if frob_ok else 'FAIL'} (normalised Frobenius={frob_t1_norm:.3f} < 0.40)")
print(f"\n PROCEED with 1m training: {'YES' if verdict['PROCEED'] else 'NO'}")
if not verdict['PROCEED']:
print(" --> Feature distributions diverge across resolutions.")
print(" --> A 1m-trained model would learn different physics than 5s production.")
print(" --> Training on 1m would NOT yield useful representations at 5s inference.")
print(" --> Alternative: use time-normalised features (fixed-time windows, not bar-count windows)")
else:
print(" --> Feature distributions are compatible across resolutions.")
print(" --> 1m training on 2021-2024 data is a valid OOS approach.")
report.update({
"t1_ks": {str(k): v for k,v in ks_t1.items()},
"t1_scale": {str(k): v for k,v in sc_t1.items()},
"t2_ks": {str(k): v for k,v in ks_t2.items()},
"t2_scale": {str(k): v for k,v in sc_t2.items()},
"pca_cosines": pca_cos,
"t1_frob_norm": frob_t1_norm,
"verdict": verdict,
})
with open(OUT, 'w') as f:
json.dump(report, f, indent=2)
print(f"\nReport: {OUT}")
if __name__ == '__main__':
main()