""" Resolution Alignment Check =========================== GATE TEST: Before training Titan VAE on 1m klines, verify that T0-T4 feature distributions at 1m resolution are sufficiently aligned with 5s production data. If they diverge, a 1m-trained model cannot be used at 5s inference — the latent dims would represent different physics (50-min vs 4.2-min eigenvalue dynamics). Tests: 1. Per-dim KS test (distribution shape match) 2. Per-dim mean/std ratio (scale match) 3. PCA cosine similarity (covariance structure match) 4. T1 feature correlation matrix alignment (Frobenius distance) Pass criteria (all must hold): - KS p > 0.05 for >= 70% of T1+T2 dims (distributions compatible) - mean ratio within [0.1, 10x] and std ratio within [0.1, 10x] for >= 70% of dims - PCA cosine(PC1, PC2) >= 0.80 (shared dominant variance axes) Output: resolution_alignment_report.json """ import sys sys.stdout.reconfigure(encoding='utf-8', errors='replace') from pathlib import Path import json import numpy as np import pandas as pd from scipy import stats ROOT = Path(__file__).parent.parent VBT5s = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache") VBT1m = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines") OUT = Path(__file__).parent / "resolution_alignment_report.json" sys.path.insert(0, str(ROOT)) from dvae.titan_sensor import build_feature_vector META_COLS = {'timestamp','scan_number','v50_lambda_max_velocity','v150_lambda_max_velocity', 'v300_lambda_max_velocity','v750_lambda_max_velocity','vel_div', 'instability_50','instability_150'} N_5S_DAYS = 10 # use first N days from vbt_cache (5s data) N_1M_DAYS = 60 # use N days from klines (1m data); pick older period if available def extract_features(parquet_dir, n_days, label, date_before=None): """Build feature matrix from parquet files.""" files = sorted(parquet_dir.glob("*.parquet")) files = [f for f in files if 'catalog' not in str(f)] if date_before: files = [f for f in files if f.stem < date_before] files = files[:n_days] if not files: print(f" [{label}] No files found in {parquet_dir}") return None, [] print(f" [{label}] {len(files)} days ({files[0].stem} to {files[-1].stem})") rows = [] for pf in files: df = pd.read_parquet(pf) assets = [c for c in df.columns if c not in META_COLS] for ri in range(50, len(df), 10): # stride=10 to keep runtime low feat = build_feature_vector(df, ri, assets) rows.append(feat) X = np.array(rows, dtype=np.float64) X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) print(f" [{label}] Feature matrix: {X.shape}") return X, [f.stem for f in files] def ks_test_per_dim(X5, X1, dims): """KS test for each dim. Returns fraction with p>0.05.""" results = {} for d in dims: a, b = X5[:, d], X1[:, d] # skip if all-zero in either if np.std(a) < 1e-10 or np.std(b) < 1e-10: results[d] = {"p": 1.0, "stat": 0.0, "trivial": True} continue stat, p = stats.ks_2samp(a, b) results[d] = {"p": float(p), "stat": float(stat), "trivial": False} return results def scale_check(X5, X1, dims): """Check mean/std ratios.""" results = {} for d in dims: m5, s5 = float(np.mean(X5[:, d])), float(np.std(X5[:, d])) m1, s1 = float(np.mean(X1[:, d])), float(np.std(X1[:, d])) mean_ratio = abs(m5) / (abs(m1) + 1e-10) if abs(m1) > 1e-10 else float('inf') std_ratio = s5 / (s1 + 1e-10) if s1 > 1e-10 else float('inf') results[d] = { "mean_5s": m5, "mean_1m": m1, "mean_ratio": mean_ratio, "std_5s": s5, "std_1m": s1, "std_ratio": std_ratio, } return results def pca_cosine(X5, X1, n_components=4): """PCA on both matrices, return cosine sim of top components.""" from numpy.linalg import svd def top_pcs(X, k): Xc = X - X.mean(axis=0) _, _, Vt = svd(Xc, full_matrices=False) return Vt[:k] # (k, d) pcs5 = top_pcs(X5, n_components) pcs1 = top_pcs(X1, n_components) cosines = [] for i in range(n_components): c = abs(float(np.dot(pcs5[i], pcs1[i]) / (np.linalg.norm(pcs5[i]) * np.linalg.norm(pcs1[i]) + 1e-12))) cosines.append(c) return cosines def corr_frobenius(X5, X1, dims): """Frobenius distance between T1 correlation matrices.""" A = np.corrcoef(X5[:, dims].T) B = np.corrcoef(X1[:, dims].T) A = np.nan_to_num(A); B = np.nan_to_num(B) frob = float(np.linalg.norm(A - B, 'fro')) max_frob = float(np.sqrt(2 * len(dims))) # upper bound (orthogonal corr matrices) return frob, frob / max_frob # raw and normalised [0,1] def main(): print("=== Resolution Alignment Check ===") print(f"5s source: {VBT5s}") print(f"1m source: {VBT1m}") # Pick 1m days from OLDER period if possible (true OOS); otherwise use what's there klines_files = sorted(VBT1m.glob("*.parquet")) klines_files = [f for f in klines_files if 'catalog' not in str(f)] # try to use oldest N_1M_DAYS date_cut = klines_files[0].stem if klines_files else None print(f" 1m klines available: {len(klines_files)} days ({klines_files[0].stem if klines_files else '?'} to {klines_files[-1].stem if klines_files else '?'})") print("\nBuilding feature matrices...") X5, days5 = extract_features(VBT5s, N_5S_DAYS, "5s-prod") X1, days1 = extract_features(VBT1m, N_1M_DAYS, "1m-klines") if X5 is None or X1 is None: print("ERROR: could not load data from one or both sources.") return # T1 dims: 8-27 (eigenvalue velocity features) # T2 dims: 28-77 (per-asset z-scores) T1_DIMS = list(range(8, 28)) T2_DIMS = list(range(28, 78)) report = { "n_5s_samples": len(X5), "n_1m_samples": len(X1), "5s_days": days5, "1m_days": days1, } print("\n--- T1 eigenvalue velocity features (dims 8-27) ---") ks_t1 = ks_test_per_dim(X5, X1, T1_DIMS) sc_t1 = scale_check(X5, X1, T1_DIMS) frob_t1, frob_t1_norm = corr_frobenius(X5, X1, T1_DIMS) pval_pass_t1 = sum(1 for v in ks_t1.values() if v['p'] > 0.05 or v.get('trivial')) scale_pass_t1 = sum(1 for v in sc_t1.values() if 0.1 <= v['std_ratio'] <= 10 and v['std_ratio'] != float('inf')) print(f" KS p>0.05: {pval_pass_t1}/{len(T1_DIMS)} dims") print(f" std_ratio in [0.1,10]: {scale_pass_t1}/{len(T1_DIMS)} dims") print(f" Correlation matrix Frobenius dist: {frob_t1:.3f} (normalised: {frob_t1_norm:.3f})") print(" Per-dim std ratios (5s/1m):") for d in T1_DIMS: r = sc_t1[d]['std_ratio'] flag = "OK" if 0.1 <= r <= 10 else "MISMATCH" print(f" dim{d:3d}: std_5s={sc_t1[d]['std_5s']:.4f} std_1m={sc_t1[d]['std_1m']:.4f} ratio={r:.2f} {flag}") print("\n--- T2 per-asset z-score features (dims 28-77) ---") ks_t2 = ks_test_per_dim(X5, X1, T2_DIMS) sc_t2 = scale_check(X5, X1, T2_DIMS) pval_pass_t2 = sum(1 for v in ks_t2.values() if v['p'] > 0.05 or v.get('trivial')) scale_pass_t2 = sum(1 for v in sc_t2.values() if 0.1 <= v['std_ratio'] <= 10 and v['std_ratio'] != float('inf')) print(f" KS p>0.05: {pval_pass_t2}/{len(T2_DIMS)} dims") print(f" std_ratio in [0.1,10]: {scale_pass_t2}/{len(T2_DIMS)} dims") print("\n--- PCA cosine similarity (T1+T2 joint, top-4 components) ---") joint_dims = T1_DIMS + T2_DIMS pca_cos = pca_cosine(X5[:, joint_dims], X1[:, joint_dims], n_components=4) for i, c in enumerate(pca_cos): flag = "OK" if c >= 0.80 else "DIVERGED" print(f" PC{i+1}: cosine={c:.3f} {flag}") # Pass/fail verdict ks_ok = (pval_pass_t1 / len(T1_DIMS)) >= 0.70 scale_ok = (scale_pass_t1 / len(T1_DIMS)) >= 0.70 pca_ok = pca_cos[0] >= 0.80 and pca_cos[1] >= 0.80 frob_ok = frob_t1_norm < 0.40 verdict = { "ks_pass": bool(ks_ok), "scale_pass": bool(scale_ok), "pca_pass": bool(pca_ok), "frob_pass": bool(frob_ok), "PROCEED": bool(ks_ok and scale_ok and pca_ok and frob_ok), } print("\n=== VERDICT ===") print(f" KS distribution match: {'PASS' if ks_ok else 'FAIL'} ({pval_pass_t1}/{len(T1_DIMS)} dims pass)") print(f" Scale match: {'PASS' if scale_ok else 'FAIL'} ({scale_pass_t1}/{len(T1_DIMS)} dims pass)") print(f" PCA structure match: {'PASS' if pca_ok else 'FAIL'} (PC1={pca_cos[0]:.3f} PC2={pca_cos[1]:.3f})") print(f" Corr matrix alignment: {'PASS' if frob_ok else 'FAIL'} (normalised Frobenius={frob_t1_norm:.3f} < 0.40)") print(f"\n PROCEED with 1m training: {'YES' if verdict['PROCEED'] else 'NO'}") if not verdict['PROCEED']: print(" --> Feature distributions diverge across resolutions.") print(" --> A 1m-trained model would learn different physics than 5s production.") print(" --> Training on 1m would NOT yield useful representations at 5s inference.") print(" --> Alternative: use time-normalised features (fixed-time windows, not bar-count windows)") else: print(" --> Feature distributions are compatible across resolutions.") print(" --> 1m training on 2021-2024 data is a valid OOS approach.") report.update({ "t1_ks": {str(k): v for k,v in ks_t1.items()}, "t1_scale": {str(k): v for k,v in sc_t1.items()}, "t2_ks": {str(k): v for k,v in ks_t2.items()}, "t2_scale": {str(k): v for k,v in sc_t2.items()}, "pca_cosines": pca_cos, "t1_frob_norm": frob_t1_norm, "verdict": verdict, }) with open(OUT, 'w') as f: json.dump(report, f, indent=2) print(f"\nReport: {OUT}") if __name__ == '__main__': main()