245 lines
10 KiB
Python
245 lines
10 KiB
Python
|
|
"""
|
||
|
|
Resolution Alignment Check
|
||
|
|
===========================
|
||
|
|
GATE TEST: Before training Titan VAE on 1m klines, verify that T0-T4 feature
|
||
|
|
distributions at 1m resolution are sufficiently aligned with 5s production data.
|
||
|
|
|
||
|
|
If they diverge, a 1m-trained model cannot be used at 5s inference — the latent
|
||
|
|
dims would represent different physics (50-min vs 4.2-min eigenvalue dynamics).
|
||
|
|
|
||
|
|
Tests:
|
||
|
|
1. Per-dim KS test (distribution shape match)
|
||
|
|
2. Per-dim mean/std ratio (scale match)
|
||
|
|
3. PCA cosine similarity (covariance structure match)
|
||
|
|
4. T1 feature correlation matrix alignment (Frobenius distance)
|
||
|
|
|
||
|
|
Pass criteria (all must hold):
|
||
|
|
- KS p > 0.05 for >= 70% of T1+T2 dims (distributions compatible)
|
||
|
|
- mean ratio within [0.1, 10x] and std ratio within [0.1, 10x] for >= 70% of dims
|
||
|
|
- PCA cosine(PC1, PC2) >= 0.80 (shared dominant variance axes)
|
||
|
|
|
||
|
|
Output: resolution_alignment_report.json
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
|
|
from pathlib import Path
|
||
|
|
import json
|
||
|
|
import numpy as np
|
||
|
|
import pandas as pd
|
||
|
|
from scipy import stats
|
||
|
|
|
||
|
|
ROOT = Path(__file__).parent.parent
|
||
|
|
VBT5s = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache")
|
||
|
|
VBT1m = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines")
|
||
|
|
OUT = Path(__file__).parent / "resolution_alignment_report.json"
|
||
|
|
|
||
|
|
sys.path.insert(0, str(ROOT))
|
||
|
|
from dvae.titan_sensor import build_feature_vector
|
||
|
|
|
||
|
|
META_COLS = {'timestamp','scan_number','v50_lambda_max_velocity','v150_lambda_max_velocity',
|
||
|
|
'v300_lambda_max_velocity','v750_lambda_max_velocity','vel_div',
|
||
|
|
'instability_50','instability_150'}
|
||
|
|
|
||
|
|
N_5S_DAYS = 10 # use first N days from vbt_cache (5s data)
|
||
|
|
N_1M_DAYS = 60 # use N days from klines (1m data); pick older period if available
|
||
|
|
|
||
|
|
|
||
|
|
def extract_features(parquet_dir, n_days, label, date_before=None):
|
||
|
|
"""Build feature matrix from parquet files."""
|
||
|
|
files = sorted(parquet_dir.glob("*.parquet"))
|
||
|
|
files = [f for f in files if 'catalog' not in str(f)]
|
||
|
|
if date_before:
|
||
|
|
files = [f for f in files if f.stem < date_before]
|
||
|
|
files = files[:n_days]
|
||
|
|
if not files:
|
||
|
|
print(f" [{label}] No files found in {parquet_dir}")
|
||
|
|
return None, []
|
||
|
|
|
||
|
|
print(f" [{label}] {len(files)} days ({files[0].stem} to {files[-1].stem})")
|
||
|
|
|
||
|
|
rows = []
|
||
|
|
for pf in files:
|
||
|
|
df = pd.read_parquet(pf)
|
||
|
|
assets = [c for c in df.columns if c not in META_COLS]
|
||
|
|
for ri in range(50, len(df), 10): # stride=10 to keep runtime low
|
||
|
|
feat = build_feature_vector(df, ri, assets)
|
||
|
|
rows.append(feat)
|
||
|
|
|
||
|
|
X = np.array(rows, dtype=np.float64)
|
||
|
|
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
|
||
|
|
print(f" [{label}] Feature matrix: {X.shape}")
|
||
|
|
return X, [f.stem for f in files]
|
||
|
|
|
||
|
|
|
||
|
|
def ks_test_per_dim(X5, X1, dims):
|
||
|
|
"""KS test for each dim. Returns fraction with p>0.05."""
|
||
|
|
results = {}
|
||
|
|
for d in dims:
|
||
|
|
a, b = X5[:, d], X1[:, d]
|
||
|
|
# skip if all-zero in either
|
||
|
|
if np.std(a) < 1e-10 or np.std(b) < 1e-10:
|
||
|
|
results[d] = {"p": 1.0, "stat": 0.0, "trivial": True}
|
||
|
|
continue
|
||
|
|
stat, p = stats.ks_2samp(a, b)
|
||
|
|
results[d] = {"p": float(p), "stat": float(stat), "trivial": False}
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
def scale_check(X5, X1, dims):
|
||
|
|
"""Check mean/std ratios."""
|
||
|
|
results = {}
|
||
|
|
for d in dims:
|
||
|
|
m5, s5 = float(np.mean(X5[:, d])), float(np.std(X5[:, d]))
|
||
|
|
m1, s1 = float(np.mean(X1[:, d])), float(np.std(X1[:, d]))
|
||
|
|
mean_ratio = abs(m5) / (abs(m1) + 1e-10) if abs(m1) > 1e-10 else float('inf')
|
||
|
|
std_ratio = s5 / (s1 + 1e-10) if s1 > 1e-10 else float('inf')
|
||
|
|
results[d] = {
|
||
|
|
"mean_5s": m5, "mean_1m": m1, "mean_ratio": mean_ratio,
|
||
|
|
"std_5s": s5, "std_1m": s1, "std_ratio": std_ratio,
|
||
|
|
}
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
def pca_cosine(X5, X1, n_components=4):
|
||
|
|
"""PCA on both matrices, return cosine sim of top components."""
|
||
|
|
from numpy.linalg import svd
|
||
|
|
|
||
|
|
def top_pcs(X, k):
|
||
|
|
Xc = X - X.mean(axis=0)
|
||
|
|
_, _, Vt = svd(Xc, full_matrices=False)
|
||
|
|
return Vt[:k] # (k, d)
|
||
|
|
|
||
|
|
pcs5 = top_pcs(X5, n_components)
|
||
|
|
pcs1 = top_pcs(X1, n_components)
|
||
|
|
|
||
|
|
cosines = []
|
||
|
|
for i in range(n_components):
|
||
|
|
c = abs(float(np.dot(pcs5[i], pcs1[i]) /
|
||
|
|
(np.linalg.norm(pcs5[i]) * np.linalg.norm(pcs1[i]) + 1e-12)))
|
||
|
|
cosines.append(c)
|
||
|
|
return cosines
|
||
|
|
|
||
|
|
|
||
|
|
def corr_frobenius(X5, X1, dims):
|
||
|
|
"""Frobenius distance between T1 correlation matrices."""
|
||
|
|
A = np.corrcoef(X5[:, dims].T)
|
||
|
|
B = np.corrcoef(X1[:, dims].T)
|
||
|
|
A = np.nan_to_num(A); B = np.nan_to_num(B)
|
||
|
|
frob = float(np.linalg.norm(A - B, 'fro'))
|
||
|
|
max_frob = float(np.sqrt(2 * len(dims))) # upper bound (orthogonal corr matrices)
|
||
|
|
return frob, frob / max_frob # raw and normalised [0,1]
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
print("=== Resolution Alignment Check ===")
|
||
|
|
print(f"5s source: {VBT5s}")
|
||
|
|
print(f"1m source: {VBT1m}")
|
||
|
|
|
||
|
|
# Pick 1m days from OLDER period if possible (true OOS); otherwise use what's there
|
||
|
|
klines_files = sorted(VBT1m.glob("*.parquet"))
|
||
|
|
klines_files = [f for f in klines_files if 'catalog' not in str(f)]
|
||
|
|
# try to use oldest N_1M_DAYS
|
||
|
|
date_cut = klines_files[0].stem if klines_files else None
|
||
|
|
print(f" 1m klines available: {len(klines_files)} days ({klines_files[0].stem if klines_files else '?'} to {klines_files[-1].stem if klines_files else '?'})")
|
||
|
|
|
||
|
|
print("\nBuilding feature matrices...")
|
||
|
|
X5, days5 = extract_features(VBT5s, N_5S_DAYS, "5s-prod")
|
||
|
|
X1, days1 = extract_features(VBT1m, N_1M_DAYS, "1m-klines")
|
||
|
|
|
||
|
|
if X5 is None or X1 is None:
|
||
|
|
print("ERROR: could not load data from one or both sources.")
|
||
|
|
return
|
||
|
|
|
||
|
|
# T1 dims: 8-27 (eigenvalue velocity features)
|
||
|
|
# T2 dims: 28-77 (per-asset z-scores)
|
||
|
|
T1_DIMS = list(range(8, 28))
|
||
|
|
T2_DIMS = list(range(28, 78))
|
||
|
|
|
||
|
|
report = {
|
||
|
|
"n_5s_samples": len(X5),
|
||
|
|
"n_1m_samples": len(X1),
|
||
|
|
"5s_days": days5,
|
||
|
|
"1m_days": days1,
|
||
|
|
}
|
||
|
|
|
||
|
|
print("\n--- T1 eigenvalue velocity features (dims 8-27) ---")
|
||
|
|
ks_t1 = ks_test_per_dim(X5, X1, T1_DIMS)
|
||
|
|
sc_t1 = scale_check(X5, X1, T1_DIMS)
|
||
|
|
frob_t1, frob_t1_norm = corr_frobenius(X5, X1, T1_DIMS)
|
||
|
|
|
||
|
|
pval_pass_t1 = sum(1 for v in ks_t1.values() if v['p'] > 0.05 or v.get('trivial'))
|
||
|
|
scale_pass_t1 = sum(1 for v in sc_t1.values()
|
||
|
|
if 0.1 <= v['std_ratio'] <= 10 and v['std_ratio'] != float('inf'))
|
||
|
|
print(f" KS p>0.05: {pval_pass_t1}/{len(T1_DIMS)} dims")
|
||
|
|
print(f" std_ratio in [0.1,10]: {scale_pass_t1}/{len(T1_DIMS)} dims")
|
||
|
|
print(f" Correlation matrix Frobenius dist: {frob_t1:.3f} (normalised: {frob_t1_norm:.3f})")
|
||
|
|
print(" Per-dim std ratios (5s/1m):")
|
||
|
|
for d in T1_DIMS:
|
||
|
|
r = sc_t1[d]['std_ratio']
|
||
|
|
flag = "OK" if 0.1 <= r <= 10 else "MISMATCH"
|
||
|
|
print(f" dim{d:3d}: std_5s={sc_t1[d]['std_5s']:.4f} std_1m={sc_t1[d]['std_1m']:.4f} ratio={r:.2f} {flag}")
|
||
|
|
|
||
|
|
print("\n--- T2 per-asset z-score features (dims 28-77) ---")
|
||
|
|
ks_t2 = ks_test_per_dim(X5, X1, T2_DIMS)
|
||
|
|
sc_t2 = scale_check(X5, X1, T2_DIMS)
|
||
|
|
pval_pass_t2 = sum(1 for v in ks_t2.values() if v['p'] > 0.05 or v.get('trivial'))
|
||
|
|
scale_pass_t2 = sum(1 for v in sc_t2.values()
|
||
|
|
if 0.1 <= v['std_ratio'] <= 10 and v['std_ratio'] != float('inf'))
|
||
|
|
print(f" KS p>0.05: {pval_pass_t2}/{len(T2_DIMS)} dims")
|
||
|
|
print(f" std_ratio in [0.1,10]: {scale_pass_t2}/{len(T2_DIMS)} dims")
|
||
|
|
|
||
|
|
print("\n--- PCA cosine similarity (T1+T2 joint, top-4 components) ---")
|
||
|
|
joint_dims = T1_DIMS + T2_DIMS
|
||
|
|
pca_cos = pca_cosine(X5[:, joint_dims], X1[:, joint_dims], n_components=4)
|
||
|
|
for i, c in enumerate(pca_cos):
|
||
|
|
flag = "OK" if c >= 0.80 else "DIVERGED"
|
||
|
|
print(f" PC{i+1}: cosine={c:.3f} {flag}")
|
||
|
|
|
||
|
|
# Pass/fail verdict
|
||
|
|
ks_ok = (pval_pass_t1 / len(T1_DIMS)) >= 0.70
|
||
|
|
scale_ok = (scale_pass_t1 / len(T1_DIMS)) >= 0.70
|
||
|
|
pca_ok = pca_cos[0] >= 0.80 and pca_cos[1] >= 0.80
|
||
|
|
frob_ok = frob_t1_norm < 0.40
|
||
|
|
|
||
|
|
verdict = {
|
||
|
|
"ks_pass": bool(ks_ok),
|
||
|
|
"scale_pass": bool(scale_ok),
|
||
|
|
"pca_pass": bool(pca_ok),
|
||
|
|
"frob_pass": bool(frob_ok),
|
||
|
|
"PROCEED": bool(ks_ok and scale_ok and pca_ok and frob_ok),
|
||
|
|
}
|
||
|
|
|
||
|
|
print("\n=== VERDICT ===")
|
||
|
|
print(f" KS distribution match: {'PASS' if ks_ok else 'FAIL'} ({pval_pass_t1}/{len(T1_DIMS)} dims pass)")
|
||
|
|
print(f" Scale match: {'PASS' if scale_ok else 'FAIL'} ({scale_pass_t1}/{len(T1_DIMS)} dims pass)")
|
||
|
|
print(f" PCA structure match: {'PASS' if pca_ok else 'FAIL'} (PC1={pca_cos[0]:.3f} PC2={pca_cos[1]:.3f})")
|
||
|
|
print(f" Corr matrix alignment: {'PASS' if frob_ok else 'FAIL'} (normalised Frobenius={frob_t1_norm:.3f} < 0.40)")
|
||
|
|
print(f"\n PROCEED with 1m training: {'YES' if verdict['PROCEED'] else 'NO'}")
|
||
|
|
if not verdict['PROCEED']:
|
||
|
|
print(" --> Feature distributions diverge across resolutions.")
|
||
|
|
print(" --> A 1m-trained model would learn different physics than 5s production.")
|
||
|
|
print(" --> Training on 1m would NOT yield useful representations at 5s inference.")
|
||
|
|
print(" --> Alternative: use time-normalised features (fixed-time windows, not bar-count windows)")
|
||
|
|
else:
|
||
|
|
print(" --> Feature distributions are compatible across resolutions.")
|
||
|
|
print(" --> 1m training on 2021-2024 data is a valid OOS approach.")
|
||
|
|
|
||
|
|
report.update({
|
||
|
|
"t1_ks": {str(k): v for k,v in ks_t1.items()},
|
||
|
|
"t1_scale": {str(k): v for k,v in sc_t1.items()},
|
||
|
|
"t2_ks": {str(k): v for k,v in ks_t2.items()},
|
||
|
|
"t2_scale": {str(k): v for k,v in sc_t2.items()},
|
||
|
|
"pca_cosines": pca_cos,
|
||
|
|
"t1_frob_norm": frob_t1_norm,
|
||
|
|
"verdict": verdict,
|
||
|
|
})
|
||
|
|
|
||
|
|
with open(OUT, 'w') as f:
|
||
|
|
json.dump(report, f, indent=2)
|
||
|
|
print(f"\nReport: {OUT}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|