""" What did the encoder actually learn? Correlate z0/z1 latent dims with raw input features. """ import sys sys.stdout.reconfigure(encoding='utf-8', errors='replace') import numpy as np from pathlib import Path HERE = Path(__file__).parent sys.path.insert(0, str(HERE)) from corpus_builder import DolphinCorpus, WINDOWS from hierarchical_dvae import HierarchicalDVAE, T_OFF, TIER0_DIM, TIER1_DIM, TIER3_DIM # ── Feature names ───────────────────────────────────────────────────────── T0_NAMES = ['bull_pct', 'bear_pct', 'side_pct', 'sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'has_eigen'] T1_NAMES = [] for w in WINDOWS: for feat in ['log_lmax', 'vel_norm', 'gap_ratio', 'instability', 'rtp']: T1_NAMES.append(f"w{w}_{feat}") EXF_FIELDS = [ 'dvol_btc', 'dvol_eth', 'fng', 'fng_prev', 'btc_dom', 'eth_dom', 'chg24_btc', 'chg24_eth', 'dispersion', 'correlation', 'imbal_btc', 'imbal_eth', 'funding_btc', 'funding_eth', 'mvrv', 'tvl', 'pcr_vol', 'pcr_oi', 'basis', 'liq_proxy', 'spread', 'vol24', 'hashrate', 'btc_price', 'fng_vol', ] # ── Load ─────────────────────────────────────────────────────────────────── print("Loading corpus...") corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz')) idx = corpus.mask[:, 1] # 16K eigen samples X_e = corpus.X[idx] mask_e = corpus.mask[idx] print(f"Eigen subset: {len(X_e):,} samples") print("Loading model...") model = HierarchicalDVAE(hidden=128, beta=0.5, gamma=1.0, lam=1.0, seed=42) model.fit_normaliser(corpus.X, corpus.mask) # Load weights d = np.load(str(HERE / 'hdvae_checkpoint.npz'), allow_pickle=True) def load_enc(enc, name): for i, layer in enumerate(enc.mlp.layers): layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b'] enc.mu_head.W = d[f'{name}_mu_W']; enc.mu_head.b = d[f'{name}_mu_b'] enc.lv_head.W = d[f'{name}_lv_W']; enc.lv_head.b = d[f'{name}_lv_b'] def load_dec(dec, name): for i, layer in enumerate(dec.mlp.layers): layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b'] for n, e in [('enc0',model.enc0),('enc1',model.enc1),('enc2',model.enc2)]: load_enc(e, n) for n, dc in [('dec0',model.dec0),('dec1',model.dec1),('dec2',model.dec2)]: load_dec(dc, n) # ── Encode all 16K samples ───────────────────────────────────────────────── print("Encoding...") rng = np.random.RandomState(0) BATCH = 512 mu0_all, mu1_all = [], [] for start in range(0, len(X_e), BATCH): Xb = X_e[start:start+BATCH] mb = mask_e[start:start+BATCH] enc = model.encode(Xb, mb, rng) mu0_all.append(enc['mu0']) mu1_all.append(enc['mu1']) mu0 = np.concatenate(mu0_all) # (N, 4) mu1 = np.concatenate(mu1_all) # (N, 8) print(f"\nmu0 stats: mean={mu0.mean(0).round(4)} std={mu0.std(0).round(4)}") print(f"mu1 stats: mean={mu1.mean(0).round(4)} std={mu1.std(0).round(4)}") # ── Raw features ────────────────────────────────────────────────────────── t0_raw = X_e[:, T_OFF[0]:T_OFF[0]+TIER0_DIM] t1_raw = X_e[:, T_OFF[1]:T_OFF[1]+TIER1_DIM] t3_raw = X_e[:, T_OFF[3]:T_OFF[3]+TIER3_DIM] # ── Correlation: z1 dims vs T1 features ─────────────────────────────────── print("\n" + "="*70) print("z1 DIMS vs T1 FEATURES (top correlations per z1 dim)") print("="*70) for zd in range(8): corrs = [np.corrcoef(mu1[:,zd], t1_raw[:,fd])[0,1] for fd in range(TIER1_DIM)] corrs = np.array(corrs) top3 = np.argsort(np.abs(corrs))[-3:][::-1] var = mu1[:,zd].var() print(f"z1[{zd}] var={var:.4f}: " + " ".join(f"{T1_NAMES[i]}={corrs[i]:+.3f}" for i in top3)) # ── Correlation: z0 dims vs T0 features ─────────────────────────────────── print("\n" + "="*70) print("z0 DIMS vs T0 FEATURES (top correlations per z0 dim)") print("="*70) for zd in range(4): corrs = [np.corrcoef(mu0[:,zd], t0_raw[:,fd])[0,1] for fd in range(TIER0_DIM)] corrs = np.array(corrs) top3 = np.argsort(np.abs(corrs))[-3:][::-1] var = mu0[:,zd].var() print(f"z0[{zd}] var={var:.5f}: " + " ".join(f"{T0_NAMES[i]}={corrs[i]:+.3f}" for i in top3)) # ── What is z1 actually distinguishing? ─────────────────────────────────── print("\n" + "="*70) print("z1[4] (highest var=0.015): value distribution vs T1 raw ranges") print("="*70) z1_4 = mu1[:, 4] # dim with highest var (index 4 = z1[4]) # Actually find which z1 dim has highest variance best_z1 = np.argmax(mu1.var(0)) z1_best = mu1[:, best_z1] print(f"Best z1 dim: {best_z1}, var={mu1[:,best_z1].var():.4f}") # Split into top/bottom 20% by z1 value lo_mask = z1_best < np.percentile(z1_best, 20) hi_mask = z1_best > np.percentile(z1_best, 80) print(f"\nT1 feature means: LOW z1[{best_z1}] (bot20%) vs HIGH z1[{best_z1}] (top20%)") print(f"{'Feature':<20} {'LOW':>8} {'HIGH':>8} {'diff':>8}") for fd, name in enumerate(T1_NAMES): lo_val = t1_raw[lo_mask, fd].mean() hi_val = t1_raw[hi_mask, fd].mean() diff = hi_val - lo_val if abs(diff) > 0.02: print(f" {name:<18} {lo_val:8.4f} {hi_val:8.4f} {diff:+8.4f}") # ── ExF correlation check ───────────────────────────────────────────────── print("\n" + "="*70) print(f"z1[{best_z1}] vs ExF (T3) features") print("="*70) exf_corrs = [(np.corrcoef(z1_best, t3_raw[:,i])[0,1], EXF_FIELDS[i]) for i in range(min(25, t3_raw.shape[1]))] exf_corrs.sort(key=lambda x: abs(x[0]), reverse=True) for r, name in exf_corrs[:10]: print(f" {name:<20} r={r:+.4f}") # ── z0 clustering: what do the 7 clusters look like? ───────────────────── print("\n" + "="*70) print("z0 cluster analysis (k=7, what separates them?)") print("="*70) from scipy.cluster.vq import kmeans2 try: centroids, labels = kmeans2(mu0, 7, seed=42, minit='points') print(f"Cluster sizes: {np.bincount(labels)}") print(f"\nCluster centroids (z0 space):") for k in range(7): c = centroids[k] # What T0 features distinguish this cluster? mask_k = labels == k t0_k = t0_raw[mask_k].mean(0) t1_k = t1_raw[mask_k].mean(0) print(f"\n Cluster {k} (N={mask_k.sum():,}): z0={c.round(3)}") print(f" T0: bull={t0_k[0]:.3f} bear={t0_k[1]:.3f} side={t0_k[2]:.3f}") print(f" T1: log_lmax_w50={t1_k[0]:.3f} vel_norm_w50={t1_k[1]:+.3f} gap_ratio_w50={t1_k[2]:.3f} inst={t1_k[3]:.3f} rtp={t1_k[4]:.3f}") except Exception as e: print(f"Clustering failed: {e}") print("\nDone.")