154 lines
7.0 KiB
Python
154 lines
7.0 KiB
Python
|
|
"""
|
||
|
|
What did the encoder actually learn?
|
||
|
|
Correlate z0/z1 latent dims with raw input features.
|
||
|
|
"""
|
||
|
|
import sys
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
|
|
import numpy as np
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
HERE = Path(__file__).parent
|
||
|
|
sys.path.insert(0, str(HERE))
|
||
|
|
|
||
|
|
from corpus_builder import DolphinCorpus, WINDOWS
|
||
|
|
from hierarchical_dvae import HierarchicalDVAE, T_OFF, TIER0_DIM, TIER1_DIM, TIER3_DIM
|
||
|
|
|
||
|
|
# ── Feature names ─────────────────────────────────────────────────────────
|
||
|
|
T0_NAMES = ['bull_pct', 'bear_pct', 'side_pct', 'sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'has_eigen']
|
||
|
|
T1_NAMES = []
|
||
|
|
for w in WINDOWS:
|
||
|
|
for feat in ['log_lmax', 'vel_norm', 'gap_ratio', 'instability', 'rtp']:
|
||
|
|
T1_NAMES.append(f"w{w}_{feat}")
|
||
|
|
|
||
|
|
EXF_FIELDS = [
|
||
|
|
'dvol_btc', 'dvol_eth', 'fng', 'fng_prev', 'btc_dom', 'eth_dom',
|
||
|
|
'chg24_btc', 'chg24_eth', 'dispersion', 'correlation', 'imbal_btc', 'imbal_eth',
|
||
|
|
'funding_btc', 'funding_eth', 'mvrv', 'tvl', 'pcr_vol', 'pcr_oi',
|
||
|
|
'basis', 'liq_proxy', 'spread', 'vol24', 'hashrate', 'btc_price', 'fng_vol',
|
||
|
|
]
|
||
|
|
|
||
|
|
# ── Load ───────────────────────────────────────────────────────────────────
|
||
|
|
print("Loading corpus...")
|
||
|
|
corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))
|
||
|
|
idx = corpus.mask[:, 1] # 16K eigen samples
|
||
|
|
X_e = corpus.X[idx]
|
||
|
|
mask_e = corpus.mask[idx]
|
||
|
|
print(f"Eigen subset: {len(X_e):,} samples")
|
||
|
|
|
||
|
|
print("Loading model...")
|
||
|
|
model = HierarchicalDVAE(hidden=128, beta=0.5, gamma=1.0, lam=1.0, seed=42)
|
||
|
|
model.fit_normaliser(corpus.X, corpus.mask)
|
||
|
|
|
||
|
|
# Load weights
|
||
|
|
d = np.load(str(HERE / 'hdvae_checkpoint.npz'), allow_pickle=True)
|
||
|
|
def load_enc(enc, name):
|
||
|
|
for i, layer in enumerate(enc.mlp.layers):
|
||
|
|
layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b']
|
||
|
|
enc.mu_head.W = d[f'{name}_mu_W']; enc.mu_head.b = d[f'{name}_mu_b']
|
||
|
|
enc.lv_head.W = d[f'{name}_lv_W']; enc.lv_head.b = d[f'{name}_lv_b']
|
||
|
|
def load_dec(dec, name):
|
||
|
|
for i, layer in enumerate(dec.mlp.layers):
|
||
|
|
layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b']
|
||
|
|
for n, e in [('enc0',model.enc0),('enc1',model.enc1),('enc2',model.enc2)]:
|
||
|
|
load_enc(e, n)
|
||
|
|
for n, dc in [('dec0',model.dec0),('dec1',model.dec1),('dec2',model.dec2)]:
|
||
|
|
load_dec(dc, n)
|
||
|
|
|
||
|
|
# ── Encode all 16K samples ─────────────────────────────────────────────────
|
||
|
|
print("Encoding...")
|
||
|
|
rng = np.random.RandomState(0)
|
||
|
|
BATCH = 512
|
||
|
|
mu0_all, mu1_all = [], []
|
||
|
|
for start in range(0, len(X_e), BATCH):
|
||
|
|
Xb = X_e[start:start+BATCH]
|
||
|
|
mb = mask_e[start:start+BATCH]
|
||
|
|
enc = model.encode(Xb, mb, rng)
|
||
|
|
mu0_all.append(enc['mu0'])
|
||
|
|
mu1_all.append(enc['mu1'])
|
||
|
|
mu0 = np.concatenate(mu0_all) # (N, 4)
|
||
|
|
mu1 = np.concatenate(mu1_all) # (N, 8)
|
||
|
|
|
||
|
|
print(f"\nmu0 stats: mean={mu0.mean(0).round(4)} std={mu0.std(0).round(4)}")
|
||
|
|
print(f"mu1 stats: mean={mu1.mean(0).round(4)} std={mu1.std(0).round(4)}")
|
||
|
|
|
||
|
|
# ── Raw features ──────────────────────────────────────────────────────────
|
||
|
|
t0_raw = X_e[:, T_OFF[0]:T_OFF[0]+TIER0_DIM]
|
||
|
|
t1_raw = X_e[:, T_OFF[1]:T_OFF[1]+TIER1_DIM]
|
||
|
|
t3_raw = X_e[:, T_OFF[3]:T_OFF[3]+TIER3_DIM]
|
||
|
|
|
||
|
|
# ── Correlation: z1 dims vs T1 features ───────────────────────────────────
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print("z1 DIMS vs T1 FEATURES (top correlations per z1 dim)")
|
||
|
|
print("="*70)
|
||
|
|
for zd in range(8):
|
||
|
|
corrs = [np.corrcoef(mu1[:,zd], t1_raw[:,fd])[0,1] for fd in range(TIER1_DIM)]
|
||
|
|
corrs = np.array(corrs)
|
||
|
|
top3 = np.argsort(np.abs(corrs))[-3:][::-1]
|
||
|
|
var = mu1[:,zd].var()
|
||
|
|
print(f"z1[{zd}] var={var:.4f}: " + " ".join(f"{T1_NAMES[i]}={corrs[i]:+.3f}" for i in top3))
|
||
|
|
|
||
|
|
# ── Correlation: z0 dims vs T0 features ───────────────────────────────────
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print("z0 DIMS vs T0 FEATURES (top correlations per z0 dim)")
|
||
|
|
print("="*70)
|
||
|
|
for zd in range(4):
|
||
|
|
corrs = [np.corrcoef(mu0[:,zd], t0_raw[:,fd])[0,1] for fd in range(TIER0_DIM)]
|
||
|
|
corrs = np.array(corrs)
|
||
|
|
top3 = np.argsort(np.abs(corrs))[-3:][::-1]
|
||
|
|
var = mu0[:,zd].var()
|
||
|
|
print(f"z0[{zd}] var={var:.5f}: " + " ".join(f"{T0_NAMES[i]}={corrs[i]:+.3f}" for i in top3))
|
||
|
|
|
||
|
|
# ── What is z1 actually distinguishing? ───────────────────────────────────
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print("z1[4] (highest var=0.015): value distribution vs T1 raw ranges")
|
||
|
|
print("="*70)
|
||
|
|
z1_4 = mu1[:, 4] # dim with highest var (index 4 = z1[4])
|
||
|
|
# Actually find which z1 dim has highest variance
|
||
|
|
best_z1 = np.argmax(mu1.var(0))
|
||
|
|
z1_best = mu1[:, best_z1]
|
||
|
|
print(f"Best z1 dim: {best_z1}, var={mu1[:,best_z1].var():.4f}")
|
||
|
|
|
||
|
|
# Split into top/bottom 20% by z1 value
|
||
|
|
lo_mask = z1_best < np.percentile(z1_best, 20)
|
||
|
|
hi_mask = z1_best > np.percentile(z1_best, 80)
|
||
|
|
print(f"\nT1 feature means: LOW z1[{best_z1}] (bot20%) vs HIGH z1[{best_z1}] (top20%)")
|
||
|
|
print(f"{'Feature':<20} {'LOW':>8} {'HIGH':>8} {'diff':>8}")
|
||
|
|
for fd, name in enumerate(T1_NAMES):
|
||
|
|
lo_val = t1_raw[lo_mask, fd].mean()
|
||
|
|
hi_val = t1_raw[hi_mask, fd].mean()
|
||
|
|
diff = hi_val - lo_val
|
||
|
|
if abs(diff) > 0.02:
|
||
|
|
print(f" {name:<18} {lo_val:8.4f} {hi_val:8.4f} {diff:+8.4f}")
|
||
|
|
|
||
|
|
# ── ExF correlation check ─────────────────────────────────────────────────
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print(f"z1[{best_z1}] vs ExF (T3) features")
|
||
|
|
print("="*70)
|
||
|
|
exf_corrs = [(np.corrcoef(z1_best, t3_raw[:,i])[0,1], EXF_FIELDS[i]) for i in range(min(25, t3_raw.shape[1]))]
|
||
|
|
exf_corrs.sort(key=lambda x: abs(x[0]), reverse=True)
|
||
|
|
for r, name in exf_corrs[:10]:
|
||
|
|
print(f" {name:<20} r={r:+.4f}")
|
||
|
|
|
||
|
|
# ── z0 clustering: what do the 7 clusters look like? ─────────────────────
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print("z0 cluster analysis (k=7, what separates them?)")
|
||
|
|
print("="*70)
|
||
|
|
from scipy.cluster.vq import kmeans2
|
||
|
|
try:
|
||
|
|
centroids, labels = kmeans2(mu0, 7, seed=42, minit='points')
|
||
|
|
print(f"Cluster sizes: {np.bincount(labels)}")
|
||
|
|
print(f"\nCluster centroids (z0 space):")
|
||
|
|
for k in range(7):
|
||
|
|
c = centroids[k]
|
||
|
|
# What T0 features distinguish this cluster?
|
||
|
|
mask_k = labels == k
|
||
|
|
t0_k = t0_raw[mask_k].mean(0)
|
||
|
|
t1_k = t1_raw[mask_k].mean(0)
|
||
|
|
print(f"\n Cluster {k} (N={mask_k.sum():,}): z0={c.round(3)}")
|
||
|
|
print(f" T0: bull={t0_k[0]:.3f} bear={t0_k[1]:.3f} side={t0_k[2]:.3f}")
|
||
|
|
print(f" T1: log_lmax_w50={t1_k[0]:.3f} vel_norm_w50={t1_k[1]:+.3f} gap_ratio_w50={t1_k[2]:.3f} inst={t1_k[3]:.3f} rtp={t1_k[4]:.3f}")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Clustering failed: {e}")
|
||
|
|
|
||
|
|
print("\nDone.")
|