DOLPHIN/nautilus_dolphin/dvae/diagnose_latents.py

"""
What did the encoder actually learn?
Correlate z0/z1 latent dims with raw input features.
"""
import sys
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
import numpy as np
from pathlib import Path

HERE = Path(__file__).parent
sys.path.insert(0, str(HERE))

from corpus_builder import DolphinCorpus, WINDOWS
from hierarchical_dvae import HierarchicalDVAE, T_OFF, TIER0_DIM, TIER1_DIM, TIER3_DIM

# ── Feature names ─────────────────────────────────────────────────────────
T0_NAMES = ['bull_pct', 'bear_pct', 'side_pct', 'sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'has_eigen']
T1_NAMES = []
for w in WINDOWS:
    for feat in ['log_lmax', 'vel_norm', 'gap_ratio', 'instability', 'rtp']:
        T1_NAMES.append(f"w{w}_{feat}")

EXF_FIELDS = [
    'dvol_btc', 'dvol_eth', 'fng', 'fng_prev', 'btc_dom', 'eth_dom',
    'chg24_btc', 'chg24_eth', 'dispersion', 'correlation', 'imbal_btc', 'imbal_eth',
    'funding_btc', 'funding_eth', 'mvrv', 'tvl', 'pcr_vol', 'pcr_oi',
    'basis', 'liq_proxy', 'spread', 'vol24', 'hashrate', 'btc_price', 'fng_vol',
]

# ── Load ───────────────────────────────────────────────────────────────────
print("Loading corpus...")
corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))
idx = corpus.mask[:, 1]   # 16K eigen samples
X_e = corpus.X[idx]
mask_e = corpus.mask[idx]
print(f"Eigen subset: {len(X_e):,} samples")

print("Loading model...")
model = HierarchicalDVAE(hidden=128, beta=0.5, gamma=1.0, lam=1.0, seed=42)
model.fit_normaliser(corpus.X, corpus.mask)

# Load weights
d = np.load(str(HERE / 'hdvae_checkpoint.npz'), allow_pickle=True)
def load_enc(enc, name):
    for i, layer in enumerate(enc.mlp.layers):
        layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b']
    enc.mu_head.W = d[f'{name}_mu_W']; enc.mu_head.b = d[f'{name}_mu_b']
    enc.lv_head.W = d[f'{name}_lv_W']; enc.lv_head.b = d[f'{name}_lv_b']
def load_dec(dec, name):
    for i, layer in enumerate(dec.mlp.layers):
        layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b']
for n, e in [('enc0',model.enc0),('enc1',model.enc1),('enc2',model.enc2)]:
    load_enc(e, n)
for n, dc in [('dec0',model.dec0),('dec1',model.dec1),('dec2',model.dec2)]:
    load_dec(dc, n)

# ── Encode all 16K samples ─────────────────────────────────────────────────
print("Encoding...")
rng = np.random.RandomState(0)
BATCH = 512
mu0_all, mu1_all = [], []
for start in range(0, len(X_e), BATCH):
    Xb = X_e[start:start+BATCH]
    mb = mask_e[start:start+BATCH]
    enc = model.encode(Xb, mb, rng)
    mu0_all.append(enc['mu0'])
    mu1_all.append(enc['mu1'])
mu0 = np.concatenate(mu0_all)   # (N, 4)
mu1 = np.concatenate(mu1_all)   # (N, 8)

print(f"\nmu0 stats: mean={mu0.mean(0).round(4)}  std={mu0.std(0).round(4)}")
print(f"mu1 stats: mean={mu1.mean(0).round(4)}  std={mu1.std(0).round(4)}")

# ── Raw features ──────────────────────────────────────────────────────────
t0_raw = X_e[:, T_OFF[0]:T_OFF[0]+TIER0_DIM]
t1_raw = X_e[:, T_OFF[1]:T_OFF[1]+TIER1_DIM]
t3_raw = X_e[:, T_OFF[3]:T_OFF[3]+TIER3_DIM]

# ── Correlation: z1 dims vs T1 features ───────────────────────────────────
print("\n" + "="*70)
print("z1 DIMS vs T1 FEATURES (top correlations per z1 dim)")
print("="*70)
for zd in range(8):
    corrs = [np.corrcoef(mu1[:,zd], t1_raw[:,fd])[0,1] for fd in range(TIER1_DIM)]
    corrs = np.array(corrs)
    top3 = np.argsort(np.abs(corrs))[-3:][::-1]
    var = mu1[:,zd].var()
    print(f"z1[{zd}] var={var:.4f}: " + "  ".join(f"{T1_NAMES[i]}={corrs[i]:+.3f}" for i in top3))

# ── Correlation: z0 dims vs T0 features ───────────────────────────────────
print("\n" + "="*70)
print("z0 DIMS vs T0 FEATURES (top correlations per z0 dim)")
print("="*70)
for zd in range(4):
    corrs = [np.corrcoef(mu0[:,zd], t0_raw[:,fd])[0,1] for fd in range(TIER0_DIM)]
    corrs = np.array(corrs)
    top3 = np.argsort(np.abs(corrs))[-3:][::-1]
    var = mu0[:,zd].var()
    print(f"z0[{zd}] var={var:.5f}: " + "  ".join(f"{T0_NAMES[i]}={corrs[i]:+.3f}" for i in top3))

# ── What is z1 actually distinguishing? ───────────────────────────────────
print("\n" + "="*70)
print("z1[4] (highest var=0.015): value distribution vs T1 raw ranges")
print("="*70)
z1_4 = mu1[:, 4]   # dim with highest var (index 4 = z1[4])
# Actually find which z1 dim has highest variance
best_z1 = np.argmax(mu1.var(0))
z1_best = mu1[:, best_z1]
print(f"Best z1 dim: {best_z1}, var={mu1[:,best_z1].var():.4f}")

# Split into top/bottom 20% by z1 value
lo_mask = z1_best < np.percentile(z1_best, 20)
hi_mask = z1_best > np.percentile(z1_best, 80)
print(f"\nT1 feature means: LOW z1[{best_z1}] (bot20%) vs HIGH z1[{best_z1}] (top20%)")
print(f"{'Feature':<20} {'LOW':>8} {'HIGH':>8} {'diff':>8}")
for fd, name in enumerate(T1_NAMES):
    lo_val = t1_raw[lo_mask, fd].mean()
    hi_val = t1_raw[hi_mask, fd].mean()
    diff = hi_val - lo_val
    if abs(diff) > 0.02:
        print(f"  {name:<18} {lo_val:8.4f} {hi_val:8.4f} {diff:+8.4f}")

# ── ExF correlation check ─────────────────────────────────────────────────
print("\n" + "="*70)
print(f"z1[{best_z1}] vs ExF (T3) features")
print("="*70)
exf_corrs = [(np.corrcoef(z1_best, t3_raw[:,i])[0,1], EXF_FIELDS[i]) for i in range(min(25, t3_raw.shape[1]))]
exf_corrs.sort(key=lambda x: abs(x[0]), reverse=True)
for r, name in exf_corrs[:10]:
    print(f"  {name:<20} r={r:+.4f}")

# ── z0 clustering: what do the 7 clusters look like? ─────────────────────
print("\n" + "="*70)
print("z0 cluster analysis (k=7, what separates them?)")
print("="*70)
from scipy.cluster.vq import kmeans2
try:
    centroids, labels = kmeans2(mu0, 7, seed=42, minit='points')
    print(f"Cluster sizes: {np.bincount(labels)}")
    print(f"\nCluster centroids (z0 space):")
    for k in range(7):
        c = centroids[k]
        # What T0 features distinguish this cluster?
        mask_k = labels == k
        t0_k = t0_raw[mask_k].mean(0)
        t1_k = t1_raw[mask_k].mean(0)
        print(f"\n  Cluster {k} (N={mask_k.sum():,}): z0={c.round(3)}")
        print(f"    T0: bull={t0_k[0]:.3f} bear={t0_k[1]:.3f} side={t0_k[2]:.3f}")
        print(f"    T1: log_lmax_w50={t1_k[0]:.3f} vel_norm_w50={t1_k[1]:+.3f} gap_ratio_w50={t1_k[2]:.3f} inst={t1_k[3]:.3f} rtp={t1_k[4]:.3f}")
except Exception as e:
    print(f"Clustering failed: {e}")

print("\nDone.")
initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore. 2026-04-21 16:58:38 +02:00			`"""`
			`What did the encoder actually learn?`
			`Correlate z0/z1 latent dims with raw input features.`
			`"""`
			`import sys`
			`sys.stdout.reconfigure(encoding='utf-8', errors='replace')`
			`import numpy as np`
			`from pathlib import Path`

			`HERE = Path(__file__).parent`
			`sys.path.insert(0, str(HERE))`

			`from corpus_builder import DolphinCorpus, WINDOWS`
			`from hierarchical_dvae import HierarchicalDVAE, T_OFF, TIER0_DIM, TIER1_DIM, TIER3_DIM`

			`# ── Feature names ─────────────────────────────────────────────────────────`
			`T0_NAMES = ['bull_pct', 'bear_pct', 'side_pct', 'sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'has_eigen']`
			`T1_NAMES = []`
			`for w in WINDOWS:`
			`for feat in ['log_lmax', 'vel_norm', 'gap_ratio', 'instability', 'rtp']:`
			`T1_NAMES.append(f"w{w}_{feat}")`

			`EXF_FIELDS = [`
			`'dvol_btc', 'dvol_eth', 'fng', 'fng_prev', 'btc_dom', 'eth_dom',`
			`'chg24_btc', 'chg24_eth', 'dispersion', 'correlation', 'imbal_btc', 'imbal_eth',`
			`'funding_btc', 'funding_eth', 'mvrv', 'tvl', 'pcr_vol', 'pcr_oi',`
			`'basis', 'liq_proxy', 'spread', 'vol24', 'hashrate', 'btc_price', 'fng_vol',`
			`]`

			`# ── Load ───────────────────────────────────────────────────────────────────`
			`print("Loading corpus...")`
			`corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))`
			`idx = corpus.mask[:, 1] # 16K eigen samples`
			`X_e = corpus.X[idx]`
			`mask_e = corpus.mask[idx]`
			`print(f"Eigen subset: {len(X_e):,} samples")`

			`print("Loading model...")`
			`model = HierarchicalDVAE(hidden=128, beta=0.5, gamma=1.0, lam=1.0, seed=42)`
			`model.fit_normaliser(corpus.X, corpus.mask)`

			`# Load weights`
			`d = np.load(str(HERE / 'hdvae_checkpoint.npz'), allow_pickle=True)`
			`def load_enc(enc, name):`
			`for i, layer in enumerate(enc.mlp.layers):`
			`layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b']`
			`enc.mu_head.W = d[f'{name}_mu_W']; enc.mu_head.b = d[f'{name}_mu_b']`
			`enc.lv_head.W = d[f'{name}_lv_W']; enc.lv_head.b = d[f'{name}_lv_b']`
			`def load_dec(dec, name):`
			`for i, layer in enumerate(dec.mlp.layers):`
			`layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b']`
			`for n, e in [('enc0',model.enc0),('enc1',model.enc1),('enc2',model.enc2)]:`
			`load_enc(e, n)`
			`for n, dc in [('dec0',model.dec0),('dec1',model.dec1),('dec2',model.dec2)]:`
			`load_dec(dc, n)`

			`# ── Encode all 16K samples ─────────────────────────────────────────────────`
			`print("Encoding...")`
			`rng = np.random.RandomState(0)`
			`BATCH = 512`
			`mu0_all, mu1_all = [], []`
			`for start in range(0, len(X_e), BATCH):`
			`Xb = X_e[start:start+BATCH]`
			`mb = mask_e[start:start+BATCH]`
			`enc = model.encode(Xb, mb, rng)`
			`mu0_all.append(enc['mu0'])`
			`mu1_all.append(enc['mu1'])`
			`mu0 = np.concatenate(mu0_all) # (N, 4)`
			`mu1 = np.concatenate(mu1_all) # (N, 8)`

			`print(f"\nmu0 stats: mean={mu0.mean(0).round(4)} std={mu0.std(0).round(4)}")`
			`print(f"mu1 stats: mean={mu1.mean(0).round(4)} std={mu1.std(0).round(4)}")`

			`# ── Raw features ──────────────────────────────────────────────────────────`
			`t0_raw = X_e[:, T_OFF[0]:T_OFF[0]+TIER0_DIM]`
			`t1_raw = X_e[:, T_OFF[1]:T_OFF[1]+TIER1_DIM]`
			`t3_raw = X_e[:, T_OFF[3]:T_OFF[3]+TIER3_DIM]`

			`# ── Correlation: z1 dims vs T1 features ───────────────────────────────────`
			`print("\n" + "="*70)`
			`print("z1 DIMS vs T1 FEATURES (top correlations per z1 dim)")`
			`print("="*70)`
			`for zd in range(8):`
			`corrs = [np.corrcoef(mu1[:,zd], t1_raw[:,fd])[0,1] for fd in range(TIER1_DIM)]`
			`corrs = np.array(corrs)`
			`top3 = np.argsort(np.abs(corrs))[-3:][::-1]`
			`var = mu1[:,zd].var()`
			`print(f"z1[{zd}] var={var:.4f}: " + " ".join(f"{T1_NAMES[i]}={corrs[i]:+.3f}" for i in top3))`

			`# ── Correlation: z0 dims vs T0 features ───────────────────────────────────`
			`print("\n" + "="*70)`
			`print("z0 DIMS vs T0 FEATURES (top correlations per z0 dim)")`
			`print("="*70)`
			`for zd in range(4):`
			`corrs = [np.corrcoef(mu0[:,zd], t0_raw[:,fd])[0,1] for fd in range(TIER0_DIM)]`
			`corrs = np.array(corrs)`
			`top3 = np.argsort(np.abs(corrs))[-3:][::-1]`
			`var = mu0[:,zd].var()`
			`print(f"z0[{zd}] var={var:.5f}: " + " ".join(f"{T0_NAMES[i]}={corrs[i]:+.3f}" for i in top3))`

			`# ── What is z1 actually distinguishing? ───────────────────────────────────`
			`print("\n" + "="*70)`
			`print("z1[4] (highest var=0.015): value distribution vs T1 raw ranges")`
			`print("="*70)`
			`z1_4 = mu1[:, 4] # dim with highest var (index 4 = z1[4])`
			`# Actually find which z1 dim has highest variance`
			`best_z1 = np.argmax(mu1.var(0))`
			`z1_best = mu1[:, best_z1]`
			`print(f"Best z1 dim: {best_z1}, var={mu1[:,best_z1].var():.4f}")`

			`# Split into top/bottom 20% by z1 value`
			`lo_mask = z1_best < np.percentile(z1_best, 20)`
			`hi_mask = z1_best > np.percentile(z1_best, 80)`
			`print(f"\nT1 feature means: LOW z1[{best_z1}] (bot20%) vs HIGH z1[{best_z1}] (top20%)")`
			`print(f"{'Feature':<20} {'LOW':>8} {'HIGH':>8} {'diff':>8}")`
			`for fd, name in enumerate(T1_NAMES):`
			`lo_val = t1_raw[lo_mask, fd].mean()`
			`hi_val = t1_raw[hi_mask, fd].mean()`
			`diff = hi_val - lo_val`
			`if abs(diff) > 0.02:`
			`print(f" {name:<18} {lo_val:8.4f} {hi_val:8.4f} {diff:+8.4f}")`

			`# ── ExF correlation check ─────────────────────────────────────────────────`
			`print("\n" + "="*70)`
			`print(f"z1[{best_z1}] vs ExF (T3) features")`
			`print("="*70)`
			`exf_corrs = [(np.corrcoef(z1_best, t3_raw[:,i])[0,1], EXF_FIELDS[i]) for i in range(min(25, t3_raw.shape[1]))]`
			`exf_corrs.sort(key=lambda x: abs(x[0]), reverse=True)`
			`for r, name in exf_corrs[:10]:`
			`print(f" {name:<20} r={r:+.4f}")`

			`# ── z0 clustering: what do the 7 clusters look like? ─────────────────────`
			`print("\n" + "="*70)`
			`print("z0 cluster analysis (k=7, what separates them?)")`
			`print("="*70)`
			`from scipy.cluster.vq import kmeans2`
			`try:`
			`centroids, labels = kmeans2(mu0, 7, seed=42, minit='points')`
			`print(f"Cluster sizes: {np.bincount(labels)}")`
			`print(f"\nCluster centroids (z0 space):")`
			`for k in range(7):`
			`c = centroids[k]`
			`# What T0 features distinguish this cluster?`
			`mask_k = labels == k`
			`t0_k = t0_raw[mask_k].mean(0)`
			`t1_k = t1_raw[mask_k].mean(0)`
			`print(f"\n Cluster {k} (N={mask_k.sum():,}): z0={c.round(3)}")`
			`print(f" T0: bull={t0_k[0]:.3f} bear={t0_k[1]:.3f} side={t0_k[2]:.3f}")`
			`print(f" T1: log_lmax_w50={t1_k[0]:.3f} vel_norm_w50={t1_k[1]:+.3f} gap_ratio_w50={t1_k[2]:.3f} inst={t1_k[3]:.3f} rtp={t1_k[4]:.3f}")`
			`except Exception as e:`
			`print(f"Clustering failed: {e}")`

			`print("\nDone.")`