"""
What did the encoder actually learn?
Correlate z0/z1 latent dims with raw input features.
"""
import sys
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
import numpy as np
from pathlib import Path

HERE = Path(__file__).parent
sys.path.insert(0, str(HERE))

from corpus_builder import DolphinCorpus, WINDOWS
from hierarchical_dvae import HierarchicalDVAE, T_OFF, TIER0_DIM, TIER1_DIM, TIER3_DIM

# ── Feature names ─────────────────────────────────────────────────────────
T0_NAMES = ['bull_pct', 'bear_pct', 'side_pct', 'sin_hour', 'cos_hour', 'sin_day', 'cos_day', 'has_eigen']
T1_NAMES = []
for w in WINDOWS:
    for feat in ['log_lmax', 'vel_norm', 'gap_ratio', 'instability', 'rtp']:
        T1_NAMES.append(f"w{w}_{feat}")

EXF_FIELDS = [
    'dvol_btc', 'dvol_eth', 'fng', 'fng_prev', 'btc_dom', 'eth_dom',
    'chg24_btc', 'chg24_eth', 'dispersion', 'correlation', 'imbal_btc', 'imbal_eth',
    'funding_btc', 'funding_eth', 'mvrv', 'tvl', 'pcr_vol', 'pcr_oi',
    'basis', 'liq_proxy', 'spread', 'vol24', 'hashrate', 'btc_price', 'fng_vol',
]

# ── Load ───────────────────────────────────────────────────────────────────
print("Loading corpus...")
corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))
idx = corpus.mask[:, 1]   # 16K eigen samples
X_e = corpus.X[idx]
mask_e = corpus.mask[idx]
print(f"Eigen subset: {len(X_e):,} samples")

print("Loading model...")
model = HierarchicalDVAE(hidden=128, beta=0.5, gamma=1.0, lam=1.0, seed=42)
model.fit_normaliser(corpus.X, corpus.mask)

# Load weights
d = np.load(str(HERE / 'hdvae_checkpoint.npz'), allow_pickle=True)
def load_enc(enc, name):
    for i, layer in enumerate(enc.mlp.layers):
        layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b']
    enc.mu_head.W = d[f'{name}_mu_W']; enc.mu_head.b = d[f'{name}_mu_b']
    enc.lv_head.W = d[f'{name}_lv_W']; enc.lv_head.b = d[f'{name}_lv_b']
def load_dec(dec, name):
    for i, layer in enumerate(dec.mlp.layers):
        layer.W = d[f'{name}_mlp{i}_W']; layer.b = d[f'{name}_mlp{i}_b']
for n, e in [('enc0',model.enc0),('enc1',model.enc1),('enc2',model.enc2)]:
    load_enc(e, n)
for n, dc in [('dec0',model.dec0),('dec1',model.dec1),('dec2',model.dec2)]:
    load_dec(dc, n)

# ── Encode all 16K samples ─────────────────────────────────────────────────
print("Encoding...")
rng = np.random.RandomState(0)
BATCH = 512
mu0_all, mu1_all = [], []
for start in range(0, len(X_e), BATCH):
    Xb = X_e[start:start+BATCH]
    mb = mask_e[start:start+BATCH]
    enc = model.encode(Xb, mb, rng)
    mu0_all.append(enc['mu0'])
    mu1_all.append(enc['mu1'])
mu0 = np.concatenate(mu0_all)   # (N, 4)
mu1 = np.concatenate(mu1_all)   # (N, 8)

print(f"\nmu0 stats: mean={mu0.mean(0).round(4)}  std={mu0.std(0).round(4)}")
print(f"mu1 stats: mean={mu1.mean(0).round(4)}  std={mu1.std(0).round(4)}")

# ── Raw features ──────────────────────────────────────────────────────────
t0_raw = X_e[:, T_OFF[0]:T_OFF[0]+TIER0_DIM]
t1_raw = X_e[:, T_OFF[1]:T_OFF[1]+TIER1_DIM]
t3_raw = X_e[:, T_OFF[3]:T_OFF[3]+TIER3_DIM]

# ── Correlation: z1 dims vs T1 features ───────────────────────────────────
print("\n" + "="*70)
print("z1 DIMS vs T1 FEATURES (top correlations per z1 dim)")
print("="*70)
for zd in range(8):
    corrs = [np.corrcoef(mu1[:,zd], t1_raw[:,fd])[0,1] for fd in range(TIER1_DIM)]
    corrs = np.array(corrs)
    top3 = np.argsort(np.abs(corrs))[-3:][::-1]
    var = mu1[:,zd].var()
    print(f"z1[{zd}] var={var:.4f}: " + "  ".join(f"{T1_NAMES[i]}={corrs[i]:+.3f}" for i in top3))

# ── Correlation: z0 dims vs T0 features ───────────────────────────────────
print("\n" + "="*70)
print("z0 DIMS vs T0 FEATURES (top correlations per z0 dim)")
print("="*70)
for zd in range(4):
    corrs = [np.corrcoef(mu0[:,zd], t0_raw[:,fd])[0,1] for fd in range(TIER0_DIM)]
    corrs = np.array(corrs)
    top3 = np.argsort(np.abs(corrs))[-3:][::-1]
    var = mu0[:,zd].var()
    print(f"z0[{zd}] var={var:.5f}: " + "  ".join(f"{T0_NAMES[i]}={corrs[i]:+.3f}" for i in top3))

# ── What is z1 actually distinguishing? ───────────────────────────────────
print("\n" + "="*70)
print("z1[4] (highest var=0.015): value distribution vs T1 raw ranges")
print("="*70)
z1_4 = mu1[:, 4]   # dim with highest var (index 4 = z1[4])
# Actually find which z1 dim has highest variance
best_z1 = np.argmax(mu1.var(0))
z1_best = mu1[:, best_z1]
print(f"Best z1 dim: {best_z1}, var={mu1[:,best_z1].var():.4f}")

# Split into top/bottom 20% by z1 value
lo_mask = z1_best < np.percentile(z1_best, 20)
hi_mask = z1_best > np.percentile(z1_best, 80)
print(f"\nT1 feature means: LOW z1[{best_z1}] (bot20%) vs HIGH z1[{best_z1}] (top20%)")
print(f"{'Feature':<20} {'LOW':>8} {'HIGH':>8} {'diff':>8}")
for fd, name in enumerate(T1_NAMES):
    lo_val = t1_raw[lo_mask, fd].mean()
    hi_val = t1_raw[hi_mask, fd].mean()
    diff = hi_val - lo_val
    if abs(diff) > 0.02:
        print(f"  {name:<18} {lo_val:8.4f} {hi_val:8.4f} {diff:+8.4f}")

# ── ExF correlation check ─────────────────────────────────────────────────
print("\n" + "="*70)
print(f"z1[{best_z1}] vs ExF (T3) features")
print("="*70)
exf_corrs = [(np.corrcoef(z1_best, t3_raw[:,i])[0,1], EXF_FIELDS[i]) for i in range(min(25, t3_raw.shape[1]))]
exf_corrs.sort(key=lambda x: abs(x[0]), reverse=True)
for r, name in exf_corrs[:10]:
    print(f"  {name:<20} r={r:+.4f}")

# ── z0 clustering: what do the 7 clusters look like? ─────────────────────
print("\n" + "="*70)
print("z0 cluster analysis (k=7, what separates them?)")
print("="*70)
from scipy.cluster.vq import kmeans2
try:
    centroids, labels = kmeans2(mu0, 7, seed=42, minit='points')
    print(f"Cluster sizes: {np.bincount(labels)}")
    print(f"\nCluster centroids (z0 space):")
    for k in range(7):
        c = centroids[k]
        # What T0 features distinguish this cluster?
        mask_k = labels == k
        t0_k = t0_raw[mask_k].mean(0)
        t1_k = t1_raw[mask_k].mean(0)
        print(f"\n  Cluster {k} (N={mask_k.sum():,}): z0={c.round(3)}")
        print(f"    T0: bull={t0_k[0]:.3f} bear={t0_k[1]:.3f} side={t0_k[2]:.3f}")
        print(f"    T1: log_lmax_w50={t1_k[0]:.3f} vel_norm_w50={t1_k[1]:+.3f} gap_ratio_w50={t1_k[2]:.3f} inst={t1_k[3]:.3f} rtp={t1_k[4]:.3f}")
except Exception as e:
    print(f"Clustering failed: {e}")

print("\nDone.")