Files
DOLPHIN/nautilus_dolphin/dvae/exp5_dvae_twopass.py

213 lines
9.4 KiB
Python
Raw Normal View History

"""
Exp 5 Two-pass β VAE training.
The question: does high-β pass (β=4) to "map features" followed by low-β pass
(β=0.1) for "fidelity" outperform single-pass β=0.1?
Theory:
Pass 1 (high β): forces encoder to compress ideally clusters similar market
states together, even at cost of reconstruction quality.
Acts as a structured initializer.
Pass 2 (low β): fine-tunes with more fidelity, starting from the structured
initializer rather than random weights.
We test three variants:
A. Single-pass β=0.1 (baseline, AUC0.6918 from flint_precursor_sweep)
B. Two-pass sequential: β=4 (20ep) β=0.1 (20ep) on same model
C. Two-pass sequential: β=2 (20ep) β=0.1 (20ep) (softer first pass)
D. Dual encoder: β=4 encoder + β=0.1 encoder, z concatenated (16-dim total)
Metric: OOS AUC for eigenspace stress prediction (K=5, same as e2e_precursor_auc.py).
Gate: if two-pass AUC > single-pass AUC + 0.02 meaningful improvement.
Note on β=12 (the user's original suggestion):
β=12 would cause complete posterior collapse even with warmup (β=6 collapsed at 0/20 dims).
β=4 is the practical upper bound where some structure survives.
We test β=2 and β=4 to find the sweet spot.
"""
import sys
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
import numpy as np
_HERE = Path(__file__).resolve().parent
sys.path.insert(0, str(_HERE))
_CORPUS_PATH = str(_HERE / 'corpus_cache.npz')
# ── Load T1 corpus ────────────────────────────────────────────────────────────
print("Loading 16K eigen corpus...")
from corpus_builder import DolphinCorpus, OFF, T1 as T1_DIM
corpus = DolphinCorpus.load(_CORPUS_PATH)
mask = corpus.mask[:, 1]
X_e = corpus.X[mask]
T1_data = X_e[:, OFF[1]:OFF[1]+T1_DIM].copy() # (16607, 20)
N = len(T1_data)
print(f" N={N} T1 shape={T1_data.shape}")
# ── Stress labels (K=5) ───────────────────────────────────────────────────────
K = 5
inst_w50 = T1_data[:, 3]
gap_w50 = T1_data[:, 2]
vel_w750 = T1_data[:, 16]
inst_p90 = np.percentile(inst_w50, 90)
gap_p10 = np.percentile(gap_w50, 10)
labels = np.zeros(N, dtype=np.float32)
for i in range(N - K):
if np.any(inst_w50[i+1:i+1+K] > inst_p90) and np.any(gap_w50[i+1:i+1+K] < gap_p10):
labels[i] = 1.0
print(f" Stress labels: {labels.mean()*100:.1f}% positive")
# Chronological split
n_test = N // 4
idx_tr = slice(0, N - n_test)
idx_te = slice(N - n_test, N)
# ── AUC helpers ───────────────────────────────────────────────────────────────
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
def eval_auc(z_all, labels, n_test):
X_lr = z_all[:-K]; y_lr = labels[:-K]
valid = np.isfinite(X_lr).all(1) & np.isfinite(y_lr)
X_lr, y_lr = X_lr[valid], y_lr[valid]
n = len(X_lr) // 4
X_tr, X_te = X_lr[:-n], X_lr[-n:]
y_tr, y_te = y_lr[:-n], y_lr[-n:]
clf = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1)
clf.fit(X_tr, y_tr)
preds = clf.predict_proba(X_te)[:,1]
auc = roc_auc_score(y_te, preds)
return max(auc, 1-auc)
# ── Import FlintHDVAE ─────────────────────────────────────────────────────────
from flint_hd_vae import FlintHDVAE
def build_model(seed=42):
return FlintHDVAE(input_dim=20, hd_dim=512, latent_dim=8,
beta=0.1, seed=seed, use_flint_norm=False)
n_vae_train = int(N * 0.8)
T1_train = T1_data[:n_vae_train]
results = {}
# ── Variant A: Single-pass β=0.1 (baseline) ──────────────────────────────────
print("\n" + "="*55)
print("A. SINGLE-PASS β=0.1 (baseline)")
print("="*55)
m_a = build_model(seed=42)
m_a.fit(T1_train, epochs=40, lr=1e-3, batch_size=256, verbose=True, warmup_frac=0.3)
z_a = m_a.encode(T1_data)
print(f" z var per dim: {z_a.var(0).round(3)}")
print(f" Active dims (var>0.1): {int((z_a.var(0)>0.1).sum())}/8")
auc_a = eval_auc(z_a, labels, n_test)
print(f" OOS AUC = {auc_a:.4f}")
results['A_single_pass_b0.1'] = dict(auc=auc_a, active_dims=int((z_a.var(0)>0.1).sum()),
z_var=z_a.var(0).tolist())
# ── Variant B: Two-pass β=4 → β=0.1 ─────────────────────────────────────────
print("\n" + "="*55)
print("B. TWO-PASS β=4 (20ep) → β=0.1 (20ep)")
print("="*55)
m_b = build_model(seed=42)
print(" Pass 1: β=4, 20 epochs")
m_b.beta = 4.0
m_b.fit(T1_train, epochs=20, lr=1e-3, batch_size=256, verbose=True, warmup_frac=0.3)
print(" Pass 2: β=0.1, 20 epochs (continuing from Pass 1 weights)")
m_b.beta = 0.1
m_b.fit(T1_train, epochs=20, lr=5e-4, batch_size=256, verbose=True, warmup_frac=0.1)
z_b = m_b.encode(T1_data)
print(f" z var per dim: {z_b.var(0).round(3)}")
print(f" Active dims (var>0.1): {int((z_b.var(0)>0.1).sum())}/8")
auc_b = eval_auc(z_b, labels, n_test)
print(f" OOS AUC = {auc_b:.4f} (vs A: {auc_b-auc_a:+.4f})")
results['B_twopass_b4_b0.1'] = dict(auc=auc_b, active_dims=int((z_b.var(0)>0.1).sum()),
z_var=z_b.var(0).tolist())
# ── Variant C: Two-pass β=2 → β=0.1 ─────────────────────────────────────────
print("\n" + "="*55)
print("C. TWO-PASS β=2 (20ep) → β=0.1 (20ep)")
print("="*55)
m_c = build_model(seed=42)
print(" Pass 1: β=2, 20 epochs")
m_c.beta = 2.0
m_c.fit(T1_train, epochs=20, lr=1e-3, batch_size=256, verbose=True, warmup_frac=0.3)
print(" Pass 2: β=0.1, 20 epochs")
m_c.beta = 0.1
m_c.fit(T1_train, epochs=20, lr=5e-4, batch_size=256, verbose=True, warmup_frac=0.1)
z_c = m_c.encode(T1_data)
print(f" z var per dim: {z_c.var(0).round(3)}")
print(f" Active dims (var>0.1): {int((z_c.var(0)>0.1).sum())}/8")
auc_c = eval_auc(z_c, labels, n_test)
print(f" OOS AUC = {auc_c:.4f} (vs A: {auc_c-auc_a:+.4f})")
results['C_twopass_b2_b0.1'] = dict(auc=auc_c, active_dims=int((z_c.var(0)>0.1).sum()),
z_var=z_c.var(0).tolist())
# ── Variant D: Dual encoder (β=4 ‖ β=0.1, z concatenated) ───────────────────
print("\n" + "="*55)
print("D. DUAL ENCODER: β=4 encoder ‖ β=0.1 encoder (z concat → 16-dim)")
print("="*55)
m_d_hi = build_model(seed=42)
m_d_hi.beta = 4.0
print(" Training β=4 encoder (20 epochs)...")
m_d_hi.fit(T1_train, epochs=20, lr=1e-3, batch_size=256, verbose=False, warmup_frac=0.3)
m_d_lo = build_model(seed=123)
m_d_lo.beta = 0.1
print(" Training β=0.1 encoder (40 epochs)...")
m_d_lo.fit(T1_train, epochs=40, lr=1e-3, batch_size=256, verbose=False, warmup_frac=0.3)
z_hi = m_d_hi.encode(T1_data) # (N, 8)
z_lo = m_d_lo.encode(T1_data) # (N, 8)
z_d = np.concatenate([z_hi, z_lo], axis=1) # (N, 16)
print(f" β=4 z var: {z_hi.var(0).round(3)}")
print(f" β=0.1 z var: {z_lo.var(0).round(3)}")
print(f" Combined z shape: {z_d.shape}")
auc_d = eval_auc(z_d, labels, n_test)
print(f" OOS AUC = {auc_d:.4f} (vs A: {auc_d-auc_a:+.4f})")
results['D_dual_b4_b0.1'] = dict(auc=auc_d,
active_dims_hi=int((z_hi.var(0)>0.1).sum()),
active_dims_lo=int((z_lo.var(0)>0.1).sum()),
z_var_hi=z_hi.var(0).tolist(), z_var_lo=z_lo.var(0).tolist())
# ── Summary ───────────────────────────────────────────────────────────────────
GATE = 0.02 # improvement threshold
print("\n" + "="*55)
print("EXP 5 — TWO-PASS β SUMMARY")
print("="*55)
print(f"{'Variant':<35} {'AUC':>8} {'vs A':>8} {'ActiveDims':>11}")
print('-'*65)
for k, v in results.items():
ad = v.get('active_dims', v.get('active_dims_lo', '?'))
delta = v['auc'] - auc_a
flag = ' ◄ GAIN' if delta >= GATE else ('' if delta > 0 else '')
print(f" {k:<33} {v['auc']:>8.4f} {delta:>+8.4f} {str(ad):>11}{flag}")
best = max(results, key=lambda k: results[k]['auc'])
best_auc = results[best]['auc']
print(f"\n Best: {best} AUC={best_auc:.4f}")
if best_auc - auc_a >= GATE:
print(f" GATE PASS: improvement {best_auc-auc_a:+.4f}{GATE}")
print(f" → Two-pass training IS beneficial. Adopt for FlintHDVAE.")
else:
print(f" GATE FAIL: best improvement {best_auc-auc_a:+.4f} < {GATE}")
print(f" → Two-pass training offers NO meaningful gain on this dataset.")
# Save
import json
out = _HERE / 'exp5_dvae_twopass_results.json'
with open(out, 'w', encoding='utf-8') as f:
json.dump({'results': results, 'baseline_auc': float(auc_a),
'gate_threshold': GATE, 'winner': best,
'note': 'beta=12 not tested (collapses; beta=6 already showed 0/20 active dims)'}, f, indent=2)
print(f"\n Logged → {out}")