213 lines
9.4 KiB
Python
213 lines
9.4 KiB
Python
|
|
"""
|
||
|
|
Exp 5 — Two-pass β VAE training.
|
||
|
|
|
||
|
|
The question: does high-β pass (β=4) to "map features" followed by low-β pass
|
||
|
|
(β=0.1) for "fidelity" outperform single-pass β=0.1?
|
||
|
|
|
||
|
|
Theory:
|
||
|
|
Pass 1 (high β): forces encoder to compress — ideally clusters similar market
|
||
|
|
states together, even at cost of reconstruction quality.
|
||
|
|
Acts as a structured initializer.
|
||
|
|
Pass 2 (low β): fine-tunes with more fidelity, starting from the structured
|
||
|
|
initializer rather than random weights.
|
||
|
|
|
||
|
|
We test three variants:
|
||
|
|
A. Single-pass β=0.1 (baseline, AUC≈0.6918 from flint_precursor_sweep)
|
||
|
|
B. Two-pass sequential: β=4 (20ep) → β=0.1 (20ep) on same model
|
||
|
|
C. Two-pass sequential: β=2 (20ep) → β=0.1 (20ep) (softer first pass)
|
||
|
|
D. Dual encoder: β=4 encoder + β=0.1 encoder, z concatenated (16-dim total)
|
||
|
|
|
||
|
|
Metric: OOS AUC for eigenspace stress prediction (K=5, same as e2e_precursor_auc.py).
|
||
|
|
Gate: if two-pass AUC > single-pass AUC + 0.02 → meaningful improvement.
|
||
|
|
|
||
|
|
Note on β=12 (the user's original suggestion):
|
||
|
|
β=12 would cause complete posterior collapse even with warmup (β=6 collapsed at 0/20 dims).
|
||
|
|
β=4 is the practical upper bound where some structure survives.
|
||
|
|
We test β=2 and β=4 to find the sweet spot.
|
||
|
|
"""
|
||
|
|
import sys
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
|
|
from pathlib import Path
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
_HERE = Path(__file__).resolve().parent
|
||
|
|
sys.path.insert(0, str(_HERE))
|
||
|
|
|
||
|
|
_CORPUS_PATH = str(_HERE / 'corpus_cache.npz')
|
||
|
|
|
||
|
|
# ── Load T1 corpus ────────────────────────────────────────────────────────────
|
||
|
|
print("Loading 16K eigen corpus...")
|
||
|
|
from corpus_builder import DolphinCorpus, OFF, T1 as T1_DIM
|
||
|
|
corpus = DolphinCorpus.load(_CORPUS_PATH)
|
||
|
|
mask = corpus.mask[:, 1]
|
||
|
|
X_e = corpus.X[mask]
|
||
|
|
T1_data = X_e[:, OFF[1]:OFF[1]+T1_DIM].copy() # (16607, 20)
|
||
|
|
N = len(T1_data)
|
||
|
|
print(f" N={N} T1 shape={T1_data.shape}")
|
||
|
|
|
||
|
|
# ── Stress labels (K=5) ───────────────────────────────────────────────────────
|
||
|
|
K = 5
|
||
|
|
inst_w50 = T1_data[:, 3]
|
||
|
|
gap_w50 = T1_data[:, 2]
|
||
|
|
vel_w750 = T1_data[:, 16]
|
||
|
|
inst_p90 = np.percentile(inst_w50, 90)
|
||
|
|
gap_p10 = np.percentile(gap_w50, 10)
|
||
|
|
|
||
|
|
labels = np.zeros(N, dtype=np.float32)
|
||
|
|
for i in range(N - K):
|
||
|
|
if np.any(inst_w50[i+1:i+1+K] > inst_p90) and np.any(gap_w50[i+1:i+1+K] < gap_p10):
|
||
|
|
labels[i] = 1.0
|
||
|
|
print(f" Stress labels: {labels.mean()*100:.1f}% positive")
|
||
|
|
|
||
|
|
# Chronological split
|
||
|
|
n_test = N // 4
|
||
|
|
idx_tr = slice(0, N - n_test)
|
||
|
|
idx_te = slice(N - n_test, N)
|
||
|
|
|
||
|
|
# ── AUC helpers ───────────────────────────────────────────────────────────────
|
||
|
|
from sklearn.linear_model import LogisticRegression
|
||
|
|
from sklearn.metrics import roc_auc_score
|
||
|
|
|
||
|
|
def eval_auc(z_all, labels, n_test):
|
||
|
|
X_lr = z_all[:-K]; y_lr = labels[:-K]
|
||
|
|
valid = np.isfinite(X_lr).all(1) & np.isfinite(y_lr)
|
||
|
|
X_lr, y_lr = X_lr[valid], y_lr[valid]
|
||
|
|
n = len(X_lr) // 4
|
||
|
|
X_tr, X_te = X_lr[:-n], X_lr[-n:]
|
||
|
|
y_tr, y_te = y_lr[:-n], y_lr[-n:]
|
||
|
|
clf = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1)
|
||
|
|
clf.fit(X_tr, y_tr)
|
||
|
|
preds = clf.predict_proba(X_te)[:,1]
|
||
|
|
auc = roc_auc_score(y_te, preds)
|
||
|
|
return max(auc, 1-auc)
|
||
|
|
|
||
|
|
# ── Import FlintHDVAE ─────────────────────────────────────────────────────────
|
||
|
|
from flint_hd_vae import FlintHDVAE
|
||
|
|
|
||
|
|
def build_model(seed=42):
|
||
|
|
return FlintHDVAE(input_dim=20, hd_dim=512, latent_dim=8,
|
||
|
|
beta=0.1, seed=seed, use_flint_norm=False)
|
||
|
|
|
||
|
|
n_vae_train = int(N * 0.8)
|
||
|
|
T1_train = T1_data[:n_vae_train]
|
||
|
|
|
||
|
|
results = {}
|
||
|
|
|
||
|
|
# ── Variant A: Single-pass β=0.1 (baseline) ──────────────────────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("A. SINGLE-PASS β=0.1 (baseline)")
|
||
|
|
print("="*55)
|
||
|
|
m_a = build_model(seed=42)
|
||
|
|
m_a.fit(T1_train, epochs=40, lr=1e-3, batch_size=256, verbose=True, warmup_frac=0.3)
|
||
|
|
z_a = m_a.encode(T1_data)
|
||
|
|
print(f" z var per dim: {z_a.var(0).round(3)}")
|
||
|
|
print(f" Active dims (var>0.1): {int((z_a.var(0)>0.1).sum())}/8")
|
||
|
|
auc_a = eval_auc(z_a, labels, n_test)
|
||
|
|
print(f" OOS AUC = {auc_a:.4f}")
|
||
|
|
results['A_single_pass_b0.1'] = dict(auc=auc_a, active_dims=int((z_a.var(0)>0.1).sum()),
|
||
|
|
z_var=z_a.var(0).tolist())
|
||
|
|
|
||
|
|
# ── Variant B: Two-pass β=4 → β=0.1 ─────────────────────────────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("B. TWO-PASS β=4 (20ep) → β=0.1 (20ep)")
|
||
|
|
print("="*55)
|
||
|
|
m_b = build_model(seed=42)
|
||
|
|
|
||
|
|
print(" Pass 1: β=4, 20 epochs")
|
||
|
|
m_b.beta = 4.0
|
||
|
|
m_b.fit(T1_train, epochs=20, lr=1e-3, batch_size=256, verbose=True, warmup_frac=0.3)
|
||
|
|
|
||
|
|
print(" Pass 2: β=0.1, 20 epochs (continuing from Pass 1 weights)")
|
||
|
|
m_b.beta = 0.1
|
||
|
|
m_b.fit(T1_train, epochs=20, lr=5e-4, batch_size=256, verbose=True, warmup_frac=0.1)
|
||
|
|
|
||
|
|
z_b = m_b.encode(T1_data)
|
||
|
|
print(f" z var per dim: {z_b.var(0).round(3)}")
|
||
|
|
print(f" Active dims (var>0.1): {int((z_b.var(0)>0.1).sum())}/8")
|
||
|
|
auc_b = eval_auc(z_b, labels, n_test)
|
||
|
|
print(f" OOS AUC = {auc_b:.4f} (vs A: {auc_b-auc_a:+.4f})")
|
||
|
|
results['B_twopass_b4_b0.1'] = dict(auc=auc_b, active_dims=int((z_b.var(0)>0.1).sum()),
|
||
|
|
z_var=z_b.var(0).tolist())
|
||
|
|
|
||
|
|
# ── Variant C: Two-pass β=2 → β=0.1 ─────────────────────────────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("C. TWO-PASS β=2 (20ep) → β=0.1 (20ep)")
|
||
|
|
print("="*55)
|
||
|
|
m_c = build_model(seed=42)
|
||
|
|
|
||
|
|
print(" Pass 1: β=2, 20 epochs")
|
||
|
|
m_c.beta = 2.0
|
||
|
|
m_c.fit(T1_train, epochs=20, lr=1e-3, batch_size=256, verbose=True, warmup_frac=0.3)
|
||
|
|
|
||
|
|
print(" Pass 2: β=0.1, 20 epochs")
|
||
|
|
m_c.beta = 0.1
|
||
|
|
m_c.fit(T1_train, epochs=20, lr=5e-4, batch_size=256, verbose=True, warmup_frac=0.1)
|
||
|
|
|
||
|
|
z_c = m_c.encode(T1_data)
|
||
|
|
print(f" z var per dim: {z_c.var(0).round(3)}")
|
||
|
|
print(f" Active dims (var>0.1): {int((z_c.var(0)>0.1).sum())}/8")
|
||
|
|
auc_c = eval_auc(z_c, labels, n_test)
|
||
|
|
print(f" OOS AUC = {auc_c:.4f} (vs A: {auc_c-auc_a:+.4f})")
|
||
|
|
results['C_twopass_b2_b0.1'] = dict(auc=auc_c, active_dims=int((z_c.var(0)>0.1).sum()),
|
||
|
|
z_var=z_c.var(0).tolist())
|
||
|
|
|
||
|
|
# ── Variant D: Dual encoder (β=4 ‖ β=0.1, z concatenated) ───────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("D. DUAL ENCODER: β=4 encoder ‖ β=0.1 encoder (z concat → 16-dim)")
|
||
|
|
print("="*55)
|
||
|
|
m_d_hi = build_model(seed=42)
|
||
|
|
m_d_hi.beta = 4.0
|
||
|
|
print(" Training β=4 encoder (20 epochs)...")
|
||
|
|
m_d_hi.fit(T1_train, epochs=20, lr=1e-3, batch_size=256, verbose=False, warmup_frac=0.3)
|
||
|
|
|
||
|
|
m_d_lo = build_model(seed=123)
|
||
|
|
m_d_lo.beta = 0.1
|
||
|
|
print(" Training β=0.1 encoder (40 epochs)...")
|
||
|
|
m_d_lo.fit(T1_train, epochs=40, lr=1e-3, batch_size=256, verbose=False, warmup_frac=0.3)
|
||
|
|
|
||
|
|
z_hi = m_d_hi.encode(T1_data) # (N, 8)
|
||
|
|
z_lo = m_d_lo.encode(T1_data) # (N, 8)
|
||
|
|
z_d = np.concatenate([z_hi, z_lo], axis=1) # (N, 16)
|
||
|
|
|
||
|
|
print(f" β=4 z var: {z_hi.var(0).round(3)}")
|
||
|
|
print(f" β=0.1 z var: {z_lo.var(0).round(3)}")
|
||
|
|
print(f" Combined z shape: {z_d.shape}")
|
||
|
|
auc_d = eval_auc(z_d, labels, n_test)
|
||
|
|
print(f" OOS AUC = {auc_d:.4f} (vs A: {auc_d-auc_a:+.4f})")
|
||
|
|
results['D_dual_b4_b0.1'] = dict(auc=auc_d,
|
||
|
|
active_dims_hi=int((z_hi.var(0)>0.1).sum()),
|
||
|
|
active_dims_lo=int((z_lo.var(0)>0.1).sum()),
|
||
|
|
z_var_hi=z_hi.var(0).tolist(), z_var_lo=z_lo.var(0).tolist())
|
||
|
|
|
||
|
|
# ── Summary ───────────────────────────────────────────────────────────────────
|
||
|
|
GATE = 0.02 # improvement threshold
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("EXP 5 — TWO-PASS β SUMMARY")
|
||
|
|
print("="*55)
|
||
|
|
print(f"{'Variant':<35} {'AUC':>8} {'vs A':>8} {'ActiveDims':>11}")
|
||
|
|
print('-'*65)
|
||
|
|
for k, v in results.items():
|
||
|
|
ad = v.get('active_dims', v.get('active_dims_lo', '?'))
|
||
|
|
delta = v['auc'] - auc_a
|
||
|
|
flag = ' ◄ GAIN' if delta >= GATE else (' △' if delta > 0 else '')
|
||
|
|
print(f" {k:<33} {v['auc']:>8.4f} {delta:>+8.4f} {str(ad):>11}{flag}")
|
||
|
|
|
||
|
|
best = max(results, key=lambda k: results[k]['auc'])
|
||
|
|
best_auc = results[best]['auc']
|
||
|
|
print(f"\n Best: {best} AUC={best_auc:.4f}")
|
||
|
|
if best_auc - auc_a >= GATE:
|
||
|
|
print(f" GATE PASS: improvement {best_auc-auc_a:+.4f} ≥ {GATE}")
|
||
|
|
print(f" → Two-pass training IS beneficial. Adopt for FlintHDVAE.")
|
||
|
|
else:
|
||
|
|
print(f" GATE FAIL: best improvement {best_auc-auc_a:+.4f} < {GATE}")
|
||
|
|
print(f" → Two-pass training offers NO meaningful gain on this dataset.")
|
||
|
|
|
||
|
|
# Save
|
||
|
|
import json
|
||
|
|
out = _HERE / 'exp5_dvae_twopass_results.json'
|
||
|
|
with open(out, 'w', encoding='utf-8') as f:
|
||
|
|
json.dump({'results': results, 'baseline_auc': float(auc_a),
|
||
|
|
'gate_threshold': GATE, 'winner': best,
|
||
|
|
'note': 'beta=12 not tested (collapses; beta=6 already showed 0/20 active dims)'}, f, indent=2)
|
||
|
|
print(f"\n Logged → {out}")
|