152 lines
6.1 KiB
Python
152 lines
6.1 KiB
Python
|
|
"""
|
||
|
|
Task 3: E2E Precursor AUC Test.
|
||
|
|
|
||
|
|
Train FlintHDVAE (beta=0.1) on 80% of 16K T1 corpus.
|
||
|
|
Encode all samples → z (8-dim latent).
|
||
|
|
Build eigenspace stress labels at K=5 scans (25s): inst>p90 AND gap<p10.
|
||
|
|
Test logistic regression on z → stress labels (chronological OOS split).
|
||
|
|
Compare against proxy_B baseline (AUC=0.715 from flint_precursor_sweep.py).
|
||
|
|
|
||
|
|
Gate: AUC ≥ 0.65 → proceed to Task 4.
|
||
|
|
"""
|
||
|
|
import sys, os
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
||
|
|
sys.path.insert(0, r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict")
|
||
|
|
|
||
|
|
import numpy as np
|
||
|
|
from pathlib import Path
|
||
|
|
from sklearn.linear_model import LogisticRegression
|
||
|
|
from sklearn.metrics import roc_auc_score, average_precision_score
|
||
|
|
|
||
|
|
HERE = Path(__file__).parent
|
||
|
|
|
||
|
|
# ── Load 16K eigen corpus ─────────────────────────────────────────
|
||
|
|
print("Loading 16K eigen corpus...")
|
||
|
|
from corpus_builder import DolphinCorpus, OFF, T1 as T1_DIM
|
||
|
|
corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))
|
||
|
|
idx_mask = corpus.mask[:, 1]
|
||
|
|
X_e = corpus.X[idx_mask]
|
||
|
|
T1 = X_e[:, OFF[1]:OFF[1] + T1_DIM].copy() # (16607, 20)
|
||
|
|
N = len(T1)
|
||
|
|
print(f" N={N} T1 shape={T1.shape}")
|
||
|
|
|
||
|
|
# ── Feature shortcuts for proxy_B baseline ────────────────────────
|
||
|
|
inst_w50 = T1[:, 3]
|
||
|
|
vel_w750 = T1[:, 16]
|
||
|
|
gap_w50 = T1[:, 2]
|
||
|
|
proxy_B = inst_w50 - vel_w750
|
||
|
|
|
||
|
|
# ── Build stress labels: K=5 scans (25s), inst>p90, gap<p10 ──────
|
||
|
|
print("\nBuilding stress labels (K=5, inst>p90, gap<p10)...")
|
||
|
|
K = 5
|
||
|
|
inst_p90 = np.percentile(inst_w50, 90)
|
||
|
|
gap_p10 = np.percentile(gap_w50, 10)
|
||
|
|
print(f" inst_p90={inst_p90:.4f} gap_p10={gap_p10:.4f}")
|
||
|
|
|
||
|
|
labels = np.zeros(N, dtype=np.float32)
|
||
|
|
for i in range(N - K):
|
||
|
|
fi = inst_w50[i+1:i+1+K]
|
||
|
|
fg = gap_w50 [i+1:i+1+K]
|
||
|
|
if np.any(fi > inst_p90) and np.any(fg < gap_p10):
|
||
|
|
labels[i] = 1.0
|
||
|
|
pos_rate = labels.mean()
|
||
|
|
print(f" Positive rate: {pos_rate*100:.1f}% Positive count: {labels.sum():.0f}")
|
||
|
|
|
||
|
|
# ── Proxy B baseline (no model) ───────────────────────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("PROXY_B BASELINE (direct, no model)")
|
||
|
|
print("="*55)
|
||
|
|
pB_vals = proxy_B[:-K]
|
||
|
|
y_vals = labels[:-K]
|
||
|
|
valid = np.isfinite(pB_vals) & np.isfinite(y_vals)
|
||
|
|
|
||
|
|
# Chronological split (same as Task 3 below)
|
||
|
|
n_test = len(pB_vals) // 4
|
||
|
|
pB_test = pB_vals[-n_test:]
|
||
|
|
y_test = y_vals[-n_test:]
|
||
|
|
auc_pB = roc_auc_score(y_test, pB_test)
|
||
|
|
auc_pB = max(auc_pB, 1 - auc_pB)
|
||
|
|
print(f" proxy_B OOS AUC = {auc_pB:.4f}")
|
||
|
|
|
||
|
|
# ── Train FlintHDVAE ──────────────────────────────────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("TRAINING FlintHDVAE (beta=0.1, 40 epochs)")
|
||
|
|
print("="*55)
|
||
|
|
from flint_hd_vae import FlintHDVAE
|
||
|
|
|
||
|
|
# Chronological 80/20 train split for the VAE itself
|
||
|
|
n_vae_train = int(N * 0.8)
|
||
|
|
T1_vae_train = T1[:n_vae_train]
|
||
|
|
|
||
|
|
model = FlintHDVAE(input_dim=20, hd_dim=512, latent_dim=8,
|
||
|
|
beta=0.1, seed=42, use_flint_norm=False)
|
||
|
|
model.fit(T1_vae_train, epochs=40, lr=1e-3, batch_size=256,
|
||
|
|
verbose=True, warmup_frac=0.3)
|
||
|
|
|
||
|
|
# ── Encode full corpus → z ────────────────────────────────────────
|
||
|
|
print("\nEncoding full 16K corpus → z (8-dim)...")
|
||
|
|
z_all = model.encode(T1) # (16607, 8)
|
||
|
|
print(f" z shape: {z_all.shape}")
|
||
|
|
print(f" z range: [{z_all.min():.3f}, {z_all.max():.3f}]")
|
||
|
|
print(f" z var per dim: {z_all.var(0).round(3)}")
|
||
|
|
|
||
|
|
# ── Logistic regression: z → stress labels ───────────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("LOGISTIC REGRESSION: z_regime → stress labels (K=5)")
|
||
|
|
print("="*55)
|
||
|
|
|
||
|
|
X_lr = z_all[:-K]
|
||
|
|
y_lr = labels[:-K]
|
||
|
|
valid_lr = np.isfinite(X_lr).all(1) & np.isfinite(y_lr)
|
||
|
|
X_lr, y_lr = X_lr[valid_lr], y_lr[valid_lr]
|
||
|
|
n_test_lr = len(X_lr) // 4
|
||
|
|
|
||
|
|
X_train_lr = X_lr[:-n_test_lr]
|
||
|
|
X_test_lr = X_lr[-n_test_lr:]
|
||
|
|
y_train_lr = y_lr[:-n_test_lr]
|
||
|
|
y_test_lr = y_lr[-n_test_lr:]
|
||
|
|
|
||
|
|
print(f" Train: {len(X_train_lr)} Test: {len(X_test_lr)}")
|
||
|
|
print(f" Test pos rate: {y_test_lr.mean()*100:.1f}%")
|
||
|
|
|
||
|
|
lr_clf = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1)
|
||
|
|
lr_clf.fit(X_train_lr, y_train_lr)
|
||
|
|
preds = lr_clf.predict_proba(X_test_lr)[:, 1]
|
||
|
|
auc_z = roc_auc_score(y_test_lr, preds)
|
||
|
|
auc_z = max(auc_z, 1 - auc_z)
|
||
|
|
ap_z = average_precision_score(y_test_lr, preds)
|
||
|
|
print(f" z-regime LogReg OOS AUC={auc_z:.4f} AvgPrecision={ap_z:.4f}")
|
||
|
|
|
||
|
|
# ── Combined: z + proxy_B ─────────────────────────────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("COMBINED: z_regime + proxy_B")
|
||
|
|
print("="*55)
|
||
|
|
X_comb = np.column_stack([z_all[:-K], proxy_B[:-K].reshape(-1,1)])[valid_lr]
|
||
|
|
X_c_train = X_comb[:-n_test_lr]
|
||
|
|
X_c_test = X_comb[-n_test_lr:]
|
||
|
|
lr_comb = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1)
|
||
|
|
lr_comb.fit(X_c_train, y_train_lr)
|
||
|
|
preds_c = lr_comb.predict_proba(X_c_test)[:, 1]
|
||
|
|
auc_c = roc_auc_score(y_test_lr, preds_c)
|
||
|
|
auc_c = max(auc_c, 1 - auc_c)
|
||
|
|
print(f" Combined OOS AUC={auc_c:.4f}")
|
||
|
|
|
||
|
|
# ── Summary and Gate ──────────────────────────────────────────────
|
||
|
|
print("\n" + "="*55)
|
||
|
|
print("SUMMARY")
|
||
|
|
print("="*55)
|
||
|
|
print(f" proxy_B direct: AUC = {auc_pB:.4f}")
|
||
|
|
print(f" z_regime (VAE): AUC = {auc_z:.4f}")
|
||
|
|
print(f" z + proxy_B: AUC = {auc_c:.4f}")
|
||
|
|
|
||
|
|
GATE_AUC = 0.65
|
||
|
|
best_auc = max(auc_pB, auc_z, auc_c)
|
||
|
|
print(f"\n Gate threshold: AUC ≥ {GATE_AUC}")
|
||
|
|
if best_auc >= GATE_AUC:
|
||
|
|
print(f" GATE PASS: best AUC={best_auc:.4f} ≥ {GATE_AUC}")
|
||
|
|
print(" → Proceed to Task 4: fork AlphaSignalGenerator with proxy_B gate")
|
||
|
|
else:
|
||
|
|
print(f" GATE FAIL: best AUC={best_auc:.4f} < {GATE_AUC}")
|
||
|
|
print(" → Do NOT proceed with gate integration")
|