Files
DOLPHIN/nautilus_dolphin/dvae/flint_precursor_sweep.py

226 lines
9.4 KiB
Python
Raw Normal View History

"""
SILOQY 550-bit Precursor Sweep NO MODIFICATIONS TO UPSTREAM CODE.
Runs on full 16K eigen corpus, tests multiple:
- Precursor label thresholds (rare extreme events)
- Horizons (K=5, 10, 20, 50 scans ahead)
- ML approaches: Logistic, Ridge, k-NN, threshold-only baseline
Reports AUC, Precision@TopDecile, and direct proxy predictivity.
"""
import sys, os
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.path.insert(0, os.path.dirname(__file__))
sys.path.insert(0, r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict")
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import TimeSeriesSplit
HERE = Path(__file__).parent
# ── Load corpus ────────────────────────────────────────────────────────────
print("Loading corpus (16K eigen samples)...")
from corpus_builder import DolphinCorpus, OFF, T1 as T1_DIM
corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))
idx_mask = corpus.mask[:, 1]
X_e = corpus.X[idx_mask]
t1 = X_e[:, OFF[1]:OFF[1]+T1_DIM].copy()
N = len(t1)
print(f"N={N} samples")
# ── Feature extraction ─────────────────────────────────────────────────────
vel_w50 = t1[:, 1]
vel_w150 = t1[:, 6]
vel_w300 = t1[:, 11]
vel_w750 = t1[:, 16]
inst_w50 = t1[:, 3]
inst_w150= t1[:, 8]
inst_w300= t1[:, 13]
gap_w50 = t1[:, 2]
gap_w300 = t1[:, 12]
lmax_w50 = t1[:, 0]
proxy_A = -0.674*vel_w750 - 0.357*vel_w300 + 0.421*inst_w50
proxy_B = inst_w50 - vel_w750
proxy_C = vel_w50 - vel_w750
proxy_D = inst_w50 * (-vel_w750)
proxy_E = (inst_w50 - inst_w300) - (vel_w50 - vel_w750)
X_proxies = np.column_stack([proxy_A, proxy_B, proxy_C, proxy_D, proxy_E])
proxy_names = ['A(linear)', 'B(inst-vel750)', 'C(vel50-vel750,k=3798)', 'D(inst*-vel750)', 'E(dinst-dvel)']
# ── 550-bit MCDAIN normalization (from flint_dvae_kernel.py, read-only) ────
print("\nApplying 550-bit MCDAIN normalization to proxies...")
from SILOQY_NN_Kernel_COMPLETE6 import arb, safe_float, FLINT_AVAILABLE, with_precision
def mcdain_550bit(X_raw):
"""Read-only implementation of MCDAIN analytical logic at 550-bit."""
rows, cols = X_raw.shape
X_norm = np.zeros_like(X_raw, dtype=np.float64)
with with_precision(550):
for j in range(cols):
col = X_raw[:, j]
col_abs = np.abs(col[np.isfinite(col)])
if len(col_abs) == 0 or col_abs.mean() < 1e-12:
continue
magnitude = arb(str(float(col_abs.mean())))
log_mag = magnitude.log()
mean_val = magnitude * arb("0.1")
scale_val = arb("1.0") / (log_mag + arb("1e-8"))
gate_val = arb("1.0") / (arb("1.0") + (-log_mag).exp())
m = safe_float(mean_val)
s = safe_float(scale_val)
g = safe_float(gate_val)
X_norm[:, j] = np.clip((X_raw[:, j] - m) * s * g, -10, 10)
X_norm = np.nan_to_num(X_norm, nan=0.0, posinf=5.0, neginf=-5.0)
return X_norm
X_norm = mcdain_550bit(X_proxies)
print(f" Normalized. std per proxy: {X_norm.std(0).round(4)}")
print(f" Kurtosis after normalization: {[round(float(((X_norm[:,j]-X_norm[:,j].mean())**4).mean()/(X_norm[:,j].std()**4+1e-8)),2) for j in range(5)]}")
# ── Build precursor labels at multiple thresholds and horizons ─────────────
print("\n" + "="*65)
print("PRECURSOR LABEL SWEEP")
print("="*65)
# inst_w50 thresholds (what percentile constitutes "stress"?)
inst_p80 = np.percentile(inst_w50, 80) # lenient
inst_p90 = np.percentile(inst_w50, 90) # moderate
inst_p95 = np.percentile(inst_w50, 95) # strict
gap_p20 = np.percentile(gap_w50, 20) # gap collapse (low = collapse)
gap_p10 = np.percentile(gap_w50, 10) # strict gap collapse
print(f"inst_w50 thresholds: p80={inst_p80:.4f} p90={inst_p90:.4f} p95={inst_p95:.4f}")
print(f"gap_w50 thresholds: p20={gap_p20:.4f} p10={gap_p10:.4f}")
def build_labels(horizon, inst_thresh, gap_thresh):
"""Did eigenspace stress (inst spike AND gap collapse) occur in next K scans?"""
labels = np.zeros(N, dtype=np.float32)
for i in range(N - horizon):
future_inst = inst_w50[i+1:i+1+horizon]
future_gap = gap_w50[i+1:i+1+horizon]
if np.any(future_inst > inst_thresh) and np.any(future_gap < gap_thresh):
labels[i] = 1.0
return labels
configs = [
('K=10 lenient', 10, inst_p80, gap_p20),
('K=10 moderate', 10, inst_p90, gap_p10),
('K=20 moderate', 20, inst_p90, gap_p10),
('K=20 strict', 20, inst_p95, gap_p10),
('K=50 strict', 50, inst_p95, gap_p10),
]
results = []
for cfg_name, K, it, gt in configs:
y = build_labels(K, it, gt)
pos_rate = y.mean()
print(f"\n [{cfg_name}] K={K} inst>{it:.3f} gap<{gt:.3f} pos_rate={pos_rate*100:.1f}%")
# Skip degenerate
if pos_rate < 0.02 or pos_rate > 0.60:
print(f" Skipping (pos_rate out of range)")
continue
# ── Evaluate each proxy directly ─────────────────────────────────────
print(f" Direct proxy AUC (no model):")
best_proxy_auc = 0
for j, pname in enumerate(proxy_names):
px = X_norm[:-K, j] if K > 0 else X_norm[:, j]
yy = y[:-K] if K > 0 else y
valid = np.isfinite(px) & np.isfinite(yy)
if valid.sum() < 100:
continue
try:
auc = roc_auc_score(yy[valid], px[valid])
auc = max(auc, 1-auc) # flip if < 0.5
best_proxy_auc = max(best_proxy_auc, auc)
if auc > 0.52:
print(f" {pname:<30} AUC={auc:.4f} *")
else:
print(f" {pname:<30} AUC={auc:.4f}")
except Exception:
pass
# ── Logistic regression on all proxies ───────────────────────────────
Xf = X_norm[:-K]
yf = y[:-K]
valid = np.isfinite(Xf).all(1) & np.isfinite(yf)
Xf, yf = Xf[valid], yf[valid]
if len(Xf) < 200:
continue
try:
# Chronological 3-fold split
n_val = len(Xf) // 4
X_train, X_val = Xf[:-n_val], Xf[-n_val:]
y_train, y_val = yf[:-n_val], yf[-n_val:]
lr = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1)
lr.fit(X_train, y_train)
preds = lr.predict_proba(X_val)[:, 1]
auc_lr = roc_auc_score(y_val, preds)
auc_lr = max(auc_lr, 1-auc_lr)
ap_lr = average_precision_score(y_val, preds)
print(f" LogReg (OOS): AUC={auc_lr:.4f} AvgPrecision={ap_lr:.4f}")
except Exception as ex:
print(f" LogReg failed: {ex}")
# ── k-NN (captures non-linear manifold structure) ─────────────────────
try:
knn = KNeighborsClassifier(n_neighbors=15, metric='euclidean')
knn.fit(X_train, y_train)
preds_knn = knn.predict_proba(X_val)[:, 1]
auc_knn = roc_auc_score(y_val, preds_knn)
auc_knn = max(auc_knn, 1-auc_knn)
print(f" k-NN (k=15): AUC={auc_knn:.4f}")
except Exception as ex:
print(f" kNN failed: {ex}")
results.append((cfg_name, K, pos_rate, best_proxy_auc,
auc_lr if 'auc_lr' in dir() else 0,
auc_knn if 'auc_knn' in dir() else 0))
# ── Temporal structure: HOW MANY SCANS AHEAD does the signal lead? ─────────
print("\n" + "="*65)
print("TEMPORAL LEAD STRUCTURE: proxy_B vs future inst/gap (by horizon)")
print("="*65)
print(f" {'Horizon':>10} {'AUC(B)':>8} {'AUC(C)':>8} {'pos_rate':>9}")
for K in [1, 2, 5, 10, 20, 30, 50, 100]:
y_k = build_labels(K, inst_p90, gap_p10)
if y_k.mean() < 0.01 or y_k.mean() > 0.80:
continue
pB = X_norm[:-K, 1] # proxy_B normalized
pC = X_norm[:-K, 2] # proxy_C normalized
yy = y_k[:-K]
valid = np.isfinite(pB) & np.isfinite(pC) & np.isfinite(yy)
if valid.sum() < 100:
continue
try:
aB = roc_auc_score(yy[valid], pB[valid])
aB = max(aB, 1-aB)
aC = roc_auc_score(yy[valid], pC[valid])
aC = max(aC, 1-aC)
print(f" K={K:>3} scans ahead: AUC(B)={aB:.4f} AUC(C)={aC:.4f} pos={y_k.mean()*100:.1f}%")
except Exception:
pass
# ── 512-bit DVAE question: variance per proxy before/after normalization ───
print("\n" + "="*65)
print("550-BIT FLINT EFFECT: variance recovery in heavy-tailed proxies")
print("="*65)
for j, pname in enumerate(proxy_names):
raw = X_proxies[:, j]
norm = X_norm[:, j]
kurt_raw = float(((raw-raw.mean())**4).mean() / (raw.std()**4 + 1e-8))
kurt_norm = float(((norm-norm.mean())**4).mean() / (norm.std()**4 + 1e-8))
# Fraction of samples that would be clipped at ±3σ in float64 z-score
z64 = (raw - raw.mean()) / (raw.std() + 1e-8)
clip_pct = (np.abs(z64) > 3).mean() * 100
print(f" {pname:<32} kurt_raw={kurt_raw:8.1f} kurt_norm={kurt_norm:6.2f} "
f"tail_samples={clip_pct:.1f}%_beyond_3sigma")
print("\nDone.")