226 lines
9.4 KiB
Python
226 lines
9.4 KiB
Python
|
|
"""
|
|||
|
|
SILOQY 550-bit Precursor Sweep — NO MODIFICATIONS TO UPSTREAM CODE.
|
|||
|
|
Runs on full 16K eigen corpus, tests multiple:
|
|||
|
|
- Precursor label thresholds (rare extreme events)
|
|||
|
|
- Horizons (K=5, 10, 20, 50 scans ahead)
|
|||
|
|
- ML approaches: Logistic, Ridge, k-NN, threshold-only baseline
|
|||
|
|
Reports AUC, Precision@TopDecile, and direct proxy predictivity.
|
|||
|
|
"""
|
|||
|
|
import sys, os
|
|||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
|||
|
|
sys.path.insert(0, os.path.dirname(__file__))
|
|||
|
|
sys.path.insert(0, r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict")
|
|||
|
|
|
|||
|
|
import numpy as np
|
|||
|
|
from pathlib import Path
|
|||
|
|
from sklearn.linear_model import LogisticRegression, Ridge
|
|||
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|||
|
|
from sklearn.metrics import roc_auc_score, average_precision_score
|
|||
|
|
from sklearn.model_selection import TimeSeriesSplit
|
|||
|
|
|
|||
|
|
HERE = Path(__file__).parent
|
|||
|
|
|
|||
|
|
# ── Load corpus ────────────────────────────────────────────────────────────
|
|||
|
|
print("Loading corpus (16K eigen samples)...")
|
|||
|
|
from corpus_builder import DolphinCorpus, OFF, T1 as T1_DIM
|
|||
|
|
corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))
|
|||
|
|
idx_mask = corpus.mask[:, 1]
|
|||
|
|
X_e = corpus.X[idx_mask]
|
|||
|
|
t1 = X_e[:, OFF[1]:OFF[1]+T1_DIM].copy()
|
|||
|
|
N = len(t1)
|
|||
|
|
print(f"N={N} samples")
|
|||
|
|
|
|||
|
|
# ── Feature extraction ─────────────────────────────────────────────────────
|
|||
|
|
vel_w50 = t1[:, 1]
|
|||
|
|
vel_w150 = t1[:, 6]
|
|||
|
|
vel_w300 = t1[:, 11]
|
|||
|
|
vel_w750 = t1[:, 16]
|
|||
|
|
inst_w50 = t1[:, 3]
|
|||
|
|
inst_w150= t1[:, 8]
|
|||
|
|
inst_w300= t1[:, 13]
|
|||
|
|
gap_w50 = t1[:, 2]
|
|||
|
|
gap_w300 = t1[:, 12]
|
|||
|
|
lmax_w50 = t1[:, 0]
|
|||
|
|
|
|||
|
|
proxy_A = -0.674*vel_w750 - 0.357*vel_w300 + 0.421*inst_w50
|
|||
|
|
proxy_B = inst_w50 - vel_w750
|
|||
|
|
proxy_C = vel_w50 - vel_w750
|
|||
|
|
proxy_D = inst_w50 * (-vel_w750)
|
|||
|
|
proxy_E = (inst_w50 - inst_w300) - (vel_w50 - vel_w750)
|
|||
|
|
|
|||
|
|
X_proxies = np.column_stack([proxy_A, proxy_B, proxy_C, proxy_D, proxy_E])
|
|||
|
|
proxy_names = ['A(linear)', 'B(inst-vel750)', 'C(vel50-vel750,k=3798)', 'D(inst*-vel750)', 'E(dinst-dvel)']
|
|||
|
|
|
|||
|
|
# ── 550-bit MCDAIN normalization (from flint_dvae_kernel.py, read-only) ────
|
|||
|
|
print("\nApplying 550-bit MCDAIN normalization to proxies...")
|
|||
|
|
from SILOQY_NN_Kernel_COMPLETE6 import arb, safe_float, FLINT_AVAILABLE, with_precision
|
|||
|
|
|
|||
|
|
def mcdain_550bit(X_raw):
|
|||
|
|
"""Read-only implementation of MCDAIN analytical logic at 550-bit."""
|
|||
|
|
rows, cols = X_raw.shape
|
|||
|
|
X_norm = np.zeros_like(X_raw, dtype=np.float64)
|
|||
|
|
with with_precision(550):
|
|||
|
|
for j in range(cols):
|
|||
|
|
col = X_raw[:, j]
|
|||
|
|
col_abs = np.abs(col[np.isfinite(col)])
|
|||
|
|
if len(col_abs) == 0 or col_abs.mean() < 1e-12:
|
|||
|
|
continue
|
|||
|
|
magnitude = arb(str(float(col_abs.mean())))
|
|||
|
|
log_mag = magnitude.log()
|
|||
|
|
mean_val = magnitude * arb("0.1")
|
|||
|
|
scale_val = arb("1.0") / (log_mag + arb("1e-8"))
|
|||
|
|
gate_val = arb("1.0") / (arb("1.0") + (-log_mag).exp())
|
|||
|
|
m = safe_float(mean_val)
|
|||
|
|
s = safe_float(scale_val)
|
|||
|
|
g = safe_float(gate_val)
|
|||
|
|
X_norm[:, j] = np.clip((X_raw[:, j] - m) * s * g, -10, 10)
|
|||
|
|
X_norm = np.nan_to_num(X_norm, nan=0.0, posinf=5.0, neginf=-5.0)
|
|||
|
|
return X_norm
|
|||
|
|
|
|||
|
|
X_norm = mcdain_550bit(X_proxies)
|
|||
|
|
print(f" Normalized. std per proxy: {X_norm.std(0).round(4)}")
|
|||
|
|
print(f" Kurtosis after normalization: {[round(float(((X_norm[:,j]-X_norm[:,j].mean())**4).mean()/(X_norm[:,j].std()**4+1e-8)),2) for j in range(5)]}")
|
|||
|
|
|
|||
|
|
# ── Build precursor labels at multiple thresholds and horizons ─────────────
|
|||
|
|
print("\n" + "="*65)
|
|||
|
|
print("PRECURSOR LABEL SWEEP")
|
|||
|
|
print("="*65)
|
|||
|
|
|
|||
|
|
# inst_w50 thresholds (what percentile constitutes "stress"?)
|
|||
|
|
inst_p80 = np.percentile(inst_w50, 80) # lenient
|
|||
|
|
inst_p90 = np.percentile(inst_w50, 90) # moderate
|
|||
|
|
inst_p95 = np.percentile(inst_w50, 95) # strict
|
|||
|
|
gap_p20 = np.percentile(gap_w50, 20) # gap collapse (low = collapse)
|
|||
|
|
gap_p10 = np.percentile(gap_w50, 10) # strict gap collapse
|
|||
|
|
|
|||
|
|
print(f"inst_w50 thresholds: p80={inst_p80:.4f} p90={inst_p90:.4f} p95={inst_p95:.4f}")
|
|||
|
|
print(f"gap_w50 thresholds: p20={gap_p20:.4f} p10={gap_p10:.4f}")
|
|||
|
|
|
|||
|
|
def build_labels(horizon, inst_thresh, gap_thresh):
|
|||
|
|
"""Did eigenspace stress (inst spike AND gap collapse) occur in next K scans?"""
|
|||
|
|
labels = np.zeros(N, dtype=np.float32)
|
|||
|
|
for i in range(N - horizon):
|
|||
|
|
future_inst = inst_w50[i+1:i+1+horizon]
|
|||
|
|
future_gap = gap_w50[i+1:i+1+horizon]
|
|||
|
|
if np.any(future_inst > inst_thresh) and np.any(future_gap < gap_thresh):
|
|||
|
|
labels[i] = 1.0
|
|||
|
|
return labels
|
|||
|
|
|
|||
|
|
configs = [
|
|||
|
|
('K=10 lenient', 10, inst_p80, gap_p20),
|
|||
|
|
('K=10 moderate', 10, inst_p90, gap_p10),
|
|||
|
|
('K=20 moderate', 20, inst_p90, gap_p10),
|
|||
|
|
('K=20 strict', 20, inst_p95, gap_p10),
|
|||
|
|
('K=50 strict', 50, inst_p95, gap_p10),
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
results = []
|
|||
|
|
for cfg_name, K, it, gt in configs:
|
|||
|
|
y = build_labels(K, it, gt)
|
|||
|
|
pos_rate = y.mean()
|
|||
|
|
print(f"\n [{cfg_name}] K={K} inst>{it:.3f} gap<{gt:.3f} pos_rate={pos_rate*100:.1f}%")
|
|||
|
|
|
|||
|
|
# Skip degenerate
|
|||
|
|
if pos_rate < 0.02 or pos_rate > 0.60:
|
|||
|
|
print(f" Skipping (pos_rate out of range)")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# ── Evaluate each proxy directly ─────────────────────────────────────
|
|||
|
|
print(f" Direct proxy AUC (no model):")
|
|||
|
|
best_proxy_auc = 0
|
|||
|
|
for j, pname in enumerate(proxy_names):
|
|||
|
|
px = X_norm[:-K, j] if K > 0 else X_norm[:, j]
|
|||
|
|
yy = y[:-K] if K > 0 else y
|
|||
|
|
valid = np.isfinite(px) & np.isfinite(yy)
|
|||
|
|
if valid.sum() < 100:
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
auc = roc_auc_score(yy[valid], px[valid])
|
|||
|
|
auc = max(auc, 1-auc) # flip if < 0.5
|
|||
|
|
best_proxy_auc = max(best_proxy_auc, auc)
|
|||
|
|
if auc > 0.52:
|
|||
|
|
print(f" {pname:<30} AUC={auc:.4f} *")
|
|||
|
|
else:
|
|||
|
|
print(f" {pname:<30} AUC={auc:.4f}")
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# ── Logistic regression on all proxies ───────────────────────────────
|
|||
|
|
Xf = X_norm[:-K]
|
|||
|
|
yf = y[:-K]
|
|||
|
|
valid = np.isfinite(Xf).all(1) & np.isfinite(yf)
|
|||
|
|
Xf, yf = Xf[valid], yf[valid]
|
|||
|
|
if len(Xf) < 200:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Chronological 3-fold split
|
|||
|
|
n_val = len(Xf) // 4
|
|||
|
|
X_train, X_val = Xf[:-n_val], Xf[-n_val:]
|
|||
|
|
y_train, y_val = yf[:-n_val], yf[-n_val:]
|
|||
|
|
lr = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1)
|
|||
|
|
lr.fit(X_train, y_train)
|
|||
|
|
preds = lr.predict_proba(X_val)[:, 1]
|
|||
|
|
auc_lr = roc_auc_score(y_val, preds)
|
|||
|
|
auc_lr = max(auc_lr, 1-auc_lr)
|
|||
|
|
ap_lr = average_precision_score(y_val, preds)
|
|||
|
|
print(f" LogReg (OOS): AUC={auc_lr:.4f} AvgPrecision={ap_lr:.4f}")
|
|||
|
|
except Exception as ex:
|
|||
|
|
print(f" LogReg failed: {ex}")
|
|||
|
|
|
|||
|
|
# ── k-NN (captures non-linear manifold structure) ─────────────────────
|
|||
|
|
try:
|
|||
|
|
knn = KNeighborsClassifier(n_neighbors=15, metric='euclidean')
|
|||
|
|
knn.fit(X_train, y_train)
|
|||
|
|
preds_knn = knn.predict_proba(X_val)[:, 1]
|
|||
|
|
auc_knn = roc_auc_score(y_val, preds_knn)
|
|||
|
|
auc_knn = max(auc_knn, 1-auc_knn)
|
|||
|
|
print(f" k-NN (k=15): AUC={auc_knn:.4f}")
|
|||
|
|
except Exception as ex:
|
|||
|
|
print(f" kNN failed: {ex}")
|
|||
|
|
|
|||
|
|
results.append((cfg_name, K, pos_rate, best_proxy_auc,
|
|||
|
|
auc_lr if 'auc_lr' in dir() else 0,
|
|||
|
|
auc_knn if 'auc_knn' in dir() else 0))
|
|||
|
|
|
|||
|
|
# ── Temporal structure: HOW MANY SCANS AHEAD does the signal lead? ─────────
|
|||
|
|
print("\n" + "="*65)
|
|||
|
|
print("TEMPORAL LEAD STRUCTURE: proxy_B vs future inst/gap (by horizon)")
|
|||
|
|
print("="*65)
|
|||
|
|
print(f" {'Horizon':>10} {'AUC(B)':>8} {'AUC(C)':>8} {'pos_rate':>9}")
|
|||
|
|
for K in [1, 2, 5, 10, 20, 30, 50, 100]:
|
|||
|
|
y_k = build_labels(K, inst_p90, gap_p10)
|
|||
|
|
if y_k.mean() < 0.01 or y_k.mean() > 0.80:
|
|||
|
|
continue
|
|||
|
|
pB = X_norm[:-K, 1] # proxy_B normalized
|
|||
|
|
pC = X_norm[:-K, 2] # proxy_C normalized
|
|||
|
|
yy = y_k[:-K]
|
|||
|
|
valid = np.isfinite(pB) & np.isfinite(pC) & np.isfinite(yy)
|
|||
|
|
if valid.sum() < 100:
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
aB = roc_auc_score(yy[valid], pB[valid])
|
|||
|
|
aB = max(aB, 1-aB)
|
|||
|
|
aC = roc_auc_score(yy[valid], pC[valid])
|
|||
|
|
aC = max(aC, 1-aC)
|
|||
|
|
print(f" K={K:>3} scans ahead: AUC(B)={aB:.4f} AUC(C)={aC:.4f} pos={y_k.mean()*100:.1f}%")
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# ── 512-bit DVAE question: variance per proxy before/after normalization ───
|
|||
|
|
print("\n" + "="*65)
|
|||
|
|
print("550-BIT FLINT EFFECT: variance recovery in heavy-tailed proxies")
|
|||
|
|
print("="*65)
|
|||
|
|
for j, pname in enumerate(proxy_names):
|
|||
|
|
raw = X_proxies[:, j]
|
|||
|
|
norm = X_norm[:, j]
|
|||
|
|
kurt_raw = float(((raw-raw.mean())**4).mean() / (raw.std()**4 + 1e-8))
|
|||
|
|
kurt_norm = float(((norm-norm.mean())**4).mean() / (norm.std()**4 + 1e-8))
|
|||
|
|
# Fraction of samples that would be clipped at ±3σ in float64 z-score
|
|||
|
|
z64 = (raw - raw.mean()) / (raw.std() + 1e-8)
|
|||
|
|
clip_pct = (np.abs(z64) > 3).mean() * 100
|
|||
|
|
print(f" {pname:<32} kurt_raw={kurt_raw:8.1f} kurt_norm={kurt_norm:6.2f} "
|
|||
|
|
f"tail_samples={clip_pct:.1f}%_beyond_3sigma")
|
|||
|
|
|
|||
|
|
print("\nDone.")
|