""" SILOQY 550-bit Precursor Sweep — NO MODIFICATIONS TO UPSTREAM CODE. Runs on full 16K eigen corpus, tests multiple: - Precursor label thresholds (rare extreme events) - Horizons (K=5, 10, 20, 50 scans ahead) - ML approaches: Logistic, Ridge, k-NN, threshold-only baseline Reports AUC, Precision@TopDecile, and direct proxy predictivity. """ import sys, os sys.stdout.reconfigure(encoding='utf-8', errors='replace') sys.path.insert(0, os.path.dirname(__file__)) sys.path.insert(0, r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict") import numpy as np from pathlib import Path from sklearn.linear_model import LogisticRegression, Ridge from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import roc_auc_score, average_precision_score from sklearn.model_selection import TimeSeriesSplit HERE = Path(__file__).parent # ── Load corpus ──────────────────────────────────────────────────────────── print("Loading corpus (16K eigen samples)...") from corpus_builder import DolphinCorpus, OFF, T1 as T1_DIM corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz')) idx_mask = corpus.mask[:, 1] X_e = corpus.X[idx_mask] t1 = X_e[:, OFF[1]:OFF[1]+T1_DIM].copy() N = len(t1) print(f"N={N} samples") # ── Feature extraction ───────────────────────────────────────────────────── vel_w50 = t1[:, 1] vel_w150 = t1[:, 6] vel_w300 = t1[:, 11] vel_w750 = t1[:, 16] inst_w50 = t1[:, 3] inst_w150= t1[:, 8] inst_w300= t1[:, 13] gap_w50 = t1[:, 2] gap_w300 = t1[:, 12] lmax_w50 = t1[:, 0] proxy_A = -0.674*vel_w750 - 0.357*vel_w300 + 0.421*inst_w50 proxy_B = inst_w50 - vel_w750 proxy_C = vel_w50 - vel_w750 proxy_D = inst_w50 * (-vel_w750) proxy_E = (inst_w50 - inst_w300) - (vel_w50 - vel_w750) X_proxies = np.column_stack([proxy_A, proxy_B, proxy_C, proxy_D, proxy_E]) proxy_names = ['A(linear)', 'B(inst-vel750)', 'C(vel50-vel750,k=3798)', 'D(inst*-vel750)', 'E(dinst-dvel)'] # ── 550-bit MCDAIN normalization (from flint_dvae_kernel.py, read-only) ──── print("\nApplying 550-bit MCDAIN normalization to proxies...") from SILOQY_NN_Kernel_COMPLETE6 import arb, safe_float, FLINT_AVAILABLE, with_precision def mcdain_550bit(X_raw): """Read-only implementation of MCDAIN analytical logic at 550-bit.""" rows, cols = X_raw.shape X_norm = np.zeros_like(X_raw, dtype=np.float64) with with_precision(550): for j in range(cols): col = X_raw[:, j] col_abs = np.abs(col[np.isfinite(col)]) if len(col_abs) == 0 or col_abs.mean() < 1e-12: continue magnitude = arb(str(float(col_abs.mean()))) log_mag = magnitude.log() mean_val = magnitude * arb("0.1") scale_val = arb("1.0") / (log_mag + arb("1e-8")) gate_val = arb("1.0") / (arb("1.0") + (-log_mag).exp()) m = safe_float(mean_val) s = safe_float(scale_val) g = safe_float(gate_val) X_norm[:, j] = np.clip((X_raw[:, j] - m) * s * g, -10, 10) X_norm = np.nan_to_num(X_norm, nan=0.0, posinf=5.0, neginf=-5.0) return X_norm X_norm = mcdain_550bit(X_proxies) print(f" Normalized. std per proxy: {X_norm.std(0).round(4)}") print(f" Kurtosis after normalization: {[round(float(((X_norm[:,j]-X_norm[:,j].mean())**4).mean()/(X_norm[:,j].std()**4+1e-8)),2) for j in range(5)]}") # ── Build precursor labels at multiple thresholds and horizons ───────────── print("\n" + "="*65) print("PRECURSOR LABEL SWEEP") print("="*65) # inst_w50 thresholds (what percentile constitutes "stress"?) inst_p80 = np.percentile(inst_w50, 80) # lenient inst_p90 = np.percentile(inst_w50, 90) # moderate inst_p95 = np.percentile(inst_w50, 95) # strict gap_p20 = np.percentile(gap_w50, 20) # gap collapse (low = collapse) gap_p10 = np.percentile(gap_w50, 10) # strict gap collapse print(f"inst_w50 thresholds: p80={inst_p80:.4f} p90={inst_p90:.4f} p95={inst_p95:.4f}") print(f"gap_w50 thresholds: p20={gap_p20:.4f} p10={gap_p10:.4f}") def build_labels(horizon, inst_thresh, gap_thresh): """Did eigenspace stress (inst spike AND gap collapse) occur in next K scans?""" labels = np.zeros(N, dtype=np.float32) for i in range(N - horizon): future_inst = inst_w50[i+1:i+1+horizon] future_gap = gap_w50[i+1:i+1+horizon] if np.any(future_inst > inst_thresh) and np.any(future_gap < gap_thresh): labels[i] = 1.0 return labels configs = [ ('K=10 lenient', 10, inst_p80, gap_p20), ('K=10 moderate', 10, inst_p90, gap_p10), ('K=20 moderate', 20, inst_p90, gap_p10), ('K=20 strict', 20, inst_p95, gap_p10), ('K=50 strict', 50, inst_p95, gap_p10), ] results = [] for cfg_name, K, it, gt in configs: y = build_labels(K, it, gt) pos_rate = y.mean() print(f"\n [{cfg_name}] K={K} inst>{it:.3f} gap<{gt:.3f} pos_rate={pos_rate*100:.1f}%") # Skip degenerate if pos_rate < 0.02 or pos_rate > 0.60: print(f" Skipping (pos_rate out of range)") continue # ── Evaluate each proxy directly ───────────────────────────────────── print(f" Direct proxy AUC (no model):") best_proxy_auc = 0 for j, pname in enumerate(proxy_names): px = X_norm[:-K, j] if K > 0 else X_norm[:, j] yy = y[:-K] if K > 0 else y valid = np.isfinite(px) & np.isfinite(yy) if valid.sum() < 100: continue try: auc = roc_auc_score(yy[valid], px[valid]) auc = max(auc, 1-auc) # flip if < 0.5 best_proxy_auc = max(best_proxy_auc, auc) if auc > 0.52: print(f" {pname:<30} AUC={auc:.4f} *") else: print(f" {pname:<30} AUC={auc:.4f}") except Exception: pass # ── Logistic regression on all proxies ─────────────────────────────── Xf = X_norm[:-K] yf = y[:-K] valid = np.isfinite(Xf).all(1) & np.isfinite(yf) Xf, yf = Xf[valid], yf[valid] if len(Xf) < 200: continue try: # Chronological 3-fold split n_val = len(Xf) // 4 X_train, X_val = Xf[:-n_val], Xf[-n_val:] y_train, y_val = yf[:-n_val], yf[-n_val:] lr = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1) lr.fit(X_train, y_train) preds = lr.predict_proba(X_val)[:, 1] auc_lr = roc_auc_score(y_val, preds) auc_lr = max(auc_lr, 1-auc_lr) ap_lr = average_precision_score(y_val, preds) print(f" LogReg (OOS): AUC={auc_lr:.4f} AvgPrecision={ap_lr:.4f}") except Exception as ex: print(f" LogReg failed: {ex}") # ── k-NN (captures non-linear manifold structure) ───────────────────── try: knn = KNeighborsClassifier(n_neighbors=15, metric='euclidean') knn.fit(X_train, y_train) preds_knn = knn.predict_proba(X_val)[:, 1] auc_knn = roc_auc_score(y_val, preds_knn) auc_knn = max(auc_knn, 1-auc_knn) print(f" k-NN (k=15): AUC={auc_knn:.4f}") except Exception as ex: print(f" kNN failed: {ex}") results.append((cfg_name, K, pos_rate, best_proxy_auc, auc_lr if 'auc_lr' in dir() else 0, auc_knn if 'auc_knn' in dir() else 0)) # ── Temporal structure: HOW MANY SCANS AHEAD does the signal lead? ───────── print("\n" + "="*65) print("TEMPORAL LEAD STRUCTURE: proxy_B vs future inst/gap (by horizon)") print("="*65) print(f" {'Horizon':>10} {'AUC(B)':>8} {'AUC(C)':>8} {'pos_rate':>9}") for K in [1, 2, 5, 10, 20, 30, 50, 100]: y_k = build_labels(K, inst_p90, gap_p10) if y_k.mean() < 0.01 or y_k.mean() > 0.80: continue pB = X_norm[:-K, 1] # proxy_B normalized pC = X_norm[:-K, 2] # proxy_C normalized yy = y_k[:-K] valid = np.isfinite(pB) & np.isfinite(pC) & np.isfinite(yy) if valid.sum() < 100: continue try: aB = roc_auc_score(yy[valid], pB[valid]) aB = max(aB, 1-aB) aC = roc_auc_score(yy[valid], pC[valid]) aC = max(aC, 1-aC) print(f" K={K:>3} scans ahead: AUC(B)={aB:.4f} AUC(C)={aC:.4f} pos={y_k.mean()*100:.1f}%") except Exception: pass # ── 512-bit DVAE question: variance per proxy before/after normalization ─── print("\n" + "="*65) print("550-BIT FLINT EFFECT: variance recovery in heavy-tailed proxies") print("="*65) for j, pname in enumerate(proxy_names): raw = X_proxies[:, j] norm = X_norm[:, j] kurt_raw = float(((raw-raw.mean())**4).mean() / (raw.std()**4 + 1e-8)) kurt_norm = float(((norm-norm.mean())**4).mean() / (norm.std()**4 + 1e-8)) # Fraction of samples that would be clipped at ±3σ in float64 z-score z64 = (raw - raw.mean()) / (raw.std() + 1e-8) clip_pct = (np.abs(z64) > 3).mean() * 100 print(f" {pname:<32} kurt_raw={kurt_raw:8.1f} kurt_norm={kurt_norm:6.2f} " f"tail_samples={clip_pct:.1f}%_beyond_3sigma") print("\nDone.")