DOLPHIN/nautilus_dolphin/dvae/flint_precursor_sweep.py

"""
SILOQY 550-bit Precursor Sweep — NO MODIFICATIONS TO UPSTREAM CODE.
Runs on full 16K eigen corpus, tests multiple:
  - Precursor label thresholds (rare extreme events)
  - Horizons (K=5, 10, 20, 50 scans ahead)
  - ML approaches: Logistic, Ridge, k-NN, threshold-only baseline
Reports AUC, Precision@TopDecile, and direct proxy predictivity.
"""
import sys, os
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.path.insert(0, os.path.dirname(__file__))
sys.path.insert(0, r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict")

import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import TimeSeriesSplit

HERE = Path(__file__).parent

# ── Load corpus ────────────────────────────────────────────────────────────
print("Loading corpus (16K eigen samples)...")
from corpus_builder import DolphinCorpus, OFF, T1 as T1_DIM
corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))
idx_mask = corpus.mask[:, 1]
X_e = corpus.X[idx_mask]
t1 = X_e[:, OFF[1]:OFF[1]+T1_DIM].copy()
N = len(t1)
print(f"N={N} samples")

# ── Feature extraction ─────────────────────────────────────────────────────
vel_w50  = t1[:, 1]
vel_w150 = t1[:, 6]
vel_w300 = t1[:, 11]
vel_w750 = t1[:, 16]
inst_w50 = t1[:, 3]
inst_w150= t1[:, 8]
inst_w300= t1[:, 13]
gap_w50  = t1[:, 2]
gap_w300 = t1[:, 12]
lmax_w50 = t1[:, 0]

proxy_A = -0.674*vel_w750 - 0.357*vel_w300 + 0.421*inst_w50
proxy_B = inst_w50 - vel_w750
proxy_C = vel_w50 - vel_w750
proxy_D = inst_w50 * (-vel_w750)
proxy_E = (inst_w50 - inst_w300) - (vel_w50 - vel_w750)

X_proxies = np.column_stack([proxy_A, proxy_B, proxy_C, proxy_D, proxy_E])
proxy_names = ['A(linear)', 'B(inst-vel750)', 'C(vel50-vel750,k=3798)', 'D(inst*-vel750)', 'E(dinst-dvel)']

# ── 550-bit MCDAIN normalization (from flint_dvae_kernel.py, read-only) ────
print("\nApplying 550-bit MCDAIN normalization to proxies...")
from SILOQY_NN_Kernel_COMPLETE6 import arb, safe_float, FLINT_AVAILABLE, with_precision

def mcdain_550bit(X_raw):
    """Read-only implementation of MCDAIN analytical logic at 550-bit."""
    rows, cols = X_raw.shape
    X_norm = np.zeros_like(X_raw, dtype=np.float64)
    with with_precision(550):
        for j in range(cols):
            col = X_raw[:, j]
            col_abs = np.abs(col[np.isfinite(col)])
            if len(col_abs) == 0 or col_abs.mean() < 1e-12:
                continue
            magnitude = arb(str(float(col_abs.mean())))
            log_mag = magnitude.log()
            mean_val  = magnitude * arb("0.1")
            scale_val = arb("1.0") / (log_mag + arb("1e-8"))
            gate_val  = arb("1.0") / (arb("1.0") + (-log_mag).exp())
            m = safe_float(mean_val)
            s = safe_float(scale_val)
            g = safe_float(gate_val)
            X_norm[:, j] = np.clip((X_raw[:, j] - m) * s * g, -10, 10)
    X_norm = np.nan_to_num(X_norm, nan=0.0, posinf=5.0, neginf=-5.0)
    return X_norm

X_norm = mcdain_550bit(X_proxies)
print(f"  Normalized. std per proxy: {X_norm.std(0).round(4)}")
print(f"  Kurtosis after normalization: {[round(float(((X_norm[:,j]-X_norm[:,j].mean())**4).mean()/(X_norm[:,j].std()**4+1e-8)),2) for j in range(5)]}")

# ── Build precursor labels at multiple thresholds and horizons ─────────────
print("\n" + "="*65)
print("PRECURSOR LABEL SWEEP")
print("="*65)

# inst_w50 thresholds (what percentile constitutes "stress"?)
inst_p80 = np.percentile(inst_w50, 80)  # lenient
inst_p90 = np.percentile(inst_w50, 90)  # moderate
inst_p95 = np.percentile(inst_w50, 95)  # strict
gap_p20  = np.percentile(gap_w50,  20)  # gap collapse (low = collapse)
gap_p10  = np.percentile(gap_w50,  10)  # strict gap collapse

print(f"inst_w50 thresholds: p80={inst_p80:.4f}  p90={inst_p90:.4f}  p95={inst_p95:.4f}")
print(f"gap_w50  thresholds: p20={gap_p20:.4f}   p10={gap_p10:.4f}")

def build_labels(horizon, inst_thresh, gap_thresh):
    """Did eigenspace stress (inst spike AND gap collapse) occur in next K scans?"""
    labels = np.zeros(N, dtype=np.float32)
    for i in range(N - horizon):
        future_inst = inst_w50[i+1:i+1+horizon]
        future_gap  = gap_w50[i+1:i+1+horizon]
        if np.any(future_inst > inst_thresh) and np.any(future_gap < gap_thresh):
            labels[i] = 1.0
    return labels

configs = [
    ('K=10 lenient',  10,  inst_p80, gap_p20),
    ('K=10 moderate', 10,  inst_p90, gap_p10),
    ('K=20 moderate', 20,  inst_p90, gap_p10),
    ('K=20 strict',   20,  inst_p95, gap_p10),
    ('K=50 strict',   50,  inst_p95, gap_p10),
]

results = []
for cfg_name, K, it, gt in configs:
    y = build_labels(K, it, gt)
    pos_rate = y.mean()
    print(f"\n  [{cfg_name}] K={K} inst>{it:.3f} gap<{gt:.3f}  pos_rate={pos_rate*100:.1f}%")

    # Skip degenerate
    if pos_rate < 0.02 or pos_rate > 0.60:
        print(f"    Skipping (pos_rate out of range)")
        continue

    # ── Evaluate each proxy directly ─────────────────────────────────────
    print(f"  Direct proxy AUC (no model):")
    best_proxy_auc = 0
    for j, pname in enumerate(proxy_names):
        px = X_norm[:-K, j] if K > 0 else X_norm[:, j]
        yy = y[:-K] if K > 0 else y
        valid = np.isfinite(px) & np.isfinite(yy)
        if valid.sum() < 100:
            continue
        try:
            auc = roc_auc_score(yy[valid], px[valid])
            auc = max(auc, 1-auc)  # flip if < 0.5
            best_proxy_auc = max(best_proxy_auc, auc)
            if auc > 0.52:
                print(f"    {pname:<30} AUC={auc:.4f} *")
            else:
                print(f"    {pname:<30} AUC={auc:.4f}")
        except Exception:
            pass

    # ── Logistic regression on all proxies ───────────────────────────────
    Xf = X_norm[:-K]
    yf = y[:-K]
    valid = np.isfinite(Xf).all(1) & np.isfinite(yf)
    Xf, yf = Xf[valid], yf[valid]
    if len(Xf) < 200:
        continue

    try:
        # Chronological 3-fold split
        n_val = len(Xf) // 4
        X_train, X_val = Xf[:-n_val], Xf[-n_val:]
        y_train, y_val = yf[:-n_val], yf[-n_val:]
        lr = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1)
        lr.fit(X_train, y_train)
        preds = lr.predict_proba(X_val)[:, 1]
        auc_lr = roc_auc_score(y_val, preds)
        auc_lr = max(auc_lr, 1-auc_lr)
        ap_lr = average_precision_score(y_val, preds)
        print(f"  LogReg (OOS):  AUC={auc_lr:.4f}  AvgPrecision={ap_lr:.4f}")
    except Exception as ex:
        print(f"  LogReg failed: {ex}")

    # ── k-NN (captures non-linear manifold structure) ─────────────────────
    try:
        knn = KNeighborsClassifier(n_neighbors=15, metric='euclidean')
        knn.fit(X_train, y_train)
        preds_knn = knn.predict_proba(X_val)[:, 1]
        auc_knn = roc_auc_score(y_val, preds_knn)
        auc_knn = max(auc_knn, 1-auc_knn)
        print(f"  k-NN (k=15):   AUC={auc_knn:.4f}")
    except Exception as ex:
        print(f"  kNN failed: {ex}")

    results.append((cfg_name, K, pos_rate, best_proxy_auc,
                    auc_lr if 'auc_lr' in dir() else 0,
                    auc_knn if 'auc_knn' in dir() else 0))

# ── Temporal structure: HOW MANY SCANS AHEAD does the signal lead? ─────────
print("\n" + "="*65)
print("TEMPORAL LEAD STRUCTURE: proxy_B vs future inst/gap (by horizon)")
print("="*65)
print(f"  {'Horizon':>10}  {'AUC(B)':>8}  {'AUC(C)':>8}  {'pos_rate':>9}")
for K in [1, 2, 5, 10, 20, 30, 50, 100]:
    y_k = build_labels(K, inst_p90, gap_p10)
    if y_k.mean() < 0.01 or y_k.mean() > 0.80:
        continue
    pB = X_norm[:-K, 1]  # proxy_B normalized
    pC = X_norm[:-K, 2]  # proxy_C normalized
    yy = y_k[:-K]
    valid = np.isfinite(pB) & np.isfinite(pC) & np.isfinite(yy)
    if valid.sum() < 100:
        continue
    try:
        aB = roc_auc_score(yy[valid], pB[valid])
        aB = max(aB, 1-aB)
        aC = roc_auc_score(yy[valid], pC[valid])
        aC = max(aC, 1-aC)
        print(f"  K={K:>3} scans ahead:  AUC(B)={aB:.4f}  AUC(C)={aC:.4f}  pos={y_k.mean()*100:.1f}%")
    except Exception:
        pass

# ── 512-bit DVAE question: variance per proxy before/after normalization ───
print("\n" + "="*65)
print("550-BIT FLINT EFFECT: variance recovery in heavy-tailed proxies")
print("="*65)
for j, pname in enumerate(proxy_names):
    raw  = X_proxies[:, j]
    norm = X_norm[:, j]
    kurt_raw  = float(((raw-raw.mean())**4).mean() / (raw.std()**4 + 1e-8))
    kurt_norm = float(((norm-norm.mean())**4).mean() / (norm.std()**4 + 1e-8))
    # Fraction of samples that would be clipped at ±3σ in float64 z-score
    z64 = (raw - raw.mean()) / (raw.std() + 1e-8)
    clip_pct = (np.abs(z64) > 3).mean() * 100
    print(f"  {pname:<32} kurt_raw={kurt_raw:8.1f}  kurt_norm={kurt_norm:6.2f}  "
          f"tail_samples={clip_pct:.1f}%_beyond_3sigma")

print("\nDone.")