Files
DOLPHIN/nautilus_dolphin/dvae/flint_precursor_sweep.py
hjnormey 01c19662cb initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems:
- prod/ (BLUE harness, configs, scripts, docs)
- nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved)
- adaptive_exit/ (AEM engine + models/bucket_assignments.pkl)
- Observability/ (EsoF advisor, TUI, dashboards)
- external_factors/ (EsoF producer)
- mc_forewarning_qlabs_fork/ (MC regime/envelope)

Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00

226 lines
9.4 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
SILOQY 550-bit Precursor Sweep — NO MODIFICATIONS TO UPSTREAM CODE.
Runs on full 16K eigen corpus, tests multiple:
- Precursor label thresholds (rare extreme events)
- Horizons (K=5, 10, 20, 50 scans ahead)
- ML approaches: Logistic, Ridge, k-NN, threshold-only baseline
Reports AUC, Precision@TopDecile, and direct proxy predictivity.
"""
import sys, os
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
sys.path.insert(0, os.path.dirname(__file__))
sys.path.insert(0, r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict")
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import TimeSeriesSplit
HERE = Path(__file__).parent
# ── Load corpus ────────────────────────────────────────────────────────────
print("Loading corpus (16K eigen samples)...")
from corpus_builder import DolphinCorpus, OFF, T1 as T1_DIM
corpus = DolphinCorpus.load(str(HERE / 'corpus_cache.npz'))
idx_mask = corpus.mask[:, 1]
X_e = corpus.X[idx_mask]
t1 = X_e[:, OFF[1]:OFF[1]+T1_DIM].copy()
N = len(t1)
print(f"N={N} samples")
# ── Feature extraction ─────────────────────────────────────────────────────
vel_w50 = t1[:, 1]
vel_w150 = t1[:, 6]
vel_w300 = t1[:, 11]
vel_w750 = t1[:, 16]
inst_w50 = t1[:, 3]
inst_w150= t1[:, 8]
inst_w300= t1[:, 13]
gap_w50 = t1[:, 2]
gap_w300 = t1[:, 12]
lmax_w50 = t1[:, 0]
proxy_A = -0.674*vel_w750 - 0.357*vel_w300 + 0.421*inst_w50
proxy_B = inst_w50 - vel_w750
proxy_C = vel_w50 - vel_w750
proxy_D = inst_w50 * (-vel_w750)
proxy_E = (inst_w50 - inst_w300) - (vel_w50 - vel_w750)
X_proxies = np.column_stack([proxy_A, proxy_B, proxy_C, proxy_D, proxy_E])
proxy_names = ['A(linear)', 'B(inst-vel750)', 'C(vel50-vel750,k=3798)', 'D(inst*-vel750)', 'E(dinst-dvel)']
# ── 550-bit MCDAIN normalization (from flint_dvae_kernel.py, read-only) ────
print("\nApplying 550-bit MCDAIN normalization to proxies...")
from SILOQY_NN_Kernel_COMPLETE6 import arb, safe_float, FLINT_AVAILABLE, with_precision
def mcdain_550bit(X_raw):
"""Read-only implementation of MCDAIN analytical logic at 550-bit."""
rows, cols = X_raw.shape
X_norm = np.zeros_like(X_raw, dtype=np.float64)
with with_precision(550):
for j in range(cols):
col = X_raw[:, j]
col_abs = np.abs(col[np.isfinite(col)])
if len(col_abs) == 0 or col_abs.mean() < 1e-12:
continue
magnitude = arb(str(float(col_abs.mean())))
log_mag = magnitude.log()
mean_val = magnitude * arb("0.1")
scale_val = arb("1.0") / (log_mag + arb("1e-8"))
gate_val = arb("1.0") / (arb("1.0") + (-log_mag).exp())
m = safe_float(mean_val)
s = safe_float(scale_val)
g = safe_float(gate_val)
X_norm[:, j] = np.clip((X_raw[:, j] - m) * s * g, -10, 10)
X_norm = np.nan_to_num(X_norm, nan=0.0, posinf=5.0, neginf=-5.0)
return X_norm
X_norm = mcdain_550bit(X_proxies)
print(f" Normalized. std per proxy: {X_norm.std(0).round(4)}")
print(f" Kurtosis after normalization: {[round(float(((X_norm[:,j]-X_norm[:,j].mean())**4).mean()/(X_norm[:,j].std()**4+1e-8)),2) for j in range(5)]}")
# ── Build precursor labels at multiple thresholds and horizons ─────────────
print("\n" + "="*65)
print("PRECURSOR LABEL SWEEP")
print("="*65)
# inst_w50 thresholds (what percentile constitutes "stress"?)
inst_p80 = np.percentile(inst_w50, 80) # lenient
inst_p90 = np.percentile(inst_w50, 90) # moderate
inst_p95 = np.percentile(inst_w50, 95) # strict
gap_p20 = np.percentile(gap_w50, 20) # gap collapse (low = collapse)
gap_p10 = np.percentile(gap_w50, 10) # strict gap collapse
print(f"inst_w50 thresholds: p80={inst_p80:.4f} p90={inst_p90:.4f} p95={inst_p95:.4f}")
print(f"gap_w50 thresholds: p20={gap_p20:.4f} p10={gap_p10:.4f}")
def build_labels(horizon, inst_thresh, gap_thresh):
"""Did eigenspace stress (inst spike AND gap collapse) occur in next K scans?"""
labels = np.zeros(N, dtype=np.float32)
for i in range(N - horizon):
future_inst = inst_w50[i+1:i+1+horizon]
future_gap = gap_w50[i+1:i+1+horizon]
if np.any(future_inst > inst_thresh) and np.any(future_gap < gap_thresh):
labels[i] = 1.0
return labels
configs = [
('K=10 lenient', 10, inst_p80, gap_p20),
('K=10 moderate', 10, inst_p90, gap_p10),
('K=20 moderate', 20, inst_p90, gap_p10),
('K=20 strict', 20, inst_p95, gap_p10),
('K=50 strict', 50, inst_p95, gap_p10),
]
results = []
for cfg_name, K, it, gt in configs:
y = build_labels(K, it, gt)
pos_rate = y.mean()
print(f"\n [{cfg_name}] K={K} inst>{it:.3f} gap<{gt:.3f} pos_rate={pos_rate*100:.1f}%")
# Skip degenerate
if pos_rate < 0.02 or pos_rate > 0.60:
print(f" Skipping (pos_rate out of range)")
continue
# ── Evaluate each proxy directly ─────────────────────────────────────
print(f" Direct proxy AUC (no model):")
best_proxy_auc = 0
for j, pname in enumerate(proxy_names):
px = X_norm[:-K, j] if K > 0 else X_norm[:, j]
yy = y[:-K] if K > 0 else y
valid = np.isfinite(px) & np.isfinite(yy)
if valid.sum() < 100:
continue
try:
auc = roc_auc_score(yy[valid], px[valid])
auc = max(auc, 1-auc) # flip if < 0.5
best_proxy_auc = max(best_proxy_auc, auc)
if auc > 0.52:
print(f" {pname:<30} AUC={auc:.4f} *")
else:
print(f" {pname:<30} AUC={auc:.4f}")
except Exception:
pass
# ── Logistic regression on all proxies ───────────────────────────────
Xf = X_norm[:-K]
yf = y[:-K]
valid = np.isfinite(Xf).all(1) & np.isfinite(yf)
Xf, yf = Xf[valid], yf[valid]
if len(Xf) < 200:
continue
try:
# Chronological 3-fold split
n_val = len(Xf) // 4
X_train, X_val = Xf[:-n_val], Xf[-n_val:]
y_train, y_val = yf[:-n_val], yf[-n_val:]
lr = LogisticRegression(class_weight='balanced', max_iter=500, C=0.1)
lr.fit(X_train, y_train)
preds = lr.predict_proba(X_val)[:, 1]
auc_lr = roc_auc_score(y_val, preds)
auc_lr = max(auc_lr, 1-auc_lr)
ap_lr = average_precision_score(y_val, preds)
print(f" LogReg (OOS): AUC={auc_lr:.4f} AvgPrecision={ap_lr:.4f}")
except Exception as ex:
print(f" LogReg failed: {ex}")
# ── k-NN (captures non-linear manifold structure) ─────────────────────
try:
knn = KNeighborsClassifier(n_neighbors=15, metric='euclidean')
knn.fit(X_train, y_train)
preds_knn = knn.predict_proba(X_val)[:, 1]
auc_knn = roc_auc_score(y_val, preds_knn)
auc_knn = max(auc_knn, 1-auc_knn)
print(f" k-NN (k=15): AUC={auc_knn:.4f}")
except Exception as ex:
print(f" kNN failed: {ex}")
results.append((cfg_name, K, pos_rate, best_proxy_auc,
auc_lr if 'auc_lr' in dir() else 0,
auc_knn if 'auc_knn' in dir() else 0))
# ── Temporal structure: HOW MANY SCANS AHEAD does the signal lead? ─────────
print("\n" + "="*65)
print("TEMPORAL LEAD STRUCTURE: proxy_B vs future inst/gap (by horizon)")
print("="*65)
print(f" {'Horizon':>10} {'AUC(B)':>8} {'AUC(C)':>8} {'pos_rate':>9}")
for K in [1, 2, 5, 10, 20, 30, 50, 100]:
y_k = build_labels(K, inst_p90, gap_p10)
if y_k.mean() < 0.01 or y_k.mean() > 0.80:
continue
pB = X_norm[:-K, 1] # proxy_B normalized
pC = X_norm[:-K, 2] # proxy_C normalized
yy = y_k[:-K]
valid = np.isfinite(pB) & np.isfinite(pC) & np.isfinite(yy)
if valid.sum() < 100:
continue
try:
aB = roc_auc_score(yy[valid], pB[valid])
aB = max(aB, 1-aB)
aC = roc_auc_score(yy[valid], pC[valid])
aC = max(aC, 1-aC)
print(f" K={K:>3} scans ahead: AUC(B)={aB:.4f} AUC(C)={aC:.4f} pos={y_k.mean()*100:.1f}%")
except Exception:
pass
# ── 512-bit DVAE question: variance per proxy before/after normalization ───
print("\n" + "="*65)
print("550-BIT FLINT EFFECT: variance recovery in heavy-tailed proxies")
print("="*65)
for j, pname in enumerate(proxy_names):
raw = X_proxies[:, j]
norm = X_norm[:, j]
kurt_raw = float(((raw-raw.mean())**4).mean() / (raw.std()**4 + 1e-8))
kurt_norm = float(((norm-norm.mean())**4).mean() / (norm.std()**4 + 1e-8))
# Fraction of samples that would be clipped at ±3σ in float64 z-score
z64 = (raw - raw.mean()) / (raw.std() + 1e-8)
clip_pct = (np.abs(z64) > 3).mean() * 100
print(f" {pname:<32} kurt_raw={kurt_raw:8.1f} kurt_norm={kurt_norm:6.2f} "
f"tail_samples={clip_pct:.1f}%_beyond_3sigma")
print("\nDone.")