194 lines
8.0 KiB
Python
194 lines
8.0 KiB
Python
|
|
"""
|
|||
|
|
proto_v2_query.py — Back-of-envelope z-space probe for convnext_model_v2.json
|
|||
|
|
|
|||
|
|
Queries the current best checkpoint against the 56-day backtest period
|
|||
|
|
(Dec 31 2025 – Feb 25 2026) to assess signal quality vs the ep=17 baseline.
|
|||
|
|
|
|||
|
|
Reports:
|
|||
|
|
1. z_active, z_post_std (latent health)
|
|||
|
|
2. proxy_B dim + r (encoding quality)
|
|||
|
|
3. Calibration: is z_proxy_B still always negative for this period?
|
|||
|
|
4. Split test: top-25% vs bottom-25% proxy_B days — does z separate them?
|
|||
|
|
|
|||
|
|
ExF columns (dvol_btc, fng, funding_btc) are zero-filled — same as exp13 fallback.
|
|||
|
|
Safe to run while training is still in progress (read-only, no GPU).
|
|||
|
|
"""
|
|||
|
|
import os, sys, json, glob
|
|||
|
|
import io
|
|||
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
|||
|
|
|
|||
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
|
|||
|
|
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|||
|
|
DVAE_DIR = os.path.join(ROOT, 'nautilus_dolphin', 'dvae')
|
|||
|
|
sys.path.insert(0, DVAE_DIR)
|
|||
|
|
|
|||
|
|
MODEL_V2 = os.path.join(DVAE_DIR, 'convnext_model_v2.json')
|
|||
|
|
MODEL_EP17 = os.path.join(DVAE_DIR, 'convnext_model.json')
|
|||
|
|
KLINES_DIR = os.path.join(ROOT, 'vbt_cache_klines')
|
|||
|
|
|
|||
|
|
FEATURE_COLS = [
|
|||
|
|
'v50_lambda_max_velocity', 'v150_lambda_max_velocity',
|
|||
|
|
'v300_lambda_max_velocity', 'v750_lambda_max_velocity',
|
|||
|
|
'vel_div', 'instability_50', 'instability_150',
|
|||
|
|
]
|
|||
|
|
EXF_COLS = ['dvol_btc', 'fng', 'funding_btc'] # zero-filled
|
|||
|
|
T_WIN = 32
|
|||
|
|
|
|||
|
|
# 56-day backtest window
|
|||
|
|
DATE_START = '2025-12-31'
|
|||
|
|
DATE_END = '2026-02-25'
|
|||
|
|
|
|||
|
|
# ── load model ───────────────────────────────────────────────────────────────
|
|||
|
|
from convnext_dvae import ConvNeXtVAE
|
|||
|
|
|
|||
|
|
def load_model(path):
|
|||
|
|
with open(path) as f:
|
|||
|
|
meta = json.load(f)
|
|||
|
|
arch = meta.get('architecture', {})
|
|||
|
|
m = ConvNeXtVAE(
|
|||
|
|
C_in=arch['C_in'], T_in=arch['T_in'],
|
|||
|
|
z_dim=arch['z_dim'], base_ch=arch['base_ch'],
|
|||
|
|
n_blocks=arch.get('n_blocks', 3), seed=42,
|
|||
|
|
)
|
|||
|
|
m.load(path)
|
|||
|
|
nm = np.array(meta['norm_mean']) if 'norm_mean' in meta else None
|
|||
|
|
ns = np.array(meta['norm_std']) if 'norm_std' in meta else None
|
|||
|
|
return m, nm, ns, meta
|
|||
|
|
|
|||
|
|
print(f"Loading v2 checkpoint...")
|
|||
|
|
model_v2, nm_v2, ns_v2, meta_v2 = load_model(MODEL_V2)
|
|||
|
|
print(f" ep={meta_v2.get('epoch')} val_loss={meta_v2.get('val_loss',0):.5f}")
|
|||
|
|
|
|||
|
|
ep17_exists = os.path.exists(MODEL_EP17)
|
|||
|
|
if ep17_exists:
|
|||
|
|
print(f"Loading ep=17 baseline for comparison...")
|
|||
|
|
model_17, nm_17, ns_17, meta_17 = load_model(MODEL_EP17)
|
|||
|
|
print(f" ep={meta_17.get('epoch')} val_loss={meta_17.get('val_loss',0):.5f}")
|
|||
|
|
|
|||
|
|
# ── build probe set from 56-day window ───────────────────────────────────────
|
|||
|
|
print(f"\nBuilding probe windows from {DATE_START} to {DATE_END}...")
|
|||
|
|
files = sorted(f for f in glob.glob(os.path.join(KLINES_DIR, '*.parquet')))
|
|||
|
|
# filter to date range
|
|||
|
|
period_files = [f for f in files
|
|||
|
|
if DATE_START <= os.path.basename(f)[:10] <= DATE_END]
|
|||
|
|
print(f" {len(period_files)} klines files in period")
|
|||
|
|
|
|||
|
|
rng = np.random.default_rng(42)
|
|||
|
|
probes_raw, proxy_B_vals, file_dates = [], [], []
|
|||
|
|
|
|||
|
|
step = max(1, len(period_files) // 60) # ~60 probes across period
|
|||
|
|
for f in period_files[::step]:
|
|||
|
|
try:
|
|||
|
|
df = pd.read_parquet(f, columns=FEATURE_COLS).dropna()
|
|||
|
|
if len(df) < T_WIN + 10: continue
|
|||
|
|
# sample from middle of each day (avoid open/close noise)
|
|||
|
|
mid = len(df) // 2
|
|||
|
|
pos = int(rng.integers(max(0, mid - 30), min(len(df) - T_WIN, mid + 30)))
|
|||
|
|
arr = df[FEATURE_COLS].values[pos:pos+T_WIN].astype(np.float64) # (T, 7)
|
|||
|
|
proxy_B = (arr[:, 5] - arr[:, 3]).reshape(-1, 1) # instability_50 - v750
|
|||
|
|
exf = np.zeros((T_WIN, 3), dtype=np.float64) # zero-fill ExF
|
|||
|
|
arr11 = np.concatenate([arr, proxy_B, exf], axis=1).T # (11, T)
|
|||
|
|
if not np.isfinite(arr11).all(): continue
|
|||
|
|
probes_raw.append(arr11)
|
|||
|
|
proxy_B_vals.append(float(proxy_B.mean()))
|
|||
|
|
file_dates.append(os.path.basename(f)[:10])
|
|||
|
|
except Exception as e:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
probes_raw = np.stack(probes_raw) # (N, 11, T)
|
|||
|
|
proxy_B_arr = np.array(proxy_B_vals)
|
|||
|
|
print(f" Probe set: {probes_raw.shape} ({len(probes_raw)} windows)")
|
|||
|
|
|
|||
|
|
def normalise(probes, nm, ns):
|
|||
|
|
if nm is None: return probes
|
|||
|
|
p = (probes - nm[None, :, None]) / ns[None, :, None]
|
|||
|
|
np.clip(p, -6., 6., out=p)
|
|||
|
|
return p
|
|||
|
|
|
|||
|
|
def run_query(model, nm, ns, label):
|
|||
|
|
probes = normalise(probes_raw, nm, ns)
|
|||
|
|
z_mu, z_logvar = model.encode(probes)
|
|||
|
|
x_recon = model.decode(z_mu)
|
|||
|
|
|
|||
|
|
# 1. Latent health
|
|||
|
|
z_std_per_dim = z_mu.std(0)
|
|||
|
|
z_active = int((z_std_per_dim > 0.01).sum())
|
|||
|
|
z_post_std = float(np.exp(0.5 * z_logvar).mean())
|
|||
|
|
z_mean_all = float(z_mu.mean())
|
|||
|
|
|
|||
|
|
# 2. Reconstruction
|
|||
|
|
recon_err = ((probes - x_recon) ** 2).mean(axis=(-1, -2))
|
|||
|
|
recon_p50 = float(np.median(recon_err))
|
|||
|
|
|
|||
|
|
# 3. proxy_B correlation — find best dim
|
|||
|
|
corrs = []
|
|||
|
|
for d in range(z_mu.shape[1]):
|
|||
|
|
if z_std_per_dim[d] > 0.01:
|
|||
|
|
r = float(np.corrcoef(z_mu[:, d], proxy_B_arr)[0, 1])
|
|||
|
|
if np.isfinite(r): corrs.append((abs(r), r, d))
|
|||
|
|
corrs.sort(reverse=True)
|
|||
|
|
best_abs_r, best_r, best_dim = corrs[0] if corrs else (0, 0, -1)
|
|||
|
|
|
|||
|
|
# 4. Calibration: is best_dim always negative for this period?
|
|||
|
|
z_best = z_mu[:, best_dim]
|
|||
|
|
z_min, z_max = float(z_best.min()), float(z_best.max())
|
|||
|
|
always_neg = z_max < 0
|
|||
|
|
always_pos = z_min > 0
|
|||
|
|
calib = "ALWAYS NEGATIVE" if always_neg else ("ALWAYS POSITIVE" if always_pos else
|
|||
|
|
f"MIXED [{z_min:+.3f}, {z_max:+.3f}]")
|
|||
|
|
|
|||
|
|
# 5. Split test: top-25% vs bottom-25% proxy_B days
|
|||
|
|
q75 = np.percentile(proxy_B_arr, 75)
|
|||
|
|
q25 = np.percentile(proxy_B_arr, 25)
|
|||
|
|
hi_mask = proxy_B_arr >= q75
|
|||
|
|
lo_mask = proxy_B_arr <= q25
|
|||
|
|
z_hi = float(z_best[hi_mask].mean()) if hi_mask.sum() > 2 else float('nan')
|
|||
|
|
z_lo = float(z_best[lo_mask].mean()) if lo_mask.sum() > 2 else float('nan')
|
|||
|
|
sep = abs(z_hi - z_lo)
|
|||
|
|
|
|||
|
|
print(f"\n{'='*60}")
|
|||
|
|
print(f" {label}")
|
|||
|
|
print(f"{'='*60}")
|
|||
|
|
print(f" z_active : {z_active} / {z_mu.shape[1]}")
|
|||
|
|
print(f" z_post_std : {z_post_std:.4f} (healthy: 0.6–1.2)")
|
|||
|
|
print(f" recon_p50 : {recon_p50:.4f} (ep17 baseline: 0.2999)")
|
|||
|
|
print(f"\n proxy_B dim : z[{best_dim}] r={best_r:+.4f} (ep17 had z[10] r=+0.973)")
|
|||
|
|
print(f" Top-5 z×proxy_B corrs:")
|
|||
|
|
for _, r, d in corrs[:5]:
|
|||
|
|
bar = '#' * int(abs(r) * 30)
|
|||
|
|
print(f" z[{d:2d}] r={r:+.4f} {bar}")
|
|||
|
|
print(f"\n Calibration : {calib}")
|
|||
|
|
print(f" z[{best_dim}] range : [{z_min:+.4f}, {z_max:+.4f}]")
|
|||
|
|
print(f" z[{best_dim}] mean : {z_best.mean():+.4f}")
|
|||
|
|
print(f"\n Split test (proxy_B quartiles):")
|
|||
|
|
print(f" top-25% proxy_B → z[{best_dim}] mean = {z_hi:+.4f}")
|
|||
|
|
print(f" bot-25% proxy_B → z[{best_dim}] mean = {z_lo:+.4f}")
|
|||
|
|
print(f" separation = {sep:.4f} (>0.3 useful, >0.6 good)")
|
|||
|
|
|
|||
|
|
return z_mu, corrs
|
|||
|
|
|
|||
|
|
print("\n" + "="*60)
|
|||
|
|
print(f"proxy_B stats over {len(probes_raw)} probes:")
|
|||
|
|
print(f" mean={proxy_B_arr.mean():+.4f} std={proxy_B_arr.std():.4f} "
|
|||
|
|
f"min={proxy_B_arr.min():+.4f} max={proxy_B_arr.max():+.4f}")
|
|||
|
|
|
|||
|
|
z_v2, corrs_v2 = run_query(model_v2, nm_v2, ns_v2,
|
|||
|
|
f"v2 ep={meta_v2.get('epoch')} val={meta_v2.get('val_loss',0):.5f} [CURRENT BEST]")
|
|||
|
|
|
|||
|
|
if ep17_exists:
|
|||
|
|
z_17, corrs_17 = run_query(model_17, nm_17, ns_17,
|
|||
|
|
f"ep17 val={meta_17.get('val_loss',0):.5f} [PRODUCTION BASELINE]")
|
|||
|
|
|
|||
|
|
# Side-by-side summary
|
|||
|
|
print(f"\n{'='*60}")
|
|||
|
|
print(" COMPARISON SUMMARY")
|
|||
|
|
print(f"{'='*60}")
|
|||
|
|
r_v2 = corrs_v2[0][1] if corrs_v2 else 0
|
|||
|
|
r_17 = corrs_17[0][1] if corrs_17 else 0
|
|||
|
|
print(f" proxy_B r : v2={r_v2:+.4f} vs ep17={r_17:+.4f} "
|
|||
|
|
f"({'BETTER' if abs(r_v2) > abs(r_17) else 'WORSE' if abs(r_v2) < abs(r_17) else 'SAME'})")
|
|||
|
|
|
|||
|
|
print(f"\nDone.")
|