"""ExF NPZ Patcher — Supplemental Historical Backfill ====================================================== The initial backfill got ~41/85 indicators. This script patches the existing NPZ files with real historical values for indicators that were failing: Priority 1 — fng (Alternative.me): one API call returns 2000+ days. EASY. Priority 2 — oi_btc/eth, ls_btc/eth, ls_top, taker (Binance hist endpoints) Priority 3 — vix, sp500, gold, dxy, us10y, ycurve, fedfunds (FRED — needs key) Priority 4 — mvrv, nvt, addr_btc (CoinMetrics community API) Strategy: load each NPZ, replace failing indicator values with fetched historical data, re-save. Idempotent — re-run any time. Usage: python backfill_patch_npz.py # patch all dates python backfill_patch_npz.py --dry-run # show what would change python backfill_patch_npz.py --fred-key YOUR_KEY_HERE # enable FRED python backfill_patch_npz.py --skip-binance # skip Binance OI/LS/taker """ import sys, time, argparse, json sys.stdout.reconfigure(encoding='utf-8', errors='replace') from pathlib import Path from datetime import datetime, timezone, date, timedelta import numpy as np try: import requests HAS_REQUESTS = True except ImportError: HAS_REQUESTS = False print("WARNING: requests not installed. Install with: pip install requests") import sys as _sys EIGENVALUES_PATH = (Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues") if _sys.platform == 'win32' else Path('/mnt/ng6_data/eigenvalues')) KLINES_DIR = (Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines") if _sys.platform == 'win32' else Path('/mnt/dolphin/vbt_cache_klines')) NPZ_FILENAME = "scan_000001__Indicators.npz" REQUEST_TIMEOUT = 20 def parse_args(): p = argparse.ArgumentParser() p.add_argument("--dry-run", action="store_true") p.add_argument("--fred-key", default="", help="FRED API key (free: fred.stlouisfed.org)") p.add_argument("--skip-binance", action="store_true") p.add_argument("--skip-fred", action="store_true") p.add_argument("--skip-fng", action="store_true") p.add_argument("--start", default=None, help="Start date YYYY-MM-DD") p.add_argument("--end", default=None, help="End date YYYY-MM-DD") return p.parse_args() # ── FNG (Alternative.me) — one call, all history ───────────────────────────── def fetch_fng_history(): """Returns dict: date_str -> fng_value (int).""" url = "https://api.alternative.me/fng/?limit=2000&format=json&date_format=us" try: r = requests.get(url, timeout=REQUEST_TIMEOUT) r.raise_for_status() data = r.json() result = {} for entry in data.get('data', []): # date_format=us gives MM/DD/YYYY raw_date = entry.get('timestamp') or entry.get('time_until_update', '') # Try two formats the API uses ts_str = str(entry.get('timestamp', '')) parsed = False for fmt in ('%m-%d-%Y', '%m/%d/%Y', '%Y-%m-%d'): try: dt = datetime.strptime(ts_str, fmt) key = dt.strftime('%Y-%m-%d') result[key] = int(entry['value']) parsed = True break except ValueError: pass if not parsed: try: ts = int(ts_str) dt = datetime.utcfromtimestamp(ts) key = dt.strftime('%Y-%m-%d') result[key] = int(entry['value']) except Exception: pass return result except Exception as e: print(f" FNG fetch failed: {e}") return {} # ── Binance historical OI / LS / taker ─────────────────────────────────────── def fetch_binance_hist(url_template, symbol, date_str): """Fetch a single data point from Binance hist endpoint for given date (noon UTC).""" yr, mo, dy = int(date_str[:4]), int(date_str[5:7]), int(date_str[8:10]) noon_utc = datetime(yr, mo, dy, 12, 0, 0, tzinfo=timezone.utc) start_ms = int(noon_utc.timestamp() * 1000) end_ms = start_ms + 3_600_000 # +1 hour window url = url_template.format(SYMBOL=symbol, start_ms=start_ms, end_ms=end_ms) try: r = requests.get(url, timeout=REQUEST_TIMEOUT) if r.status_code == 400: return None # data too old for this endpoint r.raise_for_status() data = r.json() if isinstance(data, list) and len(data) > 0: return data[0] return None except Exception: return None OI_URL = "https://fapi.binance.com/futures/data/openInterestHist?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1" LS_URL = "https://fapi.binance.com/futures/data/globalLongShortAccountRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1" LS_TOP = "https://fapi.binance.com/futures/data/topLongShortAccountRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1" TAKER_URL = "https://fapi.binance.com/futures/data/takerlongshortRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1" def get_binance_indicators(date_str): """Returns dict of indicator_name -> value (or None on failure).""" results = {} for name, url, sym, field in [ ('oi_btc', OI_URL, 'BTCUSDT', 'sumOpenInterest'), ('oi_eth', OI_URL, 'ETHUSDT', 'sumOpenInterest'), ('ls_btc', LS_URL, 'BTCUSDT', 'longShortRatio'), ('ls_eth', LS_URL, 'ETHUSDT', 'longShortRatio'), ('ls_top', LS_TOP, 'BTCUSDT', 'longShortRatio'), ('taker', TAKER_URL,'BTCUSDT', 'buySellRatio'), ]: rec = fetch_binance_hist(url, sym, date_str) if rec is not None and field in rec: try: results[name] = float(rec[field]) except (TypeError, ValueError): results[name] = None else: results[name] = None time.sleep(0.05) # light rate limiting return results # ── FRED ───────────────────────────────────────────────────────────────────── FRED_SERIES = { 'vix': 'VIXCLS', 'sp500': 'SP500', 'gold': 'GOLDAMGBD228NLBM', 'dxy': 'DTWEXBGS', 'us10y': 'DGS10', 'us2y': 'DGS2', 'ycurve': 'T10Y2Y', 'fedfunds': 'DFF', 'hy_spread': 'BAMLH0A0HYM2', 'be5y': 'T5YIE', 'm2': 'WM2NS', } _fred_cache = {} # series_id -> {date_str -> value} def fetch_fred_series(series_id, fred_key, lookback_years=6): """Fetch a FRED series for the last 6 years. Cached.""" if series_id in _fred_cache: return _fred_cache[series_id] start = (date.today() - timedelta(days=lookback_years*366)).strftime('%Y-%m-%d') url = (f"https://api.stlouisfed.org/fred/series/observations" f"?series_id={series_id}&api_key={fred_key}&file_type=json" f"&observation_start={start}") try: r = requests.get(url, timeout=REQUEST_TIMEOUT) r.raise_for_status() data = r.json() result = {} prev = None for obs in data.get('observations', []): v = obs.get('value', '.') if v not in ('.', '', 'nd'): try: prev = float(v) except ValueError: pass if prev is not None: result[obs['date']] = prev # forward-fill _fred_cache[series_id] = result return result except Exception as e: print(f" FRED {series_id} failed: {e}") _fred_cache[series_id] = {} return {} def get_fred_indicators(date_str, fred_key): results = {} for name, series_id in FRED_SERIES.items(): series = fetch_fred_series(series_id, fred_key) # Find value on or before date (forward-fill) val = None for d_str in sorted(series.keys(), reverse=True): if d_str <= date_str: val = series[d_str] break results[name] = val return results # ── CoinMetrics community ───────────────────────────────────────────────────── _cm_cache = {} # (asset, metric) -> {date_str -> value} def fetch_coinmetrics(asset, metric, date_str): key = (asset, metric) if key not in _cm_cache: url = (f"https://community-api.coinmetrics.io/v4/timeseries/asset-metrics" f"?assets={asset}&metrics={metric}&frequency=1d" f"&start_time=2021-01-01T00:00:00Z") try: r = requests.get(url, timeout=30) r.raise_for_status() data = r.json() result = {} for row in data.get('data', []): d = row.get('time', '')[:10] v = row.get(metric) if v is not None: try: result[d] = float(v) except (TypeError, ValueError): pass _cm_cache[key] = result except Exception as e: print(f" CoinMetrics {asset}/{metric} failed: {e}") _cm_cache[key] = {} cache = _cm_cache.get(key, {}) return cache.get(date_str) CM_INDICATORS = [ # Only include metrics confirmed as accessible on community API ('mvrv', 'btc', 'CapMVRVCur'), # works (200 OK) ('addr_btc', 'btc', 'AdrActCnt'), # works ('txcnt', 'btc', 'TxCnt'), # works ] # ── Main patcher ────────────────────────────────────────────────────────────── def patch_npz(npz_path, updates, dry_run=False): """Load NPZ, apply updates dict {name -> value}, save in-place.""" data = np.load(str(npz_path), allow_pickle=True) names = list(data['api_names']) vals = data['api_indicators'].copy() success = data['api_success'].copy() changed = [] for name, value in updates.items(): if value is None or not np.isfinite(float(value)): continue if name not in names: continue idx = names.index(name) old = float(vals[idx]) old_ok = bool(success[idx]) new_val = float(value) if not old_ok or abs(old - new_val) > 1e-9: vals[idx] = new_val success[idx] = True changed.append(f"{name}: {old:.4f}→{new_val:.4f}") if not changed: return 0 if not dry_run: ind_names = np.array(names, dtype=object) np.savez_compressed( str(npz_path), api_names = ind_names, api_indicators = vals, api_success = success, ) return len(changed) def main(): args = parse_args() if not HAS_REQUESTS: print("ERROR: requests required. pip install requests"); return # Enumerate dates dates = sorted(p.stem for p in KLINES_DIR.glob("*.parquet") if 'catalog' not in p.name) if args.start: dates = [d for d in dates if d >= args.start] if args.end: dates = [d for d in dates if d <= args.end] total = len(dates) print(f"Dates to patch: {total}") print(f"Dry run: {args.dry_run}") print(f"FNG: {'skip' if args.skip_fng else 'YES'}") print(f"Binance: {'skip' if args.skip_binance else 'YES'}") print(f"FRED: {'skip (no key)' if (args.skip_fred or not args.fred_key) else f'YES (key={args.fred_key[:6]}...)'}") print() # ── Fetch FNG all-history up front (one call) ───────────────────────────── fng_hist = {} if not args.skip_fng: print("Fetching FNG full history (one call)...") fng_hist = fetch_fng_history() print(f" Got {len(fng_hist)} dates " f"range={min(fng_hist) if fng_hist else 'n/a'} → {max(fng_hist) if fng_hist else 'n/a'}") if fng_hist: sample = {k: v for k, v in list(fng_hist.items())[:3]} print(f" Sample: {sample}") # ── Fetch FRED all-series up front ─────────────────────────────────────── if args.fred_key and not args.skip_fred: print(f"\nPre-fetching FRED series ({len(FRED_SERIES)} series)...") for name, sid in FRED_SERIES.items(): series = fetch_fred_series(sid, args.fred_key) print(f" {name:<12} ({sid}): {len(series)} observations") time.sleep(0.6) # FRED rate limit: 120/min # ── Fetch CoinMetrics up front ──────────────────────────────────────────── print(f"\nPre-fetching CoinMetrics ({len(CM_INDICATORS)} metrics)...") for cm_name, asset, metric in CM_INDICATORS: fetch_coinmetrics(asset, metric, '2023-01-01') # warms cache for all dates n = len(_cm_cache.get((asset, metric), {})) print(f" {cm_name:<12}: {n} dates") time.sleep(0.8) # ── Per-date loop ───────────────────────────────────────────────────────── print(f"\nPatching NPZ files...") total_changed = 0 binance_fail_streak = 0 t0 = time.time() for i, ds in enumerate(dates): npz_path = EIGENVALUES_PATH / ds / NPZ_FILENAME if not npz_path.exists(): continue updates = {} # FNG if not args.skip_fng and ds in fng_hist: updates['fng'] = float(fng_hist[ds]) # Also try to get sub-components from same entry if available # (fng_prev is previous day's value) prev_day = (datetime.strptime(ds, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d') if prev_day in fng_hist: updates['fng_prev'] = float(fng_hist[prev_day]) # FRED if args.fred_key and not args.skip_fred: fred_vals = get_fred_indicators(ds, args.fred_key) for name, val in fred_vals.items(): if val is not None: updates[name] = val # CoinMetrics for cm_name, asset, metric in CM_INDICATORS: val = fetch_coinmetrics(asset, metric, ds) # hits cache if val is not None: updates[cm_name] = val # Binance OI/LS/taker (network call per date — slowest) if not args.skip_binance and binance_fail_streak < 10: # Only call if these are currently failing in the NPZ d = np.load(str(npz_path), allow_pickle=True) names_in_npz = list(d['api_names']) ok_in_npz = d['api_success'] taker_idx = names_in_npz.index('taker') if 'taker' in names_in_npz else -1 taker_ok = bool(ok_in_npz[taker_idx]) if taker_idx >= 0 else False if not taker_ok: # proxy check: if taker failing, all Binance hist likely failing binance_vals = get_binance_indicators(ds) n_binance_ok = sum(1 for v in binance_vals.values() if v is not None) if n_binance_ok == 0: binance_fail_streak += 1 else: binance_fail_streak = 0 updates.update({k: v for k, v in binance_vals.items() if v is not None}) # Patch n_changed = patch_npz(npz_path, updates, dry_run=args.dry_run) total_changed += n_changed if (i + 1) % 50 == 0 or n_changed > 0: elapsed = time.time() - t0 rate = (i + 1) / elapsed eta = (total - i - 1) / rate if rate > 0 else 0 tag = f" +{n_changed} fields" if n_changed else "" print(f" [{i+1}/{total}] {ds} {elapsed/60:.1f}m eta={eta/60:.1f}m{tag}") elapsed = time.time() - t0 print(f"\n{'='*60}") print(f" Patch complete in {elapsed/60:.1f}m") print(f" Total fields updated: {total_changed}") print(f" {'DRY RUN — no files written' if args.dry_run else 'Files patched in-place'}") print(f"{'='*60}") if not args.fred_key: print(f"\n *** FRED indicators (vix, sp500, gold, dxy, us10y, ycurve, fedfunds)") print(f" *** were SKIPPED. Get a free API key at: https://fred.stlouisfed.org/docs/api/api_key.html") print(f" *** Then re-run with: --fred-key YOUR_KEY_HERE") if binance_fail_streak >= 10: print(f"\n *** Binance hist endpoints failed consistently.") print(f" *** OI data before 2020-09 is not available via Binance API.") print(f" *** Dates before that will remain FAIL for oi_btc, ls_btc, taker.") if __name__ == "__main__": main()