initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
398
external_factors/backfill_patch_npz.py
Executable file
398
external_factors/backfill_patch_npz.py
Executable file
@@ -0,0 +1,398 @@
|
||||
"""ExF NPZ Patcher — Supplemental Historical Backfill
|
||||
======================================================
|
||||
The initial backfill got ~41/85 indicators. This script patches the existing
|
||||
NPZ files with real historical values for indicators that were failing:
|
||||
|
||||
Priority 1 — fng (Alternative.me): one API call returns 2000+ days. EASY.
|
||||
Priority 2 — oi_btc/eth, ls_btc/eth, ls_top, taker (Binance hist endpoints)
|
||||
Priority 3 — vix, sp500, gold, dxy, us10y, ycurve, fedfunds (FRED — needs key)
|
||||
Priority 4 — mvrv, nvt, addr_btc (CoinMetrics community API)
|
||||
|
||||
Strategy: load each NPZ, replace failing indicator values with fetched historical
|
||||
data, re-save. Idempotent — re-run any time.
|
||||
|
||||
Usage:
|
||||
python backfill_patch_npz.py # patch all dates
|
||||
python backfill_patch_npz.py --dry-run # show what would change
|
||||
python backfill_patch_npz.py --fred-key YOUR_KEY_HERE # enable FRED
|
||||
python backfill_patch_npz.py --skip-binance # skip Binance OI/LS/taker
|
||||
"""
|
||||
import sys, time, argparse, json
|
||||
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone, date, timedelta
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
import requests
|
||||
HAS_REQUESTS = True
|
||||
except ImportError:
|
||||
HAS_REQUESTS = False
|
||||
print("WARNING: requests not installed. Install with: pip install requests")
|
||||
|
||||
import sys as _sys
|
||||
EIGENVALUES_PATH = (Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
|
||||
if _sys.platform == 'win32' else Path('/mnt/ng6_data/eigenvalues'))
|
||||
KLINES_DIR = (Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines")
|
||||
if _sys.platform == 'win32' else Path('/mnt/dolphin/vbt_cache_klines'))
|
||||
NPZ_FILENAME = "scan_000001__Indicators.npz"
|
||||
REQUEST_TIMEOUT = 20
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--dry-run", action="store_true")
|
||||
p.add_argument("--fred-key", default="", help="FRED API key (free: fred.stlouisfed.org)")
|
||||
p.add_argument("--skip-binance", action="store_true")
|
||||
p.add_argument("--skip-fred", action="store_true")
|
||||
p.add_argument("--skip-fng", action="store_true")
|
||||
p.add_argument("--start", default=None, help="Start date YYYY-MM-DD")
|
||||
p.add_argument("--end", default=None, help="End date YYYY-MM-DD")
|
||||
return p.parse_args()
|
||||
|
||||
# ── FNG (Alternative.me) — one call, all history ─────────────────────────────
|
||||
|
||||
def fetch_fng_history():
|
||||
"""Returns dict: date_str -> fng_value (int)."""
|
||||
url = "https://api.alternative.me/fng/?limit=2000&format=json&date_format=us"
|
||||
try:
|
||||
r = requests.get(url, timeout=REQUEST_TIMEOUT)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
result = {}
|
||||
for entry in data.get('data', []):
|
||||
# date_format=us gives MM/DD/YYYY
|
||||
raw_date = entry.get('timestamp') or entry.get('time_until_update', '')
|
||||
# Try two formats the API uses
|
||||
ts_str = str(entry.get('timestamp', ''))
|
||||
parsed = False
|
||||
for fmt in ('%m-%d-%Y', '%m/%d/%Y', '%Y-%m-%d'):
|
||||
try:
|
||||
dt = datetime.strptime(ts_str, fmt)
|
||||
key = dt.strftime('%Y-%m-%d')
|
||||
result[key] = int(entry['value'])
|
||||
parsed = True
|
||||
break
|
||||
except ValueError:
|
||||
pass
|
||||
if not parsed:
|
||||
try:
|
||||
ts = int(ts_str)
|
||||
dt = datetime.utcfromtimestamp(ts)
|
||||
key = dt.strftime('%Y-%m-%d')
|
||||
result[key] = int(entry['value'])
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f" FNG fetch failed: {e}")
|
||||
return {}
|
||||
|
||||
# ── Binance historical OI / LS / taker ───────────────────────────────────────
|
||||
|
||||
def fetch_binance_hist(url_template, symbol, date_str):
|
||||
"""Fetch a single data point from Binance hist endpoint for given date (noon UTC)."""
|
||||
yr, mo, dy = int(date_str[:4]), int(date_str[5:7]), int(date_str[8:10])
|
||||
noon_utc = datetime(yr, mo, dy, 12, 0, 0, tzinfo=timezone.utc)
|
||||
start_ms = int(noon_utc.timestamp() * 1000)
|
||||
end_ms = start_ms + 3_600_000 # +1 hour window
|
||||
url = url_template.format(SYMBOL=symbol, start_ms=start_ms, end_ms=end_ms)
|
||||
try:
|
||||
r = requests.get(url, timeout=REQUEST_TIMEOUT)
|
||||
if r.status_code == 400:
|
||||
return None # data too old for this endpoint
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
if isinstance(data, list) and len(data) > 0:
|
||||
return data[0]
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
OI_URL = "https://fapi.binance.com/futures/data/openInterestHist?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1"
|
||||
LS_URL = "https://fapi.binance.com/futures/data/globalLongShortAccountRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1"
|
||||
LS_TOP = "https://fapi.binance.com/futures/data/topLongShortAccountRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1"
|
||||
TAKER_URL = "https://fapi.binance.com/futures/data/takerlongshortRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1"
|
||||
|
||||
def get_binance_indicators(date_str):
|
||||
"""Returns dict of indicator_name -> value (or None on failure)."""
|
||||
results = {}
|
||||
for name, url, sym, field in [
|
||||
('oi_btc', OI_URL, 'BTCUSDT', 'sumOpenInterest'),
|
||||
('oi_eth', OI_URL, 'ETHUSDT', 'sumOpenInterest'),
|
||||
('ls_btc', LS_URL, 'BTCUSDT', 'longShortRatio'),
|
||||
('ls_eth', LS_URL, 'ETHUSDT', 'longShortRatio'),
|
||||
('ls_top', LS_TOP, 'BTCUSDT', 'longShortRatio'),
|
||||
('taker', TAKER_URL,'BTCUSDT', 'buySellRatio'),
|
||||
]:
|
||||
rec = fetch_binance_hist(url, sym, date_str)
|
||||
if rec is not None and field in rec:
|
||||
try:
|
||||
results[name] = float(rec[field])
|
||||
except (TypeError, ValueError):
|
||||
results[name] = None
|
||||
else:
|
||||
results[name] = None
|
||||
time.sleep(0.05) # light rate limiting
|
||||
return results
|
||||
|
||||
# ── FRED ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
FRED_SERIES = {
|
||||
'vix': 'VIXCLS',
|
||||
'sp500': 'SP500',
|
||||
'gold': 'GOLDAMGBD228NLBM',
|
||||
'dxy': 'DTWEXBGS',
|
||||
'us10y': 'DGS10',
|
||||
'us2y': 'DGS2',
|
||||
'ycurve': 'T10Y2Y',
|
||||
'fedfunds': 'DFF',
|
||||
'hy_spread': 'BAMLH0A0HYM2',
|
||||
'be5y': 'T5YIE',
|
||||
'm2': 'WM2NS',
|
||||
}
|
||||
|
||||
_fred_cache = {} # series_id -> {date_str -> value}
|
||||
|
||||
def fetch_fred_series(series_id, fred_key, lookback_years=6):
|
||||
"""Fetch a FRED series for the last 6 years. Cached."""
|
||||
if series_id in _fred_cache:
|
||||
return _fred_cache[series_id]
|
||||
start = (date.today() - timedelta(days=lookback_years*366)).strftime('%Y-%m-%d')
|
||||
url = (f"https://api.stlouisfed.org/fred/series/observations"
|
||||
f"?series_id={series_id}&api_key={fred_key}&file_type=json"
|
||||
f"&observation_start={start}")
|
||||
try:
|
||||
r = requests.get(url, timeout=REQUEST_TIMEOUT)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
result = {}
|
||||
prev = None
|
||||
for obs in data.get('observations', []):
|
||||
v = obs.get('value', '.')
|
||||
if v not in ('.', '', 'nd'):
|
||||
try:
|
||||
prev = float(v)
|
||||
except ValueError:
|
||||
pass
|
||||
if prev is not None:
|
||||
result[obs['date']] = prev # forward-fill
|
||||
_fred_cache[series_id] = result
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f" FRED {series_id} failed: {e}")
|
||||
_fred_cache[series_id] = {}
|
||||
return {}
|
||||
|
||||
def get_fred_indicators(date_str, fred_key):
|
||||
results = {}
|
||||
for name, series_id in FRED_SERIES.items():
|
||||
series = fetch_fred_series(series_id, fred_key)
|
||||
# Find value on or before date (forward-fill)
|
||||
val = None
|
||||
for d_str in sorted(series.keys(), reverse=True):
|
||||
if d_str <= date_str:
|
||||
val = series[d_str]
|
||||
break
|
||||
results[name] = val
|
||||
return results
|
||||
|
||||
# ── CoinMetrics community ─────────────────────────────────────────────────────
|
||||
|
||||
_cm_cache = {} # (asset, metric) -> {date_str -> value}
|
||||
|
||||
def fetch_coinmetrics(asset, metric, date_str):
|
||||
key = (asset, metric)
|
||||
if key not in _cm_cache:
|
||||
url = (f"https://community-api.coinmetrics.io/v4/timeseries/asset-metrics"
|
||||
f"?assets={asset}&metrics={metric}&frequency=1d"
|
||||
f"&start_time=2021-01-01T00:00:00Z")
|
||||
try:
|
||||
r = requests.get(url, timeout=30)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
result = {}
|
||||
for row in data.get('data', []):
|
||||
d = row.get('time', '')[:10]
|
||||
v = row.get(metric)
|
||||
if v is not None:
|
||||
try:
|
||||
result[d] = float(v)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
_cm_cache[key] = result
|
||||
except Exception as e:
|
||||
print(f" CoinMetrics {asset}/{metric} failed: {e}")
|
||||
_cm_cache[key] = {}
|
||||
cache = _cm_cache.get(key, {})
|
||||
return cache.get(date_str)
|
||||
|
||||
CM_INDICATORS = [
|
||||
# Only include metrics confirmed as accessible on community API
|
||||
('mvrv', 'btc', 'CapMVRVCur'), # works (200 OK)
|
||||
('addr_btc', 'btc', 'AdrActCnt'), # works
|
||||
('txcnt', 'btc', 'TxCnt'), # works
|
||||
]
|
||||
|
||||
# ── Main patcher ──────────────────────────────────────────────────────────────
|
||||
|
||||
def patch_npz(npz_path, updates, dry_run=False):
|
||||
"""Load NPZ, apply updates dict {name -> value}, save in-place."""
|
||||
data = np.load(str(npz_path), allow_pickle=True)
|
||||
names = list(data['api_names'])
|
||||
vals = data['api_indicators'].copy()
|
||||
success = data['api_success'].copy()
|
||||
|
||||
changed = []
|
||||
for name, value in updates.items():
|
||||
if value is None or not np.isfinite(float(value)):
|
||||
continue
|
||||
if name not in names:
|
||||
continue
|
||||
idx = names.index(name)
|
||||
old = float(vals[idx])
|
||||
old_ok = bool(success[idx])
|
||||
new_val = float(value)
|
||||
if not old_ok or abs(old - new_val) > 1e-9:
|
||||
vals[idx] = new_val
|
||||
success[idx] = True
|
||||
changed.append(f"{name}: {old:.4f}→{new_val:.4f}")
|
||||
|
||||
if not changed:
|
||||
return 0
|
||||
|
||||
if not dry_run:
|
||||
ind_names = np.array(names, dtype=object)
|
||||
np.savez_compressed(
|
||||
str(npz_path),
|
||||
api_names = ind_names,
|
||||
api_indicators = vals,
|
||||
api_success = success,
|
||||
)
|
||||
return len(changed)
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
if not HAS_REQUESTS:
|
||||
print("ERROR: requests required. pip install requests"); return
|
||||
|
||||
# Enumerate dates
|
||||
dates = sorted(p.stem for p in KLINES_DIR.glob("*.parquet") if 'catalog' not in p.name)
|
||||
if args.start: dates = [d for d in dates if d >= args.start]
|
||||
if args.end: dates = [d for d in dates if d <= args.end]
|
||||
total = len(dates)
|
||||
print(f"Dates to patch: {total}")
|
||||
print(f"Dry run: {args.dry_run}")
|
||||
print(f"FNG: {'skip' if args.skip_fng else 'YES'}")
|
||||
print(f"Binance: {'skip' if args.skip_binance else 'YES'}")
|
||||
print(f"FRED: {'skip (no key)' if (args.skip_fred or not args.fred_key) else f'YES (key={args.fred_key[:6]}...)'}")
|
||||
print()
|
||||
|
||||
# ── Fetch FNG all-history up front (one call) ─────────────────────────────
|
||||
fng_hist = {}
|
||||
if not args.skip_fng:
|
||||
print("Fetching FNG full history (one call)...")
|
||||
fng_hist = fetch_fng_history()
|
||||
print(f" Got {len(fng_hist)} dates "
|
||||
f"range={min(fng_hist) if fng_hist else 'n/a'} → {max(fng_hist) if fng_hist else 'n/a'}")
|
||||
if fng_hist:
|
||||
sample = {k: v for k, v in list(fng_hist.items())[:3]}
|
||||
print(f" Sample: {sample}")
|
||||
|
||||
# ── Fetch FRED all-series up front ───────────────────────────────────────
|
||||
if args.fred_key and not args.skip_fred:
|
||||
print(f"\nPre-fetching FRED series ({len(FRED_SERIES)} series)...")
|
||||
for name, sid in FRED_SERIES.items():
|
||||
series = fetch_fred_series(sid, args.fred_key)
|
||||
print(f" {name:<12} ({sid}): {len(series)} observations")
|
||||
time.sleep(0.6) # FRED rate limit: 120/min
|
||||
|
||||
# ── Fetch CoinMetrics up front ────────────────────────────────────────────
|
||||
print(f"\nPre-fetching CoinMetrics ({len(CM_INDICATORS)} metrics)...")
|
||||
for cm_name, asset, metric in CM_INDICATORS:
|
||||
fetch_coinmetrics(asset, metric, '2023-01-01') # warms cache for all dates
|
||||
n = len(_cm_cache.get((asset, metric), {}))
|
||||
print(f" {cm_name:<12}: {n} dates")
|
||||
time.sleep(0.8)
|
||||
|
||||
# ── Per-date loop ─────────────────────────────────────────────────────────
|
||||
print(f"\nPatching NPZ files...")
|
||||
total_changed = 0
|
||||
binance_fail_streak = 0
|
||||
|
||||
t0 = time.time()
|
||||
for i, ds in enumerate(dates):
|
||||
npz_path = EIGENVALUES_PATH / ds / NPZ_FILENAME
|
||||
if not npz_path.exists():
|
||||
continue
|
||||
|
||||
updates = {}
|
||||
|
||||
# FNG
|
||||
if not args.skip_fng and ds in fng_hist:
|
||||
updates['fng'] = float(fng_hist[ds])
|
||||
# Also try to get sub-components from same entry if available
|
||||
# (fng_prev is previous day's value)
|
||||
prev_day = (datetime.strptime(ds, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
|
||||
if prev_day in fng_hist:
|
||||
updates['fng_prev'] = float(fng_hist[prev_day])
|
||||
|
||||
# FRED
|
||||
if args.fred_key and not args.skip_fred:
|
||||
fred_vals = get_fred_indicators(ds, args.fred_key)
|
||||
for name, val in fred_vals.items():
|
||||
if val is not None:
|
||||
updates[name] = val
|
||||
|
||||
# CoinMetrics
|
||||
for cm_name, asset, metric in CM_INDICATORS:
|
||||
val = fetch_coinmetrics(asset, metric, ds) # hits cache
|
||||
if val is not None:
|
||||
updates[cm_name] = val
|
||||
|
||||
# Binance OI/LS/taker (network call per date — slowest)
|
||||
if not args.skip_binance and binance_fail_streak < 10:
|
||||
# Only call if these are currently failing in the NPZ
|
||||
d = np.load(str(npz_path), allow_pickle=True)
|
||||
names_in_npz = list(d['api_names'])
|
||||
ok_in_npz = d['api_success']
|
||||
taker_idx = names_in_npz.index('taker') if 'taker' in names_in_npz else -1
|
||||
taker_ok = bool(ok_in_npz[taker_idx]) if taker_idx >= 0 else False
|
||||
|
||||
if not taker_ok: # proxy check: if taker failing, all Binance hist likely failing
|
||||
binance_vals = get_binance_indicators(ds)
|
||||
n_binance_ok = sum(1 for v in binance_vals.values() if v is not None)
|
||||
if n_binance_ok == 0:
|
||||
binance_fail_streak += 1
|
||||
else:
|
||||
binance_fail_streak = 0
|
||||
updates.update({k: v for k, v in binance_vals.items() if v is not None})
|
||||
|
||||
# Patch
|
||||
n_changed = patch_npz(npz_path, updates, dry_run=args.dry_run)
|
||||
total_changed += n_changed
|
||||
|
||||
if (i + 1) % 50 == 0 or n_changed > 0:
|
||||
elapsed = time.time() - t0
|
||||
rate = (i + 1) / elapsed
|
||||
eta = (total - i - 1) / rate if rate > 0 else 0
|
||||
tag = f" +{n_changed} fields" if n_changed else ""
|
||||
print(f" [{i+1}/{total}] {ds} {elapsed/60:.1f}m eta={eta/60:.1f}m{tag}")
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Patch complete in {elapsed/60:.1f}m")
|
||||
print(f" Total fields updated: {total_changed}")
|
||||
print(f" {'DRY RUN — no files written' if args.dry_run else 'Files patched in-place'}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
if not args.fred_key:
|
||||
print(f"\n *** FRED indicators (vix, sp500, gold, dxy, us10y, ycurve, fedfunds)")
|
||||
print(f" *** were SKIPPED. Get a free API key at: https://fred.stlouisfed.org/docs/api/api_key.html")
|
||||
print(f" *** Then re-run with: --fred-key YOUR_KEY_HERE")
|
||||
if binance_fail_streak >= 10:
|
||||
print(f"\n *** Binance hist endpoints failed consistently.")
|
||||
print(f" *** OI data before 2020-09 is not available via Binance API.")
|
||||
print(f" *** Dates before that will remain FAIL for oi_btc, ls_btc, taker.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user