#!/usr/bin/env python3 """ test_exf_correlation.py ======================= Fetch historical data for all ExF indicators with FULL/PARTIAL history, align to 55 trading days (2025-12-31 → 2026-02-25), and test each for predictiveness against daily trading returns. Usage: python test_exf_correlation.py [--no-cache] """ import sys import os import time import json import csv import math import argparse import requests from pathlib import Path from datetime import datetime, timezone, timedelta, date from collections import defaultdict import numpy as np import pandas as pd from scipy import stats # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- BASE_DIR = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict") VBT_DIR = BASE_DIR / "vbt_cache" EXF_DIR = BASE_DIR / "external_factors" NAUTILUS_DIR = BASE_DIR / "nautilus_dolphin" RUN_LOGS = NAUTILUS_DIR / "run_logs" RUN_LOGS.mkdir(parents=True, exist_ok=True) FRED_KEY = "c16a9cde3e3bb5bb972bb9283485f202" # --------------------------------------------------------------------------- # Date range # --------------------------------------------------------------------------- START_DT = datetime(2025, 12, 31, tzinfo=timezone.utc) END_DT = datetime(2026, 2, 25, 23, 59, 59, tzinfo=timezone.utc) START_MS = int(START_DT.timestamp() * 1000) END_MS = int(END_DT.timestamp() * 1000) START_STR = "2025-12-31" END_STR = "2026-02-26" # exclusive for CoinMetrics end_time TS_NOW = datetime.now().strftime("%Y%m%d_%H%M%S") # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _date_range_str(start: datetime, end: datetime): """Yield YYYY-MM-DD strings for every calendar day from start to end inclusive.""" cur = start.date() stop = end.date() while cur <= stop: yield cur.strftime("%Y-%m-%d") cur += timedelta(days=1) def _ms_to_date(ms: int) -> str: return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).strftime("%Y-%m-%d") def _unix_to_date(ts: int) -> str: return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d") def _safe_float(v): if v is None: return float("nan") try: f = float(v) if math.isfinite(f): return f return float("nan") except (TypeError, ValueError): return float("nan") def _fill_forward(series: dict, date_list: list) -> dict: """Fill-forward a {date_str: float} dict over date_list.""" result = {} last = float("nan") for d in date_list: v = series.get(d, float("nan")) if not math.isnan(v): last = v result[d] = last return result def _cache_path(source: str) -> Path: return RUN_LOGS / f"exf_raw_cache_{source}.json" def _load_cache(source: str): p = _cache_path(source) if p.exists(): try: with open(p, "r") as f: return json.load(f) except Exception: return None return None def _save_cache(source: str, data): p = _cache_path(source) with open(p, "w") as f: json.dump(data, f) def _get(url: str, params=None, timeout=20, headers=None) -> dict | list | None: try: h = {"User-Agent": "Mozilla/5.0"} if headers: h.update(headers) r = requests.get(url, params=params, timeout=timeout, headers=h) r.raise_for_status() return r.json() except Exception as e: print(f" [HTTP ERROR] {url[:80]} -> {e}") return None # --------------------------------------------------------------------------- # Step 1: Get trading day list from VBT parquet filenames # --------------------------------------------------------------------------- def get_trading_days() -> list: files = sorted(VBT_DIR.glob("*.parquet")) days = [] for f in files: stem = f.stem # YYYY-MM-DD try: dt = datetime.strptime(stem, "%Y-%m-%d").date() if date(2025, 12, 31) <= dt <= date(2026, 2, 25): days.append(stem) except ValueError: pass print(f"[TRADING DAYS] Found {len(days)} days: {days[0]} → {days[-1]}") return days # --------------------------------------------------------------------------- # Step 2: Load daily returns # --------------------------------------------------------------------------- def load_daily_returns(trading_days: list) -> dict: """ Returns {date_str: pnl_normalized} where pnl_normalized = pnl / capital_start_of_day. Tries to load from most recent run_logs/daily_*.csv first. """ csvs = sorted(RUN_LOGS.glob("daily_*.csv"), reverse=True) if csvs: latest = csvs[0] print(f"[RETURNS] Loading from {latest.name}") df = pd.read_csv(latest) df["date"] = df["date"].astype(str) # capital col is end-of-day capital; start capital = capital + pnl # pnl_normalized = pnl / (capital + pnl) if capital is end-of-day # OR if capital is start: pnl_normalized = pnl / capital # The CSV header: date,pnl,capital,dd_pct,... — capital appears to be end-of-day # Reconstruct start capital: start_cap = capital - pnl returns = {} for _, row in df.iterrows(): d = str(row["date"]) if d in trading_days: pnl = float(row["pnl"]) cap = float(row["capital"]) start_cap = cap - pnl # end_cap = start_cap + pnl ← reconstruct if start_cap > 0: returns[d] = pnl / start_cap else: returns[d] = float("nan") print(f"[RETURNS] Loaded {len(returns)} dates. Sample: {list(returns.items())[:3]}") return returns else: raise RuntimeError( "No run_logs/daily_*.csv found. Run test_pf_dynamic_beta_validate.py first." ) # --------------------------------------------------------------------------- # Step 3: Fetch functions — one call per source # --------------------------------------------------------------------------- # ----- FRED ---------------------------------------------------------------- FRED_SERIES = { "dxy": "DTWEXBGS", "us10y": "DGS10", "us2y": "DGS2", "ycurve": "T10Y2Y", "vix": "VIXCLS", "fedfunds": "DFF", "m2": "WM2NS", "cpi": "CPIAUCSL", "sp500": "SP500", # gold: KO — GOLDAMGBD228NLBM/GOLDPMGBD228NLBM/GOLD all return HTTP 400 (series licensing/discontinued) "hy_spread":"BAMLH0A0HYM2", "be5y": "T5YIE", "nfci": "NFCI", "claims": "ICSA", } def fetch_fred(no_cache=False) -> dict: """Returns {indicator_name: {date_str: float}} for all FRED indicators.""" cache = None if no_cache else _load_cache("fred") if cache is not None: print("[FRED] Using cache") return cache result = {} all_dates = list(_date_range_str(START_DT, END_DT)) for name, series_id in FRED_SERIES.items(): url = ( f"https://api.stlouisfed.org/fred/series/observations" f"?series_id={series_id}" f"&api_key={FRED_KEY}" f"&file_type=json" f"&observation_start=2025-12-01" f"&observation_end=2026-03-01" f"&sort_order=asc" ) d = _get(url) series_raw = {} if d and "observations" in d: for obs in d["observations"]: dt = obs.get("date", "") v = obs.get("value", ".") if v != ".": fv = _safe_float(v) if not math.isnan(fv): series_raw[dt] = fv series_ff = _fill_forward(series_raw, all_dates) result[name] = series_ff n_ok = sum(1 for v in series_ff.values() if not math.isnan(v)) print(f" [FRED] {name:12s} ({series_id:20s}): {n_ok}/{len(all_dates)} obs") time.sleep(0.15) _save_cache("fred", result) return result # ----- BINANCE DERIVATIVES ------------------------------------------------- def _bin_hourly_to_daily(records: list, key: str, agg="mean") -> dict: """Group hourly Binance records to daily. agg='mean' or 'last'.""" by_day = defaultdict(list) for r in records: ts = int(r.get("timestamp", r.get("fundingTime", 0))) d = _ms_to_date(ts) v = _safe_float(r.get(key)) if not math.isnan(v): by_day[d].append(v) result = {} for d, vals in by_day.items(): if vals: result[d] = vals[-1] if agg == "last" else float(np.mean(vals)) return result def fetch_binance(no_cache=False) -> dict: """Returns {indicator_name: {date_str: float}}""" cache = None if no_cache else _load_cache("binance") if cache is not None: print("[BINANCE] Using cache") return cache result = {} # --- Funding BTC def _funding(symbol): all_recs = [] cur_start = START_MS while cur_start < END_MS: chunk_end = min(cur_start + 500 * 8 * 3600 * 1000, END_MS) url = f"https://fapi.binance.com/fapi/v1/fundingRate?symbol={symbol}&startTime={cur_start}&endTime={chunk_end}&limit=500" d = _get(url) if d and isinstance(d, list): all_recs.extend(d) if len(d) < 500: break cur_start = int(d[-1]["fundingTime"]) + 1 else: break return _bin_hourly_to_daily(all_recs, "fundingRate", agg="last") # --- OI / LS / Taker: KO — Binance Futures stats endpoints retain only ~30 days of history. # Requesting data from 2025-12-31 (63 days ago) returns HTTP 400 -1130 "startTime invalid". # These require live collection; cannot be retrofitted historically. def _oi(symbol): print(f" [KO] openInterestHist({symbol}): 30-day retention wall — returning empty") return {} def _ls(endpoint, symbol): print(f" [KO] {endpoint}({symbol}): 30-day retention wall — returning empty") return {} def _taker(): print(f" [KO] takerlongshortRatio: 30-day retention wall — returning empty") return {} # --- Volume (spot klines daily) def _vol24(): url = f"https://api.binance.com/api/v3/klines?symbol=BTCUSDT&interval=1d&startTime={START_MS}&endTime={END_MS}&limit=100" d = _get(url) result_v = {} if d and isinstance(d, list): for bar in d: ts = int(bar[0]) date_s = _ms_to_date(ts) v = _safe_float(bar[7]) # quote volume if not math.isnan(v): result_v[date_s] = v return result_v print("[BINANCE] Fetching funding_btc ...") result["funding_btc"] = _funding("BTCUSDT") print("[BINANCE] Fetching funding_eth ...") result["funding_eth"] = _funding("ETHUSDT") print("[BINANCE] Fetching oi_btc ...") result["oi_btc"] = _oi("BTCUSDT") print("[BINANCE] Fetching oi_eth ...") result["oi_eth"] = _oi("ETHUSDT") print("[BINANCE] Fetching ls_btc ...") result["ls_btc"] = _ls("globalLongShortAccountRatio", "BTCUSDT") print("[BINANCE] Fetching ls_eth ...") result["ls_eth"] = _ls("globalLongShortAccountRatio", "ETHUSDT") print("[BINANCE] Fetching ls_top ...") result["ls_top"] = _ls("topLongShortAccountRatio", "BTCUSDT") print("[BINANCE] Fetching taker ...") result["taker"] = _taker() print("[BINANCE] Fetching vol24 ...") result["vol24"] = _vol24() for k, v in result.items(): print(f" [BINANCE] {k}: {len(v)} days") _save_cache("binance", result) return result # ----- DERIBIT ------------------------------------------------------------- def fetch_deribit(no_cache=False) -> dict: cache = None if no_cache else _load_cache("deribit") if cache is not None: print("[DERIBIT] Using cache") return cache result = {} def _dvol(currency): url = ( f"https://www.deribit.com/api/v2/public/get_volatility_index_data" f"?currency={currency}&resolution=3600&start_timestamp={START_MS}&end_timestamp={END_MS}" ) d = _get(url) by_day = defaultdict(list) if d and "result" in d and isinstance(d["result"], dict): for row in d["result"].get("data", []): # row = [timestamp_ms, open, high, low, close] ts = int(row[0]) close = _safe_float(row[4]) if len(row) > 4 else float("nan") if not math.isnan(close): by_day[_ms_to_date(ts)].append(close) return {d: float(np.mean(v)) for d, v in by_day.items()} def _deribit_funding(instrument): # Paginate backward: Deribit returns at most ~744 records per call (count=1000 cap). # With hourly data, 744 records ≈ 31 days. Need two pages to cover 55 days. by_day = defaultdict(list) cur_end = END_MS pages = 0 while cur_end > START_MS and pages < 10: url = ( f"https://www.deribit.com/api/v2/public/get_funding_rate_history" f"?instrument_name={instrument}&start_timestamp={START_MS}&end_timestamp={cur_end}&count=1000" ) d = _get(url) if not (d and "result" in d and isinstance(d["result"], list)): break rows = d["result"] if not rows: break for row in rows: ts = int(row.get("timestamp", 0)) v = _safe_float(row.get("interest_8h")) if not math.isnan(v): by_day[_ms_to_date(ts)].append(v) oldest_ts = min(int(r.get("timestamp", cur_end)) for r in rows) if oldest_ts >= cur_end: break cur_end = oldest_ts - 1 pages += 1 time.sleep(0.3) return {d: float(np.mean(v)) for d, v in by_day.items()} print("[DERIBIT] Fetching dvol_btc ...") result["dvol_btc"] = _dvol("BTC") time.sleep(0.5) print("[DERIBIT] Fetching dvol_eth ...") result["dvol_eth"] = _dvol("ETH") time.sleep(0.5) print("[DERIBIT] Fetching fund_dbt_btc ...") result["fund_dbt_btc"] = _deribit_funding("BTC-PERPETUAL") time.sleep(0.5) print("[DERIBIT] Fetching fund_dbt_eth ...") result["fund_dbt_eth"] = _deribit_funding("ETH-PERPETUAL") for k, v in result.items(): print(f" [DERIBIT] {k}: {len(v)} days") _save_cache("deribit", result) return result # ----- COINMETRICS --------------------------------------------------------- def fetch_coinmetrics(no_cache=False) -> dict: cache = None if no_cache else _load_cache("coinmetrics") if cache is not None: print("[COINMETRICS] Using cache") return cache result = {} # CoinMetrics community API: KO permanently — HTTP 403 Forbidden as of 2026-03. # Community endpoint is now paywalled. All 12 CoinMetrics indicators are dead. BASE = "https://community-api.coinmetrics.io/v4/timeseries/asset-metrics" def _cm_fetch(asset, metrics_str): url = ( f"{BASE}?assets={asset}&metrics={metrics_str}" f"&frequency=1d&start_time={START_STR}&end_time={END_STR}&page_size=100" ) d = _get(url, timeout=30) rows = [] if d and "data" in d: rows = d["data"] return rows def _rows_to_series(rows, metric_key): out = {} for row in rows: t = row.get("time", "") date_s = t[:10] # YYYY-MM-DD v = _safe_float(row.get(metric_key)) if not math.isnan(v): out[date_s] = v return out # BTC batch print("[COINMETRICS] Fetching BTC batch ...") btc_metrics = "CapRealUSD,CapMrktCurUSD,AdrActCnt,TxCnt,FeeTotUSD,NVTAdj,VelCur1yr,SplyAct1yr" btc_rows = _cm_fetch("btc", btc_metrics) time.sleep(7) result["rcap_btc"] = _rows_to_series(btc_rows, "CapRealUSD") result["addr_btc"] = _rows_to_series(btc_rows, "AdrActCnt") result["txcnt"] = _rows_to_series(btc_rows, "TxCnt") result["fees_btc"] = _rows_to_series(btc_rows, "FeeTotUSD") result["nvt"] = _rows_to_series(btc_rows, "NVTAdj") result["velocity"] = _rows_to_series(btc_rows, "VelCur1yr") result["sply_act"] = _rows_to_series(btc_rows, "SplyAct1yr") # MVRV and NUPL from CapMrktCurUSD + CapRealUSD mvrv_d = {} nupl_d = {} for row in btc_rows: t = row.get("time", "")[:10] m = _safe_float(row.get("CapMrktCurUSD")) rc = _safe_float(row.get("CapRealUSD")) if not math.isnan(m) and not math.isnan(rc) and rc > 0: mvrv_d[t] = m / rc nupl_d[t] = (m - rc) / m if m > 0 else float("nan") result["mvrv"] = mvrv_d result["nupl"] = nupl_d # ETH batch print("[COINMETRICS] Fetching ETH batch ...") eth_metrics = "CapRealUSD,AdrActCnt,FeeTotUSD" eth_rows = _cm_fetch("eth", eth_metrics) time.sleep(7) result["rcap_eth"] = _rows_to_series(eth_rows, "CapRealUSD") result["addr_eth"] = _rows_to_series(eth_rows, "AdrActCnt") result["fees_eth"] = _rows_to_series(eth_rows, "FeeTotUSD") for k, v in result.items(): print(f" [COINMETRICS] {k}: {len(v)} days") _save_cache("coinmetrics", result) return result # ----- DEFILLAMA ----------------------------------------------------------- def fetch_defillama(no_cache=False) -> dict: cache = None if no_cache else _load_cache("defillama") if cache is not None: print("[DEFILLAMA] Using cache") return cache result = {} all_dates_set = set(_date_range_str(START_DT, END_DT)) def _dl_tvl(url): d = _get(url) out = {} if d and isinstance(d, list): for entry in d: ts = int(entry.get("date", 0)) ds = _unix_to_date(ts) v = _safe_float(entry.get("tvl")) if ds in all_dates_set and not math.isnan(v): out[ds] = v return out def _dl_stables_chart(url): """stablecoincharts/all returns array of {date, totalCirculatingUSD: {peggedUSD: N}}""" d = _get(url) out = {} if d and isinstance(d, list): for entry in d: ts = int(entry.get("date", 0)) ds = _unix_to_date(ts) if ds not in all_dates_set: continue circ = entry.get("totalCirculatingUSD") if isinstance(circ, dict): v = _safe_float(circ.get("peggedUSD")) else: v = _safe_float(circ) if not math.isnan(v): out[ds] = v return out print("[DEFILLAMA] Fetching TVL (all chains) ...") result["tvl"] = _dl_tvl("https://api.llama.fi/v2/historicalChainTvl") time.sleep(1) print("[DEFILLAMA] Fetching TVL Ethereum ...") result["tvl_eth"] = _dl_tvl("https://api.llama.fi/v2/historicalChainTvl/Ethereum") time.sleep(1) print("[DEFILLAMA] Fetching stables total (all) ...") result["stables"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all") time.sleep(1) print("[DEFILLAMA] Fetching USDT ...") result["usdt"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all?stablecoin=1") time.sleep(1) print("[DEFILLAMA] Fetching USDC ...") result["usdc"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all?stablecoin=2") for k, v in result.items(): print(f" [DEFILLAMA] {k}: {len(v)} days") _save_cache("defillama", result) return result # ----- ALTERNATIVE.ME (Fear & Greed) --------------------------------------- def fetch_fng(no_cache=False) -> dict: cache = None if no_cache else _load_cache("fng") if cache is not None: print("[FNG] Using cache") return cache print("[FNG] Fetching F&G history ...") # date_format=us changes timestamp to MM/DD/YYYY strings — int() parse fails silently. # Without date_format, timestamp is returned as Unix seconds string — works correctly. url = "https://api.alternative.me/fng/?limit=1000" d = _get(url) fng_raw = {} if d and "data" in d: for entry in d["data"]: # date_format=us → timestamp field is unix timestamp (str or int) ts = entry.get("timestamp") v = _safe_float(entry.get("value")) if ts is None: continue try: ts_int = int(ts) ds = _unix_to_date(ts_int) except (TypeError, ValueError): continue if not math.isnan(v): fng_raw[ds] = v all_dates = list(_date_range_str(START_DT, END_DT)) fng_ff = _fill_forward(fng_raw, all_dates) # Derived: fng_prev (lag-1), fng_week (7-day rolling mean) sorted_dates = sorted(fng_ff.keys()) fng_prev = {} fng_week = {} for i, d_str in enumerate(sorted_dates): if i >= 1: fng_prev[d_str] = fng_ff.get(sorted_dates[i - 1], float("nan")) else: fng_prev[d_str] = float("nan") window = [fng_ff.get(sorted_dates[j], float("nan")) for j in range(max(0, i - 6), i + 1)] valid = [x for x in window if not math.isnan(x)] fng_week[d_str] = float(np.mean(valid)) if valid else float("nan") result = { "fng": fng_ff, "fng_prev": fng_prev, "fng_week": fng_week, } for k, v in result.items(): n = sum(1 for x in v.values() if not math.isnan(x)) print(f" [FNG] {k}: {n} valid days") _save_cache("fng", result) return result # ----- BLOCKCHAIN.INFO ----------------------------------------------------- def fetch_blockchain(no_cache=False) -> dict: cache = None if no_cache else _load_cache("blockchain") if cache is not None: print("[BLOCKCHAIN] Using cache") return cache result = {} all_dates_set = set(_date_range_str(START_DT, END_DT)) CHARTS = { "hashrate": ("https://api.blockchain.info/charts/hash-rate", 1.0), "difficulty": ("https://api.blockchain.info/charts/difficulty", 1.0), "mcap_bc": ("https://api.blockchain.info/charts/market-cap", 1.0), "tx_blk": ("https://api.blockchain.info/charts/n-transactions-per-block", 1.0), "total_btc": ("https://api.blockchain.info/charts/total-bitcoins", 1e-8), } for name, (base_url, divisor) in CHARTS.items(): url = f"{base_url}?timespan=90days&start=2025-12-01&format=json" print(f"[BLOCKCHAIN] Fetching {name} ...") d = _get(url, timeout=30) out = {} if d and "values" in d: for entry in d["values"]: ts = int(entry.get("x", 0)) ds = _unix_to_date(ts) v = _safe_float(entry.get("y")) if ds in all_dates_set and not math.isnan(v): out[ds] = v * divisor result[name] = out print(f" [BLOCKCHAIN] {name}: {len(out)} days") time.sleep(1) _save_cache("blockchain", result) return result # ----- COINGECKO ----------------------------------------------------------- def fetch_coingecko(no_cache=False) -> dict: cache = None if no_cache else _load_cache("coingecko") if cache is not None: print("[COINGECKO] Using cache") return cache result = {"btc_price": {}, "eth_price": {}} all_dates_set = set(_date_range_str(START_DT, END_DT)) # Use market_chart for a bulk fetch (avoids per-day calls + rate limits) def _market_chart(coin_id): # from=unix start, to=unix end start_unix = int(START_DT.timestamp()) end_unix = int(END_DT.timestamp()) url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart/range?vs_currency=usd&from={start_unix}&to={end_unix}" d = _get(url, timeout=30) out = {} if d and "prices" in d: by_day = defaultdict(list) for ts_ms, price in d["prices"]: ds = _ms_to_date(int(ts_ms)) if ds in all_dates_set: by_day[ds].append(_safe_float(price)) for ds, vals in by_day.items(): valid = [x for x in vals if not math.isnan(x)] if valid: out[ds] = valid[-1] # last price of the day return out print("[COINGECKO] Fetching BTC price ...") result["btc_price"] = _market_chart("bitcoin") time.sleep(2) print("[COINGECKO] Fetching ETH price ...") result["eth_price"] = _market_chart("ethereum") for k, v in result.items(): print(f" [COINGECKO] {k}: {len(v)} days") _save_cache("coingecko", result) return result # --------------------------------------------------------------------------- # Step 4: Assemble all indicator data # --------------------------------------------------------------------------- def assemble_indicators( fred_data: dict, binance_data: dict, deribit_data: dict, cm_data: dict, dl_data: dict, fng_data: dict, bc_data: dict, cg_data: dict, ) -> dict: """ Merge all source dicts into a single {indicator_name: {date_str: float}} dict. """ merged = {} sources = [ fred_data, binance_data, deribit_data, cm_data, dl_data, fng_data, bc_data, cg_data, ] for src in sources: for name, daily in src.items(): if name in merged: # Prefer whichever has more data if len(daily) > len(merged[name]): merged[name] = daily else: merged[name] = daily return merged # --------------------------------------------------------------------------- # Step 5: Correlation analysis # --------------------------------------------------------------------------- LAGS = [0, 1, 2, 3, 5, 7] def compute_correlations( trading_days: list, returns: dict, indicators: dict, ) -> list: """ For each indicator, test multiple lags. Returns a flat list of row dicts for the correlation table. """ # Build aligned return vector ret_vec = np.array([returns.get(d, float("nan")) for d in trading_days]) rows = [] for ind_name, daily in indicators.items(): # Raw vector for this indicator over all calendar days (for lag computation) # We need to look up indicator values at trading_day - lag (in calendar days) # Build a sorted lookup from the full daily dict all_cal = sorted(daily.keys()) cal_set = set(all_cal) # Pre-build array: for each trading day, what is the indicator value? ind_series = {} for d in trading_days: v = daily.get(d) if v is None: # Look backwards up to 5 calendar days (fill-forward) dt = datetime.strptime(d, "%Y-%m-%d") for back in range(1, 6): candidate = (dt - timedelta(days=back)).strftime("%Y-%m-%d") if candidate in daily: v = daily[candidate] break ind_series[d] = v if v is not None else float("nan") for lag in LAGS: # indicator at trading_day[i - lag] predicts return at trading_day[i] n_td = len(trading_days) x_vals = [] y_vals = [] for i in range(n_td): y = ret_vec[i] if lag == 0: src_day = trading_days[i] else: src_idx = i - lag if src_idx < 0: x_vals.append(float("nan")) y_vals.append(y) continue src_day = trading_days[src_idx] x = ind_series.get(src_day, float("nan")) x_vals.append(x) y_vals.append(y) x_arr = np.array(x_vals, dtype=float) y_arr = np.array(y_vals, dtype=float) # Drop NaN pairs valid = ~(np.isnan(x_arr) | np.isnan(y_arr)) xv = x_arr[valid] yv = y_arr[valid] n = int(valid.sum()) if n < 10: rows.append({ "name": ind_name, "lag": lag, "n_days": n, "pearson_r": float("nan"), "pearson_p": float("nan"), "spearman_r": float("nan"), "spearman_p": float("nan"), "stress_r": float("nan"), "stress_p": float("nan"), "note": "insufficient data", }) continue # Pearson try: pr, pp = stats.pearsonr(xv, yv) except Exception: pr, pp = float("nan"), float("nan") # Spearman try: sr, sp = stats.spearmanr(xv, yv) except Exception: sr, sp = float("nan"), float("nan") # Point-biserial: stress day flag (return < -0.01) stress_flag = (yv < -0.01).astype(float) try: if stress_flag.std() > 0: pbr, pbp = stats.pointbiserialr(stress_flag.astype(bool), xv) else: pbr, pbp = float("nan"), float("nan") except Exception: pbr, pbp = float("nan"), float("nan") rows.append({ "name": ind_name, "lag": lag, "n_days": n, "pearson_r": float(pr), "pearson_p": float(pp), "spearman_r": float(sr), "spearman_p": float(sp), "stress_r": float(pbr), "stress_p": float(pbp), "note": "ok", }) return rows # --------------------------------------------------------------------------- # Step 6: Build summary (best lag per indicator) # --------------------------------------------------------------------------- def build_summary(corr_rows: list, indicators: dict, trading_days: list) -> list: """Best lag per indicator, sorted by |pearson_r|.""" from collections import defaultdict best = {} for row in corr_rows: name = row["name"] r = row["pearson_r"] if math.isnan(r): continue if name not in best or abs(r) > abs(best[name]["pearson_r"]): best[name] = row # Add indicators that had no valid data for name in indicators: if name not in best: # Count non-NaN days n_ok = sum(1 for d in trading_days if not math.isnan( _safe_float(indicators[name].get(d)) )) best[name] = { "name": name, "lag": -1, "n_days": n_ok, "pearson_r": float("nan"), "pearson_p": float("nan"), "spearman_r": float("nan"), "spearman_p": float("nan"), "stress_r": float("nan"), "stress_p": float("nan"), "note": "no valid correlation", } # Assign source labels SOURCE_MAP = { **{k: "fred" for k in FRED_SERIES}, "funding_btc": "binance", "funding_eth": "binance", "oi_btc": "binance", "oi_eth": "binance", "ls_btc": "binance", "ls_eth": "binance", "ls_top": "binance", "taker": "binance", "vol24": "binance", "dvol_btc": "deribit", "dvol_eth": "deribit", "fund_dbt_btc": "deribit","fund_dbt_eth": "deribit", "rcap_btc": "coinmetrics","mvrv": "coinmetrics", "nupl": "coinmetrics", "addr_btc": "coinmetrics", "addr_eth": "coinmetrics","txcnt": "coinmetrics", "fees_btc": "coinmetrics","fees_eth": "coinmetrics", "nvt": "coinmetrics", "velocity": "coinmetrics", "sply_act": "coinmetrics","rcap_eth": "coinmetrics", "tvl": "defillama", "tvl_eth": "defillama", "stables": "defillama", "usdt": "defillama", "usdc": "defillama", "fng": "alternative", "fng_prev": "alternative", "fng_week": "alternative", "hashrate": "blockchain", "difficulty": "blockchain", "mcap_bc": "blockchain", "tx_blk": "blockchain", "total_btc": "blockchain", "btc_price": "coingecko", "eth_price": "coingecko", } rows_sorted = sorted( best.values(), key=lambda x: abs(x["pearson_r"]) if not math.isnan(x["pearson_r"]) else -1, reverse=True, ) summary = [] for rank, row in enumerate(rows_sorted, 1): name = row["name"] sig = "" if not math.isnan(row.get("pearson_p", float("nan"))): sig = "*" if row["pearson_p"] < 0.05 else "" sig += "*" if row["pearson_p"] < 0.01 else "" summary.append({ "rank": rank, "name": name, "source": SOURCE_MAP.get(name, "?"), "best_lag": row["lag"], "best_r": row["pearson_r"], "best_p": row["pearson_p"], "spearman": row["spearman_r"], "stress_r": row["stress_r"], "n_days": row["n_days"], "sig": sig, "note": row.get("note", ""), }) return summary # --------------------------------------------------------------------------- # Step 7: Save outputs # --------------------------------------------------------------------------- def save_daily_csv(trading_days: list, indicators: dict, returns: dict): path = RUN_LOGS / f"exf_daily_{TS_NOW}.csv" cols = ["date", "daily_return"] + sorted(indicators.keys()) with open(path, "w", newline="") as f: w = csv.DictWriter(f, fieldnames=cols) w.writeheader() for d in trading_days: row = {"date": d, "daily_return": returns.get(d, "")} for name, series in indicators.items(): v = series.get(d, float("nan")) row[name] = "" if math.isnan(v) else v w.writerow(row) print(f"[SAVE] Daily values: {path}") def save_corr_csv(corr_rows: list): path = RUN_LOGS / f"exf_correlation_{TS_NOW}.csv" if not corr_rows: return cols = list(corr_rows[0].keys()) with open(path, "w", newline="") as f: w = csv.DictWriter(f, fieldnames=cols) w.writeheader() w.writerows(corr_rows) print(f"[SAVE] Correlation table: {path}") def print_summary(summary: list): print() print("=" * 110) print(f"{'rank':>4} {'name':<14} {'source':<12} {'best_lag':>8} {'best_r':>8} {'best_p':>8} " f"{'spearman':>8} {'stress_r':>8} {'n_days':>6} {'sig':>4}") print("=" * 110) for row in summary: r_str = f"{row['best_r']:+.4f}" if not math.isnan(row.get("best_r", float("nan"))) else " n/a " p_str = f"{row['best_p']:.4f}" if not math.isnan(row.get("best_p", float("nan"))) else " n/a " sp_str = f"{row['spearman']:+.4f}" if not math.isnan(row.get("spearman", float("nan"))) else " n/a " sr_str = f"{row['stress_r']:+.4f}" if not math.isnan(row.get("stress_r", float("nan"))) else " n/a " lag_str = f"lag={row['best_lag']}" if row['best_lag'] >= 0 else " - " print( f"{row['rank']:>4} {row['name']:<14} {row['source']:<12} {lag_str:>8} " f"{r_str:>8} {p_str:>8} {sp_str:>8} {sr_str:>8} {row['n_days']:>6} {row['sig']:>4}" ) print("=" * 110) # Top 10 highlight sig_rows = [r for r in summary if not math.isnan(r.get("best_p", float("nan"))) and r["best_p"] < 0.05] print(f"\n[RESULT] {len(sig_rows)} indicators significant at p<0.05 (any lag)") if sig_rows: print("TOP PREDICTORS:") for r in sig_rows[:15]: print(f" {r['rank']:>3}. {r['name']:<14} lag={r['best_lag']} " f"r={r['best_r']:+.4f} p={r['best_p']:.4f} spearman={r['spearman']:+.4f} " f"n={r['n_days']}") # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="ExF historical correlation tester") parser.add_argument("--no-cache", action="store_true", help="Ignore and overwrite all caches") args = parser.parse_args() no_cache = args.no_cache print("=" * 70) print(" DOLPHIN ExF Correlation Tester") print(f" Date range: {START_STR} → 2026-02-25") print(f" Cache: {'DISABLED (--no-cache)' if no_cache else 'ENABLED'}") print("=" * 70) # --- Trading days trading_days = get_trading_days() if len(trading_days) == 0: print("[ERROR] No parquet files found in vbt_cache. Abort.") sys.exit(1) # --- Daily returns print("\n[STEP 1] Loading daily returns ...") try: returns = load_daily_returns(trading_days) except RuntimeError as e: print(f"[ERROR] {e}") sys.exit(1) valid_ret = sum(1 for v in returns.values() if not math.isnan(v)) print(f" Returns: {valid_ret}/{len(trading_days)} valid") # --- Fetch all sources print("\n[STEP 2] Fetching external factor data ...") print("\n--- FRED ---") try: fred_data = fetch_fred(no_cache) print(f" FRED: {len(fred_data)} indicators OK") except Exception as e: print(f" FRED FAILED: {e}") fred_data = {} print("\n--- BINANCE DERIVATIVES ---") try: binance_data = fetch_binance(no_cache) print(f" BINANCE: {len(binance_data)} indicators OK") except Exception as e: print(f" BINANCE FAILED: {e}") binance_data = {} print("\n--- DERIBIT ---") try: deribit_data = fetch_deribit(no_cache) print(f" DERIBIT: {len(deribit_data)} indicators OK") except Exception as e: print(f" DERIBIT FAILED: {e}") deribit_data = {} print("\n--- COINMETRICS ---") try: cm_data = fetch_coinmetrics(no_cache) print(f" COINMETRICS: {len(cm_data)} indicators OK") except Exception as e: print(f" COINMETRICS FAILED: {e}") cm_data = {} print("\n--- DEFILLAMA ---") try: dl_data = fetch_defillama(no_cache) print(f" DEFILLAMA: {len(dl_data)} indicators OK") except Exception as e: print(f" DEFILLAMA FAILED: {e}") dl_data = {} print("\n--- FEAR & GREED (alternative.me) ---") try: fng_data = fetch_fng(no_cache) print(f" FNG: {len(fng_data)} indicators OK") except Exception as e: print(f" FNG FAILED: {e}") fng_data = {} print("\n--- BLOCKCHAIN.INFO ---") try: bc_data = fetch_blockchain(no_cache) print(f" BLOCKCHAIN: {len(bc_data)} indicators OK") except Exception as e: print(f" BLOCKCHAIN FAILED: {e}") bc_data = {} print("\n--- COINGECKO ---") try: cg_data = fetch_coingecko(no_cache) print(f" COINGECKO: {len(cg_data)} indicators OK") except Exception as e: print(f" COINGECKO FAILED: {e}") cg_data = {} # --- Assemble print("\n[STEP 3] Assembling indicators ...") indicators = assemble_indicators( fred_data, binance_data, deribit_data, cm_data, dl_data, fng_data, bc_data, cg_data, ) print(f" Total indicators assembled: {len(indicators)}") # Coverage report print("\n Coverage per indicator over trading days:") for name in sorted(indicators.keys()): series = indicators[name] n_ok = sum( 1 for d in trading_days if not math.isnan(_safe_float(series.get(d))) ) pct = 100 * n_ok / len(trading_days) status = "OK" if n_ok >= 40 else ("PARTIAL" if n_ok >= 10 else "FAIL") print(f" {name:<18} {n_ok:>3}/{len(trading_days)} ({pct:5.1f}%) [{status}]") # --- Correlation analysis print("\n[STEP 4] Computing correlations (lags 0-7) ...") corr_rows = compute_correlations(trading_days, returns, indicators) print(f" Computed {len(corr_rows)} rows ({len(indicators)} indicators × {len(LAGS)} lags)") # --- Summary summary = build_summary(corr_rows, indicators, trading_days) # --- Save print("\n[STEP 5] Saving outputs ...") save_daily_csv(trading_days, indicators, returns) save_corr_csv(corr_rows) # --- Print table print("\n[STEP 6] Results") print_summary(summary) # Also print status for each source print("\n[SOURCE STATUS]") source_ok = { "fred": "OK" if fred_data else "FAIL", "binance": "OK" if binance_data else "FAIL", "deribit": "OK" if deribit_data else "FAIL", "coinmetrics": "OK" if cm_data else "FAIL", "defillama": "OK" if dl_data else "FAIL", "alternative": "OK" if fng_data else "FAIL", "blockchain": "OK" if bc_data else "FAIL", "coingecko": "OK" if cg_data else "FAIL", } for src, status in source_ok.items(): print(f" {src:<14} {status}") print("\n[DONE] ExF correlation analysis complete.") if __name__ == "__main__": main()