Files
DOLPHIN/nautilus_dolphin/test_exf_correlation.py

1207 lines
41 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
test_exf_correlation.py
=======================
Fetch historical data for all ExF indicators with FULL/PARTIAL history,
align to 55 trading days (2025-12-31 2026-02-25), and test each for
predictiveness against daily trading returns.
Usage:
python test_exf_correlation.py [--no-cache]
"""
import sys
import os
import time
import json
import csv
import math
import argparse
import requests
from pathlib import Path
from datetime import datetime, timezone, timedelta, date
from collections import defaultdict
import numpy as np
import pandas as pd
from scipy import stats
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
BASE_DIR = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict")
VBT_DIR = BASE_DIR / "vbt_cache"
EXF_DIR = BASE_DIR / "external_factors"
NAUTILUS_DIR = BASE_DIR / "nautilus_dolphin"
RUN_LOGS = NAUTILUS_DIR / "run_logs"
RUN_LOGS.mkdir(parents=True, exist_ok=True)
FRED_KEY = "c16a9cde3e3bb5bb972bb9283485f202"
# ---------------------------------------------------------------------------
# Date range
# ---------------------------------------------------------------------------
START_DT = datetime(2025, 12, 31, tzinfo=timezone.utc)
END_DT = datetime(2026, 2, 25, 23, 59, 59, tzinfo=timezone.utc)
START_MS = int(START_DT.timestamp() * 1000)
END_MS = int(END_DT.timestamp() * 1000)
START_STR = "2025-12-31"
END_STR = "2026-02-26" # exclusive for CoinMetrics end_time
TS_NOW = datetime.now().strftime("%Y%m%d_%H%M%S")
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _date_range_str(start: datetime, end: datetime):
"""Yield YYYY-MM-DD strings for every calendar day from start to end inclusive."""
cur = start.date()
stop = end.date()
while cur <= stop:
yield cur.strftime("%Y-%m-%d")
cur += timedelta(days=1)
def _ms_to_date(ms: int) -> str:
return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).strftime("%Y-%m-%d")
def _unix_to_date(ts: int) -> str:
return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
def _safe_float(v):
if v is None:
return float("nan")
try:
f = float(v)
if math.isfinite(f):
return f
return float("nan")
except (TypeError, ValueError):
return float("nan")
def _fill_forward(series: dict, date_list: list) -> dict:
"""Fill-forward a {date_str: float} dict over date_list."""
result = {}
last = float("nan")
for d in date_list:
v = series.get(d, float("nan"))
if not math.isnan(v):
last = v
result[d] = last
return result
def _cache_path(source: str) -> Path:
return RUN_LOGS / f"exf_raw_cache_{source}.json"
def _load_cache(source: str):
p = _cache_path(source)
if p.exists():
try:
with open(p, "r") as f:
return json.load(f)
except Exception:
return None
return None
def _save_cache(source: str, data):
p = _cache_path(source)
with open(p, "w") as f:
json.dump(data, f)
def _get(url: str, params=None, timeout=20, headers=None) -> dict | list | None:
try:
h = {"User-Agent": "Mozilla/5.0"}
if headers:
h.update(headers)
r = requests.get(url, params=params, timeout=timeout, headers=h)
r.raise_for_status()
return r.json()
except Exception as e:
print(f" [HTTP ERROR] {url[:80]} -> {e}")
return None
# ---------------------------------------------------------------------------
# Step 1: Get trading day list from VBT parquet filenames
# ---------------------------------------------------------------------------
def get_trading_days() -> list:
files = sorted(VBT_DIR.glob("*.parquet"))
days = []
for f in files:
stem = f.stem # YYYY-MM-DD
try:
dt = datetime.strptime(stem, "%Y-%m-%d").date()
if date(2025, 12, 31) <= dt <= date(2026, 2, 25):
days.append(stem)
except ValueError:
pass
print(f"[TRADING DAYS] Found {len(days)} days: {days[0]}{days[-1]}")
return days
# ---------------------------------------------------------------------------
# Step 2: Load daily returns
# ---------------------------------------------------------------------------
def load_daily_returns(trading_days: list) -> dict:
"""
Returns {date_str: pnl_normalized} where pnl_normalized = pnl / capital_start_of_day.
Tries to load from most recent run_logs/daily_*.csv first.
"""
csvs = sorted(RUN_LOGS.glob("daily_*.csv"), reverse=True)
if csvs:
latest = csvs[0]
print(f"[RETURNS] Loading from {latest.name}")
df = pd.read_csv(latest)
df["date"] = df["date"].astype(str)
# capital col is end-of-day capital; start capital = capital + pnl
# pnl_normalized = pnl / (capital + pnl) if capital is end-of-day
# OR if capital is start: pnl_normalized = pnl / capital
# The CSV header: date,pnl,capital,dd_pct,... — capital appears to be end-of-day
# Reconstruct start capital: start_cap = capital - pnl
returns = {}
for _, row in df.iterrows():
d = str(row["date"])
if d in trading_days:
pnl = float(row["pnl"])
cap = float(row["capital"])
start_cap = cap - pnl # end_cap = start_cap + pnl ← reconstruct
if start_cap > 0:
returns[d] = pnl / start_cap
else:
returns[d] = float("nan")
print(f"[RETURNS] Loaded {len(returns)} dates. Sample: {list(returns.items())[:3]}")
return returns
else:
raise RuntimeError(
"No run_logs/daily_*.csv found. Run test_pf_dynamic_beta_validate.py first."
)
# ---------------------------------------------------------------------------
# Step 3: Fetch functions — one call per source
# ---------------------------------------------------------------------------
# ----- FRED ----------------------------------------------------------------
FRED_SERIES = {
"dxy": "DTWEXBGS",
"us10y": "DGS10",
"us2y": "DGS2",
"ycurve": "T10Y2Y",
"vix": "VIXCLS",
"fedfunds": "DFF",
"m2": "WM2NS",
"cpi": "CPIAUCSL",
"sp500": "SP500",
# gold: KO — GOLDAMGBD228NLBM/GOLDPMGBD228NLBM/GOLD all return HTTP 400 (series licensing/discontinued)
"hy_spread":"BAMLH0A0HYM2",
"be5y": "T5YIE",
"nfci": "NFCI",
"claims": "ICSA",
}
def fetch_fred(no_cache=False) -> dict:
"""Returns {indicator_name: {date_str: float}} for all FRED indicators."""
cache = None if no_cache else _load_cache("fred")
if cache is not None:
print("[FRED] Using cache")
return cache
result = {}
all_dates = list(_date_range_str(START_DT, END_DT))
for name, series_id in FRED_SERIES.items():
url = (
f"https://api.stlouisfed.org/fred/series/observations"
f"?series_id={series_id}"
f"&api_key={FRED_KEY}"
f"&file_type=json"
f"&observation_start=2025-12-01"
f"&observation_end=2026-03-01"
f"&sort_order=asc"
)
d = _get(url)
series_raw = {}
if d and "observations" in d:
for obs in d["observations"]:
dt = obs.get("date", "")
v = obs.get("value", ".")
if v != ".":
fv = _safe_float(v)
if not math.isnan(fv):
series_raw[dt] = fv
series_ff = _fill_forward(series_raw, all_dates)
result[name] = series_ff
n_ok = sum(1 for v in series_ff.values() if not math.isnan(v))
print(f" [FRED] {name:12s} ({series_id:20s}): {n_ok}/{len(all_dates)} obs")
time.sleep(0.15)
_save_cache("fred", result)
return result
# ----- BINANCE DERIVATIVES -------------------------------------------------
def _bin_hourly_to_daily(records: list, key: str, agg="mean") -> dict:
"""Group hourly Binance records to daily. agg='mean' or 'last'."""
by_day = defaultdict(list)
for r in records:
ts = int(r.get("timestamp", r.get("fundingTime", 0)))
d = _ms_to_date(ts)
v = _safe_float(r.get(key))
if not math.isnan(v):
by_day[d].append(v)
result = {}
for d, vals in by_day.items():
if vals:
result[d] = vals[-1] if agg == "last" else float(np.mean(vals))
return result
def fetch_binance(no_cache=False) -> dict:
"""Returns {indicator_name: {date_str: float}}"""
cache = None if no_cache else _load_cache("binance")
if cache is not None:
print("[BINANCE] Using cache")
return cache
result = {}
# --- Funding BTC
def _funding(symbol):
all_recs = []
cur_start = START_MS
while cur_start < END_MS:
chunk_end = min(cur_start + 500 * 8 * 3600 * 1000, END_MS)
url = f"https://fapi.binance.com/fapi/v1/fundingRate?symbol={symbol}&startTime={cur_start}&endTime={chunk_end}&limit=500"
d = _get(url)
if d and isinstance(d, list):
all_recs.extend(d)
if len(d) < 500:
break
cur_start = int(d[-1]["fundingTime"]) + 1
else:
break
return _bin_hourly_to_daily(all_recs, "fundingRate", agg="last")
# --- OI / LS / Taker: KO — Binance Futures stats endpoints retain only ~30 days of history.
# Requesting data from 2025-12-31 (63 days ago) returns HTTP 400 -1130 "startTime invalid".
# These require live collection; cannot be retrofitted historically.
def _oi(symbol):
print(f" [KO] openInterestHist({symbol}): 30-day retention wall — returning empty")
return {}
def _ls(endpoint, symbol):
print(f" [KO] {endpoint}({symbol}): 30-day retention wall — returning empty")
return {}
def _taker():
print(f" [KO] takerlongshortRatio: 30-day retention wall — returning empty")
return {}
# --- Volume (spot klines daily)
def _vol24():
url = f"https://api.binance.com/api/v3/klines?symbol=BTCUSDT&interval=1d&startTime={START_MS}&endTime={END_MS}&limit=100"
d = _get(url)
result_v = {}
if d and isinstance(d, list):
for bar in d:
ts = int(bar[0])
date_s = _ms_to_date(ts)
v = _safe_float(bar[7]) # quote volume
if not math.isnan(v):
result_v[date_s] = v
return result_v
print("[BINANCE] Fetching funding_btc ...")
result["funding_btc"] = _funding("BTCUSDT")
print("[BINANCE] Fetching funding_eth ...")
result["funding_eth"] = _funding("ETHUSDT")
print("[BINANCE] Fetching oi_btc ...")
result["oi_btc"] = _oi("BTCUSDT")
print("[BINANCE] Fetching oi_eth ...")
result["oi_eth"] = _oi("ETHUSDT")
print("[BINANCE] Fetching ls_btc ...")
result["ls_btc"] = _ls("globalLongShortAccountRatio", "BTCUSDT")
print("[BINANCE] Fetching ls_eth ...")
result["ls_eth"] = _ls("globalLongShortAccountRatio", "ETHUSDT")
print("[BINANCE] Fetching ls_top ...")
result["ls_top"] = _ls("topLongShortAccountRatio", "BTCUSDT")
print("[BINANCE] Fetching taker ...")
result["taker"] = _taker()
print("[BINANCE] Fetching vol24 ...")
result["vol24"] = _vol24()
for k, v in result.items():
print(f" [BINANCE] {k}: {len(v)} days")
_save_cache("binance", result)
return result
# ----- DERIBIT -------------------------------------------------------------
def fetch_deribit(no_cache=False) -> dict:
cache = None if no_cache else _load_cache("deribit")
if cache is not None:
print("[DERIBIT] Using cache")
return cache
result = {}
def _dvol(currency):
url = (
f"https://www.deribit.com/api/v2/public/get_volatility_index_data"
f"?currency={currency}&resolution=3600&start_timestamp={START_MS}&end_timestamp={END_MS}"
)
d = _get(url)
by_day = defaultdict(list)
if d and "result" in d and isinstance(d["result"], dict):
for row in d["result"].get("data", []):
# row = [timestamp_ms, open, high, low, close]
ts = int(row[0])
close = _safe_float(row[4]) if len(row) > 4 else float("nan")
if not math.isnan(close):
by_day[_ms_to_date(ts)].append(close)
return {d: float(np.mean(v)) for d, v in by_day.items()}
def _deribit_funding(instrument):
# Paginate backward: Deribit returns at most ~744 records per call (count=1000 cap).
# With hourly data, 744 records ≈ 31 days. Need two pages to cover 55 days.
by_day = defaultdict(list)
cur_end = END_MS
pages = 0
while cur_end > START_MS and pages < 10:
url = (
f"https://www.deribit.com/api/v2/public/get_funding_rate_history"
f"?instrument_name={instrument}&start_timestamp={START_MS}&end_timestamp={cur_end}&count=1000"
)
d = _get(url)
if not (d and "result" in d and isinstance(d["result"], list)):
break
rows = d["result"]
if not rows:
break
for row in rows:
ts = int(row.get("timestamp", 0))
v = _safe_float(row.get("interest_8h"))
if not math.isnan(v):
by_day[_ms_to_date(ts)].append(v)
oldest_ts = min(int(r.get("timestamp", cur_end)) for r in rows)
if oldest_ts >= cur_end:
break
cur_end = oldest_ts - 1
pages += 1
time.sleep(0.3)
return {d: float(np.mean(v)) for d, v in by_day.items()}
print("[DERIBIT] Fetching dvol_btc ...")
result["dvol_btc"] = _dvol("BTC")
time.sleep(0.5)
print("[DERIBIT] Fetching dvol_eth ...")
result["dvol_eth"] = _dvol("ETH")
time.sleep(0.5)
print("[DERIBIT] Fetching fund_dbt_btc ...")
result["fund_dbt_btc"] = _deribit_funding("BTC-PERPETUAL")
time.sleep(0.5)
print("[DERIBIT] Fetching fund_dbt_eth ...")
result["fund_dbt_eth"] = _deribit_funding("ETH-PERPETUAL")
for k, v in result.items():
print(f" [DERIBIT] {k}: {len(v)} days")
_save_cache("deribit", result)
return result
# ----- COINMETRICS ---------------------------------------------------------
def fetch_coinmetrics(no_cache=False) -> dict:
cache = None if no_cache else _load_cache("coinmetrics")
if cache is not None:
print("[COINMETRICS] Using cache")
return cache
result = {}
# CoinMetrics community API: KO permanently — HTTP 403 Forbidden as of 2026-03.
# Community endpoint is now paywalled. All 12 CoinMetrics indicators are dead.
BASE = "https://community-api.coinmetrics.io/v4/timeseries/asset-metrics"
def _cm_fetch(asset, metrics_str):
url = (
f"{BASE}?assets={asset}&metrics={metrics_str}"
f"&frequency=1d&start_time={START_STR}&end_time={END_STR}&page_size=100"
)
d = _get(url, timeout=30)
rows = []
if d and "data" in d:
rows = d["data"]
return rows
def _rows_to_series(rows, metric_key):
out = {}
for row in rows:
t = row.get("time", "")
date_s = t[:10] # YYYY-MM-DD
v = _safe_float(row.get(metric_key))
if not math.isnan(v):
out[date_s] = v
return out
# BTC batch
print("[COINMETRICS] Fetching BTC batch ...")
btc_metrics = "CapRealUSD,CapMrktCurUSD,AdrActCnt,TxCnt,FeeTotUSD,NVTAdj,VelCur1yr,SplyAct1yr"
btc_rows = _cm_fetch("btc", btc_metrics)
time.sleep(7)
result["rcap_btc"] = _rows_to_series(btc_rows, "CapRealUSD")
result["addr_btc"] = _rows_to_series(btc_rows, "AdrActCnt")
result["txcnt"] = _rows_to_series(btc_rows, "TxCnt")
result["fees_btc"] = _rows_to_series(btc_rows, "FeeTotUSD")
result["nvt"] = _rows_to_series(btc_rows, "NVTAdj")
result["velocity"] = _rows_to_series(btc_rows, "VelCur1yr")
result["sply_act"] = _rows_to_series(btc_rows, "SplyAct1yr")
# MVRV and NUPL from CapMrktCurUSD + CapRealUSD
mvrv_d = {}
nupl_d = {}
for row in btc_rows:
t = row.get("time", "")[:10]
m = _safe_float(row.get("CapMrktCurUSD"))
rc = _safe_float(row.get("CapRealUSD"))
if not math.isnan(m) and not math.isnan(rc) and rc > 0:
mvrv_d[t] = m / rc
nupl_d[t] = (m - rc) / m if m > 0 else float("nan")
result["mvrv"] = mvrv_d
result["nupl"] = nupl_d
# ETH batch
print("[COINMETRICS] Fetching ETH batch ...")
eth_metrics = "CapRealUSD,AdrActCnt,FeeTotUSD"
eth_rows = _cm_fetch("eth", eth_metrics)
time.sleep(7)
result["rcap_eth"] = _rows_to_series(eth_rows, "CapRealUSD")
result["addr_eth"] = _rows_to_series(eth_rows, "AdrActCnt")
result["fees_eth"] = _rows_to_series(eth_rows, "FeeTotUSD")
for k, v in result.items():
print(f" [COINMETRICS] {k}: {len(v)} days")
_save_cache("coinmetrics", result)
return result
# ----- DEFILLAMA -----------------------------------------------------------
def fetch_defillama(no_cache=False) -> dict:
cache = None if no_cache else _load_cache("defillama")
if cache is not None:
print("[DEFILLAMA] Using cache")
return cache
result = {}
all_dates_set = set(_date_range_str(START_DT, END_DT))
def _dl_tvl(url):
d = _get(url)
out = {}
if d and isinstance(d, list):
for entry in d:
ts = int(entry.get("date", 0))
ds = _unix_to_date(ts)
v = _safe_float(entry.get("tvl"))
if ds in all_dates_set and not math.isnan(v):
out[ds] = v
return out
def _dl_stables_chart(url):
"""stablecoincharts/all returns array of {date, totalCirculatingUSD: {peggedUSD: N}}"""
d = _get(url)
out = {}
if d and isinstance(d, list):
for entry in d:
ts = int(entry.get("date", 0))
ds = _unix_to_date(ts)
if ds not in all_dates_set:
continue
circ = entry.get("totalCirculatingUSD")
if isinstance(circ, dict):
v = _safe_float(circ.get("peggedUSD"))
else:
v = _safe_float(circ)
if not math.isnan(v):
out[ds] = v
return out
print("[DEFILLAMA] Fetching TVL (all chains) ...")
result["tvl"] = _dl_tvl("https://api.llama.fi/v2/historicalChainTvl")
time.sleep(1)
print("[DEFILLAMA] Fetching TVL Ethereum ...")
result["tvl_eth"] = _dl_tvl("https://api.llama.fi/v2/historicalChainTvl/Ethereum")
time.sleep(1)
print("[DEFILLAMA] Fetching stables total (all) ...")
result["stables"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all")
time.sleep(1)
print("[DEFILLAMA] Fetching USDT ...")
result["usdt"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all?stablecoin=1")
time.sleep(1)
print("[DEFILLAMA] Fetching USDC ...")
result["usdc"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all?stablecoin=2")
for k, v in result.items():
print(f" [DEFILLAMA] {k}: {len(v)} days")
_save_cache("defillama", result)
return result
# ----- ALTERNATIVE.ME (Fear & Greed) ---------------------------------------
def fetch_fng(no_cache=False) -> dict:
cache = None if no_cache else _load_cache("fng")
if cache is not None:
print("[FNG] Using cache")
return cache
print("[FNG] Fetching F&G history ...")
# date_format=us changes timestamp to MM/DD/YYYY strings — int() parse fails silently.
# Without date_format, timestamp is returned as Unix seconds string — works correctly.
url = "https://api.alternative.me/fng/?limit=1000"
d = _get(url)
fng_raw = {}
if d and "data" in d:
for entry in d["data"]:
# date_format=us → timestamp field is unix timestamp (str or int)
ts = entry.get("timestamp")
v = _safe_float(entry.get("value"))
if ts is None:
continue
try:
ts_int = int(ts)
ds = _unix_to_date(ts_int)
except (TypeError, ValueError):
continue
if not math.isnan(v):
fng_raw[ds] = v
all_dates = list(_date_range_str(START_DT, END_DT))
fng_ff = _fill_forward(fng_raw, all_dates)
# Derived: fng_prev (lag-1), fng_week (7-day rolling mean)
sorted_dates = sorted(fng_ff.keys())
fng_prev = {}
fng_week = {}
for i, d_str in enumerate(sorted_dates):
if i >= 1:
fng_prev[d_str] = fng_ff.get(sorted_dates[i - 1], float("nan"))
else:
fng_prev[d_str] = float("nan")
window = [fng_ff.get(sorted_dates[j], float("nan")) for j in range(max(0, i - 6), i + 1)]
valid = [x for x in window if not math.isnan(x)]
fng_week[d_str] = float(np.mean(valid)) if valid else float("nan")
result = {
"fng": fng_ff,
"fng_prev": fng_prev,
"fng_week": fng_week,
}
for k, v in result.items():
n = sum(1 for x in v.values() if not math.isnan(x))
print(f" [FNG] {k}: {n} valid days")
_save_cache("fng", result)
return result
# ----- BLOCKCHAIN.INFO -----------------------------------------------------
def fetch_blockchain(no_cache=False) -> dict:
cache = None if no_cache else _load_cache("blockchain")
if cache is not None:
print("[BLOCKCHAIN] Using cache")
return cache
result = {}
all_dates_set = set(_date_range_str(START_DT, END_DT))
CHARTS = {
"hashrate": ("https://api.blockchain.info/charts/hash-rate", 1.0),
"difficulty": ("https://api.blockchain.info/charts/difficulty", 1.0),
"mcap_bc": ("https://api.blockchain.info/charts/market-cap", 1.0),
"tx_blk": ("https://api.blockchain.info/charts/n-transactions-per-block", 1.0),
"total_btc": ("https://api.blockchain.info/charts/total-bitcoins", 1e-8),
}
for name, (base_url, divisor) in CHARTS.items():
url = f"{base_url}?timespan=90days&start=2025-12-01&format=json"
print(f"[BLOCKCHAIN] Fetching {name} ...")
d = _get(url, timeout=30)
out = {}
if d and "values" in d:
for entry in d["values"]:
ts = int(entry.get("x", 0))
ds = _unix_to_date(ts)
v = _safe_float(entry.get("y"))
if ds in all_dates_set and not math.isnan(v):
out[ds] = v * divisor
result[name] = out
print(f" [BLOCKCHAIN] {name}: {len(out)} days")
time.sleep(1)
_save_cache("blockchain", result)
return result
# ----- COINGECKO -----------------------------------------------------------
def fetch_coingecko(no_cache=False) -> dict:
cache = None if no_cache else _load_cache("coingecko")
if cache is not None:
print("[COINGECKO] Using cache")
return cache
result = {"btc_price": {}, "eth_price": {}}
all_dates_set = set(_date_range_str(START_DT, END_DT))
# Use market_chart for a bulk fetch (avoids per-day calls + rate limits)
def _market_chart(coin_id):
# from=unix start, to=unix end
start_unix = int(START_DT.timestamp())
end_unix = int(END_DT.timestamp())
url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart/range?vs_currency=usd&from={start_unix}&to={end_unix}"
d = _get(url, timeout=30)
out = {}
if d and "prices" in d:
by_day = defaultdict(list)
for ts_ms, price in d["prices"]:
ds = _ms_to_date(int(ts_ms))
if ds in all_dates_set:
by_day[ds].append(_safe_float(price))
for ds, vals in by_day.items():
valid = [x for x in vals if not math.isnan(x)]
if valid:
out[ds] = valid[-1] # last price of the day
return out
print("[COINGECKO] Fetching BTC price ...")
result["btc_price"] = _market_chart("bitcoin")
time.sleep(2)
print("[COINGECKO] Fetching ETH price ...")
result["eth_price"] = _market_chart("ethereum")
for k, v in result.items():
print(f" [COINGECKO] {k}: {len(v)} days")
_save_cache("coingecko", result)
return result
# ---------------------------------------------------------------------------
# Step 4: Assemble all indicator data
# ---------------------------------------------------------------------------
def assemble_indicators(
fred_data: dict,
binance_data: dict,
deribit_data: dict,
cm_data: dict,
dl_data: dict,
fng_data: dict,
bc_data: dict,
cg_data: dict,
) -> dict:
"""
Merge all source dicts into a single {indicator_name: {date_str: float}} dict.
"""
merged = {}
sources = [
fred_data,
binance_data,
deribit_data,
cm_data,
dl_data,
fng_data,
bc_data,
cg_data,
]
for src in sources:
for name, daily in src.items():
if name in merged:
# Prefer whichever has more data
if len(daily) > len(merged[name]):
merged[name] = daily
else:
merged[name] = daily
return merged
# ---------------------------------------------------------------------------
# Step 5: Correlation analysis
# ---------------------------------------------------------------------------
LAGS = [0, 1, 2, 3, 5, 7]
def compute_correlations(
trading_days: list,
returns: dict,
indicators: dict,
) -> list:
"""
For each indicator, test multiple lags.
Returns a flat list of row dicts for the correlation table.
"""
# Build aligned return vector
ret_vec = np.array([returns.get(d, float("nan")) for d in trading_days])
rows = []
for ind_name, daily in indicators.items():
# Raw vector for this indicator over all calendar days (for lag computation)
# We need to look up indicator values at trading_day - lag (in calendar days)
# Build a sorted lookup from the full daily dict
all_cal = sorted(daily.keys())
cal_set = set(all_cal)
# Pre-build array: for each trading day, what is the indicator value?
ind_series = {}
for d in trading_days:
v = daily.get(d)
if v is None:
# Look backwards up to 5 calendar days (fill-forward)
dt = datetime.strptime(d, "%Y-%m-%d")
for back in range(1, 6):
candidate = (dt - timedelta(days=back)).strftime("%Y-%m-%d")
if candidate in daily:
v = daily[candidate]
break
ind_series[d] = v if v is not None else float("nan")
for lag in LAGS:
# indicator at trading_day[i - lag] predicts return at trading_day[i]
n_td = len(trading_days)
x_vals = []
y_vals = []
for i in range(n_td):
y = ret_vec[i]
if lag == 0:
src_day = trading_days[i]
else:
src_idx = i - lag
if src_idx < 0:
x_vals.append(float("nan"))
y_vals.append(y)
continue
src_day = trading_days[src_idx]
x = ind_series.get(src_day, float("nan"))
x_vals.append(x)
y_vals.append(y)
x_arr = np.array(x_vals, dtype=float)
y_arr = np.array(y_vals, dtype=float)
# Drop NaN pairs
valid = ~(np.isnan(x_arr) | np.isnan(y_arr))
xv = x_arr[valid]
yv = y_arr[valid]
n = int(valid.sum())
if n < 10:
rows.append({
"name": ind_name,
"lag": lag,
"n_days": n,
"pearson_r": float("nan"),
"pearson_p": float("nan"),
"spearman_r": float("nan"),
"spearman_p": float("nan"),
"stress_r": float("nan"),
"stress_p": float("nan"),
"note": "insufficient data",
})
continue
# Pearson
try:
pr, pp = stats.pearsonr(xv, yv)
except Exception:
pr, pp = float("nan"), float("nan")
# Spearman
try:
sr, sp = stats.spearmanr(xv, yv)
except Exception:
sr, sp = float("nan"), float("nan")
# Point-biserial: stress day flag (return < -0.01)
stress_flag = (yv < -0.01).astype(float)
try:
if stress_flag.std() > 0:
pbr, pbp = stats.pointbiserialr(stress_flag.astype(bool), xv)
else:
pbr, pbp = float("nan"), float("nan")
except Exception:
pbr, pbp = float("nan"), float("nan")
rows.append({
"name": ind_name,
"lag": lag,
"n_days": n,
"pearson_r": float(pr),
"pearson_p": float(pp),
"spearman_r": float(sr),
"spearman_p": float(sp),
"stress_r": float(pbr),
"stress_p": float(pbp),
"note": "ok",
})
return rows
# ---------------------------------------------------------------------------
# Step 6: Build summary (best lag per indicator)
# ---------------------------------------------------------------------------
def build_summary(corr_rows: list, indicators: dict, trading_days: list) -> list:
"""Best lag per indicator, sorted by |pearson_r|."""
from collections import defaultdict
best = {}
for row in corr_rows:
name = row["name"]
r = row["pearson_r"]
if math.isnan(r):
continue
if name not in best or abs(r) > abs(best[name]["pearson_r"]):
best[name] = row
# Add indicators that had no valid data
for name in indicators:
if name not in best:
# Count non-NaN days
n_ok = sum(1 for d in trading_days if not math.isnan(
_safe_float(indicators[name].get(d))
))
best[name] = {
"name": name,
"lag": -1,
"n_days": n_ok,
"pearson_r": float("nan"),
"pearson_p": float("nan"),
"spearman_r": float("nan"),
"spearman_p": float("nan"),
"stress_r": float("nan"),
"stress_p": float("nan"),
"note": "no valid correlation",
}
# Assign source labels
SOURCE_MAP = {
**{k: "fred" for k in FRED_SERIES},
"funding_btc": "binance", "funding_eth": "binance",
"oi_btc": "binance", "oi_eth": "binance",
"ls_btc": "binance", "ls_eth": "binance",
"ls_top": "binance", "taker": "binance",
"vol24": "binance",
"dvol_btc": "deribit", "dvol_eth": "deribit",
"fund_dbt_btc": "deribit","fund_dbt_eth": "deribit",
"rcap_btc": "coinmetrics","mvrv": "coinmetrics",
"nupl": "coinmetrics", "addr_btc": "coinmetrics",
"addr_eth": "coinmetrics","txcnt": "coinmetrics",
"fees_btc": "coinmetrics","fees_eth": "coinmetrics",
"nvt": "coinmetrics", "velocity": "coinmetrics",
"sply_act": "coinmetrics","rcap_eth": "coinmetrics",
"tvl": "defillama", "tvl_eth": "defillama",
"stables": "defillama", "usdt": "defillama",
"usdc": "defillama",
"fng": "alternative", "fng_prev": "alternative",
"fng_week": "alternative",
"hashrate": "blockchain", "difficulty": "blockchain",
"mcap_bc": "blockchain", "tx_blk": "blockchain",
"total_btc": "blockchain",
"btc_price": "coingecko", "eth_price": "coingecko",
}
rows_sorted = sorted(
best.values(),
key=lambda x: abs(x["pearson_r"]) if not math.isnan(x["pearson_r"]) else -1,
reverse=True,
)
summary = []
for rank, row in enumerate(rows_sorted, 1):
name = row["name"]
sig = ""
if not math.isnan(row.get("pearson_p", float("nan"))):
sig = "*" if row["pearson_p"] < 0.05 else ""
sig += "*" if row["pearson_p"] < 0.01 else ""
summary.append({
"rank": rank,
"name": name,
"source": SOURCE_MAP.get(name, "?"),
"best_lag": row["lag"],
"best_r": row["pearson_r"],
"best_p": row["pearson_p"],
"spearman": row["spearman_r"],
"stress_r": row["stress_r"],
"n_days": row["n_days"],
"sig": sig,
"note": row.get("note", ""),
})
return summary
# ---------------------------------------------------------------------------
# Step 7: Save outputs
# ---------------------------------------------------------------------------
def save_daily_csv(trading_days: list, indicators: dict, returns: dict):
path = RUN_LOGS / f"exf_daily_{TS_NOW}.csv"
cols = ["date", "daily_return"] + sorted(indicators.keys())
with open(path, "w", newline="") as f:
w = csv.DictWriter(f, fieldnames=cols)
w.writeheader()
for d in trading_days:
row = {"date": d, "daily_return": returns.get(d, "")}
for name, series in indicators.items():
v = series.get(d, float("nan"))
row[name] = "" if math.isnan(v) else v
w.writerow(row)
print(f"[SAVE] Daily values: {path}")
def save_corr_csv(corr_rows: list):
path = RUN_LOGS / f"exf_correlation_{TS_NOW}.csv"
if not corr_rows:
return
cols = list(corr_rows[0].keys())
with open(path, "w", newline="") as f:
w = csv.DictWriter(f, fieldnames=cols)
w.writeheader()
w.writerows(corr_rows)
print(f"[SAVE] Correlation table: {path}")
def print_summary(summary: list):
print()
print("=" * 110)
print(f"{'rank':>4} {'name':<14} {'source':<12} {'best_lag':>8} {'best_r':>8} {'best_p':>8} "
f"{'spearman':>8} {'stress_r':>8} {'n_days':>6} {'sig':>4}")
print("=" * 110)
for row in summary:
r_str = f"{row['best_r']:+.4f}" if not math.isnan(row.get("best_r", float("nan"))) else " n/a "
p_str = f"{row['best_p']:.4f}" if not math.isnan(row.get("best_p", float("nan"))) else " n/a "
sp_str = f"{row['spearman']:+.4f}" if not math.isnan(row.get("spearman", float("nan"))) else " n/a "
sr_str = f"{row['stress_r']:+.4f}" if not math.isnan(row.get("stress_r", float("nan"))) else " n/a "
lag_str = f"lag={row['best_lag']}" if row['best_lag'] >= 0 else " - "
print(
f"{row['rank']:>4} {row['name']:<14} {row['source']:<12} {lag_str:>8} "
f"{r_str:>8} {p_str:>8} {sp_str:>8} {sr_str:>8} {row['n_days']:>6} {row['sig']:>4}"
)
print("=" * 110)
# Top 10 highlight
sig_rows = [r for r in summary if not math.isnan(r.get("best_p", float("nan"))) and r["best_p"] < 0.05]
print(f"\n[RESULT] {len(sig_rows)} indicators significant at p<0.05 (any lag)")
if sig_rows:
print("TOP PREDICTORS:")
for r in sig_rows[:15]:
print(f" {r['rank']:>3}. {r['name']:<14} lag={r['best_lag']} "
f"r={r['best_r']:+.4f} p={r['best_p']:.4f} spearman={r['spearman']:+.4f} "
f"n={r['n_days']}")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="ExF historical correlation tester")
parser.add_argument("--no-cache", action="store_true", help="Ignore and overwrite all caches")
args = parser.parse_args()
no_cache = args.no_cache
print("=" * 70)
print(" DOLPHIN ExF Correlation Tester")
print(f" Date range: {START_STR} → 2026-02-25")
print(f" Cache: {'DISABLED (--no-cache)' if no_cache else 'ENABLED'}")
print("=" * 70)
# --- Trading days
trading_days = get_trading_days()
if len(trading_days) == 0:
print("[ERROR] No parquet files found in vbt_cache. Abort.")
sys.exit(1)
# --- Daily returns
print("\n[STEP 1] Loading daily returns ...")
try:
returns = load_daily_returns(trading_days)
except RuntimeError as e:
print(f"[ERROR] {e}")
sys.exit(1)
valid_ret = sum(1 for v in returns.values() if not math.isnan(v))
print(f" Returns: {valid_ret}/{len(trading_days)} valid")
# --- Fetch all sources
print("\n[STEP 2] Fetching external factor data ...")
print("\n--- FRED ---")
try:
fred_data = fetch_fred(no_cache)
print(f" FRED: {len(fred_data)} indicators OK")
except Exception as e:
print(f" FRED FAILED: {e}")
fred_data = {}
print("\n--- BINANCE DERIVATIVES ---")
try:
binance_data = fetch_binance(no_cache)
print(f" BINANCE: {len(binance_data)} indicators OK")
except Exception as e:
print(f" BINANCE FAILED: {e}")
binance_data = {}
print("\n--- DERIBIT ---")
try:
deribit_data = fetch_deribit(no_cache)
print(f" DERIBIT: {len(deribit_data)} indicators OK")
except Exception as e:
print(f" DERIBIT FAILED: {e}")
deribit_data = {}
print("\n--- COINMETRICS ---")
try:
cm_data = fetch_coinmetrics(no_cache)
print(f" COINMETRICS: {len(cm_data)} indicators OK")
except Exception as e:
print(f" COINMETRICS FAILED: {e}")
cm_data = {}
print("\n--- DEFILLAMA ---")
try:
dl_data = fetch_defillama(no_cache)
print(f" DEFILLAMA: {len(dl_data)} indicators OK")
except Exception as e:
print(f" DEFILLAMA FAILED: {e}")
dl_data = {}
print("\n--- FEAR & GREED (alternative.me) ---")
try:
fng_data = fetch_fng(no_cache)
print(f" FNG: {len(fng_data)} indicators OK")
except Exception as e:
print(f" FNG FAILED: {e}")
fng_data = {}
print("\n--- BLOCKCHAIN.INFO ---")
try:
bc_data = fetch_blockchain(no_cache)
print(f" BLOCKCHAIN: {len(bc_data)} indicators OK")
except Exception as e:
print(f" BLOCKCHAIN FAILED: {e}")
bc_data = {}
print("\n--- COINGECKO ---")
try:
cg_data = fetch_coingecko(no_cache)
print(f" COINGECKO: {len(cg_data)} indicators OK")
except Exception as e:
print(f" COINGECKO FAILED: {e}")
cg_data = {}
# --- Assemble
print("\n[STEP 3] Assembling indicators ...")
indicators = assemble_indicators(
fred_data, binance_data, deribit_data,
cm_data, dl_data, fng_data, bc_data, cg_data,
)
print(f" Total indicators assembled: {len(indicators)}")
# Coverage report
print("\n Coverage per indicator over trading days:")
for name in sorted(indicators.keys()):
series = indicators[name]
n_ok = sum(
1 for d in trading_days
if not math.isnan(_safe_float(series.get(d)))
)
pct = 100 * n_ok / len(trading_days)
status = "OK" if n_ok >= 40 else ("PARTIAL" if n_ok >= 10 else "FAIL")
print(f" {name:<18} {n_ok:>3}/{len(trading_days)} ({pct:5.1f}%) [{status}]")
# --- Correlation analysis
print("\n[STEP 4] Computing correlations (lags 0-7) ...")
corr_rows = compute_correlations(trading_days, returns, indicators)
print(f" Computed {len(corr_rows)} rows ({len(indicators)} indicators × {len(LAGS)} lags)")
# --- Summary
summary = build_summary(corr_rows, indicators, trading_days)
# --- Save
print("\n[STEP 5] Saving outputs ...")
save_daily_csv(trading_days, indicators, returns)
save_corr_csv(corr_rows)
# --- Print table
print("\n[STEP 6] Results")
print_summary(summary)
# Also print status for each source
print("\n[SOURCE STATUS]")
source_ok = {
"fred": "OK" if fred_data else "FAIL",
"binance": "OK" if binance_data else "FAIL",
"deribit": "OK" if deribit_data else "FAIL",
"coinmetrics": "OK" if cm_data else "FAIL",
"defillama": "OK" if dl_data else "FAIL",
"alternative": "OK" if fng_data else "FAIL",
"blockchain": "OK" if bc_data else "FAIL",
"coingecko": "OK" if cg_data else "FAIL",
}
for src, status in source_ok.items():
print(f" {src:<14} {status}")
print("\n[DONE] ExF correlation analysis complete.")
if __name__ == "__main__":
main()