DOLPHIN/nautilus_dolphin/test_exf_correlation.py

#!/usr/bin/env python3
"""
test_exf_correlation.py
=======================
Fetch historical data for all ExF indicators with FULL/PARTIAL history,
align to 55 trading days (2025-12-31 → 2026-02-25), and test each for
predictiveness against daily trading returns.

Usage:
    python test_exf_correlation.py [--no-cache]
"""

import sys
import os
import time
import json
import csv
import math
import argparse
import requests
from pathlib import Path
from datetime import datetime, timezone, timedelta, date
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy import stats

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
BASE_DIR      = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict")
VBT_DIR       = BASE_DIR / "vbt_cache"
EXF_DIR       = BASE_DIR / "external_factors"
NAUTILUS_DIR  = BASE_DIR / "nautilus_dolphin"
RUN_LOGS      = NAUTILUS_DIR / "run_logs"
RUN_LOGS.mkdir(parents=True, exist_ok=True)

FRED_KEY      = "c16a9cde3e3bb5bb972bb9283485f202"

# ---------------------------------------------------------------------------
# Date range
# ---------------------------------------------------------------------------
START_DT  = datetime(2025, 12, 31, tzinfo=timezone.utc)
END_DT    = datetime(2026,  2, 25, 23, 59, 59, tzinfo=timezone.utc)
START_MS  = int(START_DT.timestamp() * 1000)
END_MS    = int(END_DT.timestamp() * 1000)
START_STR = "2025-12-31"
END_STR   = "2026-02-26"   # exclusive for CoinMetrics end_time

TS_NOW = datetime.now().strftime("%Y%m%d_%H%M%S")

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _date_range_str(start: datetime, end: datetime):
    """Yield YYYY-MM-DD strings for every calendar day from start to end inclusive."""
    cur = start.date()
    stop = end.date()
    while cur <= stop:
        yield cur.strftime("%Y-%m-%d")
        cur += timedelta(days=1)


def _ms_to_date(ms: int) -> str:
    return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).strftime("%Y-%m-%d")


def _unix_to_date(ts: int) -> str:
    return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")


def _safe_float(v):
    if v is None:
        return float("nan")
    try:
        f = float(v)
        if math.isfinite(f):
            return f
        return float("nan")
    except (TypeError, ValueError):
        return float("nan")


def _fill_forward(series: dict, date_list: list) -> dict:
    """Fill-forward a {date_str: float} dict over date_list."""
    result = {}
    last = float("nan")
    for d in date_list:
        v = series.get(d, float("nan"))
        if not math.isnan(v):
            last = v
        result[d] = last
    return result


def _cache_path(source: str) -> Path:
    return RUN_LOGS / f"exf_raw_cache_{source}.json"


def _load_cache(source: str):
    p = _cache_path(source)
    if p.exists():
        try:
            with open(p, "r") as f:
                return json.load(f)
        except Exception:
            return None
    return None


def _save_cache(source: str, data):
    p = _cache_path(source)
    with open(p, "w") as f:
        json.dump(data, f)


def _get(url: str, params=None, timeout=20, headers=None) -> dict | list | None:
    try:
        h = {"User-Agent": "Mozilla/5.0"}
        if headers:
            h.update(headers)
        r = requests.get(url, params=params, timeout=timeout, headers=h)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"    [HTTP ERROR] {url[:80]} -> {e}")
        return None


# ---------------------------------------------------------------------------
# Step 1: Get trading day list from VBT parquet filenames
# ---------------------------------------------------------------------------

def get_trading_days() -> list:
    files = sorted(VBT_DIR.glob("*.parquet"))
    days = []
    for f in files:
        stem = f.stem  # YYYY-MM-DD
        try:
            dt = datetime.strptime(stem, "%Y-%m-%d").date()
            if date(2025, 12, 31) <= dt <= date(2026, 2, 25):
                days.append(stem)
        except ValueError:
            pass
    print(f"[TRADING DAYS] Found {len(days)} days: {days[0]} → {days[-1]}")
    return days


# ---------------------------------------------------------------------------
# Step 2: Load daily returns
# ---------------------------------------------------------------------------

def load_daily_returns(trading_days: list) -> dict:
    """
    Returns {date_str: pnl_normalized} where pnl_normalized = pnl / capital_start_of_day.
    Tries to load from most recent run_logs/daily_*.csv first.
    """
    csvs = sorted(RUN_LOGS.glob("daily_*.csv"), reverse=True)
    if csvs:
        latest = csvs[0]
        print(f"[RETURNS] Loading from {latest.name}")
        df = pd.read_csv(latest)
        df["date"] = df["date"].astype(str)
        # capital col is end-of-day capital; start capital = capital + pnl
        # pnl_normalized = pnl / (capital + pnl) if capital is end-of-day
        # OR if capital is start: pnl_normalized = pnl / capital
        # The CSV header: date,pnl,capital,dd_pct,... — capital appears to be end-of-day
        # Reconstruct start capital: start_cap = capital - pnl
        returns = {}
        for _, row in df.iterrows():
            d = str(row["date"])
            if d in trading_days:
                pnl = float(row["pnl"])
                cap = float(row["capital"])
                start_cap = cap - pnl  # end_cap = start_cap + pnl  ← reconstruct
                if start_cap > 0:
                    returns[d] = pnl / start_cap
                else:
                    returns[d] = float("nan")
        print(f"[RETURNS] Loaded {len(returns)} dates. Sample: {list(returns.items())[:3]}")
        return returns
    else:
        raise RuntimeError(
            "No run_logs/daily_*.csv found. Run test_pf_dynamic_beta_validate.py first."
        )


# ---------------------------------------------------------------------------
# Step 3: Fetch functions — one call per source
# ---------------------------------------------------------------------------

# ----- FRED ----------------------------------------------------------------

FRED_SERIES = {
    "dxy":      "DTWEXBGS",
    "us10y":    "DGS10",
    "us2y":     "DGS2",
    "ycurve":   "T10Y2Y",
    "vix":      "VIXCLS",
    "fedfunds": "DFF",
    "m2":       "WM2NS",
    "cpi":      "CPIAUCSL",
    "sp500":    "SP500",
    # gold: KO — GOLDAMGBD228NLBM/GOLDPMGBD228NLBM/GOLD all return HTTP 400 (series licensing/discontinued)
    "hy_spread":"BAMLH0A0HYM2",
    "be5y":     "T5YIE",
    "nfci":     "NFCI",
    "claims":   "ICSA",
}


def fetch_fred(no_cache=False) -> dict:
    """Returns {indicator_name: {date_str: float}} for all FRED indicators."""
    cache = None if no_cache else _load_cache("fred")
    if cache is not None:
        print("[FRED] Using cache")
        return cache

    result = {}
    all_dates = list(_date_range_str(START_DT, END_DT))

    for name, series_id in FRED_SERIES.items():
        url = (
            f"https://api.stlouisfed.org/fred/series/observations"
            f"?series_id={series_id}"
            f"&api_key={FRED_KEY}"
            f"&file_type=json"
            f"&observation_start=2025-12-01"
            f"&observation_end=2026-03-01"
            f"&sort_order=asc"
        )
        d = _get(url)
        series_raw = {}
        if d and "observations" in d:
            for obs in d["observations"]:
                dt = obs.get("date", "")
                v = obs.get("value", ".")
                if v != ".":
                    fv = _safe_float(v)
                    if not math.isnan(fv):
                        series_raw[dt] = fv
        series_ff = _fill_forward(series_raw, all_dates)
        result[name] = series_ff
        n_ok = sum(1 for v in series_ff.values() if not math.isnan(v))
        print(f"  [FRED] {name:12s} ({series_id:20s}): {n_ok}/{len(all_dates)} obs")
        time.sleep(0.15)

    _save_cache("fred", result)
    return result


# ----- BINANCE DERIVATIVES -------------------------------------------------

def _bin_hourly_to_daily(records: list, key: str, agg="mean") -> dict:
    """Group hourly Binance records to daily. agg='mean' or 'last'."""
    by_day = defaultdict(list)
    for r in records:
        ts = int(r.get("timestamp", r.get("fundingTime", 0)))
        d = _ms_to_date(ts)
        v = _safe_float(r.get(key))
        if not math.isnan(v):
            by_day[d].append(v)
    result = {}
    for d, vals in by_day.items():
        if vals:
            result[d] = vals[-1] if agg == "last" else float(np.mean(vals))
    return result


def fetch_binance(no_cache=False) -> dict:
    """Returns {indicator_name: {date_str: float}}"""
    cache = None if no_cache else _load_cache("binance")
    if cache is not None:
        print("[BINANCE] Using cache")
        return cache

    result = {}

    # --- Funding BTC
    def _funding(symbol):
        all_recs = []
        cur_start = START_MS
        while cur_start < END_MS:
            chunk_end = min(cur_start + 500 * 8 * 3600 * 1000, END_MS)
            url = f"https://fapi.binance.com/fapi/v1/fundingRate?symbol={symbol}&startTime={cur_start}&endTime={chunk_end}&limit=500"
            d = _get(url)
            if d and isinstance(d, list):
                all_recs.extend(d)
                if len(d) < 500:
                    break
                cur_start = int(d[-1]["fundingTime"]) + 1
            else:
                break
        return _bin_hourly_to_daily(all_recs, "fundingRate", agg="last")

    # --- OI / LS / Taker: KO — Binance Futures stats endpoints retain only ~30 days of history.
    # Requesting data from 2025-12-31 (63 days ago) returns HTTP 400 -1130 "startTime invalid".
    # These require live collection; cannot be retrofitted historically.
    def _oi(symbol):
        print(f"    [KO] openInterestHist({symbol}): 30-day retention wall — returning empty")
        return {}

    def _ls(endpoint, symbol):
        print(f"    [KO] {endpoint}({symbol}): 30-day retention wall — returning empty")
        return {}

    def _taker():
        print(f"    [KO] takerlongshortRatio: 30-day retention wall — returning empty")
        return {}

    # --- Volume (spot klines daily)
    def _vol24():
        url = f"https://api.binance.com/api/v3/klines?symbol=BTCUSDT&interval=1d&startTime={START_MS}&endTime={END_MS}&limit=100"
        d = _get(url)
        result_v = {}
        if d and isinstance(d, list):
            for bar in d:
                ts = int(bar[0])
                date_s = _ms_to_date(ts)
                v = _safe_float(bar[7])  # quote volume
                if not math.isnan(v):
                    result_v[date_s] = v
        return result_v

    print("[BINANCE] Fetching funding_btc ...")
    result["funding_btc"] = _funding("BTCUSDT")

    print("[BINANCE] Fetching funding_eth ...")
    result["funding_eth"] = _funding("ETHUSDT")

    print("[BINANCE] Fetching oi_btc ...")
    result["oi_btc"] = _oi("BTCUSDT")

    print("[BINANCE] Fetching oi_eth ...")
    result["oi_eth"] = _oi("ETHUSDT")

    print("[BINANCE] Fetching ls_btc ...")
    result["ls_btc"] = _ls("globalLongShortAccountRatio", "BTCUSDT")

    print("[BINANCE] Fetching ls_eth ...")
    result["ls_eth"] = _ls("globalLongShortAccountRatio", "ETHUSDT")

    print("[BINANCE] Fetching ls_top ...")
    result["ls_top"] = _ls("topLongShortAccountRatio", "BTCUSDT")

    print("[BINANCE] Fetching taker ...")
    result["taker"] = _taker()

    print("[BINANCE] Fetching vol24 ...")
    result["vol24"] = _vol24()

    for k, v in result.items():
        print(f"  [BINANCE] {k}: {len(v)} days")

    _save_cache("binance", result)
    return result


# ----- DERIBIT -------------------------------------------------------------

def fetch_deribit(no_cache=False) -> dict:
    cache = None if no_cache else _load_cache("deribit")
    if cache is not None:
        print("[DERIBIT] Using cache")
        return cache

    result = {}

    def _dvol(currency):
        url = (
            f"https://www.deribit.com/api/v2/public/get_volatility_index_data"
            f"?currency={currency}&resolution=3600&start_timestamp={START_MS}&end_timestamp={END_MS}"
        )
        d = _get(url)
        by_day = defaultdict(list)
        if d and "result" in d and isinstance(d["result"], dict):
            for row in d["result"].get("data", []):
                # row = [timestamp_ms, open, high, low, close]
                ts = int(row[0])
                close = _safe_float(row[4]) if len(row) > 4 else float("nan")
                if not math.isnan(close):
                    by_day[_ms_to_date(ts)].append(close)
        return {d: float(np.mean(v)) for d, v in by_day.items()}

    def _deribit_funding(instrument):
        # Paginate backward: Deribit returns at most ~744 records per call (count=1000 cap).
        # With hourly data, 744 records ≈ 31 days. Need two pages to cover 55 days.
        by_day = defaultdict(list)
        cur_end = END_MS
        pages = 0
        while cur_end > START_MS and pages < 10:
            url = (
                f"https://www.deribit.com/api/v2/public/get_funding_rate_history"
                f"?instrument_name={instrument}&start_timestamp={START_MS}&end_timestamp={cur_end}&count=1000"
            )
            d = _get(url)
            if not (d and "result" in d and isinstance(d["result"], list)):
                break
            rows = d["result"]
            if not rows:
                break
            for row in rows:
                ts = int(row.get("timestamp", 0))
                v = _safe_float(row.get("interest_8h"))
                if not math.isnan(v):
                    by_day[_ms_to_date(ts)].append(v)
            oldest_ts = min(int(r.get("timestamp", cur_end)) for r in rows)
            if oldest_ts >= cur_end:
                break
            cur_end = oldest_ts - 1
            pages += 1
            time.sleep(0.3)
        return {d: float(np.mean(v)) for d, v in by_day.items()}

    print("[DERIBIT] Fetching dvol_btc ...")
    result["dvol_btc"] = _dvol("BTC")
    time.sleep(0.5)

    print("[DERIBIT] Fetching dvol_eth ...")
    result["dvol_eth"] = _dvol("ETH")
    time.sleep(0.5)

    print("[DERIBIT] Fetching fund_dbt_btc ...")
    result["fund_dbt_btc"] = _deribit_funding("BTC-PERPETUAL")
    time.sleep(0.5)

    print("[DERIBIT] Fetching fund_dbt_eth ...")
    result["fund_dbt_eth"] = _deribit_funding("ETH-PERPETUAL")

    for k, v in result.items():
        print(f"  [DERIBIT] {k}: {len(v)} days")

    _save_cache("deribit", result)
    return result


# ----- COINMETRICS ---------------------------------------------------------

def fetch_coinmetrics(no_cache=False) -> dict:
    cache = None if no_cache else _load_cache("coinmetrics")
    if cache is not None:
        print("[COINMETRICS] Using cache")
        return cache

    result = {}
    # CoinMetrics community API: KO permanently — HTTP 403 Forbidden as of 2026-03.
    # Community endpoint is now paywalled. All 12 CoinMetrics indicators are dead.
    BASE = "https://community-api.coinmetrics.io/v4/timeseries/asset-metrics"

    def _cm_fetch(asset, metrics_str):
        url = (
            f"{BASE}?assets={asset}&metrics={metrics_str}"
            f"&frequency=1d&start_time={START_STR}&end_time={END_STR}&page_size=100"
        )
        d = _get(url, timeout=30)
        rows = []
        if d and "data" in d:
            rows = d["data"]
        return rows

    def _rows_to_series(rows, metric_key):
        out = {}
        for row in rows:
            t = row.get("time", "")
            date_s = t[:10]  # YYYY-MM-DD
            v = _safe_float(row.get(metric_key))
            if not math.isnan(v):
                out[date_s] = v
        return out

    # BTC batch
    print("[COINMETRICS] Fetching BTC batch ...")
    btc_metrics = "CapRealUSD,CapMrktCurUSD,AdrActCnt,TxCnt,FeeTotUSD,NVTAdj,VelCur1yr,SplyAct1yr"
    btc_rows = _cm_fetch("btc", btc_metrics)
    time.sleep(7)

    result["rcap_btc"]  = _rows_to_series(btc_rows, "CapRealUSD")
    result["addr_btc"]  = _rows_to_series(btc_rows, "AdrActCnt")
    result["txcnt"]     = _rows_to_series(btc_rows, "TxCnt")
    result["fees_btc"]  = _rows_to_series(btc_rows, "FeeTotUSD")
    result["nvt"]       = _rows_to_series(btc_rows, "NVTAdj")
    result["velocity"]  = _rows_to_series(btc_rows, "VelCur1yr")
    result["sply_act"]  = _rows_to_series(btc_rows, "SplyAct1yr")

    # MVRV and NUPL from CapMrktCurUSD + CapRealUSD
    mvrv_d = {}
    nupl_d = {}
    for row in btc_rows:
        t = row.get("time", "")[:10]
        m = _safe_float(row.get("CapMrktCurUSD"))
        rc = _safe_float(row.get("CapRealUSD"))
        if not math.isnan(m) and not math.isnan(rc) and rc > 0:
            mvrv_d[t] = m / rc
            nupl_d[t] = (m - rc) / m if m > 0 else float("nan")
    result["mvrv"] = mvrv_d
    result["nupl"] = nupl_d

    # ETH batch
    print("[COINMETRICS] Fetching ETH batch ...")
    eth_metrics = "CapRealUSD,AdrActCnt,FeeTotUSD"
    eth_rows = _cm_fetch("eth", eth_metrics)
    time.sleep(7)

    result["rcap_eth"]  = _rows_to_series(eth_rows, "CapRealUSD")
    result["addr_eth"]  = _rows_to_series(eth_rows, "AdrActCnt")
    result["fees_eth"]  = _rows_to_series(eth_rows, "FeeTotUSD")

    for k, v in result.items():
        print(f"  [COINMETRICS] {k}: {len(v)} days")

    _save_cache("coinmetrics", result)
    return result


# ----- DEFILLAMA -----------------------------------------------------------

def fetch_defillama(no_cache=False) -> dict:
    cache = None if no_cache else _load_cache("defillama")
    if cache is not None:
        print("[DEFILLAMA] Using cache")
        return cache

    result = {}
    all_dates_set = set(_date_range_str(START_DT, END_DT))

    def _dl_tvl(url):
        d = _get(url)
        out = {}
        if d and isinstance(d, list):
            for entry in d:
                ts = int(entry.get("date", 0))
                ds = _unix_to_date(ts)
                v = _safe_float(entry.get("tvl"))
                if ds in all_dates_set and not math.isnan(v):
                    out[ds] = v
        return out

    def _dl_stables_chart(url):
        """stablecoincharts/all returns array of {date, totalCirculatingUSD: {peggedUSD: N}}"""
        d = _get(url)
        out = {}
        if d and isinstance(d, list):
            for entry in d:
                ts = int(entry.get("date", 0))
                ds = _unix_to_date(ts)
                if ds not in all_dates_set:
                    continue
                circ = entry.get("totalCirculatingUSD")
                if isinstance(circ, dict):
                    v = _safe_float(circ.get("peggedUSD"))
                else:
                    v = _safe_float(circ)
                if not math.isnan(v):
                    out[ds] = v
        return out

    print("[DEFILLAMA] Fetching TVL (all chains) ...")
    result["tvl"] = _dl_tvl("https://api.llama.fi/v2/historicalChainTvl")
    time.sleep(1)

    print("[DEFILLAMA] Fetching TVL Ethereum ...")
    result["tvl_eth"] = _dl_tvl("https://api.llama.fi/v2/historicalChainTvl/Ethereum")
    time.sleep(1)

    print("[DEFILLAMA] Fetching stables total (all) ...")
    result["stables"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all")
    time.sleep(1)

    print("[DEFILLAMA] Fetching USDT ...")
    result["usdt"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all?stablecoin=1")
    time.sleep(1)

    print("[DEFILLAMA] Fetching USDC ...")
    result["usdc"] = _dl_stables_chart("https://stablecoins.llama.fi/stablecoincharts/all?stablecoin=2")

    for k, v in result.items():
        print(f"  [DEFILLAMA] {k}: {len(v)} days")

    _save_cache("defillama", result)
    return result


# ----- ALTERNATIVE.ME (Fear & Greed) ---------------------------------------

def fetch_fng(no_cache=False) -> dict:
    cache = None if no_cache else _load_cache("fng")
    if cache is not None:
        print("[FNG] Using cache")
        return cache

    print("[FNG] Fetching F&G history ...")
    # date_format=us changes timestamp to MM/DD/YYYY strings — int() parse fails silently.
    # Without date_format, timestamp is returned as Unix seconds string — works correctly.
    url = "https://api.alternative.me/fng/?limit=1000"
    d = _get(url)

    fng_raw = {}
    if d and "data" in d:
        for entry in d["data"]:
            # date_format=us → timestamp field is unix timestamp (str or int)
            ts = entry.get("timestamp")
            v = _safe_float(entry.get("value"))
            if ts is None:
                continue
            try:
                ts_int = int(ts)
                ds = _unix_to_date(ts_int)
            except (TypeError, ValueError):
                continue
            if not math.isnan(v):
                fng_raw[ds] = v

    all_dates = list(_date_range_str(START_DT, END_DT))
    fng_ff = _fill_forward(fng_raw, all_dates)

    # Derived: fng_prev (lag-1), fng_week (7-day rolling mean)
    sorted_dates = sorted(fng_ff.keys())
    fng_prev = {}
    fng_week = {}
    for i, d_str in enumerate(sorted_dates):
        if i >= 1:
            fng_prev[d_str] = fng_ff.get(sorted_dates[i - 1], float("nan"))
        else:
            fng_prev[d_str] = float("nan")
        window = [fng_ff.get(sorted_dates[j], float("nan")) for j in range(max(0, i - 6), i + 1)]
        valid = [x for x in window if not math.isnan(x)]
        fng_week[d_str] = float(np.mean(valid)) if valid else float("nan")

    result = {
        "fng":      fng_ff,
        "fng_prev": fng_prev,
        "fng_week": fng_week,
    }

    for k, v in result.items():
        n = sum(1 for x in v.values() if not math.isnan(x))
        print(f"  [FNG] {k}: {n} valid days")

    _save_cache("fng", result)
    return result


# ----- BLOCKCHAIN.INFO -----------------------------------------------------

def fetch_blockchain(no_cache=False) -> dict:
    cache = None if no_cache else _load_cache("blockchain")
    if cache is not None:
        print("[BLOCKCHAIN] Using cache")
        return cache

    result = {}
    all_dates_set = set(_date_range_str(START_DT, END_DT))

    CHARTS = {
        "hashrate":   ("https://api.blockchain.info/charts/hash-rate", 1.0),
        "difficulty": ("https://api.blockchain.info/charts/difficulty", 1.0),
        "mcap_bc":    ("https://api.blockchain.info/charts/market-cap", 1.0),
        "tx_blk":     ("https://api.blockchain.info/charts/n-transactions-per-block", 1.0),
        "total_btc":  ("https://api.blockchain.info/charts/total-bitcoins", 1e-8),
    }

    for name, (base_url, divisor) in CHARTS.items():
        url = f"{base_url}?timespan=90days&start=2025-12-01&format=json"
        print(f"[BLOCKCHAIN] Fetching {name} ...")
        d = _get(url, timeout=30)
        out = {}
        if d and "values" in d:
            for entry in d["values"]:
                ts = int(entry.get("x", 0))
                ds = _unix_to_date(ts)
                v = _safe_float(entry.get("y"))
                if ds in all_dates_set and not math.isnan(v):
                    out[ds] = v * divisor
        result[name] = out
        print(f"  [BLOCKCHAIN] {name}: {len(out)} days")
        time.sleep(1)

    _save_cache("blockchain", result)
    return result


# ----- COINGECKO -----------------------------------------------------------

def fetch_coingecko(no_cache=False) -> dict:
    cache = None if no_cache else _load_cache("coingecko")
    if cache is not None:
        print("[COINGECKO] Using cache")
        return cache

    result = {"btc_price": {}, "eth_price": {}}
    all_dates_set = set(_date_range_str(START_DT, END_DT))

    # Use market_chart for a bulk fetch (avoids per-day calls + rate limits)
    def _market_chart(coin_id):
        # from=unix start, to=unix end
        start_unix = int(START_DT.timestamp())
        end_unix   = int(END_DT.timestamp())
        url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart/range?vs_currency=usd&from={start_unix}&to={end_unix}"
        d = _get(url, timeout=30)
        out = {}
        if d and "prices" in d:
            by_day = defaultdict(list)
            for ts_ms, price in d["prices"]:
                ds = _ms_to_date(int(ts_ms))
                if ds in all_dates_set:
                    by_day[ds].append(_safe_float(price))
            for ds, vals in by_day.items():
                valid = [x for x in vals if not math.isnan(x)]
                if valid:
                    out[ds] = valid[-1]  # last price of the day
        return out

    print("[COINGECKO] Fetching BTC price ...")
    result["btc_price"] = _market_chart("bitcoin")
    time.sleep(2)

    print("[COINGECKO] Fetching ETH price ...")
    result["eth_price"] = _market_chart("ethereum")

    for k, v in result.items():
        print(f"  [COINGECKO] {k}: {len(v)} days")

    _save_cache("coingecko", result)
    return result


# ---------------------------------------------------------------------------
# Step 4: Assemble all indicator data
# ---------------------------------------------------------------------------

def assemble_indicators(
    fred_data:    dict,
    binance_data: dict,
    deribit_data: dict,
    cm_data:      dict,
    dl_data:      dict,
    fng_data:     dict,
    bc_data:      dict,
    cg_data:      dict,
) -> dict:
    """
    Merge all source dicts into a single {indicator_name: {date_str: float}} dict.
    """
    merged = {}

    sources = [
        fred_data,
        binance_data,
        deribit_data,
        cm_data,
        dl_data,
        fng_data,
        bc_data,
        cg_data,
    ]

    for src in sources:
        for name, daily in src.items():
            if name in merged:
                # Prefer whichever has more data
                if len(daily) > len(merged[name]):
                    merged[name] = daily
            else:
                merged[name] = daily

    return merged


# ---------------------------------------------------------------------------
# Step 5: Correlation analysis
# ---------------------------------------------------------------------------

LAGS = [0, 1, 2, 3, 5, 7]


def compute_correlations(
    trading_days: list,
    returns: dict,
    indicators: dict,
) -> list:
    """
    For each indicator, test multiple lags.
    Returns a flat list of row dicts for the correlation table.
    """
    # Build aligned return vector
    ret_vec = np.array([returns.get(d, float("nan")) for d in trading_days])

    rows = []

    for ind_name, daily in indicators.items():
        # Raw vector for this indicator over all calendar days (for lag computation)
        # We need to look up indicator values at trading_day - lag (in calendar days)
        # Build a sorted lookup from the full daily dict
        all_cal = sorted(daily.keys())
        cal_set = set(all_cal)

        # Pre-build array: for each trading day, what is the indicator value?
        ind_series = {}
        for d in trading_days:
            v = daily.get(d)
            if v is None:
                # Look backwards up to 5 calendar days (fill-forward)
                dt = datetime.strptime(d, "%Y-%m-%d")
                for back in range(1, 6):
                    candidate = (dt - timedelta(days=back)).strftime("%Y-%m-%d")
                    if candidate in daily:
                        v = daily[candidate]
                        break
            ind_series[d] = v if v is not None else float("nan")

        for lag in LAGS:
            # indicator at trading_day[i - lag] predicts return at trading_day[i]
            n_td = len(trading_days)
            x_vals = []
            y_vals = []
            for i in range(n_td):
                y = ret_vec[i]
                if lag == 0:
                    src_day = trading_days[i]
                else:
                    src_idx = i - lag
                    if src_idx < 0:
                        x_vals.append(float("nan"))
                        y_vals.append(y)
                        continue
                    src_day = trading_days[src_idx]

                x = ind_series.get(src_day, float("nan"))
                x_vals.append(x)
                y_vals.append(y)

            x_arr = np.array(x_vals, dtype=float)
            y_arr = np.array(y_vals, dtype=float)

            # Drop NaN pairs
            valid = ~(np.isnan(x_arr) | np.isnan(y_arr))
            xv = x_arr[valid]
            yv = y_arr[valid]
            n = int(valid.sum())

            if n < 10:
                rows.append({
                    "name": ind_name,
                    "lag": lag,
                    "n_days": n,
                    "pearson_r": float("nan"),
                    "pearson_p": float("nan"),
                    "spearman_r": float("nan"),
                    "spearman_p": float("nan"),
                    "stress_r": float("nan"),
                    "stress_p": float("nan"),
                    "note": "insufficient data",
                })
                continue

            # Pearson
            try:
                pr, pp = stats.pearsonr(xv, yv)
            except Exception:
                pr, pp = float("nan"), float("nan")

            # Spearman
            try:
                sr, sp = stats.spearmanr(xv, yv)
            except Exception:
                sr, sp = float("nan"), float("nan")

            # Point-biserial: stress day flag (return < -0.01)
            stress_flag = (yv < -0.01).astype(float)
            try:
                if stress_flag.std() > 0:
                    pbr, pbp = stats.pointbiserialr(stress_flag.astype(bool), xv)
                else:
                    pbr, pbp = float("nan"), float("nan")
            except Exception:
                pbr, pbp = float("nan"), float("nan")

            rows.append({
                "name": ind_name,
                "lag": lag,
                "n_days": n,
                "pearson_r": float(pr),
                "pearson_p": float(pp),
                "spearman_r": float(sr),
                "spearman_p": float(sp),
                "stress_r": float(pbr),
                "stress_p": float(pbp),
                "note": "ok",
            })

    return rows


# ---------------------------------------------------------------------------
# Step 6: Build summary (best lag per indicator)
# ---------------------------------------------------------------------------

def build_summary(corr_rows: list, indicators: dict, trading_days: list) -> list:
    """Best lag per indicator, sorted by |pearson_r|."""
    from collections import defaultdict
    best = {}
    for row in corr_rows:
        name = row["name"]
        r = row["pearson_r"]
        if math.isnan(r):
            continue
        if name not in best or abs(r) > abs(best[name]["pearson_r"]):
            best[name] = row

    # Add indicators that had no valid data
    for name in indicators:
        if name not in best:
            # Count non-NaN days
            n_ok = sum(1 for d in trading_days if not math.isnan(
                _safe_float(indicators[name].get(d))
            ))
            best[name] = {
                "name": name,
                "lag": -1,
                "n_days": n_ok,
                "pearson_r": float("nan"),
                "pearson_p": float("nan"),
                "spearman_r": float("nan"),
                "spearman_p": float("nan"),
                "stress_r": float("nan"),
                "stress_p": float("nan"),
                "note": "no valid correlation",
            }

    # Assign source labels
    SOURCE_MAP = {
        **{k: "fred"       for k in FRED_SERIES},
        "funding_btc": "binance", "funding_eth": "binance",
        "oi_btc": "binance",      "oi_eth": "binance",
        "ls_btc": "binance",      "ls_eth": "binance",
        "ls_top": "binance",      "taker": "binance",
        "vol24": "binance",
        "dvol_btc": "deribit",    "dvol_eth": "deribit",
        "fund_dbt_btc": "deribit","fund_dbt_eth": "deribit",
        "rcap_btc": "coinmetrics","mvrv": "coinmetrics",
        "nupl": "coinmetrics",    "addr_btc": "coinmetrics",
        "addr_eth": "coinmetrics","txcnt": "coinmetrics",
        "fees_btc": "coinmetrics","fees_eth": "coinmetrics",
        "nvt": "coinmetrics",     "velocity": "coinmetrics",
        "sply_act": "coinmetrics","rcap_eth": "coinmetrics",
        "tvl": "defillama",       "tvl_eth": "defillama",
        "stables": "defillama",   "usdt": "defillama",
        "usdc": "defillama",
        "fng": "alternative",     "fng_prev": "alternative",
        "fng_week": "alternative",
        "hashrate": "blockchain",  "difficulty": "blockchain",
        "mcap_bc": "blockchain",   "tx_blk": "blockchain",
        "total_btc": "blockchain",
        "btc_price": "coingecko",  "eth_price": "coingecko",
    }

    rows_sorted = sorted(
        best.values(),
        key=lambda x: abs(x["pearson_r"]) if not math.isnan(x["pearson_r"]) else -1,
        reverse=True,
    )

    summary = []
    for rank, row in enumerate(rows_sorted, 1):
        name = row["name"]
        sig = ""
        if not math.isnan(row.get("pearson_p", float("nan"))):
            sig = "*" if row["pearson_p"] < 0.05 else ""
            sig += "*" if row["pearson_p"] < 0.01 else ""

        summary.append({
            "rank":      rank,
            "name":      name,
            "source":    SOURCE_MAP.get(name, "?"),
            "best_lag":  row["lag"],
            "best_r":    row["pearson_r"],
            "best_p":    row["pearson_p"],
            "spearman":  row["spearman_r"],
            "stress_r":  row["stress_r"],
            "n_days":    row["n_days"],
            "sig":       sig,
            "note":      row.get("note", ""),
        })

    return summary


# ---------------------------------------------------------------------------
# Step 7: Save outputs
# ---------------------------------------------------------------------------

def save_daily_csv(trading_days: list, indicators: dict, returns: dict):
    path = RUN_LOGS / f"exf_daily_{TS_NOW}.csv"
    cols = ["date", "daily_return"] + sorted(indicators.keys())
    with open(path, "w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        for d in trading_days:
            row = {"date": d, "daily_return": returns.get(d, "")}
            for name, series in indicators.items():
                v = series.get(d, float("nan"))
                row[name] = "" if math.isnan(v) else v
            w.writerow(row)
    print(f"[SAVE] Daily values: {path}")


def save_corr_csv(corr_rows: list):
    path = RUN_LOGS / f"exf_correlation_{TS_NOW}.csv"
    if not corr_rows:
        return
    cols = list(corr_rows[0].keys())
    with open(path, "w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        w.writerows(corr_rows)
    print(f"[SAVE] Correlation table: {path}")


def print_summary(summary: list):
    print()
    print("=" * 110)
    print(f"{'rank':>4}  {'name':<14}  {'source':<12}  {'best_lag':>8}  {'best_r':>8}  {'best_p':>8}  "
          f"{'spearman':>8}  {'stress_r':>8}  {'n_days':>6}  {'sig':>4}")
    print("=" * 110)
    for row in summary:
        r_str  = f"{row['best_r']:+.4f}"  if not math.isnan(row.get("best_r", float("nan"))) else "  n/a  "
        p_str  = f"{row['best_p']:.4f}"   if not math.isnan(row.get("best_p", float("nan"))) else "  n/a  "
        sp_str = f"{row['spearman']:+.4f}" if not math.isnan(row.get("spearman", float("nan"))) else "  n/a  "
        sr_str = f"{row['stress_r']:+.4f}" if not math.isnan(row.get("stress_r", float("nan"))) else "  n/a  "
        lag_str = f"lag={row['best_lag']}" if row['best_lag'] >= 0 else "  -  "
        print(
            f"{row['rank']:>4}  {row['name']:<14}  {row['source']:<12}  {lag_str:>8}  "
            f"{r_str:>8}  {p_str:>8}  {sp_str:>8}  {sr_str:>8}  {row['n_days']:>6}  {row['sig']:>4}"
        )
    print("=" * 110)

    # Top 10 highlight
    sig_rows = [r for r in summary if not math.isnan(r.get("best_p", float("nan"))) and r["best_p"] < 0.05]
    print(f"\n[RESULT] {len(sig_rows)} indicators significant at p<0.05 (any lag)")
    if sig_rows:
        print("TOP PREDICTORS:")
        for r in sig_rows[:15]:
            print(f"  {r['rank']:>3}. {r['name']:<14} lag={r['best_lag']}  "
                  f"r={r['best_r']:+.4f}  p={r['best_p']:.4f}  spearman={r['spearman']:+.4f}  "
                  f"n={r['n_days']}")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    parser = argparse.ArgumentParser(description="ExF historical correlation tester")
    parser.add_argument("--no-cache", action="store_true", help="Ignore and overwrite all caches")
    args = parser.parse_args()
    no_cache = args.no_cache

    print("=" * 70)
    print("  DOLPHIN ExF Correlation Tester")
    print(f"  Date range: {START_STR} → 2026-02-25")
    print(f"  Cache: {'DISABLED (--no-cache)' if no_cache else 'ENABLED'}")
    print("=" * 70)

    # --- Trading days
    trading_days = get_trading_days()
    if len(trading_days) == 0:
        print("[ERROR] No parquet files found in vbt_cache. Abort.")
        sys.exit(1)

    # --- Daily returns
    print("\n[STEP 1] Loading daily returns ...")
    try:
        returns = load_daily_returns(trading_days)
    except RuntimeError as e:
        print(f"[ERROR] {e}")
        sys.exit(1)

    valid_ret = sum(1 for v in returns.values() if not math.isnan(v))
    print(f"  Returns: {valid_ret}/{len(trading_days)} valid")

    # --- Fetch all sources
    print("\n[STEP 2] Fetching external factor data ...")

    print("\n--- FRED ---")
    try:
        fred_data = fetch_fred(no_cache)
        print(f"  FRED: {len(fred_data)} indicators OK")
    except Exception as e:
        print(f"  FRED FAILED: {e}")
        fred_data = {}

    print("\n--- BINANCE DERIVATIVES ---")
    try:
        binance_data = fetch_binance(no_cache)
        print(f"  BINANCE: {len(binance_data)} indicators OK")
    except Exception as e:
        print(f"  BINANCE FAILED: {e}")
        binance_data = {}

    print("\n--- DERIBIT ---")
    try:
        deribit_data = fetch_deribit(no_cache)
        print(f"  DERIBIT: {len(deribit_data)} indicators OK")
    except Exception as e:
        print(f"  DERIBIT FAILED: {e}")
        deribit_data = {}

    print("\n--- COINMETRICS ---")
    try:
        cm_data = fetch_coinmetrics(no_cache)
        print(f"  COINMETRICS: {len(cm_data)} indicators OK")
    except Exception as e:
        print(f"  COINMETRICS FAILED: {e}")
        cm_data = {}

    print("\n--- DEFILLAMA ---")
    try:
        dl_data = fetch_defillama(no_cache)
        print(f"  DEFILLAMA: {len(dl_data)} indicators OK")
    except Exception as e:
        print(f"  DEFILLAMA FAILED: {e}")
        dl_data = {}

    print("\n--- FEAR & GREED (alternative.me) ---")
    try:
        fng_data = fetch_fng(no_cache)
        print(f"  FNG: {len(fng_data)} indicators OK")
    except Exception as e:
        print(f"  FNG FAILED: {e}")
        fng_data = {}

    print("\n--- BLOCKCHAIN.INFO ---")
    try:
        bc_data = fetch_blockchain(no_cache)
        print(f"  BLOCKCHAIN: {len(bc_data)} indicators OK")
    except Exception as e:
        print(f"  BLOCKCHAIN FAILED: {e}")
        bc_data = {}

    print("\n--- COINGECKO ---")
    try:
        cg_data = fetch_coingecko(no_cache)
        print(f"  COINGECKO: {len(cg_data)} indicators OK")
    except Exception as e:
        print(f"  COINGECKO FAILED: {e}")
        cg_data = {}

    # --- Assemble
    print("\n[STEP 3] Assembling indicators ...")
    indicators = assemble_indicators(
        fred_data, binance_data, deribit_data,
        cm_data, dl_data, fng_data, bc_data, cg_data,
    )
    print(f"  Total indicators assembled: {len(indicators)}")

    # Coverage report
    print("\n  Coverage per indicator over trading days:")
    for name in sorted(indicators.keys()):
        series = indicators[name]
        n_ok = sum(
            1 for d in trading_days
            if not math.isnan(_safe_float(series.get(d)))
        )
        pct = 100 * n_ok / len(trading_days)
        status = "OK" if n_ok >= 40 else ("PARTIAL" if n_ok >= 10 else "FAIL")
        print(f"    {name:<18}  {n_ok:>3}/{len(trading_days)}  ({pct:5.1f}%)  [{status}]")

    # --- Correlation analysis
    print("\n[STEP 4] Computing correlations (lags 0-7) ...")
    corr_rows = compute_correlations(trading_days, returns, indicators)
    print(f"  Computed {len(corr_rows)} rows ({len(indicators)} indicators × {len(LAGS)} lags)")

    # --- Summary
    summary = build_summary(corr_rows, indicators, trading_days)

    # --- Save
    print("\n[STEP 5] Saving outputs ...")
    save_daily_csv(trading_days, indicators, returns)
    save_corr_csv(corr_rows)

    # --- Print table
    print("\n[STEP 6] Results")
    print_summary(summary)

    # Also print status for each source
    print("\n[SOURCE STATUS]")
    source_ok = {
        "fred":        "OK" if fred_data else "FAIL",
        "binance":     "OK" if binance_data else "FAIL",
        "deribit":     "OK" if deribit_data else "FAIL",
        "coinmetrics": "OK" if cm_data else "FAIL",
        "defillama":   "OK" if dl_data else "FAIL",
        "alternative": "OK" if fng_data else "FAIL",
        "blockchain":  "OK" if bc_data else "FAIL",
        "coingecko":   "OK" if cg_data else "FAIL",
    }
    for src, status in source_ok.items():
        print(f"  {src:<14}  {status}")

    print("\n[DONE] ExF correlation analysis complete.")


if __name__ == "__main__":
    main()