Files
DOLPHIN/external_factors/backfill_liquidations_exf.py

343 lines
12 KiB
Python
Raw Normal View History

"""
backfill_liquidations_exf.py Backfill liquidation ExF channels for 5y klines dates.
Fetches aggregate BTC liquidation data from Coinglass historical API and appends
4 new channels (liq_vol_24h, liq_long_ratio, liq_z_score, liq_percentile) to the
existing scan_000001__Indicators.npz files under EIGENVALUES_PATH.
Usage (from external_factors/ dir):
python backfill_liquidations_exf.py
python backfill_liquidations_exf.py --dry-run
python backfill_liquidations_exf.py --start 2023-01-01 --end 2023-12-31
python backfill_liquidations_exf.py --mode standalone
Output: each NPZ gains 4 new channels. Log ../../backfill_liquidations.log
"""
import sys
import time
import argparse
import asyncio
import math
import logging
from pathlib import Path
from datetime import datetime, timezone
import numpy as np
import aiohttp
# --- Paths (same as backfill_klines_exf.py) ---
HCM_DIR = Path(__file__).parent.parent
KLINES_DIR = HCM_DIR / "vbt_cache_klines"
EIGENVALUES_PATH = Path(
r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues"
)
NPZ_FILENAME = "scan_000001__Indicators.npz"
LIQ_NPZ_FILENAME = "scan_000001__Liq_Indicators.npz" # for --mode standalone
LOG_PATH = HCM_DIR / "backfill_liquidations.log"
LIQ_KEYS = ["liq_vol_24h", "liq_long_ratio", "liq_z_score", "liq_percentile"]
# --- Coinglass endpoint ---
# Coinglass API v4 requires CG-API-KEY header
CG_URL_V4 = "https://open-api-v4.coinglass.com/api/futures/liquidation/aggregated-history"
RATE_DELAY = 2.0 # seconds between requests
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[
logging.FileHandler(str(LOG_PATH), encoding="utf-8"),
logging.StreamHandler(sys.stdout),
],
)
log = logging.getLogger(__name__)
def parse_args():
p = argparse.ArgumentParser(description="Backfill liquidation ExF channels")
p.add_argument(
"--start", default=None, help="Start date YYYY-MM-DD (inclusive)"
)
p.add_argument("--end", default=None, help="End date YYYY-MM-DD (inclusive)")
p.add_argument("--dry-run", action="store_true")
p.add_argument("--delay", type=float, default=2.0)
p.add_argument("--overwrite", action="store_true")
p.add_argument("--mode", default="append", choices=["append", "standalone"])
p.add_argument("--api-key", default=None, help="Coinglass API key (or set COINGLASS_API_KEY env var)")
return p.parse_args()
def get_api_key(args) -> str:
"""Get Coinglass API key from args or environment."""
import os
key = args.api_key or os.environ.get("COINGLASS_API_KEY", "")
return key
async def fetch_coinglass_day(
session: aiohttp.ClientSession, ds: str, api_key: str
) -> tuple:
"""
Fetch liquidation bars for date string 'YYYY-MM-DD'.
Returns (liq_vol_log, liq_long_ratio, success: bool).
Uses Coinglass API v4 which requires CG-API-KEY header.
"""
if not api_key:
log.error(f" {ds}: No Coinglass API key provided")
return (0.0, 0.5, False)
# Coinglass v4 uses different time format (Unix seconds, not ms)
yr, mo, dy = int(ds[:4]), int(ds[5:7]), int(ds[8:10])
start_ts = int(datetime(yr, mo, dy, 0, 0, 0, tzinfo=timezone.utc).timestamp())
end_ts = int(datetime(yr, mo, dy, 23, 59, 59, tzinfo=timezone.utc).timestamp())
# v4 API params - uses 'startTime' and 'endTime' in seconds
params = {
"symbol": "BTC",
"interval": "1h",
"startTime": start_ts,
"endTime": end_ts,
}
headers = {
"CG-API-KEY": api_key,
"Accept": "application/json",
}
for attempt in range(3):
try:
async with session.get(
CG_URL_V4,
params=params,
headers=headers,
timeout=aiohttp.ClientTimeout(total=15),
) as resp:
if resp.status == 429:
log.warning(f" {ds}: rate limited (429) — sleeping 30s")
await asyncio.sleep(30)
continue
if resp.status == 403:
log.error(f" {ds}: HTTP 403 - Invalid or missing API key")
return (0.0, 0.5, False)
if resp.status != 200:
log.warning(f" {ds}: HTTP {resp.status}")
return (0.0, 0.5, False)
data = await resp.json(content_type=None)
# Parse v4 response
# Response: {"code":"0","msg":"success","data": [{"t":1234567890, "longLiquidationUsd":123.0, "shortLiquidationUsd":456.0}, ...]}
if data.get("code") != "0":
log.warning(f" {ds}: API error: {data.get('msg', 'unknown')}")
return (0.0, 0.5, False)
bars = data.get("data", [])
if not bars:
log.warning(f" {ds}: empty liquidation data")
return (0.0, 0.5, False)
long_total = sum(float(b.get("longLiquidationUsd", 0)) for b in bars)
short_total = sum(float(b.get("shortLiquidationUsd", 0)) for b in bars)
total = long_total + short_total
liq_vol_log = math.log10(total + 1.0)
liq_long_ratio = (long_total / total) if total > 0 else 0.5
return (liq_vol_log, liq_long_ratio, True)
except asyncio.TimeoutError:
log.warning(f" {ds}: timeout (attempt {attempt+1}/3)")
await asyncio.sleep(10)
except Exception as e:
log.warning(f" {ds}: error {e} (attempt {attempt+1}/3)")
await asyncio.sleep(10)
return (0.0, 0.5, False)
def compute_derived_metrics(dates, raw_vols, raw_success):
"""Compute z_score and percentile across full series."""
dates_sorted = sorted(dates)
vols = np.array([raw_vols.get(d, 0.0) for d in dates_sorted])
success = np.array([raw_success.get(d, False) for d in dates_sorted])
z_scores = {}
percentiles = {}
WINDOW = 30
for i, ds in enumerate(dates_sorted):
if not success[i]:
z_scores[ds] = (0.0, False)
percentiles[ds] = (0.5, False)
continue
# z_score vs 30d rolling window
start = max(0, i - WINDOW)
w_vals = vols[start:i][success[start:i]]
if len(w_vals) >= 5:
z = float((vols[i] - w_vals.mean()) / (w_vals.std() + 1e-8))
z_scores[ds] = (z, True)
else:
z_scores[ds] = (0.0, False)
# percentile vs full history to date
hist = vols[: i + 1][success[: i + 1]]
if len(hist) >= 10:
pct = float((hist < vols[i]).sum()) / len(hist)
percentiles[ds] = (pct, True)
else:
percentiles[ds] = (0.5, False)
return z_scores, percentiles
def append_liq_to_npz(npz_path, liq_values, overwrite, dry_run):
"""Append 4 liq channels to existing NPZ. liq_values = {key: (float, bool)}."""
if not npz_path.exists():
# Create minimal NPZ (rare case)
names = np.array(LIQ_KEYS, dtype=object)
inds = np.array([liq_values[k][0] for k in LIQ_KEYS], dtype=np.float64)
succ = np.array([liq_values[k][1] for k in LIQ_KEYS], dtype=np.bool_)
else:
data = np.load(str(npz_path), allow_pickle=True)
existing_names = [str(n) for n in data["api_names"]]
if "liq_vol_24h" in existing_names and not overwrite:
return False # idempotent skip
# Strip old liq channels if overwriting
if overwrite and "liq_vol_24h" in existing_names:
keep = [
i
for i, n in enumerate(existing_names)
if not n.startswith("liq_")
]
existing_names = [existing_names[i] for i in keep]
ex_inds = data["api_indicators"][keep]
ex_succ = data["api_success"][keep]
else:
ex_inds = data["api_indicators"]
ex_succ = data["api_success"]
names = np.array(existing_names + LIQ_KEYS, dtype=object)
inds = np.concatenate(
[
ex_inds.astype(np.float64),
np.array([liq_values[k][0] for k in LIQ_KEYS], dtype=np.float64),
]
)
succ = np.concatenate(
[
ex_succ.astype(np.bool_),
np.array([liq_values[k][1] for k in LIQ_KEYS], dtype=np.bool_),
]
)
if not dry_run:
np.savez_compressed(
str(npz_path), api_names=names, api_indicators=inds, api_success=succ
)
return True
async def main_async(args):
# Enumerate klines dates
parquet_files = sorted(KLINES_DIR.glob("*.parquet"))
parquet_files = [p for p in parquet_files if "catalog" not in str(p)]
dates = [p.stem for p in parquet_files]
if args.start:
dates = [d for d in dates if d >= args.start]
if args.end:
dates = [d for d in dates if d <= args.end]
total = len(dates)
log.info(f"Dates to process: {total}")
log.info(f"Mode: {args.mode} Dry-run: {args.dry_run} Overwrite: {args.overwrite}")
raw_vols = {}
raw_ratios = {}
raw_success = {}
# Get API key
api_key = get_api_key(args)
if not api_key:
log.warning("No Coinglass API key provided! Use --api-key or set COINGLASS_API_KEY env var.")
log.warning("Get a free API key at: https://www.coinglass.com/pricing")
# Phase 1: Fetch raw data from Coinglass
log.info("=== PHASE 1: Fetching Coinglass liquidation data ===")
t0 = time.time()
async with aiohttp.ClientSession() as session:
for i, ds in enumerate(sorted(dates)):
vol, ratio, ok = await fetch_coinglass_day(session, ds, api_key)
raw_vols[ds] = vol
raw_ratios[ds] = ratio
raw_success[ds] = ok
if (i + 1) % 10 == 0:
elapsed = time.time() - t0
eta = (total - i - 1) * args.delay
log.info(
f" [{i+1}/{total}] {ds} vol={vol:.3f} ratio={ratio:.3f} ok={ok}"
f" elapsed={elapsed/60:.1f}m eta={eta/60:.1f}m"
)
else:
log.info(f" {ds} vol={vol:.3f} ratio={ratio:.3f} ok={ok}")
await asyncio.sleep(args.delay)
# Phase 2: Compute derived metrics
log.info("=== PHASE 2: Computing z_score and percentile ===")
z_scores, percentiles = compute_derived_metrics(dates, raw_vols, raw_success)
# Phase 3: Append to NPZ files
log.info(f"=== PHASE 3: Appending to NPZ files (mode={args.mode}) ===")
written = skipped = errors = 0
for ds in sorted(dates):
liq_values = {
"liq_vol_24h": (raw_vols.get(ds, 0.0), raw_success.get(ds, False)),
"liq_long_ratio": (raw_ratios.get(ds, 0.5), raw_success.get(ds, False)),
"liq_z_score": z_scores.get(ds, (0.0, False)),
"liq_percentile": percentiles.get(ds, (0.5, False)),
}
out_dir = EIGENVALUES_PATH / ds
if args.mode == "append":
npz_path = out_dir / NPZ_FILENAME
else: # standalone
npz_path = out_dir / LIQ_NPZ_FILENAME
out_dir.mkdir(parents=True, exist_ok=True)
try:
did_write = append_liq_to_npz(npz_path, liq_values, args.overwrite, args.dry_run)
if did_write:
written += 1
log.debug(f" {ds}: written")
else:
skipped += 1
except Exception as e:
log.error(f" {ds}: NPZ write error — {e}")
errors += 1
elapsed_total = time.time() - t0
log.info(f"{'='*60}")
log.info(f"Liquidation ExF Backfill COMPLETE")
log.info(f"Written: {written}")
log.info(f"Skipped: {skipped} (already had liq channels)")
log.info(f"Errors: {errors}")
log.info(f"Runtime: {elapsed_total/60:.1f}m")
log.info(f"{'='*60}")
def main():
args = parse_args()
asyncio.run(main_async(args))
if __name__ == "__main__":
main()