""" backfill_liquidations_exf.py — Backfill liquidation ExF channels for 5y klines dates. Fetches aggregate BTC liquidation data from Coinglass historical API and appends 4 new channels (liq_vol_24h, liq_long_ratio, liq_z_score, liq_percentile) to the existing scan_000001__Indicators.npz files under EIGENVALUES_PATH. Usage (from external_factors/ dir): python backfill_liquidations_exf.py python backfill_liquidations_exf.py --dry-run python backfill_liquidations_exf.py --start 2023-01-01 --end 2023-12-31 python backfill_liquidations_exf.py --mode standalone Output: each NPZ gains 4 new channels. Log → ../../backfill_liquidations.log """ import sys import time import argparse import asyncio import math import logging from pathlib import Path from datetime import datetime, timezone import numpy as np import aiohttp # --- Paths (same as backfill_klines_exf.py) --- HCM_DIR = Path(__file__).parent.parent KLINES_DIR = HCM_DIR / "vbt_cache_klines" EIGENVALUES_PATH = Path( r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues" ) NPZ_FILENAME = "scan_000001__Indicators.npz" LIQ_NPZ_FILENAME = "scan_000001__Liq_Indicators.npz" # for --mode standalone LOG_PATH = HCM_DIR / "backfill_liquidations.log" LIQ_KEYS = ["liq_vol_24h", "liq_long_ratio", "liq_z_score", "liq_percentile"] # --- Coinglass endpoint --- # Coinglass API v4 requires CG-API-KEY header CG_URL_V4 = "https://open-api-v4.coinglass.com/api/futures/liquidation/aggregated-history" RATE_DELAY = 2.0 # seconds between requests # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", handlers=[ logging.FileHandler(str(LOG_PATH), encoding="utf-8"), logging.StreamHandler(sys.stdout), ], ) log = logging.getLogger(__name__) def parse_args(): p = argparse.ArgumentParser(description="Backfill liquidation ExF channels") p.add_argument( "--start", default=None, help="Start date YYYY-MM-DD (inclusive)" ) p.add_argument("--end", default=None, help="End date YYYY-MM-DD (inclusive)") p.add_argument("--dry-run", action="store_true") p.add_argument("--delay", type=float, default=2.0) p.add_argument("--overwrite", action="store_true") p.add_argument("--mode", default="append", choices=["append", "standalone"]) p.add_argument("--api-key", default=None, help="Coinglass API key (or set COINGLASS_API_KEY env var)") return p.parse_args() def get_api_key(args) -> str: """Get Coinglass API key from args or environment.""" import os key = args.api_key or os.environ.get("COINGLASS_API_KEY", "") return key async def fetch_coinglass_day( session: aiohttp.ClientSession, ds: str, api_key: str ) -> tuple: """ Fetch liquidation bars for date string 'YYYY-MM-DD'. Returns (liq_vol_log, liq_long_ratio, success: bool). Uses Coinglass API v4 which requires CG-API-KEY header. """ if not api_key: log.error(f" {ds}: No Coinglass API key provided") return (0.0, 0.5, False) # Coinglass v4 uses different time format (Unix seconds, not ms) yr, mo, dy = int(ds[:4]), int(ds[5:7]), int(ds[8:10]) start_ts = int(datetime(yr, mo, dy, 0, 0, 0, tzinfo=timezone.utc).timestamp()) end_ts = int(datetime(yr, mo, dy, 23, 59, 59, tzinfo=timezone.utc).timestamp()) # v4 API params - uses 'startTime' and 'endTime' in seconds params = { "symbol": "BTC", "interval": "1h", "startTime": start_ts, "endTime": end_ts, } headers = { "CG-API-KEY": api_key, "Accept": "application/json", } for attempt in range(3): try: async with session.get( CG_URL_V4, params=params, headers=headers, timeout=aiohttp.ClientTimeout(total=15), ) as resp: if resp.status == 429: log.warning(f" {ds}: rate limited (429) — sleeping 30s") await asyncio.sleep(30) continue if resp.status == 403: log.error(f" {ds}: HTTP 403 - Invalid or missing API key") return (0.0, 0.5, False) if resp.status != 200: log.warning(f" {ds}: HTTP {resp.status}") return (0.0, 0.5, False) data = await resp.json(content_type=None) # Parse v4 response # Response: {"code":"0","msg":"success","data": [{"t":1234567890, "longLiquidationUsd":123.0, "shortLiquidationUsd":456.0}, ...]} if data.get("code") != "0": log.warning(f" {ds}: API error: {data.get('msg', 'unknown')}") return (0.0, 0.5, False) bars = data.get("data", []) if not bars: log.warning(f" {ds}: empty liquidation data") return (0.0, 0.5, False) long_total = sum(float(b.get("longLiquidationUsd", 0)) for b in bars) short_total = sum(float(b.get("shortLiquidationUsd", 0)) for b in bars) total = long_total + short_total liq_vol_log = math.log10(total + 1.0) liq_long_ratio = (long_total / total) if total > 0 else 0.5 return (liq_vol_log, liq_long_ratio, True) except asyncio.TimeoutError: log.warning(f" {ds}: timeout (attempt {attempt+1}/3)") await asyncio.sleep(10) except Exception as e: log.warning(f" {ds}: error {e} (attempt {attempt+1}/3)") await asyncio.sleep(10) return (0.0, 0.5, False) def compute_derived_metrics(dates, raw_vols, raw_success): """Compute z_score and percentile across full series.""" dates_sorted = sorted(dates) vols = np.array([raw_vols.get(d, 0.0) for d in dates_sorted]) success = np.array([raw_success.get(d, False) for d in dates_sorted]) z_scores = {} percentiles = {} WINDOW = 30 for i, ds in enumerate(dates_sorted): if not success[i]: z_scores[ds] = (0.0, False) percentiles[ds] = (0.5, False) continue # z_score vs 30d rolling window start = max(0, i - WINDOW) w_vals = vols[start:i][success[start:i]] if len(w_vals) >= 5: z = float((vols[i] - w_vals.mean()) / (w_vals.std() + 1e-8)) z_scores[ds] = (z, True) else: z_scores[ds] = (0.0, False) # percentile vs full history to date hist = vols[: i + 1][success[: i + 1]] if len(hist) >= 10: pct = float((hist < vols[i]).sum()) / len(hist) percentiles[ds] = (pct, True) else: percentiles[ds] = (0.5, False) return z_scores, percentiles def append_liq_to_npz(npz_path, liq_values, overwrite, dry_run): """Append 4 liq channels to existing NPZ. liq_values = {key: (float, bool)}.""" if not npz_path.exists(): # Create minimal NPZ (rare case) names = np.array(LIQ_KEYS, dtype=object) inds = np.array([liq_values[k][0] for k in LIQ_KEYS], dtype=np.float64) succ = np.array([liq_values[k][1] for k in LIQ_KEYS], dtype=np.bool_) else: data = np.load(str(npz_path), allow_pickle=True) existing_names = [str(n) for n in data["api_names"]] if "liq_vol_24h" in existing_names and not overwrite: return False # idempotent skip # Strip old liq channels if overwriting if overwrite and "liq_vol_24h" in existing_names: keep = [ i for i, n in enumerate(existing_names) if not n.startswith("liq_") ] existing_names = [existing_names[i] for i in keep] ex_inds = data["api_indicators"][keep] ex_succ = data["api_success"][keep] else: ex_inds = data["api_indicators"] ex_succ = data["api_success"] names = np.array(existing_names + LIQ_KEYS, dtype=object) inds = np.concatenate( [ ex_inds.astype(np.float64), np.array([liq_values[k][0] for k in LIQ_KEYS], dtype=np.float64), ] ) succ = np.concatenate( [ ex_succ.astype(np.bool_), np.array([liq_values[k][1] for k in LIQ_KEYS], dtype=np.bool_), ] ) if not dry_run: np.savez_compressed( str(npz_path), api_names=names, api_indicators=inds, api_success=succ ) return True async def main_async(args): # Enumerate klines dates parquet_files = sorted(KLINES_DIR.glob("*.parquet")) parquet_files = [p for p in parquet_files if "catalog" not in str(p)] dates = [p.stem for p in parquet_files] if args.start: dates = [d for d in dates if d >= args.start] if args.end: dates = [d for d in dates if d <= args.end] total = len(dates) log.info(f"Dates to process: {total}") log.info(f"Mode: {args.mode} Dry-run: {args.dry_run} Overwrite: {args.overwrite}") raw_vols = {} raw_ratios = {} raw_success = {} # Get API key api_key = get_api_key(args) if not api_key: log.warning("No Coinglass API key provided! Use --api-key or set COINGLASS_API_KEY env var.") log.warning("Get a free API key at: https://www.coinglass.com/pricing") # Phase 1: Fetch raw data from Coinglass log.info("=== PHASE 1: Fetching Coinglass liquidation data ===") t0 = time.time() async with aiohttp.ClientSession() as session: for i, ds in enumerate(sorted(dates)): vol, ratio, ok = await fetch_coinglass_day(session, ds, api_key) raw_vols[ds] = vol raw_ratios[ds] = ratio raw_success[ds] = ok if (i + 1) % 10 == 0: elapsed = time.time() - t0 eta = (total - i - 1) * args.delay log.info( f" [{i+1}/{total}] {ds} vol={vol:.3f} ratio={ratio:.3f} ok={ok}" f" elapsed={elapsed/60:.1f}m eta={eta/60:.1f}m" ) else: log.info(f" {ds} vol={vol:.3f} ratio={ratio:.3f} ok={ok}") await asyncio.sleep(args.delay) # Phase 2: Compute derived metrics log.info("=== PHASE 2: Computing z_score and percentile ===") z_scores, percentiles = compute_derived_metrics(dates, raw_vols, raw_success) # Phase 3: Append to NPZ files log.info(f"=== PHASE 3: Appending to NPZ files (mode={args.mode}) ===") written = skipped = errors = 0 for ds in sorted(dates): liq_values = { "liq_vol_24h": (raw_vols.get(ds, 0.0), raw_success.get(ds, False)), "liq_long_ratio": (raw_ratios.get(ds, 0.5), raw_success.get(ds, False)), "liq_z_score": z_scores.get(ds, (0.0, False)), "liq_percentile": percentiles.get(ds, (0.5, False)), } out_dir = EIGENVALUES_PATH / ds if args.mode == "append": npz_path = out_dir / NPZ_FILENAME else: # standalone npz_path = out_dir / LIQ_NPZ_FILENAME out_dir.mkdir(parents=True, exist_ok=True) try: did_write = append_liq_to_npz(npz_path, liq_values, args.overwrite, args.dry_run) if did_write: written += 1 log.debug(f" {ds}: written") else: skipped += 1 except Exception as e: log.error(f" {ds}: NPZ write error — {e}") errors += 1 elapsed_total = time.time() - t0 log.info(f"{'='*60}") log.info(f"Liquidation ExF Backfill COMPLETE") log.info(f"Written: {written}") log.info(f"Skipped: {skipped} (already had liq channels)") log.info(f"Errors: {errors}") log.info(f"Runtime: {elapsed_total/60:.1f}m") log.info(f"{'='*60}") def main(): args = parse_args() asyncio.run(main_async(args)) if __name__ == "__main__": main()