#!/usr/bin/env python3 """ DOLPHIN OBF-Universe — Lightweight L2 health monitor for all USDT perps ======================================================================== Subscribes to @depth5@500ms for all active USDT perpetual pairs (~400 assets), computes OB health metrics every 500 ms (matching WS push cadence), and serves them via: - Hazelcast key : obf_universe_latest (JSON dict keyed by symbol) - Parquet archive: /mnt/ng6_data/ob_universe/ (Hive partitioned, no pruning) Used by Asset Picker to score tradable universe on real-time liquidity/health. Architecture: - Two asyncio WS connections (Binance cap: 300 streams each) - Subscribe to {symbol}@depth5@500ms — absolute snapshots, no book maintenance - In-memory state: Dict[symbol → {bids, asks, ts}] (lock-protected) - Snapshot thread: every SNAPSHOT_INTERVAL_S compute health + push HZ - Flush thread : every FLUSH_INTERVAL_S write Parquet Binance rate limits: - WebSocket push streams carry ZERO REST weight - depth5@500ms: ~2 msg/s per stream, 800 msg/s for 400 assets — trivial Usage: python prod/obf_universe_service.py python prod/obf_universe_service.py --snapshot-interval 30 --dry-run """ import argparse import asyncio import hashlib import json import logging import sys import threading import time from collections import deque from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Tuple import pyarrow as pa import pyarrow.parquet as pq import requests HCM_DIR = Path(__file__).parent.parent sys.path.insert(0, str(HCM_DIR)) sys.path.insert(0, str(HCM_DIR / "prod")) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s — %(message)s", ) logger = logging.getLogger("obf_universe") # ── Constants ────────────────────────────────────────────────────────────────── SNAPSHOT_INTERVAL_S = 0.5 # HZ publish cadence — matches @depth5@500ms WS push PARQUET_BUFFER_EVERY_N = 20 # buffer to Parquet every Nth snapshot (= every 10s) FLUSH_INTERVAL_S = 300 # parquet flush cadence (5 min) FIRST_FLUSH_S = 60 # first flush fires early STALE_THRESHOLD_S = 10.0 # skip asset if book not updated within N s MAX_STREAMS_PER_CONN = 200 # conservative (<300 Binance limit); two connections PARQUET_BASE_DIR = Path("/mnt/ng6_data/ob_universe") HZ_KEY = "obf_universe_latest" WS_URL = "wss://fstream.binance.com/ws" EXCHANGE_INFO_URL = "https://fapi.binance.com/fapi/v1/exchangeInfo" # ── Parquet Schema (lightweight — health metrics only) ───────────────────────── OB_UNIVERSE_SCHEMA = pa.schema([ pa.field("local_ts", pa.float64()), pa.field("asset", pa.string()), pa.field("best_bid", pa.float64()), pa.field("best_ask", pa.float64()), pa.field("spread_bps", pa.float32()), pa.field("depth_1pct_usd", pa.float64()), pa.field("depth_quality", pa.float32()), pa.field("fill_probability", pa.float32()), pa.field("imbalance", pa.float32()), pa.field("n_bid_levels", pa.int32()), pa.field("n_ask_levels", pa.int32()), ]) def _compute_health(bids: list, asks: list) -> Optional[dict]: """Compute OB health metrics from raw depth5 bid/ask levels. bids/asks: [[price_str, qty_str], ...] (up to 5 levels, best first) Returns None if book is empty. """ if not bids or not asks: return None try: bid0 = float(bids[0][0]) ask0 = float(asks[0][0]) if bid0 <= 0 or ask0 <= 0: return None mid = (bid0 + ask0) / 2.0 spread_bps = (ask0 - bid0) / mid * 10_000.0 threshold_1pct = mid * 0.01 bid_notional = sum( float(p) * float(q) for p, q in bids if float(p) >= mid - threshold_1pct ) ask_notional = sum( float(p) * float(q) for p, q in asks if float(p) <= mid + threshold_1pct ) depth_1pct_usd = bid_notional + ask_notional depth_quality = float(min(1.0, depth_1pct_usd / 1_000_000.0)) fill_probability = float(max(0.0, min(1.0, 1.0 - spread_bps / 20.0))) total = bid_notional + ask_notional imbalance = float((bid_notional - ask_notional) / max(total, 1e-9)) return { "best_bid": bid0, "best_ask": ask0, "spread_bps": round(spread_bps, 4), "depth_1pct_usd": round(depth_1pct_usd, 2), "depth_quality": round(depth_quality, 4), "fill_probability": round(fill_probability, 4), "imbalance": round(imbalance, 4), "n_bid_levels": len(bids), "n_ask_levels": len(asks), } except (ValueError, IndexError, ZeroDivisionError): return None class OBUniverseService: """ Two-connection WS health monitor for the full USDT perp universe. Thread safety: _state is protected by _state_lock (updated by WS threads, read by snapshot thread) """ def __init__(self, hz_host: str = "localhost:5701", hz_cluster: str = "dolphin", snapshot_interval_s: float = SNAPSHOT_INTERVAL_S, dry_run: bool = False): self.hz_host = hz_host self.hz_cluster = hz_cluster self.snapshot_interval = snapshot_interval_s self.dry_run = dry_run self._state: Dict[str, dict] = {} # symbol → {bids, asks, ts} self._state_lock = threading.Lock() # Parquet row buffer per asset: deque of health dicts self._buf: Dict[str, deque] = {} self._buf_lock = threading.Lock() self._running = False self._symbols: List[str] = [] self._hz_client = None self._imap = None # Stats self.stats = {"snapshots": 0, "hz_pushes": 0, "hz_failures": 0, "parquet_files": 0, "parquet_rows": 0, "ws_msgs": 0} # ── Universe fetch ───────────────────────────────────────────────────────── def _fetch_universe(self) -> List[str]: """Fetch all active USDT perpetual symbols from Binance.""" try: resp = requests.get(EXCHANGE_INFO_URL, timeout=10) resp.raise_for_status() data = resp.json() symbols = [ s["symbol"] for s in data["symbols"] if s.get("quoteAsset") == "USDT" and s.get("contractType") == "PERPETUAL" and s.get("status") == "TRADING" ] logger.info("Universe: %d active USDT perps fetched", len(symbols)) return sorted(symbols) except Exception as e: logger.error("Failed to fetch universe: %s", e) return ["BTCUSDT", "ETHUSDT", "SOLUSDT"] # minimal fallback # ── Hazelcast ───────────────────────────────────────────────────────────── def _connect_hz(self) -> bool: try: import hazelcast if self._hz_client: try: self._hz_client.shutdown() except Exception: pass self._hz_client = hazelcast.HazelcastClient( cluster_name=self.hz_cluster, cluster_members=[self.hz_host], ) self._imap = self._hz_client.get_map("DOLPHIN_FEATURES").blocking() logger.info("HZ connected") return True except Exception as e: logger.warning("HZ connect failed: %s", e) self._hz_client = None self._imap = None return False # ── WebSocket connections ───────────────────────────────────────────────── def _update_state(self, symbol: str, bids: list, asks: list, ts: float): """Thread-safe state update from WS thread.""" with self._state_lock: self._state[symbol] = {"bids": bids, "asks": asks, "ts": ts} self.stats["ws_msgs"] += 1 async def _ws_connection(self, symbols: List[str], conn_id: int): """Async WS loop for one connection. Runs in a background thread's event loop.""" import websockets streams = [f"{s.lower()}@depth5@500ms" for s in symbols] logger.info("WS conn %d: subscribing to %d streams", conn_id, len(streams)) while self._running: try: async with websockets.connect( WS_URL, ping_interval=20, ping_timeout=30, max_size=2**23, # 8 MB — handle bursts ) as ws: await ws.send(json.dumps({ "method": "SUBSCRIBE", "params": streams, "id": conn_id, })) logger.info("WS conn %d: subscribed", conn_id) async for raw in ws: if not self._running: break try: msg = json.loads(raw) except json.JSONDecodeError: continue # Skip subscription ack / errors if "result" in msg or "error" in msg: continue symbol = msg.get("s") if not symbol: continue bids = msg.get("b", []) asks = msg.get("a", []) ts = msg.get("T", time.time() * 1000) / 1000.0 self._update_state(symbol, bids, asks, ts) except Exception as e: if self._running: logger.warning("WS conn %d error: %s — reconnecting in 5s", conn_id, e) await asyncio.sleep(5) def _run_ws_thread(self, symbols: List[str], conn_id: int): """Entrypoint for a WS background thread (owns its own event loop).""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: loop.run_until_complete(self._ws_connection(symbols, conn_id)) finally: loop.close() # ── Snapshot loop ───────────────────────────────────────────────────────── def _snapshot_loop(self): """Every SNAPSHOT_INTERVAL_S: compute health for all assets, push to HZ.""" next_tick = time.monotonic() + self.snapshot_interval while self._running: now_mono = time.monotonic() if now_mono < next_tick: time.sleep(min(1.0, next_tick - now_mono)) continue next_tick = now_mono + self.snapshot_interval now_ts = time.time() cutoff = now_ts - STALE_THRESHOLD_S with self._state_lock: state_copy = dict(self._state) health_map: Dict[str, dict] = {} n_stale = 0 rows_by_asset: Dict[str, dict] = {} for symbol, book in state_copy.items(): if book["ts"] < cutoff: n_stale += 1 continue h = _compute_health(book["bids"], book["asks"]) if h is None: continue health_map[symbol] = h rows_by_asset[symbol] = {"local_ts": now_ts, "asset": symbol, **h} self.stats["snapshots"] += 1 # Buffer rows for Parquet flush every Nth snapshot (decoupled from HZ rate) if self.stats["snapshots"] % PARQUET_BUFFER_EVERY_N == 0: with self._buf_lock: for symbol, row in rows_by_asset.items(): if symbol not in self._buf: self._buf[symbol] = deque() self._buf[symbol].append(row) # Push to HZ payload = { "_snapshot_utc": datetime.now(timezone.utc).isoformat(), "_n_assets": len(health_map), "_n_stale": n_stale, "_n_total": len(state_copy), **health_map, } if not self.dry_run and self._imap: try: self._imap.put(HZ_KEY, json.dumps(payload)) self.stats["hz_pushes"] += 1 try: from ch_writer import ch_put snap_ts_ms = int(now_ts * 1_000) for sym, h in health_map.items(): ch_put("obf_universe", { "ts": snap_ts_ms, "symbol": sym, "spread_bps": float(h.get("spread_bps", 0)), "depth_1pct_usd": float(h.get("depth_1pct_usd", 0)), "depth_quality": float(h.get("depth_quality", 0)), "fill_probability": float(h.get("fill_probability", 0)), "imbalance": float(h.get("imbalance", 0)), "best_bid": float(h.get("best_bid", 0)), "best_ask": float(h.get("best_ask", 0)), "n_bid_levels": int(h.get("n_bid_levels", 0)), "n_ask_levels": int(h.get("n_ask_levels", 0)), }) except Exception: pass # Write per-asset OB snapshots for HZOBProvider cache. # Format: asset_{symbol}_ob → {bid_notional[5], ask_notional[5], # bid_depth[5], ask_depth[5], timestamp} def _levels(side, n=5): out_notional = [] out_depth = [] for i in range(n): if i < len(side): p, q = float(side[i][0]), float(side[i][1]) out_notional.append(round(p * q, 2)) out_depth.append(round(q, 6)) else: out_notional.append(0.0) out_depth.append(0.0) return out_notional, out_depth per_asset_kv: Dict[str, str] = {} for symbol, book in state_copy.items(): if book["ts"] < cutoff: continue bn, bd = _levels(book["bids"]) an, ad = _levels(book["asks"]) per_asset_kv[f"asset_{symbol}_ob"] = json.dumps({ "timestamp": book["ts"], "bid_notional": bn, "ask_notional": an, "bid_depth": bd, "ask_depth": ad, }) if per_asset_kv: self._imap.put_all(per_asset_kv) except Exception as e: self.stats["hz_failures"] += 1 logger.warning("HZ push failed: %s", e) self._connect_hz() # silent reconnect elif self.dry_run: self.stats["hz_pushes"] += 1 if self.stats["snapshots"] % 120 == 1: logger.info( "Snapshot #%d: health=%d/%d stale=%d hz_ok=%d", self.stats["snapshots"], len(health_map), len(state_copy), n_stale, self.stats["hz_pushes"], ) # ── Parquet flush ───────────────────────────────────────────────────────── def _flush_loop(self): """Every FLUSH_INTERVAL_S: write buffered rows to Parquet.""" time.sleep(FIRST_FLUSH_S) while self._running: try: self._flush() except Exception as e: logger.error("Parquet flush error: %s", e) time.sleep(FLUSH_INTERVAL_S) def _flush(self): with self._buf_lock: snapshot = {s: list(q) for s, q in self._buf.items() if q} for q in self._buf.values(): q.clear() if not snapshot: return now = datetime.now(timezone.utc) date_str = now.strftime("%Y-%m-%d") ts_str = now.strftime("%H-%M-%S-%f") total = 0 for symbol, rows in snapshot.items(): if not rows: continue if self.dry_run: total += len(rows) continue try: out_dir = ( PARQUET_BASE_DIR / "exchange=binance" / f"symbol={symbol}" / f"date={date_str}" ) out_dir.mkdir(parents=True, exist_ok=True) filename = f"part-{ts_str}.parquet" out_path = out_dir / filename tmp_path = out_path.with_suffix(".tmp") sentinel = {"local_ts": 0.0, "asset": "", "best_bid": 0.0, "best_ask": 0.0, "spread_bps": float("nan"), "depth_1pct_usd": 0.0, "depth_quality": float("nan"), "fill_probability": float("nan"), "imbalance": float("nan"), "n_bid_levels": 0, "n_ask_levels": 0} arrays = { f.name: pa.array( [row.get(f.name, sentinel[f.name]) for row in rows], type=f.type, ) for f in OB_UNIVERSE_SCHEMA } table = pa.table(arrays, schema=OB_UNIVERSE_SCHEMA) pq.write_table(table, tmp_path, compression="snappy") tmp_path.rename(out_path) digest = hashlib.md5(out_path.read_bytes()).hexdigest() (out_dir / f"{filename}.md5").write_text(digest) self.stats["parquet_files"] += 1 total += len(rows) except Exception as e: logger.error("Parquet write failed for %s: %s", symbol, e) self.stats["parquet_rows"] += total if total: logger.info("Parquet flush: %d rows, %d assets", total, len(snapshot)) # ── Lifecycle ───────────────────────────────────────────────────────────── def start(self, dry_run: bool = False): if dry_run: self.dry_run = True self._running = True self._symbols = self._fetch_universe() if not self.dry_run: self._connect_hz() # Initialise state dict for all symbols for s in self._symbols: self._state[s] = {"bids": [], "asks": [], "ts": 0.0} # Partition symbols across WS connections batches: List[List[str]] = [] for i in range(0, len(self._symbols), MAX_STREAMS_PER_CONN): batches.append(self._symbols[i:i + MAX_STREAMS_PER_CONN]) self._ws_threads = [] for conn_id, batch in enumerate(batches): t = threading.Thread( target=self._run_ws_thread, args=(batch, conn_id), daemon=True, name=f"obf-universe-ws-{conn_id}", ) t.start() self._ws_threads.append(t) logger.info( "Started %d WS connections for %d symbols", len(batches), len(self._symbols) ) self._snapshot_thread = threading.Thread( target=self._snapshot_loop, daemon=True, name="obf-universe-snapshot" ) self._snapshot_thread.start() self._flush_thread = threading.Thread( target=self._flush_loop, daemon=True, name="obf-universe-flush" ) self._flush_thread.start() def stop(self): logger.info( "Stopping — snapshots=%d hz=%d parquet_rows=%d ws_msgs=%d", self.stats["snapshots"], self.stats["hz_pushes"], self.stats["parquet_rows"], self.stats["ws_msgs"], ) self._running = False if self._hz_client: try: self._hz_client.shutdown() except Exception: pass def run_forever(self): """Block main thread until KeyboardInterrupt / SIGTERM.""" import signal def _sig(signum, frame): raise KeyboardInterrupt signal.signal(signal.SIGTERM, _sig) try: while self._running: time.sleep(5) except KeyboardInterrupt: pass finally: self.stop() # ── Entry point ──────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="DOLPHIN OBF-Universe service") parser.add_argument("--snapshot-interval", type=float, default=SNAPSHOT_INTERVAL_S, help="Health snapshot interval in seconds (default 60)") parser.add_argument("--hz-host", default="localhost:5701") parser.add_argument("--hz-cluster", default="dolphin") parser.add_argument("--dry-run", action="store_true", help="Skip HZ writes and Parquet writes (testing)") args = parser.parse_args() svc = OBUniverseService( hz_host=args.hz_host, hz_cluster=args.hz_cluster, snapshot_interval_s=args.snapshot_interval, ) svc.start(dry_run=args.dry_run) svc.run_forever() if __name__ == "__main__": main()