Files
DOLPHIN/prod/obf_universe_service.py

561 lines
22 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN OBF-Universe Lightweight L2 health monitor for all USDT perps
========================================================================
Subscribes to @depth5@500ms for all active USDT perpetual pairs (~400 assets),
computes OB health metrics every 500 ms (matching WS push cadence), and serves them via:
- Hazelcast key : obf_universe_latest (JSON dict keyed by symbol)
- Parquet archive: /mnt/ng6_data/ob_universe/ (Hive partitioned, no pruning)
Used by Asset Picker to score tradable universe on real-time liquidity/health.
Architecture:
- Two asyncio WS connections (Binance cap: 300 streams each)
- Subscribe to {symbol}@depth5@500ms absolute snapshots, no book maintenance
- In-memory state: Dict[symbol {bids, asks, ts}] (lock-protected)
- Snapshot thread: every SNAPSHOT_INTERVAL_S compute health + push HZ
- Flush thread : every FLUSH_INTERVAL_S write Parquet
Binance rate limits:
- WebSocket push streams carry ZERO REST weight
- depth5@500ms: ~2 msg/s per stream, 800 msg/s for 400 assets trivial
Usage:
python prod/obf_universe_service.py
python prod/obf_universe_service.py --snapshot-interval 30 --dry-run
"""
import argparse
import asyncio
import hashlib
import json
import logging
import sys
import threading
import time
from collections import deque
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pyarrow as pa
import pyarrow.parquet as pq
import requests
HCM_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(HCM_DIR))
sys.path.insert(0, str(HCM_DIR / "prod"))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s%(message)s",
)
logger = logging.getLogger("obf_universe")
# ── Constants ──────────────────────────────────────────────────────────────────
SNAPSHOT_INTERVAL_S = 0.5 # HZ publish cadence — matches @depth5@500ms WS push
PARQUET_BUFFER_EVERY_N = 20 # buffer to Parquet every Nth snapshot (= every 10s)
FLUSH_INTERVAL_S = 300 # parquet flush cadence (5 min)
FIRST_FLUSH_S = 60 # first flush fires early
STALE_THRESHOLD_S = 10.0 # skip asset if book not updated within N s
MAX_STREAMS_PER_CONN = 200 # conservative (<300 Binance limit); two connections
PARQUET_BASE_DIR = Path("/mnt/ng6_data/ob_universe")
HZ_KEY = "obf_universe_latest"
WS_URL = "wss://fstream.binance.com/ws"
EXCHANGE_INFO_URL = "https://fapi.binance.com/fapi/v1/exchangeInfo"
# ── Parquet Schema (lightweight — health metrics only) ─────────────────────────
OB_UNIVERSE_SCHEMA = pa.schema([
pa.field("local_ts", pa.float64()),
pa.field("asset", pa.string()),
pa.field("best_bid", pa.float64()),
pa.field("best_ask", pa.float64()),
pa.field("spread_bps", pa.float32()),
pa.field("depth_1pct_usd", pa.float64()),
pa.field("depth_quality", pa.float32()),
pa.field("fill_probability", pa.float32()),
pa.field("imbalance", pa.float32()),
pa.field("n_bid_levels", pa.int32()),
pa.field("n_ask_levels", pa.int32()),
])
def _compute_health(bids: list, asks: list) -> Optional[dict]:
"""Compute OB health metrics from raw depth5 bid/ask levels.
bids/asks: [[price_str, qty_str], ...] (up to 5 levels, best first)
Returns None if book is empty.
"""
if not bids or not asks:
return None
try:
bid0 = float(bids[0][0])
ask0 = float(asks[0][0])
if bid0 <= 0 or ask0 <= 0:
return None
mid = (bid0 + ask0) / 2.0
spread_bps = (ask0 - bid0) / mid * 10_000.0
threshold_1pct = mid * 0.01
bid_notional = sum(
float(p) * float(q)
for p, q in bids
if float(p) >= mid - threshold_1pct
)
ask_notional = sum(
float(p) * float(q)
for p, q in asks
if float(p) <= mid + threshold_1pct
)
depth_1pct_usd = bid_notional + ask_notional
depth_quality = float(min(1.0, depth_1pct_usd / 1_000_000.0))
fill_probability = float(max(0.0, min(1.0, 1.0 - spread_bps / 20.0)))
total = bid_notional + ask_notional
imbalance = float((bid_notional - ask_notional) / max(total, 1e-9))
return {
"best_bid": bid0,
"best_ask": ask0,
"spread_bps": round(spread_bps, 4),
"depth_1pct_usd": round(depth_1pct_usd, 2),
"depth_quality": round(depth_quality, 4),
"fill_probability": round(fill_probability, 4),
"imbalance": round(imbalance, 4),
"n_bid_levels": len(bids),
"n_ask_levels": len(asks),
}
except (ValueError, IndexError, ZeroDivisionError):
return None
class OBUniverseService:
"""
Two-connection WS health monitor for the full USDT perp universe.
Thread safety:
_state is protected by _state_lock (updated by WS threads, read by snapshot thread)
"""
def __init__(self, hz_host: str = "localhost:5701", hz_cluster: str = "dolphin",
snapshot_interval_s: float = SNAPSHOT_INTERVAL_S,
dry_run: bool = False):
self.hz_host = hz_host
self.hz_cluster = hz_cluster
self.snapshot_interval = snapshot_interval_s
self.dry_run = dry_run
self._state: Dict[str, dict] = {} # symbol → {bids, asks, ts}
self._state_lock = threading.Lock()
# Parquet row buffer per asset: deque of health dicts
self._buf: Dict[str, deque] = {}
self._buf_lock = threading.Lock()
self._running = False
self._symbols: List[str] = []
self._hz_client = None
self._imap = None
# Stats
self.stats = {"snapshots": 0, "hz_pushes": 0, "hz_failures": 0,
"parquet_files": 0, "parquet_rows": 0, "ws_msgs": 0}
# ── Universe fetch ─────────────────────────────────────────────────────────
def _fetch_universe(self) -> List[str]:
"""Fetch all active USDT perpetual symbols from Binance."""
try:
resp = requests.get(EXCHANGE_INFO_URL, timeout=10)
resp.raise_for_status()
data = resp.json()
symbols = [
s["symbol"] for s in data["symbols"]
if s.get("quoteAsset") == "USDT"
and s.get("contractType") == "PERPETUAL"
and s.get("status") == "TRADING"
]
logger.info("Universe: %d active USDT perps fetched", len(symbols))
return sorted(symbols)
except Exception as e:
logger.error("Failed to fetch universe: %s", e)
return ["BTCUSDT", "ETHUSDT", "SOLUSDT"] # minimal fallback
# ── Hazelcast ─────────────────────────────────────────────────────────────
def _connect_hz(self) -> bool:
try:
import hazelcast
if self._hz_client:
try:
self._hz_client.shutdown()
except Exception:
pass
self._hz_client = hazelcast.HazelcastClient(
cluster_name=self.hz_cluster,
cluster_members=[self.hz_host],
)
self._imap = self._hz_client.get_map("DOLPHIN_FEATURES").blocking()
logger.info("HZ connected")
return True
except Exception as e:
logger.warning("HZ connect failed: %s", e)
self._hz_client = None
self._imap = None
return False
# ── WebSocket connections ─────────────────────────────────────────────────
def _update_state(self, symbol: str, bids: list, asks: list, ts: float):
"""Thread-safe state update from WS thread."""
with self._state_lock:
self._state[symbol] = {"bids": bids, "asks": asks, "ts": ts}
self.stats["ws_msgs"] += 1
async def _ws_connection(self, symbols: List[str], conn_id: int):
"""Async WS loop for one connection. Runs in a background thread's event loop."""
import websockets
streams = [f"{s.lower()}@depth5@500ms" for s in symbols]
logger.info("WS conn %d: subscribing to %d streams", conn_id, len(streams))
while self._running:
try:
async with websockets.connect(
WS_URL,
ping_interval=20,
ping_timeout=30,
max_size=2**23, # 8 MB — handle bursts
) as ws:
await ws.send(json.dumps({
"method": "SUBSCRIBE",
"params": streams,
"id": conn_id,
}))
logger.info("WS conn %d: subscribed", conn_id)
async for raw in ws:
if not self._running:
break
try:
msg = json.loads(raw)
except json.JSONDecodeError:
continue
# Skip subscription ack / errors
if "result" in msg or "error" in msg:
continue
symbol = msg.get("s")
if not symbol:
continue
bids = msg.get("b", [])
asks = msg.get("a", [])
ts = msg.get("T", time.time() * 1000) / 1000.0
self._update_state(symbol, bids, asks, ts)
except Exception as e:
if self._running:
logger.warning("WS conn %d error: %s — reconnecting in 5s", conn_id, e)
await asyncio.sleep(5)
def _run_ws_thread(self, symbols: List[str], conn_id: int):
"""Entrypoint for a WS background thread (owns its own event loop)."""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(self._ws_connection(symbols, conn_id))
finally:
loop.close()
# ── Snapshot loop ─────────────────────────────────────────────────────────
def _snapshot_loop(self):
"""Every SNAPSHOT_INTERVAL_S: compute health for all assets, push to HZ."""
next_tick = time.monotonic() + self.snapshot_interval
while self._running:
now_mono = time.monotonic()
if now_mono < next_tick:
time.sleep(min(1.0, next_tick - now_mono))
continue
next_tick = now_mono + self.snapshot_interval
now_ts = time.time()
cutoff = now_ts - STALE_THRESHOLD_S
with self._state_lock:
state_copy = dict(self._state)
health_map: Dict[str, dict] = {}
n_stale = 0
rows_by_asset: Dict[str, dict] = {}
for symbol, book in state_copy.items():
if book["ts"] < cutoff:
n_stale += 1
continue
h = _compute_health(book["bids"], book["asks"])
if h is None:
continue
health_map[symbol] = h
rows_by_asset[symbol] = {"local_ts": now_ts, "asset": symbol, **h}
self.stats["snapshots"] += 1
# Buffer rows for Parquet flush every Nth snapshot (decoupled from HZ rate)
if self.stats["snapshots"] % PARQUET_BUFFER_EVERY_N == 0:
with self._buf_lock:
for symbol, row in rows_by_asset.items():
if symbol not in self._buf:
self._buf[symbol] = deque()
self._buf[symbol].append(row)
# Push to HZ
payload = {
"_snapshot_utc": datetime.now(timezone.utc).isoformat(),
"_n_assets": len(health_map),
"_n_stale": n_stale,
"_n_total": len(state_copy),
**health_map,
}
if not self.dry_run and self._imap:
try:
self._imap.put(HZ_KEY, json.dumps(payload))
self.stats["hz_pushes"] += 1
try:
from ch_writer import ch_put
snap_ts_ms = int(now_ts * 1_000)
for sym, h in health_map.items():
ch_put("obf_universe", {
"ts": snap_ts_ms,
"symbol": sym,
"spread_bps": float(h.get("spread_bps", 0)),
"depth_1pct_usd": float(h.get("depth_1pct_usd", 0)),
"depth_quality": float(h.get("depth_quality", 0)),
"fill_probability": float(h.get("fill_probability", 0)),
"imbalance": float(h.get("imbalance", 0)),
"best_bid": float(h.get("best_bid", 0)),
"best_ask": float(h.get("best_ask", 0)),
"n_bid_levels": int(h.get("n_bid_levels", 0)),
"n_ask_levels": int(h.get("n_ask_levels", 0)),
})
except Exception:
pass
# Write per-asset OB snapshots for HZOBProvider cache.
# Format: asset_{symbol}_ob → {bid_notional[5], ask_notional[5],
# bid_depth[5], ask_depth[5], timestamp}
def _levels(side, n=5):
out_notional = []
out_depth = []
for i in range(n):
if i < len(side):
p, q = float(side[i][0]), float(side[i][1])
out_notional.append(round(p * q, 2))
out_depth.append(round(q, 6))
else:
out_notional.append(0.0)
out_depth.append(0.0)
return out_notional, out_depth
per_asset_kv: Dict[str, str] = {}
for symbol, book in state_copy.items():
if book["ts"] < cutoff:
continue
bn, bd = _levels(book["bids"])
an, ad = _levels(book["asks"])
per_asset_kv[f"asset_{symbol}_ob"] = json.dumps({
"timestamp": book["ts"],
"bid_notional": bn,
"ask_notional": an,
"bid_depth": bd,
"ask_depth": ad,
})
if per_asset_kv:
self._imap.put_all(per_asset_kv)
except Exception as e:
self.stats["hz_failures"] += 1
logger.warning("HZ push failed: %s", e)
self._connect_hz() # silent reconnect
elif self.dry_run:
self.stats["hz_pushes"] += 1
if self.stats["snapshots"] % 120 == 1:
logger.info(
"Snapshot #%d: health=%d/%d stale=%d hz_ok=%d",
self.stats["snapshots"], len(health_map),
len(state_copy), n_stale, self.stats["hz_pushes"],
)
# ── Parquet flush ─────────────────────────────────────────────────────────
def _flush_loop(self):
"""Every FLUSH_INTERVAL_S: write buffered rows to Parquet."""
time.sleep(FIRST_FLUSH_S)
while self._running:
try:
self._flush()
except Exception as e:
logger.error("Parquet flush error: %s", e)
time.sleep(FLUSH_INTERVAL_S)
def _flush(self):
with self._buf_lock:
snapshot = {s: list(q) for s, q in self._buf.items() if q}
for q in self._buf.values():
q.clear()
if not snapshot:
return
now = datetime.now(timezone.utc)
date_str = now.strftime("%Y-%m-%d")
ts_str = now.strftime("%H-%M-%S-%f")
total = 0
for symbol, rows in snapshot.items():
if not rows:
continue
if self.dry_run:
total += len(rows)
continue
try:
out_dir = (
PARQUET_BASE_DIR
/ "exchange=binance"
/ f"symbol={symbol}"
/ f"date={date_str}"
)
out_dir.mkdir(parents=True, exist_ok=True)
filename = f"part-{ts_str}.parquet"
out_path = out_dir / filename
tmp_path = out_path.with_suffix(".tmp")
sentinel = {"local_ts": 0.0, "asset": "", "best_bid": 0.0, "best_ask": 0.0,
"spread_bps": float("nan"), "depth_1pct_usd": 0.0,
"depth_quality": float("nan"), "fill_probability": float("nan"),
"imbalance": float("nan"), "n_bid_levels": 0, "n_ask_levels": 0}
arrays = {
f.name: pa.array(
[row.get(f.name, sentinel[f.name]) for row in rows],
type=f.type,
)
for f in OB_UNIVERSE_SCHEMA
}
table = pa.table(arrays, schema=OB_UNIVERSE_SCHEMA)
pq.write_table(table, tmp_path, compression="snappy")
tmp_path.rename(out_path)
digest = hashlib.md5(out_path.read_bytes()).hexdigest()
(out_dir / f"{filename}.md5").write_text(digest)
self.stats["parquet_files"] += 1
total += len(rows)
except Exception as e:
logger.error("Parquet write failed for %s: %s", symbol, e)
self.stats["parquet_rows"] += total
if total:
logger.info("Parquet flush: %d rows, %d assets", total, len(snapshot))
# ── Lifecycle ─────────────────────────────────────────────────────────────
def start(self, dry_run: bool = False):
if dry_run:
self.dry_run = True
self._running = True
self._symbols = self._fetch_universe()
if not self.dry_run:
self._connect_hz()
# Initialise state dict for all symbols
for s in self._symbols:
self._state[s] = {"bids": [], "asks": [], "ts": 0.0}
# Partition symbols across WS connections
batches: List[List[str]] = []
for i in range(0, len(self._symbols), MAX_STREAMS_PER_CONN):
batches.append(self._symbols[i:i + MAX_STREAMS_PER_CONN])
self._ws_threads = []
for conn_id, batch in enumerate(batches):
t = threading.Thread(
target=self._run_ws_thread,
args=(batch, conn_id),
daemon=True,
name=f"obf-universe-ws-{conn_id}",
)
t.start()
self._ws_threads.append(t)
logger.info(
"Started %d WS connections for %d symbols", len(batches), len(self._symbols)
)
self._snapshot_thread = threading.Thread(
target=self._snapshot_loop, daemon=True, name="obf-universe-snapshot"
)
self._snapshot_thread.start()
self._flush_thread = threading.Thread(
target=self._flush_loop, daemon=True, name="obf-universe-flush"
)
self._flush_thread.start()
def stop(self):
logger.info(
"Stopping — snapshots=%d hz=%d parquet_rows=%d ws_msgs=%d",
self.stats["snapshots"], self.stats["hz_pushes"],
self.stats["parquet_rows"], self.stats["ws_msgs"],
)
self._running = False
if self._hz_client:
try:
self._hz_client.shutdown()
except Exception:
pass
def run_forever(self):
"""Block main thread until KeyboardInterrupt / SIGTERM."""
import signal
def _sig(signum, frame):
raise KeyboardInterrupt
signal.signal(signal.SIGTERM, _sig)
try:
while self._running:
time.sleep(5)
except KeyboardInterrupt:
pass
finally:
self.stop()
# ── Entry point ────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="DOLPHIN OBF-Universe service")
parser.add_argument("--snapshot-interval", type=float, default=SNAPSHOT_INTERVAL_S,
help="Health snapshot interval in seconds (default 60)")
parser.add_argument("--hz-host", default="localhost:5701")
parser.add_argument("--hz-cluster", default="dolphin")
parser.add_argument("--dry-run", action="store_true",
help="Skip HZ writes and Parquet writes (testing)")
args = parser.parse_args()
svc = OBUniverseService(
hz_host=args.hz_host,
hz_cluster=args.hz_cluster,
snapshot_interval_s=args.snapshot_interval,
)
svc.start(dry_run=args.dry_run)
svc.run_forever()
if __name__ == "__main__":
main()