Files
DOLPHIN/prod/obf_prefect_flow.py

590 lines
25 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN OBF Order Book Feature Subsystem under Prefect
=========================================================
Architecture mirrors ExF/EsoF flows exactly:
- Async OBStreamService runs in a background thread (its own event loop).
- LiveOBFeatureEngine computes all 4 sub-systems incrementally.
- Per-asset raw OB pushed to HZ as asset_{ASSET}_ob.
- Consolidated features pushed to HZ as obf_latest.
- Local JSON cache written atomically for DOLPHIN scanner to read.
- Parquet persistence via OBFPersistenceService (5-min flush).
Rate limits respected:
- Binance Futures WS @depth@100ms: push stream, no REST rate limit concern.
- Binance REST depth limit=1000: weight=20, used ONLY on init/reconnect.
Timing:
- WS stream updates internal book at 100 ms granularity.
- Feature extraction + HZ push: every HZ_PUSH_INTERVAL_S (0.5 s).
- Parquet flush: every 300 s (background thread).
Fixes applied:
P0-1 HZ circuit breaker opens after N failures, resets after cooldown
P0-2 WS stall watchdog warns if OBStreamService.is_stale() > 30 s
P0-4 Per-asset dark-streak counter logs immediately after 5 consecutive None
P1-1 Per-asset HZ pushes are fire-and-forget (no .result() block)
P1-6 push_errors is Dict[key, int] per-key breakdown in status log
P1-7 _write_local_cache logs failures (not silent pass)
P1-8 HZ connectivity probe before entering hot loop
P2-5 AsyncOBThread exposes stop() for clean shutdown + is_stale() passthrough
P3-3 Dead-man's switch — CRITICAL log if all assets dark > 60 s
Launch:
cd /mnt/dolphinng5_predict/prod
PREFECT_API_URL=http://localhost:4200/api \\
nohup python3 obf_prefect_flow.py > /tmp/obf_prefect.log 2>&1 &
"""
import asyncio
import collections
import json
import os
import sys
import threading
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional
from prefect import flow, task, get_run_logger
from prefect.cache_policies import NO_CACHE
_HERE = Path(__file__).parent
sys.path.insert(0, str(_HERE))
sys.path.insert(0, str(_HERE.parent))
from _hz_push import make_hz_client, hz_push
# ===========================================================================
# CONSTANTS
# ===========================================================================
HZ_KEY_CONSOLIDATED = "obf_latest"
HZ_KEY_PER_ASSET = "asset_{asset}_ob"
HZ_MAP = "DOLPHIN_FEATURES"
HZ_PUSH_INTERVAL_S = 0.5
WARMUP_S = 8
LOG_STATUS_EVERY = 120 # every 60 s at 2 Hz
ASSETS = ["BTCUSDT", "ETHUSDT", "SOLUSDT"]
OB_CACHE_DIR = _HERE.parent / "ob_cache"
OB_CACHE_FILE = OB_CACHE_DIR / "latest_ob_features.json"
MAX_DEPTH_PCT = 5
# P0-1: circuit breaker thresholds
_HZ_CIRCUIT_OPEN_AFTER = 5 # consecutive failures before opening
_HZ_CIRCUIT_RESET_AFTER = 30 # cycles before trying again (~15 s)
# P0-4 / P3-3: dark-streak thresholds
_DARK_WARN_AFTER = 5 # log warning after 5 dark cycles (2.5 s)
_DARK_CRITICAL_AFTER = 120 # log CRITICAL if ALL assets dark 60 s (P3-3)
# P1-7: local cache failure logging
_CACHE_LOG_EVERY = 60 # log once per 30 s at 2 Hz
# ===========================================================================
# ASYNC OB STREAM THREAD
# ===========================================================================
class AsyncOBThread(threading.Thread):
"""
Runs OBStreamService in a background asyncio event loop.
Exposes sync API safe to call from the Prefect flow's synchronous context.
Fixes applied:
P0-2 is_stale() passthrough to OBStreamService.is_stale()
P2-5 stop() for clean shutdown; _stop_event replaces create_future() park
"""
def __init__(self, assets: List[str], max_depth_pct: int = MAX_DEPTH_PCT):
super().__init__(daemon=True, name="ob-stream-thread")
from external_factors.ob_stream_service import OBStreamService
self.service = OBStreamService(assets=assets, max_depth_pct=max_depth_pct)
self._loop: Optional[asyncio.AbstractEventLoop] = None
self._ready = threading.Event()
self._stop_ev = threading.Event() # P2-5
def run(self):
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
self._loop.run_until_complete(self._run_forever())
async def _run_forever(self):
asyncio.create_task(self.service.stream())
self._ready.set()
# P2-5: park on a future that stop() cancels
self._stop_future = self._loop.create_future()
try:
await self._stop_future
except asyncio.CancelledError:
pass
def wait_ready(self, timeout: float = 10.0) -> bool:
return self._ready.wait(timeout=timeout)
def stop(self, timeout: float = 5.0):
"""P2-5: request clean shutdown and join."""
if self._loop and self._loop.is_running():
self._loop.call_soon_threadsafe(
lambda: self._stop_future.set_result(None)
if not self._stop_future.done() else None
)
self.join(timeout=timeout)
def is_stale(self, threshold_s: float = 30.0) -> bool:
"""P0-2: True if OBStreamService has received no WS events for threshold_s."""
return self.service.is_stale(threshold_s=threshold_s)
def get_depth_buckets_sync(self, asset: str) -> Optional[dict]:
if self._loop is None or not self._loop.is_running():
return None
try:
future = asyncio.run_coroutine_threadsafe(
self.service.get_depth_buckets(asset), self._loop
)
return future.result(timeout=HZ_PUSH_INTERVAL_S)
except Exception:
return None
@property
def initialized(self) -> Dict[str, bool]:
return dict(self.service.initialized)
# ===========================================================================
# HZ PUSH TASK
# ===========================================================================
@task(name="hz_push_obf", retries=3, retry_delay_seconds=2, cache_policy=NO_CACHE)
def hz_push_obf_task(client, key: str, payload: dict) -> bool:
try:
data = dict(payload)
data["_pushed_at"] = datetime.now(timezone.utc).isoformat()
data["_push_seq"] = int(time.time() * 1000)
client.get_map(HZ_MAP).blocking().put(key, json.dumps(data))
return True
except Exception:
return False
# ===========================================================================
# LOCAL CACHE WRITE (P1-7: logs failures instead of silent pass)
# ===========================================================================
_cache_write_failures = 0
def _write_local_cache(payload: dict) -> bool:
"""Atomic JSON write. Returns True on success. Logs failures (P1-7)."""
global _cache_write_failures
try:
OB_CACHE_DIR.mkdir(parents=True, exist_ok=True)
tmp = OB_CACHE_DIR / f".ob_cache_{os.getpid()}.tmp"
tmp.write_text(json.dumps(payload, default=str), encoding="utf-8")
tmp.rename(OB_CACHE_FILE)
return True
except Exception as exc:
_cache_write_failures += 1
if _cache_write_failures % _CACHE_LOG_EVERY == 1:
import logging
logging.getLogger(__name__).warning(
"OBF local cache write failed (%d times): %s", _cache_write_failures, exc
)
return False
# ===========================================================================
# HZ PRE-FLIGHT CHECK (P1-8)
# ===========================================================================
def _hz_preflight(client, log, retries: int = 5, delay_s: float = 3.0) -> bool:
"""Verify HZ connectivity before entering hot loop. Returns True if OK."""
for attempt in range(1, retries + 1):
try:
client.get_map(HZ_MAP).blocking().put(
"_obf_heartbeat",
json.dumps({"ts": time.time(), "source": "obf_preflight"}),
)
log.info("HZ connectivity verified (attempt %d)", attempt)
return True
except Exception as e:
log.warning("HZ preflight failed (attempt %d/%d): %s", attempt, retries, e)
if attempt < retries:
time.sleep(delay_s)
return False
# ===========================================================================
# PREFECT FLOW
# ===========================================================================
@flow(name="obf-prefect-flow", log_prints=True)
def obf_prefect_flow(
warmup_s: float = float(WARMUP_S),
poll_interval_s: float = float(HZ_PUSH_INTERVAL_S),
assets: list = None,
):
"""
Order Book Feature subsystem daemon.
Runs indefinitely under Prefect:
1. Start AsyncOBThread (WS stream + REST sync).
2. Warm up for warmup_s seconds.
3. Start OBFPersistenceService.
4. HZ connectivity pre-flight check.
5. Loop at poll_interval_s:
a. Extract depth buckets per asset.
b. Compute 4-subsystem OB features.
c. Push per-asset raw OB to HZ (fire-and-forget).
d. Push consolidated features to HZ (blocking authoritative key).
e. Write local JSON cache.
f. Feed persistence buffer.
g. Stall watchdog + dark-streak + dead-man's switch checks.
"""
log = get_run_logger()
_assets = list(assets or ASSETS)
log.info("=== OBF PREFECT FLOW STARTING ===")
log.info("Assets: %s | push_interval=%.1fs | warmup=%ss", _assets, poll_interval_s, warmup_s)
# -----------------------------------------------------------------------
# 1. Start WebSocket stream thread
# -----------------------------------------------------------------------
ob_thread = AsyncOBThread(assets=_assets, max_depth_pct=MAX_DEPTH_PCT)
ob_thread.start()
if not ob_thread.wait_ready(timeout=10.0):
log.error("AsyncOBThread event loop did not start in time — aborting")
return
log.info("OB WebSocket stream thread started")
# -----------------------------------------------------------------------
# 2. Warm up
# -----------------------------------------------------------------------
log.info("Warming up for %.0f s (WS + REST book sync)...", warmup_s)
time.sleep(warmup_s)
init_status = ob_thread.initialized
log.info("Book init status after warmup: %s", init_status)
n_ready = sum(v for v in init_status.values())
if n_ready == 0:
log.warning("No assets initialized after warmup — will retry in hot loop")
# -----------------------------------------------------------------------
# 3. Start persistence service
# -----------------------------------------------------------------------
from obf_persistence import OBFPersistenceService, LiveOBFeatureEngine
persist = OBFPersistenceService(assets=_assets, flush_interval_s=300)
persist.start()
log.info("OBFPersistenceService started")
# -----------------------------------------------------------------------
# 4. Initialize LiveOBFeatureEngine
# -----------------------------------------------------------------------
feature_engine = LiveOBFeatureEngine(assets=_assets)
log.info("LiveOBFeatureEngine initialized")
# -----------------------------------------------------------------------
# 5. Connect to Hazelcast + pre-flight check (P1-8)
# -----------------------------------------------------------------------
log.info("Connecting to Hazelcast...")
client = make_hz_client()
if not _hz_preflight(client, log):
log.error("HZ preflight failed after retries — aborting flow")
persist.stop()
ob_thread.stop()
return
log.info("Hazelcast ready")
# -----------------------------------------------------------------------
# 6. Hot loop state
# -----------------------------------------------------------------------
pushes = 0
push_errors: Dict[str, int] = collections.defaultdict(int) # P1-6
# P0-1: circuit breaker state
_hz_consec_failures = 0
_hz_circuit_open = False
_hz_cooldown = 0
# P0-4: per-asset dark streak
_none_streak: Dict[str, int] = {a: 0 for a in _assets}
# P3-3: all-assets-dark counter
_all_dark_cycles = 0
# P0-2: stale watchdog — check every LOG_STATUS_EVERY cycles
_stale_logged = False
last_lag_s = 0.0
_push_seq = 0
log.info("=== OBF HOT LOOP STARTED ===")
try:
while True:
t0_mono = time.monotonic()
t0_wall = time.time()
# ------------------------------------------------------------------
# A. Extract current OB snapshots
# ------------------------------------------------------------------
raw_snaps: Dict[str, Optional[dict]] = {}
for asset in _assets:
raw_snaps[asset] = ob_thread.get_depth_buckets_sync(asset)
local_ts = time.time()
# ------------------------------------------------------------------
# P0-4: per-asset dark streak detection
# ------------------------------------------------------------------
n_init = 0
for asset in _assets:
if raw_snaps[asset] is None:
_none_streak[asset] += 1
if _none_streak[asset] == _DARK_WARN_AFTER:
log.warning(
"OBF: %s book dark for %d consecutive cycles (%.1f s)",
asset, _none_streak[asset],
_none_streak[asset] * poll_interval_s,
)
else:
if _none_streak[asset] >= _DARK_WARN_AFTER:
log.info(
"OBF: %s book restored after %d dark cycles",
asset, _none_streak[asset],
)
_none_streak[asset] = 0
n_init += 1
# ------------------------------------------------------------------
# P3-3: dead-man's switch — all assets dark
# ------------------------------------------------------------------
if n_init == 0:
_all_dark_cycles += 1
if _all_dark_cycles == _DARK_CRITICAL_AFTER:
log.critical(
"OBF DEAD-MAN: ALL %d assets dark for %.0f s — "
"alpha engine is receiving neutral OB features",
len(_assets), _all_dark_cycles * poll_interval_s,
)
else:
_all_dark_cycles = 0
# ------------------------------------------------------------------
# B. Compute 4-subsystem OB features
# ------------------------------------------------------------------
feature_snaps = {a: raw_snaps[a] for a in _assets}
feature_result = feature_engine.update(feature_snaps)
per_asset_feat = feature_result["per_asset"]
market_feat = feature_result["market"]
macro_feat = feature_result["macro"]
compute_ts = time.time()
_push_seq += 1
# ------------------------------------------------------------------
# C. Build consolidated HZ payload
# ------------------------------------------------------------------
consolidated: dict = {
"timestamp": datetime.fromtimestamp(local_ts, tz=timezone.utc).isoformat(),
"local_ts": local_ts,
"compute_ts": compute_ts,
"assets": _assets,
"_push_seq": _push_seq,
**{f"market_{k}": v for k, v in market_feat.items()},
**{f"macro_{k}": v for k, v in macro_feat.items()},
}
for asset in _assets:
paf = per_asset_feat.get(asset)
pfx = f"{asset.lower()}_"
if paf:
for k, v in paf.items():
consolidated[f"{pfx}{k}"] = v
else:
consolidated[f"{pfx}initialized"] = False
consolidated["_n_assets_live"] = n_init
consolidated["_n_assets_total"] = len(_assets)
consolidated["_all_live"] = n_init == len(_assets)
# ------------------------------------------------------------------
# D. HZ push (P0-1: circuit breaker; P1-1: per-asset fire-and-forget)
# ------------------------------------------------------------------
if _hz_circuit_open:
# Circuit open — skip HZ pushes, still persist locally
_hz_cooldown -= 1
if _hz_cooldown <= 0:
log.info("OBF circuit breaker resetting — retrying HZ")
_hz_circuit_open = False
_hz_consec_failures = 0
else:
# P1-1: per-asset pushes are fire-and-forget (no .result() block)
for asset in _assets:
snap = raw_snaps.get(asset)
if snap is None:
continue
asset_payload = {
"timestamp": snap["timestamp"],
"asset": snap["asset"],
"bid_notional": list(snap["bid_notional"]),
"ask_notional": list(snap["ask_notional"]),
"bid_depth": list(snap["bid_depth"]),
"ask_depth": list(snap["ask_depth"]),
"best_bid": snap["best_bid"],
"best_ask": snap["best_ask"],
"spread_bps": snap["spread_bps"],
}
key = HZ_KEY_PER_ASSET.format(asset=asset)
try:
hz_push_obf_task.submit(client, key, asset_payload)
# fire-and-forget: no .result() — don't block hot loop
except Exception:
push_errors[key] += 1
# Consolidated push blocks (authoritative key — consumers depend on it)
consol_ok = False
try:
hz_push_obf_task.submit(
client, HZ_KEY_CONSOLIDATED, consolidated
).result(timeout=1.5)
consol_ok = True
_hz_consec_failures = 0
pushes += 1
except Exception:
push_errors[HZ_KEY_CONSOLIDATED] += 1
_hz_consec_failures += 1
if _hz_consec_failures >= _HZ_CIRCUIT_OPEN_AFTER:
log.error(
"OBF HZ CIRCUIT OPEN after %d consecutive failures — "
"skipping HZ pushes for %d cycles",
_hz_consec_failures, _HZ_CIRCUIT_RESET_AFTER,
)
_hz_circuit_open = True
_hz_cooldown = _HZ_CIRCUIT_RESET_AFTER
# ------------------------------------------------------------------
# E. Write local JSON cache (P1-7: logs failures)
# ------------------------------------------------------------------
_write_local_cache(consolidated)
# ------------------------------------------------------------------
# F. Feed persistence buffers
# ------------------------------------------------------------------
pushed_at_ts = time.time()
last_lag_s = pushed_at_ts - local_ts
for asset in _assets:
snap = raw_snaps.get(asset)
paf = per_asset_feat.get(asset)
if snap is None or paf is None:
continue
bid_n = snap["bid_notional"]
ask_n = snap["ask_notional"]
row = {
"exchange_ts": float(snap["timestamp"]),
"local_ts": local_ts,
"pushed_at": pushed_at_ts,
"lag_s": float(last_lag_s),
"asset": asset,
"best_bid": float(snap["best_bid"]),
"best_ask": float(snap["best_ask"]),
"spread_bps": float(snap["spread_bps"]),
"bid_notional_0": float(bid_n[0]),
"bid_notional_1": float(bid_n[1]),
"bid_notional_2": float(bid_n[2]),
"bid_notional_3": float(bid_n[3]),
"bid_notional_4": float(bid_n[4]),
"ask_notional_0": float(ask_n[0]),
"ask_notional_1": float(ask_n[1]),
"ask_notional_2": float(ask_n[2]),
"ask_notional_3": float(ask_n[3]),
"ask_notional_4": float(ask_n[4]),
"depth_1pct_usd": paf["depth_1pct_usd"],
"depth_quality": paf["depth_quality"],
"fill_probability": paf["fill_probability"],
"spread_proxy_bps": paf["spread_proxy_bps"],
"imbalance": paf["imbalance"],
"imbalance_ma5": paf["imbalance_ma5"],
"imbalance_persistence": paf["imbalance_persistence"],
"depth_asymmetry": paf["depth_asymmetry"],
"withdrawal_velocity": paf["withdrawal_velocity"],
"median_imbalance": market_feat["median_imbalance"],
"agreement_pct": market_feat["agreement_pct"],
"depth_pressure": market_feat["depth_pressure"],
"depth_velocity": macro_feat["depth_velocity"],
"cascade_count": macro_feat["cascade_count"],
"acceleration": macro_feat["acceleration"],
"regime_signal": macro_feat["regime_signal"],
}
persist.update_snapshot(asset, row)
# ------------------------------------------------------------------
# G. Periodic status log + watchdogs
# ------------------------------------------------------------------
if pushes % LOG_STATUS_EVERY == 0 and pushes > 0:
stats = persist.get_stats()
# P1-6: report top error keys
top_errs = sorted(push_errors.items(), key=lambda x: -x[1])[:3]
log.info(
"OBF status | pushes=%d lag_s=%.3f assets_live=%d/%d "
"files=%d hz_circuit=%s top_errors=%s",
pushes, last_lag_s, n_init, len(_assets),
stats.get("files_written", 0),
"OPEN" if _hz_circuit_open else "closed",
top_errs,
)
if last_lag_s > poll_interval_s * 2:
log.warning(
"OBF LAG DRIFT: lag_s=%.3f > 2×poll=%.3f",
last_lag_s, poll_interval_s * 2,
)
# P0-2: WS stall watchdog
if ob_thread.is_stale(threshold_s=30.0):
if not _stale_logged:
log.error(
"OBF WS STALL: no events received for > 30 s — "
"book data may be frozen"
)
_stale_logged = True
else:
_stale_logged = False
# ------------------------------------------------------------------
# H. Maintain push interval (monotonic clock)
# ------------------------------------------------------------------
elapsed = time.monotonic() - t0_mono
sleep_time = max(0.0, poll_interval_s - elapsed)
if sleep_time > 0:
time.sleep(sleep_time)
except KeyboardInterrupt:
log.info("OBF flow interrupted by KeyboardInterrupt")
finally:
persist.stop()
ob_thread.stop() # P2-5: clean shutdown
total_errors = sum(push_errors.values())
log.info(
"OBF flow done — pushes=%d total_errors=%d error_breakdown=%s",
pushes, total_errors, dict(push_errors),
)
# ===========================================================================
# ENTRY POINT
# ===========================================================================
if __name__ == "__main__":
obf_prefect_flow()