Files
DOLPHIN/prod/obf_persistence.py

485 lines
20 KiB
Python
Raw Normal View History

"""
OBF Persistence Service
=======================
Persists live order-book feature snapshots to Arrow/Parquet for backtesting.
Storage layout (Hive partitioning, Spark-compatible):
/mnt/ng6_data/ob_features/
exchange=binance/
symbol=BTCUSDT/
date=YYYY-MM-DD/
part-HH-MM-SS-ffffff.parquet microsecond precision (P1-10)
part-HH-MM-SS-ffffff.parquet.md5
Schema: one row per OB snapshot per asset.
Fixes applied:
P0-5 HISTORY_MAXLEN now covers the full flush window (no data loss on crash)
P1-9 First flush fires after FIRST_FLUSH_S (60 s) not flush_interval_s (5 min)
P1-10 Filenames include microseconds no same-second collision
P2-2 _write_parquet uses vectorised column transpose (inspired by realtime_exf_service.py)
P2-8 _imb_hist deque sized to actual IMBALANCE_LOOKBACK, not DEPTH_LOOKBACK
P3-2 _cleanup_old_partitions() removes date dirs older than MAX_FILE_AGE_DAYS
"""
import hashlib
import logging
import shutil
import threading
import time
from collections import deque
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
logger = logging.getLogger(__name__)
# ===========================================================================
# CONSTANTS
# ===========================================================================
PARQUET_BASE_DIR = Path("/mnt/ng6_data/ob_features")
EXCHANGE = "binance"
FLUSH_INTERVAL_S = 300 # 5 minutes between regular flushes
FIRST_FLUSH_S = 60 # P1-9: first flush after 60 s (not 5 min)
POLL_INTERVAL_S = 0.5 # must match obf_prefect_flow.poll_interval_s
# P0-5: buffer must cover the full flush window so no rows are evicted before flush
HISTORY_MAXLEN = int(FLUSH_INTERVAL_S / POLL_INTERVAL_S) + 100 # = 700
MAX_FILE_AGE_DAYS = 0 # 0 = never prune (accumulate for backtesting); set >0 to enable rolling window
DEFAULT_ASSETS = ["BTCUSDT", "ETHUSDT", "SOLUSDT"]
# ===========================================================================
# PARQUET SCHEMA — complete OB feature record per snapshot per asset
# ===========================================================================
OB_SCHEMA = pa.schema([
# Timing
pa.field("exchange_ts", pa.float64()),
pa.field("local_ts", pa.float64()),
pa.field("pushed_at", pa.float64()),
pa.field("lag_s", pa.float32()),
# Identity
pa.field("asset", pa.string()),
# Top-of-book
pa.field("best_bid", pa.float64()),
pa.field("best_ask", pa.float64()),
pa.field("spread_bps", pa.float32()),
# Depth vectors (5 levels: 0=1%, 1=2%, ..., 4=5% from mid)
pa.field("bid_notional_0", pa.float64()),
pa.field("bid_notional_1", pa.float64()),
pa.field("bid_notional_2", pa.float64()),
pa.field("bid_notional_3", pa.float64()),
pa.field("bid_notional_4", pa.float64()),
pa.field("ask_notional_0", pa.float64()),
pa.field("ask_notional_1", pa.float64()),
pa.field("ask_notional_2", pa.float64()),
pa.field("ask_notional_3", pa.float64()),
pa.field("ask_notional_4", pa.float64()),
# Sub-system 1 — Per-asset placement
pa.field("depth_1pct_usd", pa.float64()),
pa.field("depth_quality", pa.float32()),
pa.field("fill_probability", pa.float32()),
pa.field("spread_proxy_bps", pa.float32()),
# Sub-system 2 — Per-asset signal
pa.field("imbalance", pa.float32()),
pa.field("imbalance_ma5", pa.float32()),
pa.field("imbalance_persistence", pa.float32()),
pa.field("depth_asymmetry", pa.float32()),
pa.field("withdrawal_velocity", pa.float32()),
# Sub-system 3 — Cross-asset market signal
pa.field("median_imbalance", pa.float32()),
pa.field("agreement_pct", pa.float32()),
pa.field("depth_pressure", pa.float32()),
# Sub-system 4 — Macro regime
pa.field("depth_velocity", pa.float32()),
pa.field("cascade_count", pa.int32()),
pa.field("acceleration", pa.float32()),
pa.field("regime_signal", pa.int32()),
])
# Pre-build sentinel values per field type (used in vectorised write, P2-2)
def _sentinel(field: pa.Field):
if pa.types.is_string(field.type):
return ""
if pa.types.is_integer(field.type):
return 0
return float("nan")
_SENTINELS: Dict[str, object] = {f.name: _sentinel(f) for f in OB_SCHEMA}
# ===========================================================================
# LIVE OB FEATURE ENGINE (incremental, no preload)
# ===========================================================================
class LiveOBFeatureEngine:
"""
Streaming, incremental version of OBFeatureEngine for production use.
Fixes applied:
P2-8 _imb_hist deque maxlen = IMBALANCE_LOOKBACK+5, not DEPTH_LOOKBACK
"""
IMBALANCE_LOOKBACK = 10 # snapshots → 5 s window at 0.5 s polling
DEPTH_LOOKBACK = 60 # snapshots → 30 s velocity window
CASCADE_THRESHOLD = -0.10
MEDIAN_CALIB_N = 60 # first N samples to build median depth ref
def __init__(self, assets: List[str]):
self.assets = list(assets)
# P2-8: size each deque to its actual lookback, not the larger DEPTH_LOOKBACK
self._imb_hist: Dict[str, deque] = {
a: deque(maxlen=self.IMBALANCE_LOOKBACK + 5) for a in assets
}
self._dep_hist: Dict[str, deque] = {
a: deque(maxlen=self.DEPTH_LOOKBACK) for a in assets
}
self._median_ref: Dict[str, float] = {a: 0.0 for a in assets}
self._calib_buf: Dict[str, list] = {a: [] for a in assets}
self._calibrated: Dict[str, bool] = {a: False for a in assets}
self._market_dep: deque = deque(maxlen=self.DEPTH_LOOKBACK)
def update(self, snaps: dict) -> dict:
"""
Process one OBSnapshot per asset (may be None if not yet initialized).
Args:
snaps: Dict[asset_str OBSnapshot-like dict | None]
Returns:
Feature dict with 'per_asset', 'market', 'macro' sub-keys.
"""
try:
from nautilus_dolphin.nautilus_dolphin.nautilus.ob_features import (
compute_imbalance_nb, compute_depth_1pct_nb, compute_depth_quality_nb,
compute_fill_probability_nb, compute_spread_proxy_nb,
compute_depth_asymmetry_nb, compute_imbalance_persistence_nb,
compute_withdrawal_velocity_nb, compute_market_agreement_nb,
compute_cascade_signal_nb,
)
except (ImportError, RuntimeError):
import importlib.util, types, sys
_root = Path(__file__).parent.parent
_naut_dir = _root / "nautilus_dolphin" / "nautilus_dolphin" / "nautilus"
_pkg = "_naut_ob"
if _pkg not in sys.modules:
_p = types.ModuleType(_pkg)
_p.__path__ = [str(_naut_dir)]
_p.__package__ = _pkg
sys.modules[_pkg] = _p
def _lm(name):
fn = f"{_pkg}.{name}"
if fn not in sys.modules:
sp = importlib.util.spec_from_file_location(fn, _naut_dir / f"{name}.py")
m = importlib.util.module_from_spec(sp)
m.__package__ = _pkg
sys.modules[fn] = m
sp.loader.exec_module(m)
return sys.modules[fn]
_lm("ob_provider")
_obf = _lm("ob_features")
compute_imbalance_nb = _obf.compute_imbalance_nb
compute_depth_1pct_nb = _obf.compute_depth_1pct_nb
compute_depth_quality_nb = _obf.compute_depth_quality_nb
compute_fill_probability_nb = _obf.compute_fill_probability_nb
compute_spread_proxy_nb = _obf.compute_spread_proxy_nb
compute_depth_asymmetry_nb = _obf.compute_depth_asymmetry_nb
compute_imbalance_persistence_nb = _obf.compute_imbalance_persistence_nb
compute_withdrawal_velocity_nb = _obf.compute_withdrawal_velocity_nb
compute_market_agreement_nb = _obf.compute_market_agreement_nb
compute_cascade_signal_nb = _obf.compute_cascade_signal_nb
per_asset: dict = {}
asset_imbalances: list = []
asset_velocities: list = []
market_total_depth = 0.0
for asset in self.assets:
snap = snaps.get(asset)
if snap is None:
per_asset[asset] = None
continue
bid_n = np.asarray(snap.get("bid_notional", [0.0] * 5), dtype=np.float64)
ask_n = np.asarray(snap.get("ask_notional", [0.0] * 5), dtype=np.float64)
imb = compute_imbalance_nb(bid_n, ask_n)
d1pct = compute_depth_1pct_nb(bid_n, ask_n)
if not self._calibrated[asset]:
self._calib_buf[asset].append(d1pct)
if len(self._calib_buf[asset]) >= self.MEDIAN_CALIB_N:
self._median_ref[asset] = float(np.median(self._calib_buf[asset]))
self._calibrated[asset] = True
med_ref = self._median_ref[asset] if self._calibrated[asset] else max(d1pct, 1.0)
dq = compute_depth_quality_nb(d1pct, med_ref)
fp = compute_fill_probability_nb(dq)
sp = compute_spread_proxy_nb(bid_n, ask_n)
da = compute_depth_asymmetry_nb(bid_n, ask_n)
self._imb_hist[asset].append(imb)
self._dep_hist[asset].append(d1pct)
market_total_depth += d1pct
imb_arr = np.array(list(self._imb_hist[asset]), dtype=np.float64)
ma5_n = min(5, len(imb_arr))
imb_ma5 = float(np.mean(imb_arr[-ma5_n:])) if ma5_n > 0 else float(imb)
persistence = float(compute_imbalance_persistence_nb(imb_arr, self.IMBALANCE_LOOKBACK))
dep_arr = np.array(list(self._dep_hist[asset]), dtype=np.float64)
lb_actual = min(self.DEPTH_LOOKBACK, len(dep_arr) - 1)
velocity = float(compute_withdrawal_velocity_nb(dep_arr, lb_actual))
per_asset[asset] = {
"depth_1pct_usd": float(d1pct),
"depth_quality": float(dq),
"fill_probability": float(fp),
"spread_proxy_bps": float(sp),
"imbalance": float(imb),
"imbalance_ma5": float(imb_ma5),
"imbalance_persistence": float(persistence),
"depth_asymmetry": float(da),
"withdrawal_velocity": float(velocity),
}
asset_imbalances.append(float(imb))
asset_velocities.append(float(velocity))
if market_total_depth > 0:
self._market_dep.append(market_total_depth)
n_valid = len(asset_imbalances)
if n_valid > 0:
imb_np = np.array(asset_imbalances, dtype=np.float64)
med_imb, agreement = compute_market_agreement_nb(imb_np, n_valid)
market_features = {
"median_imbalance": float(med_imb),
"agreement_pct": float(agreement),
"depth_pressure": float(np.mean(imb_np)),
}
else:
market_features = {"median_imbalance": 0.0, "agreement_pct": 0.5, "depth_pressure": 0.0}
if n_valid > 0:
vel_np = np.array(asset_velocities, dtype=np.float64)
cascade, mean_vel, regime = compute_cascade_signal_nb(
vel_np, n_valid, self.CASCADE_THRESHOLD
)
dep_hist_list = list(self._market_dep)
accel = 0.0
if len(dep_hist_list) >= 20:
rv = (dep_hist_list[-1] - dep_hist_list[-10]) / max(1e-9, dep_hist_list[-10])
pv = (dep_hist_list[-10] - dep_hist_list[-20]) / max(1e-9, dep_hist_list[-20])
accel = rv - pv
macro_features = {
"depth_velocity": float(mean_vel),
"cascade_count": int(cascade),
"acceleration": float(accel),
"regime_signal": int(regime),
}
else:
macro_features = {"depth_velocity": 0.0, "cascade_count": 0,
"acceleration": 0.0, "regime_signal": 0}
return {"per_asset": per_asset, "market": market_features, "macro": macro_features}
# ===========================================================================
# PERSISTENCE SERVICE
# ===========================================================================
class OBFPersistenceService:
"""
Background thread that flushes OB feature snapshots to Parquet.
Fixes applied:
P0-5 HISTORY_MAXLEN covers full flush window (no silent row eviction)
P1-9 First flush after FIRST_FLUSH_S (60 s), not flush_interval_s (5 min)
P1-10 Filenames include microseconds collision-proof
P2-2 Vectorised column transpose in _write_parquet (O(fields) not O(N×fields))
P3-2 _cleanup_old_partitions() prunes dirs older than MAX_FILE_AGE_DAYS
"""
def __init__(
self,
assets: List[str] = None,
flush_interval_s: float = FLUSH_INTERVAL_S,
base_dir: Path = PARQUET_BASE_DIR,
history_maxlen: int = HISTORY_MAXLEN,
):
self.assets = list(assets or DEFAULT_ASSETS)
self.flush_interval = flush_interval_s
self.base_dir = Path(base_dir)
self.history_maxlen = history_maxlen
self._history: Dict[str, deque] = {
a: deque(maxlen=history_maxlen) for a in self.assets
}
self._lock = threading.Lock()
self._files_written = 0
self._rows_written = 0
self._last_flush_ts: Optional[float] = None
self._thread: Optional[threading.Thread] = None
self._running = False
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def start(self):
self._running = True
self._thread = threading.Thread(
target=self._run, daemon=True, name="obf-persist"
)
self._thread.start()
logger.info(
"OBFPersistenceService started (assets=%s, flush=%ss, maxlen=%d)",
self.assets, self.flush_interval, self.history_maxlen,
)
def stop(self):
self._running = False
if self._thread:
self._thread.join(timeout=10)
logger.info("OBFPersistenceService stopped")
def update_snapshot(self, asset: str, row: dict):
"""Append one feature row. O(1), lock-protected, non-blocking."""
if asset not in self._history:
return
with self._lock:
self._history[asset].append(row)
def force_flush(self):
"""Synchronous flush (testing)."""
self._flush()
def get_stats(self) -> dict:
with self._lock:
sizes = {a: len(q) for a, q in self._history.items()}
return {
"files_written": self._files_written,
"rows_written": self._rows_written,
"last_flush_ts": self._last_flush_ts,
"buffer_sizes": sizes,
}
# ------------------------------------------------------------------
# Background thread
# ------------------------------------------------------------------
def _run(self):
# P1-9: first flush after FIRST_FLUSH_S, not flush_interval_s
time.sleep(FIRST_FLUSH_S)
while self._running:
try:
self._flush()
except Exception as exc:
logger.error("OBF flush error: %s", exc)
time.sleep(self.flush_interval)
# ------------------------------------------------------------------
# Flush
# ------------------------------------------------------------------
def _flush(self):
now = datetime.now(timezone.utc)
date_str = now.strftime("%Y-%m-%d")
# P1-10: microsecond-precision filename — no same-second collision
ts_str = now.strftime("%H-%M-%S-%f")
with self._lock:
snapshots = {a: list(q) for a, q in self._history.items()}
total_rows = 0
for asset, rows in snapshots.items():
if not rows:
continue
try:
self._write_parquet(asset, rows, date_str, ts_str)
total_rows += len(rows)
except Exception as exc:
logger.error("OBF write failed for %s: %s", asset, exc)
self._last_flush_ts = time.time()
if total_rows > 0:
logger.info("OBF flush: %d rows across %d assets", total_rows, len(self.assets))
# P3-2: prune old partitions after each flush
try:
self._cleanup_old_partitions()
except Exception as exc:
logger.warning("OBF cleanup error: %s", exc)
# ------------------------------------------------------------------
# Parquet write (P2-2: vectorised column transpose)
# ------------------------------------------------------------------
def _write_parquet(self, asset: str, rows: List[dict], date_str: str, ts_str: str):
"""Write one Parquet file (Hive partitioned) + MD5 checksum.
Column building uses a single vectorised transpose pass (O(fields))
rather than the original N×fields Python loop. Pattern inspired by
prod/realtime_exf_service.py and backfill_ng3_to_arrow.py.
"""
out_dir = (
self.base_dir
/ f"exchange={EXCHANGE}"
/ f"symbol={asset}"
/ f"date={date_str}"
)
out_dir.mkdir(parents=True, exist_ok=True)
filename = f"part-{ts_str}.parquet"
out_path = out_dir / filename
tmp_path = out_path.with_suffix(".tmp")
# P2-2: single pass — transpose list-of-dicts → dict-of-lists
sentinel = _SENTINELS
arrays = {
f.name: pa.array(
[row.get(f.name, sentinel[f.name]) for row in rows],
type=f.type,
)
for f in OB_SCHEMA
}
table = pa.table(arrays, schema=OB_SCHEMA)
# Atomic write: tmp → rename
pq.write_table(table, tmp_path, compression="snappy")
tmp_path.rename(out_path)
# MD5 checksum alongside
digest = hashlib.md5(out_path.read_bytes()).hexdigest()
(out_dir / f"{filename}.md5").write_text(digest)
self._files_written += 1
self._rows_written += len(rows)
logger.debug("OBF wrote %d rows → %s", len(rows), out_path)
# ------------------------------------------------------------------
# P3-2: old partition cleanup
# ------------------------------------------------------------------
def _cleanup_old_partitions(self):
"""Remove date partition directories older than MAX_FILE_AGE_DAYS. 0 = disabled."""
if not MAX_FILE_AGE_DAYS or not self.base_dir.exists():
return
cutoff = datetime.now(timezone.utc).date() - timedelta(days=MAX_FILE_AGE_DAYS)
for date_dir in self.base_dir.glob("**/date=*/"):
try:
date_str = date_dir.name.split("=", 1)[1]
dir_date = datetime.strptime(date_str, "%Y-%m-%d").date()
if dir_date < cutoff:
shutil.rmtree(date_dir)
logger.info("OBF pruned old partition: %s", date_dir)
except (IndexError, ValueError):
pass # malformed dir name — skip