""" OBF Persistence Service ======================= Persists live order-book feature snapshots to Arrow/Parquet for backtesting. Storage layout (Hive partitioning, Spark-compatible): /mnt/ng6_data/ob_features/ exchange=binance/ symbol=BTCUSDT/ date=YYYY-MM-DD/ part-HH-MM-SS-ffffff.parquet ← microsecond precision (P1-10) part-HH-MM-SS-ffffff.parquet.md5 Schema: one row per OB snapshot per asset. Fixes applied: P0-5 HISTORY_MAXLEN now covers the full flush window (no data loss on crash) P1-9 First flush fires after FIRST_FLUSH_S (60 s) not flush_interval_s (5 min) P1-10 Filenames include microseconds — no same-second collision P2-2 _write_parquet uses vectorised column transpose (inspired by realtime_exf_service.py) P2-8 _imb_hist deque sized to actual IMBALANCE_LOOKBACK, not DEPTH_LOOKBACK P3-2 _cleanup_old_partitions() removes date dirs older than MAX_FILE_AGE_DAYS """ import hashlib import logging import shutil import threading import time from collections import deque from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Dict, List, Optional import numpy as np import pyarrow as pa import pyarrow.parquet as pq logger = logging.getLogger(__name__) # =========================================================================== # CONSTANTS # =========================================================================== PARQUET_BASE_DIR = Path("/mnt/ng6_data/ob_features") EXCHANGE = "binance" FLUSH_INTERVAL_S = 300 # 5 minutes between regular flushes FIRST_FLUSH_S = 60 # P1-9: first flush after 60 s (not 5 min) POLL_INTERVAL_S = 0.5 # must match obf_prefect_flow.poll_interval_s # P0-5: buffer must cover the full flush window so no rows are evicted before flush HISTORY_MAXLEN = int(FLUSH_INTERVAL_S / POLL_INTERVAL_S) + 100 # = 700 MAX_FILE_AGE_DAYS = 0 # 0 = never prune (accumulate for backtesting); set >0 to enable rolling window DEFAULT_ASSETS = ["BTCUSDT", "ETHUSDT", "SOLUSDT"] # =========================================================================== # PARQUET SCHEMA — complete OB feature record per snapshot per asset # =========================================================================== OB_SCHEMA = pa.schema([ # Timing pa.field("exchange_ts", pa.float64()), pa.field("local_ts", pa.float64()), pa.field("pushed_at", pa.float64()), pa.field("lag_s", pa.float32()), # Identity pa.field("asset", pa.string()), # Top-of-book pa.field("best_bid", pa.float64()), pa.field("best_ask", pa.float64()), pa.field("spread_bps", pa.float32()), # Depth vectors (5 levels: 0=1%, 1=2%, ..., 4=5% from mid) pa.field("bid_notional_0", pa.float64()), pa.field("bid_notional_1", pa.float64()), pa.field("bid_notional_2", pa.float64()), pa.field("bid_notional_3", pa.float64()), pa.field("bid_notional_4", pa.float64()), pa.field("ask_notional_0", pa.float64()), pa.field("ask_notional_1", pa.float64()), pa.field("ask_notional_2", pa.float64()), pa.field("ask_notional_3", pa.float64()), pa.field("ask_notional_4", pa.float64()), # Sub-system 1 — Per-asset placement pa.field("depth_1pct_usd", pa.float64()), pa.field("depth_quality", pa.float32()), pa.field("fill_probability", pa.float32()), pa.field("spread_proxy_bps", pa.float32()), # Sub-system 2 — Per-asset signal pa.field("imbalance", pa.float32()), pa.field("imbalance_ma5", pa.float32()), pa.field("imbalance_persistence", pa.float32()), pa.field("depth_asymmetry", pa.float32()), pa.field("withdrawal_velocity", pa.float32()), # Sub-system 3 — Cross-asset market signal pa.field("median_imbalance", pa.float32()), pa.field("agreement_pct", pa.float32()), pa.field("depth_pressure", pa.float32()), # Sub-system 4 — Macro regime pa.field("depth_velocity", pa.float32()), pa.field("cascade_count", pa.int32()), pa.field("acceleration", pa.float32()), pa.field("regime_signal", pa.int32()), ]) # Pre-build sentinel values per field type (used in vectorised write, P2-2) def _sentinel(field: pa.Field): if pa.types.is_string(field.type): return "" if pa.types.is_integer(field.type): return 0 return float("nan") _SENTINELS: Dict[str, object] = {f.name: _sentinel(f) for f in OB_SCHEMA} # =========================================================================== # LIVE OB FEATURE ENGINE (incremental, no preload) # =========================================================================== class LiveOBFeatureEngine: """ Streaming, incremental version of OBFeatureEngine for production use. Fixes applied: P2-8 _imb_hist deque maxlen = IMBALANCE_LOOKBACK+5, not DEPTH_LOOKBACK """ IMBALANCE_LOOKBACK = 10 # snapshots → 5 s window at 0.5 s polling DEPTH_LOOKBACK = 60 # snapshots → 30 s velocity window CASCADE_THRESHOLD = -0.10 MEDIAN_CALIB_N = 60 # first N samples to build median depth ref def __init__(self, assets: List[str]): self.assets = list(assets) # P2-8: size each deque to its actual lookback, not the larger DEPTH_LOOKBACK self._imb_hist: Dict[str, deque] = { a: deque(maxlen=self.IMBALANCE_LOOKBACK + 5) for a in assets } self._dep_hist: Dict[str, deque] = { a: deque(maxlen=self.DEPTH_LOOKBACK) for a in assets } self._median_ref: Dict[str, float] = {a: 0.0 for a in assets} self._calib_buf: Dict[str, list] = {a: [] for a in assets} self._calibrated: Dict[str, bool] = {a: False for a in assets} self._market_dep: deque = deque(maxlen=self.DEPTH_LOOKBACK) def update(self, snaps: dict) -> dict: """ Process one OBSnapshot per asset (may be None if not yet initialized). Args: snaps: Dict[asset_str → OBSnapshot-like dict | None] Returns: Feature dict with 'per_asset', 'market', 'macro' sub-keys. """ try: from nautilus_dolphin.nautilus_dolphin.nautilus.ob_features import ( compute_imbalance_nb, compute_depth_1pct_nb, compute_depth_quality_nb, compute_fill_probability_nb, compute_spread_proxy_nb, compute_depth_asymmetry_nb, compute_imbalance_persistence_nb, compute_withdrawal_velocity_nb, compute_market_agreement_nb, compute_cascade_signal_nb, ) except (ImportError, RuntimeError): import importlib.util, types, sys _root = Path(__file__).parent.parent _naut_dir = _root / "nautilus_dolphin" / "nautilus_dolphin" / "nautilus" _pkg = "_naut_ob" if _pkg not in sys.modules: _p = types.ModuleType(_pkg) _p.__path__ = [str(_naut_dir)] _p.__package__ = _pkg sys.modules[_pkg] = _p def _lm(name): fn = f"{_pkg}.{name}" if fn not in sys.modules: sp = importlib.util.spec_from_file_location(fn, _naut_dir / f"{name}.py") m = importlib.util.module_from_spec(sp) m.__package__ = _pkg sys.modules[fn] = m sp.loader.exec_module(m) return sys.modules[fn] _lm("ob_provider") _obf = _lm("ob_features") compute_imbalance_nb = _obf.compute_imbalance_nb compute_depth_1pct_nb = _obf.compute_depth_1pct_nb compute_depth_quality_nb = _obf.compute_depth_quality_nb compute_fill_probability_nb = _obf.compute_fill_probability_nb compute_spread_proxy_nb = _obf.compute_spread_proxy_nb compute_depth_asymmetry_nb = _obf.compute_depth_asymmetry_nb compute_imbalance_persistence_nb = _obf.compute_imbalance_persistence_nb compute_withdrawal_velocity_nb = _obf.compute_withdrawal_velocity_nb compute_market_agreement_nb = _obf.compute_market_agreement_nb compute_cascade_signal_nb = _obf.compute_cascade_signal_nb per_asset: dict = {} asset_imbalances: list = [] asset_velocities: list = [] market_total_depth = 0.0 for asset in self.assets: snap = snaps.get(asset) if snap is None: per_asset[asset] = None continue bid_n = np.asarray(snap.get("bid_notional", [0.0] * 5), dtype=np.float64) ask_n = np.asarray(snap.get("ask_notional", [0.0] * 5), dtype=np.float64) imb = compute_imbalance_nb(bid_n, ask_n) d1pct = compute_depth_1pct_nb(bid_n, ask_n) if not self._calibrated[asset]: self._calib_buf[asset].append(d1pct) if len(self._calib_buf[asset]) >= self.MEDIAN_CALIB_N: self._median_ref[asset] = float(np.median(self._calib_buf[asset])) self._calibrated[asset] = True med_ref = self._median_ref[asset] if self._calibrated[asset] else max(d1pct, 1.0) dq = compute_depth_quality_nb(d1pct, med_ref) fp = compute_fill_probability_nb(dq) sp = compute_spread_proxy_nb(bid_n, ask_n) da = compute_depth_asymmetry_nb(bid_n, ask_n) self._imb_hist[asset].append(imb) self._dep_hist[asset].append(d1pct) market_total_depth += d1pct imb_arr = np.array(list(self._imb_hist[asset]), dtype=np.float64) ma5_n = min(5, len(imb_arr)) imb_ma5 = float(np.mean(imb_arr[-ma5_n:])) if ma5_n > 0 else float(imb) persistence = float(compute_imbalance_persistence_nb(imb_arr, self.IMBALANCE_LOOKBACK)) dep_arr = np.array(list(self._dep_hist[asset]), dtype=np.float64) lb_actual = min(self.DEPTH_LOOKBACK, len(dep_arr) - 1) velocity = float(compute_withdrawal_velocity_nb(dep_arr, lb_actual)) per_asset[asset] = { "depth_1pct_usd": float(d1pct), "depth_quality": float(dq), "fill_probability": float(fp), "spread_proxy_bps": float(sp), "imbalance": float(imb), "imbalance_ma5": float(imb_ma5), "imbalance_persistence": float(persistence), "depth_asymmetry": float(da), "withdrawal_velocity": float(velocity), } asset_imbalances.append(float(imb)) asset_velocities.append(float(velocity)) if market_total_depth > 0: self._market_dep.append(market_total_depth) n_valid = len(asset_imbalances) if n_valid > 0: imb_np = np.array(asset_imbalances, dtype=np.float64) med_imb, agreement = compute_market_agreement_nb(imb_np, n_valid) market_features = { "median_imbalance": float(med_imb), "agreement_pct": float(agreement), "depth_pressure": float(np.mean(imb_np)), } else: market_features = {"median_imbalance": 0.0, "agreement_pct": 0.5, "depth_pressure": 0.0} if n_valid > 0: vel_np = np.array(asset_velocities, dtype=np.float64) cascade, mean_vel, regime = compute_cascade_signal_nb( vel_np, n_valid, self.CASCADE_THRESHOLD ) dep_hist_list = list(self._market_dep) accel = 0.0 if len(dep_hist_list) >= 20: rv = (dep_hist_list[-1] - dep_hist_list[-10]) / max(1e-9, dep_hist_list[-10]) pv = (dep_hist_list[-10] - dep_hist_list[-20]) / max(1e-9, dep_hist_list[-20]) accel = rv - pv macro_features = { "depth_velocity": float(mean_vel), "cascade_count": int(cascade), "acceleration": float(accel), "regime_signal": int(regime), } else: macro_features = {"depth_velocity": 0.0, "cascade_count": 0, "acceleration": 0.0, "regime_signal": 0} return {"per_asset": per_asset, "market": market_features, "macro": macro_features} # =========================================================================== # PERSISTENCE SERVICE # =========================================================================== class OBFPersistenceService: """ Background thread that flushes OB feature snapshots to Parquet. Fixes applied: P0-5 HISTORY_MAXLEN covers full flush window (no silent row eviction) P1-9 First flush after FIRST_FLUSH_S (60 s), not flush_interval_s (5 min) P1-10 Filenames include microseconds — collision-proof P2-2 Vectorised column transpose in _write_parquet (O(fields) not O(N×fields)) P3-2 _cleanup_old_partitions() prunes dirs older than MAX_FILE_AGE_DAYS """ def __init__( self, assets: List[str] = None, flush_interval_s: float = FLUSH_INTERVAL_S, base_dir: Path = PARQUET_BASE_DIR, history_maxlen: int = HISTORY_MAXLEN, ): self.assets = list(assets or DEFAULT_ASSETS) self.flush_interval = flush_interval_s self.base_dir = Path(base_dir) self.history_maxlen = history_maxlen self._history: Dict[str, deque] = { a: deque(maxlen=history_maxlen) for a in self.assets } self._lock = threading.Lock() self._files_written = 0 self._rows_written = 0 self._last_flush_ts: Optional[float] = None self._thread: Optional[threading.Thread] = None self._running = False # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def start(self): self._running = True self._thread = threading.Thread( target=self._run, daemon=True, name="obf-persist" ) self._thread.start() logger.info( "OBFPersistenceService started (assets=%s, flush=%ss, maxlen=%d)", self.assets, self.flush_interval, self.history_maxlen, ) def stop(self): self._running = False if self._thread: self._thread.join(timeout=10) logger.info("OBFPersistenceService stopped") def update_snapshot(self, asset: str, row: dict): """Append one feature row. O(1), lock-protected, non-blocking.""" if asset not in self._history: return with self._lock: self._history[asset].append(row) def force_flush(self): """Synchronous flush (testing).""" self._flush() def get_stats(self) -> dict: with self._lock: sizes = {a: len(q) for a, q in self._history.items()} return { "files_written": self._files_written, "rows_written": self._rows_written, "last_flush_ts": self._last_flush_ts, "buffer_sizes": sizes, } # ------------------------------------------------------------------ # Background thread # ------------------------------------------------------------------ def _run(self): # P1-9: first flush after FIRST_FLUSH_S, not flush_interval_s time.sleep(FIRST_FLUSH_S) while self._running: try: self._flush() except Exception as exc: logger.error("OBF flush error: %s", exc) time.sleep(self.flush_interval) # ------------------------------------------------------------------ # Flush # ------------------------------------------------------------------ def _flush(self): now = datetime.now(timezone.utc) date_str = now.strftime("%Y-%m-%d") # P1-10: microsecond-precision filename — no same-second collision ts_str = now.strftime("%H-%M-%S-%f") with self._lock: snapshots = {a: list(q) for a, q in self._history.items()} total_rows = 0 for asset, rows in snapshots.items(): if not rows: continue try: self._write_parquet(asset, rows, date_str, ts_str) total_rows += len(rows) except Exception as exc: logger.error("OBF write failed for %s: %s", asset, exc) self._last_flush_ts = time.time() if total_rows > 0: logger.info("OBF flush: %d rows across %d assets", total_rows, len(self.assets)) # P3-2: prune old partitions after each flush try: self._cleanup_old_partitions() except Exception as exc: logger.warning("OBF cleanup error: %s", exc) # ------------------------------------------------------------------ # Parquet write (P2-2: vectorised column transpose) # ------------------------------------------------------------------ def _write_parquet(self, asset: str, rows: List[dict], date_str: str, ts_str: str): """Write one Parquet file (Hive partitioned) + MD5 checksum. Column building uses a single vectorised transpose pass (O(fields)) rather than the original N×fields Python loop. Pattern inspired by prod/realtime_exf_service.py and backfill_ng3_to_arrow.py. """ out_dir = ( self.base_dir / f"exchange={EXCHANGE}" / f"symbol={asset}" / f"date={date_str}" ) out_dir.mkdir(parents=True, exist_ok=True) filename = f"part-{ts_str}.parquet" out_path = out_dir / filename tmp_path = out_path.with_suffix(".tmp") # P2-2: single pass — transpose list-of-dicts → dict-of-lists sentinel = _SENTINELS arrays = { f.name: pa.array( [row.get(f.name, sentinel[f.name]) for row in rows], type=f.type, ) for f in OB_SCHEMA } table = pa.table(arrays, schema=OB_SCHEMA) # Atomic write: tmp → rename pq.write_table(table, tmp_path, compression="snappy") tmp_path.rename(out_path) # MD5 checksum alongside digest = hashlib.md5(out_path.read_bytes()).hexdigest() (out_dir / f"{filename}.md5").write_text(digest) self._files_written += 1 self._rows_written += len(rows) logger.debug("OBF wrote %d rows → %s", len(rows), out_path) # ------------------------------------------------------------------ # P3-2: old partition cleanup # ------------------------------------------------------------------ def _cleanup_old_partitions(self): """Remove date partition directories older than MAX_FILE_AGE_DAYS. 0 = disabled.""" if not MAX_FILE_AGE_DAYS or not self.base_dir.exists(): return cutoff = datetime.now(timezone.utc).date() - timedelta(days=MAX_FILE_AGE_DAYS) for date_dir in self.base_dir.glob("**/date=*/"): try: date_str = date_dir.name.split("=", 1)[1] dir_date = datetime.strptime(date_str, "%Y-%m-%d").date() if dir_date < cutoff: shutil.rmtree(date_dir) logger.info("OBF pruned old partition: %s", date_dir) except (IndexError, ValueError): pass # malformed dir name — skip