485 lines
20 KiB
Python
485 lines
20 KiB
Python
|
|
"""
|
|||
|
|
OBF Persistence Service
|
|||
|
|
=======================
|
|||
|
|
|
|||
|
|
Persists live order-book feature snapshots to Arrow/Parquet for backtesting.
|
|||
|
|
|
|||
|
|
Storage layout (Hive partitioning, Spark-compatible):
|
|||
|
|
/mnt/ng6_data/ob_features/
|
|||
|
|
exchange=binance/
|
|||
|
|
symbol=BTCUSDT/
|
|||
|
|
date=YYYY-MM-DD/
|
|||
|
|
part-HH-MM-SS-ffffff.parquet ← microsecond precision (P1-10)
|
|||
|
|
part-HH-MM-SS-ffffff.parquet.md5
|
|||
|
|
|
|||
|
|
Schema: one row per OB snapshot per asset.
|
|||
|
|
|
|||
|
|
Fixes applied:
|
|||
|
|
P0-5 HISTORY_MAXLEN now covers the full flush window (no data loss on crash)
|
|||
|
|
P1-9 First flush fires after FIRST_FLUSH_S (60 s) not flush_interval_s (5 min)
|
|||
|
|
P1-10 Filenames include microseconds — no same-second collision
|
|||
|
|
P2-2 _write_parquet uses vectorised column transpose (inspired by realtime_exf_service.py)
|
|||
|
|
P2-8 _imb_hist deque sized to actual IMBALANCE_LOOKBACK, not DEPTH_LOOKBACK
|
|||
|
|
P3-2 _cleanup_old_partitions() removes date dirs older than MAX_FILE_AGE_DAYS
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import hashlib
|
|||
|
|
import logging
|
|||
|
|
import shutil
|
|||
|
|
import threading
|
|||
|
|
import time
|
|||
|
|
from collections import deque
|
|||
|
|
from datetime import datetime, timedelta, timezone
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, List, Optional
|
|||
|
|
|
|||
|
|
import numpy as np
|
|||
|
|
import pyarrow as pa
|
|||
|
|
import pyarrow.parquet as pq
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
# ===========================================================================
|
|||
|
|
# CONSTANTS
|
|||
|
|
# ===========================================================================
|
|||
|
|
PARQUET_BASE_DIR = Path("/mnt/ng6_data/ob_features")
|
|||
|
|
EXCHANGE = "binance"
|
|||
|
|
FLUSH_INTERVAL_S = 300 # 5 minutes between regular flushes
|
|||
|
|
FIRST_FLUSH_S = 60 # P1-9: first flush after 60 s (not 5 min)
|
|||
|
|
POLL_INTERVAL_S = 0.5 # must match obf_prefect_flow.poll_interval_s
|
|||
|
|
# P0-5: buffer must cover the full flush window so no rows are evicted before flush
|
|||
|
|
HISTORY_MAXLEN = int(FLUSH_INTERVAL_S / POLL_INTERVAL_S) + 100 # = 700
|
|||
|
|
MAX_FILE_AGE_DAYS = 0 # 0 = never prune (accumulate for backtesting); set >0 to enable rolling window
|
|||
|
|
|
|||
|
|
DEFAULT_ASSETS = ["BTCUSDT", "ETHUSDT", "SOLUSDT"]
|
|||
|
|
|
|||
|
|
# ===========================================================================
|
|||
|
|
# PARQUET SCHEMA — complete OB feature record per snapshot per asset
|
|||
|
|
# ===========================================================================
|
|||
|
|
OB_SCHEMA = pa.schema([
|
|||
|
|
# Timing
|
|||
|
|
pa.field("exchange_ts", pa.float64()),
|
|||
|
|
pa.field("local_ts", pa.float64()),
|
|||
|
|
pa.field("pushed_at", pa.float64()),
|
|||
|
|
pa.field("lag_s", pa.float32()),
|
|||
|
|
# Identity
|
|||
|
|
pa.field("asset", pa.string()),
|
|||
|
|
# Top-of-book
|
|||
|
|
pa.field("best_bid", pa.float64()),
|
|||
|
|
pa.field("best_ask", pa.float64()),
|
|||
|
|
pa.field("spread_bps", pa.float32()),
|
|||
|
|
# Depth vectors (5 levels: 0=1%, 1=2%, ..., 4=5% from mid)
|
|||
|
|
pa.field("bid_notional_0", pa.float64()),
|
|||
|
|
pa.field("bid_notional_1", pa.float64()),
|
|||
|
|
pa.field("bid_notional_2", pa.float64()),
|
|||
|
|
pa.field("bid_notional_3", pa.float64()),
|
|||
|
|
pa.field("bid_notional_4", pa.float64()),
|
|||
|
|
pa.field("ask_notional_0", pa.float64()),
|
|||
|
|
pa.field("ask_notional_1", pa.float64()),
|
|||
|
|
pa.field("ask_notional_2", pa.float64()),
|
|||
|
|
pa.field("ask_notional_3", pa.float64()),
|
|||
|
|
pa.field("ask_notional_4", pa.float64()),
|
|||
|
|
# Sub-system 1 — Per-asset placement
|
|||
|
|
pa.field("depth_1pct_usd", pa.float64()),
|
|||
|
|
pa.field("depth_quality", pa.float32()),
|
|||
|
|
pa.field("fill_probability", pa.float32()),
|
|||
|
|
pa.field("spread_proxy_bps", pa.float32()),
|
|||
|
|
# Sub-system 2 — Per-asset signal
|
|||
|
|
pa.field("imbalance", pa.float32()),
|
|||
|
|
pa.field("imbalance_ma5", pa.float32()),
|
|||
|
|
pa.field("imbalance_persistence", pa.float32()),
|
|||
|
|
pa.field("depth_asymmetry", pa.float32()),
|
|||
|
|
pa.field("withdrawal_velocity", pa.float32()),
|
|||
|
|
# Sub-system 3 — Cross-asset market signal
|
|||
|
|
pa.field("median_imbalance", pa.float32()),
|
|||
|
|
pa.field("agreement_pct", pa.float32()),
|
|||
|
|
pa.field("depth_pressure", pa.float32()),
|
|||
|
|
# Sub-system 4 — Macro regime
|
|||
|
|
pa.field("depth_velocity", pa.float32()),
|
|||
|
|
pa.field("cascade_count", pa.int32()),
|
|||
|
|
pa.field("acceleration", pa.float32()),
|
|||
|
|
pa.field("regime_signal", pa.int32()),
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
# Pre-build sentinel values per field type (used in vectorised write, P2-2)
|
|||
|
|
def _sentinel(field: pa.Field):
|
|||
|
|
if pa.types.is_string(field.type):
|
|||
|
|
return ""
|
|||
|
|
if pa.types.is_integer(field.type):
|
|||
|
|
return 0
|
|||
|
|
return float("nan")
|
|||
|
|
|
|||
|
|
_SENTINELS: Dict[str, object] = {f.name: _sentinel(f) for f in OB_SCHEMA}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ===========================================================================
|
|||
|
|
# LIVE OB FEATURE ENGINE (incremental, no preload)
|
|||
|
|
# ===========================================================================
|
|||
|
|
|
|||
|
|
class LiveOBFeatureEngine:
|
|||
|
|
"""
|
|||
|
|
Streaming, incremental version of OBFeatureEngine for production use.
|
|||
|
|
|
|||
|
|
Fixes applied:
|
|||
|
|
P2-8 _imb_hist deque maxlen = IMBALANCE_LOOKBACK+5, not DEPTH_LOOKBACK
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
IMBALANCE_LOOKBACK = 10 # snapshots → 5 s window at 0.5 s polling
|
|||
|
|
DEPTH_LOOKBACK = 60 # snapshots → 30 s velocity window
|
|||
|
|
CASCADE_THRESHOLD = -0.10
|
|||
|
|
MEDIAN_CALIB_N = 60 # first N samples to build median depth ref
|
|||
|
|
|
|||
|
|
def __init__(self, assets: List[str]):
|
|||
|
|
self.assets = list(assets)
|
|||
|
|
# P2-8: size each deque to its actual lookback, not the larger DEPTH_LOOKBACK
|
|||
|
|
self._imb_hist: Dict[str, deque] = {
|
|||
|
|
a: deque(maxlen=self.IMBALANCE_LOOKBACK + 5) for a in assets
|
|||
|
|
}
|
|||
|
|
self._dep_hist: Dict[str, deque] = {
|
|||
|
|
a: deque(maxlen=self.DEPTH_LOOKBACK) for a in assets
|
|||
|
|
}
|
|||
|
|
self._median_ref: Dict[str, float] = {a: 0.0 for a in assets}
|
|||
|
|
self._calib_buf: Dict[str, list] = {a: [] for a in assets}
|
|||
|
|
self._calibrated: Dict[str, bool] = {a: False for a in assets}
|
|||
|
|
self._market_dep: deque = deque(maxlen=self.DEPTH_LOOKBACK)
|
|||
|
|
|
|||
|
|
def update(self, snaps: dict) -> dict:
|
|||
|
|
"""
|
|||
|
|
Process one OBSnapshot per asset (may be None if not yet initialized).
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
snaps: Dict[asset_str → OBSnapshot-like dict | None]
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
Feature dict with 'per_asset', 'market', 'macro' sub-keys.
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
from nautilus_dolphin.nautilus_dolphin.nautilus.ob_features import (
|
|||
|
|
compute_imbalance_nb, compute_depth_1pct_nb, compute_depth_quality_nb,
|
|||
|
|
compute_fill_probability_nb, compute_spread_proxy_nb,
|
|||
|
|
compute_depth_asymmetry_nb, compute_imbalance_persistence_nb,
|
|||
|
|
compute_withdrawal_velocity_nb, compute_market_agreement_nb,
|
|||
|
|
compute_cascade_signal_nb,
|
|||
|
|
)
|
|||
|
|
except (ImportError, RuntimeError):
|
|||
|
|
import importlib.util, types, sys
|
|||
|
|
_root = Path(__file__).parent.parent
|
|||
|
|
_naut_dir = _root / "nautilus_dolphin" / "nautilus_dolphin" / "nautilus"
|
|||
|
|
_pkg = "_naut_ob"
|
|||
|
|
if _pkg not in sys.modules:
|
|||
|
|
_p = types.ModuleType(_pkg)
|
|||
|
|
_p.__path__ = [str(_naut_dir)]
|
|||
|
|
_p.__package__ = _pkg
|
|||
|
|
sys.modules[_pkg] = _p
|
|||
|
|
def _lm(name):
|
|||
|
|
fn = f"{_pkg}.{name}"
|
|||
|
|
if fn not in sys.modules:
|
|||
|
|
sp = importlib.util.spec_from_file_location(fn, _naut_dir / f"{name}.py")
|
|||
|
|
m = importlib.util.module_from_spec(sp)
|
|||
|
|
m.__package__ = _pkg
|
|||
|
|
sys.modules[fn] = m
|
|||
|
|
sp.loader.exec_module(m)
|
|||
|
|
return sys.modules[fn]
|
|||
|
|
_lm("ob_provider")
|
|||
|
|
_obf = _lm("ob_features")
|
|||
|
|
compute_imbalance_nb = _obf.compute_imbalance_nb
|
|||
|
|
compute_depth_1pct_nb = _obf.compute_depth_1pct_nb
|
|||
|
|
compute_depth_quality_nb = _obf.compute_depth_quality_nb
|
|||
|
|
compute_fill_probability_nb = _obf.compute_fill_probability_nb
|
|||
|
|
compute_spread_proxy_nb = _obf.compute_spread_proxy_nb
|
|||
|
|
compute_depth_asymmetry_nb = _obf.compute_depth_asymmetry_nb
|
|||
|
|
compute_imbalance_persistence_nb = _obf.compute_imbalance_persistence_nb
|
|||
|
|
compute_withdrawal_velocity_nb = _obf.compute_withdrawal_velocity_nb
|
|||
|
|
compute_market_agreement_nb = _obf.compute_market_agreement_nb
|
|||
|
|
compute_cascade_signal_nb = _obf.compute_cascade_signal_nb
|
|||
|
|
|
|||
|
|
per_asset: dict = {}
|
|||
|
|
asset_imbalances: list = []
|
|||
|
|
asset_velocities: list = []
|
|||
|
|
market_total_depth = 0.0
|
|||
|
|
|
|||
|
|
for asset in self.assets:
|
|||
|
|
snap = snaps.get(asset)
|
|||
|
|
if snap is None:
|
|||
|
|
per_asset[asset] = None
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
bid_n = np.asarray(snap.get("bid_notional", [0.0] * 5), dtype=np.float64)
|
|||
|
|
ask_n = np.asarray(snap.get("ask_notional", [0.0] * 5), dtype=np.float64)
|
|||
|
|
|
|||
|
|
imb = compute_imbalance_nb(bid_n, ask_n)
|
|||
|
|
d1pct = compute_depth_1pct_nb(bid_n, ask_n)
|
|||
|
|
|
|||
|
|
if not self._calibrated[asset]:
|
|||
|
|
self._calib_buf[asset].append(d1pct)
|
|||
|
|
if len(self._calib_buf[asset]) >= self.MEDIAN_CALIB_N:
|
|||
|
|
self._median_ref[asset] = float(np.median(self._calib_buf[asset]))
|
|||
|
|
self._calibrated[asset] = True
|
|||
|
|
med_ref = self._median_ref[asset] if self._calibrated[asset] else max(d1pct, 1.0)
|
|||
|
|
|
|||
|
|
dq = compute_depth_quality_nb(d1pct, med_ref)
|
|||
|
|
fp = compute_fill_probability_nb(dq)
|
|||
|
|
sp = compute_spread_proxy_nb(bid_n, ask_n)
|
|||
|
|
da = compute_depth_asymmetry_nb(bid_n, ask_n)
|
|||
|
|
|
|||
|
|
self._imb_hist[asset].append(imb)
|
|||
|
|
self._dep_hist[asset].append(d1pct)
|
|||
|
|
market_total_depth += d1pct
|
|||
|
|
|
|||
|
|
imb_arr = np.array(list(self._imb_hist[asset]), dtype=np.float64)
|
|||
|
|
ma5_n = min(5, len(imb_arr))
|
|||
|
|
imb_ma5 = float(np.mean(imb_arr[-ma5_n:])) if ma5_n > 0 else float(imb)
|
|||
|
|
persistence = float(compute_imbalance_persistence_nb(imb_arr, self.IMBALANCE_LOOKBACK))
|
|||
|
|
|
|||
|
|
dep_arr = np.array(list(self._dep_hist[asset]), dtype=np.float64)
|
|||
|
|
lb_actual = min(self.DEPTH_LOOKBACK, len(dep_arr) - 1)
|
|||
|
|
velocity = float(compute_withdrawal_velocity_nb(dep_arr, lb_actual))
|
|||
|
|
|
|||
|
|
per_asset[asset] = {
|
|||
|
|
"depth_1pct_usd": float(d1pct),
|
|||
|
|
"depth_quality": float(dq),
|
|||
|
|
"fill_probability": float(fp),
|
|||
|
|
"spread_proxy_bps": float(sp),
|
|||
|
|
"imbalance": float(imb),
|
|||
|
|
"imbalance_ma5": float(imb_ma5),
|
|||
|
|
"imbalance_persistence": float(persistence),
|
|||
|
|
"depth_asymmetry": float(da),
|
|||
|
|
"withdrawal_velocity": float(velocity),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
asset_imbalances.append(float(imb))
|
|||
|
|
asset_velocities.append(float(velocity))
|
|||
|
|
|
|||
|
|
if market_total_depth > 0:
|
|||
|
|
self._market_dep.append(market_total_depth)
|
|||
|
|
|
|||
|
|
n_valid = len(asset_imbalances)
|
|||
|
|
if n_valid > 0:
|
|||
|
|
imb_np = np.array(asset_imbalances, dtype=np.float64)
|
|||
|
|
med_imb, agreement = compute_market_agreement_nb(imb_np, n_valid)
|
|||
|
|
market_features = {
|
|||
|
|
"median_imbalance": float(med_imb),
|
|||
|
|
"agreement_pct": float(agreement),
|
|||
|
|
"depth_pressure": float(np.mean(imb_np)),
|
|||
|
|
}
|
|||
|
|
else:
|
|||
|
|
market_features = {"median_imbalance": 0.0, "agreement_pct": 0.5, "depth_pressure": 0.0}
|
|||
|
|
|
|||
|
|
if n_valid > 0:
|
|||
|
|
vel_np = np.array(asset_velocities, dtype=np.float64)
|
|||
|
|
cascade, mean_vel, regime = compute_cascade_signal_nb(
|
|||
|
|
vel_np, n_valid, self.CASCADE_THRESHOLD
|
|||
|
|
)
|
|||
|
|
dep_hist_list = list(self._market_dep)
|
|||
|
|
accel = 0.0
|
|||
|
|
if len(dep_hist_list) >= 20:
|
|||
|
|
rv = (dep_hist_list[-1] - dep_hist_list[-10]) / max(1e-9, dep_hist_list[-10])
|
|||
|
|
pv = (dep_hist_list[-10] - dep_hist_list[-20]) / max(1e-9, dep_hist_list[-20])
|
|||
|
|
accel = rv - pv
|
|||
|
|
macro_features = {
|
|||
|
|
"depth_velocity": float(mean_vel),
|
|||
|
|
"cascade_count": int(cascade),
|
|||
|
|
"acceleration": float(accel),
|
|||
|
|
"regime_signal": int(regime),
|
|||
|
|
}
|
|||
|
|
else:
|
|||
|
|
macro_features = {"depth_velocity": 0.0, "cascade_count": 0,
|
|||
|
|
"acceleration": 0.0, "regime_signal": 0}
|
|||
|
|
|
|||
|
|
return {"per_asset": per_asset, "market": market_features, "macro": macro_features}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ===========================================================================
|
|||
|
|
# PERSISTENCE SERVICE
|
|||
|
|
# ===========================================================================
|
|||
|
|
|
|||
|
|
class OBFPersistenceService:
|
|||
|
|
"""
|
|||
|
|
Background thread that flushes OB feature snapshots to Parquet.
|
|||
|
|
|
|||
|
|
Fixes applied:
|
|||
|
|
P0-5 HISTORY_MAXLEN covers full flush window (no silent row eviction)
|
|||
|
|
P1-9 First flush after FIRST_FLUSH_S (60 s), not flush_interval_s (5 min)
|
|||
|
|
P1-10 Filenames include microseconds — collision-proof
|
|||
|
|
P2-2 Vectorised column transpose in _write_parquet (O(fields) not O(N×fields))
|
|||
|
|
P3-2 _cleanup_old_partitions() prunes dirs older than MAX_FILE_AGE_DAYS
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
def __init__(
|
|||
|
|
self,
|
|||
|
|
assets: List[str] = None,
|
|||
|
|
flush_interval_s: float = FLUSH_INTERVAL_S,
|
|||
|
|
base_dir: Path = PARQUET_BASE_DIR,
|
|||
|
|
history_maxlen: int = HISTORY_MAXLEN,
|
|||
|
|
):
|
|||
|
|
self.assets = list(assets or DEFAULT_ASSETS)
|
|||
|
|
self.flush_interval = flush_interval_s
|
|||
|
|
self.base_dir = Path(base_dir)
|
|||
|
|
self.history_maxlen = history_maxlen
|
|||
|
|
|
|||
|
|
self._history: Dict[str, deque] = {
|
|||
|
|
a: deque(maxlen=history_maxlen) for a in self.assets
|
|||
|
|
}
|
|||
|
|
self._lock = threading.Lock()
|
|||
|
|
|
|||
|
|
self._files_written = 0
|
|||
|
|
self._rows_written = 0
|
|||
|
|
self._last_flush_ts: Optional[float] = None
|
|||
|
|
self._thread: Optional[threading.Thread] = None
|
|||
|
|
self._running = False
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Public API
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
def start(self):
|
|||
|
|
self._running = True
|
|||
|
|
self._thread = threading.Thread(
|
|||
|
|
target=self._run, daemon=True, name="obf-persist"
|
|||
|
|
)
|
|||
|
|
self._thread.start()
|
|||
|
|
logger.info(
|
|||
|
|
"OBFPersistenceService started (assets=%s, flush=%ss, maxlen=%d)",
|
|||
|
|
self.assets, self.flush_interval, self.history_maxlen,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def stop(self):
|
|||
|
|
self._running = False
|
|||
|
|
if self._thread:
|
|||
|
|
self._thread.join(timeout=10)
|
|||
|
|
logger.info("OBFPersistenceService stopped")
|
|||
|
|
|
|||
|
|
def update_snapshot(self, asset: str, row: dict):
|
|||
|
|
"""Append one feature row. O(1), lock-protected, non-blocking."""
|
|||
|
|
if asset not in self._history:
|
|||
|
|
return
|
|||
|
|
with self._lock:
|
|||
|
|
self._history[asset].append(row)
|
|||
|
|
|
|||
|
|
def force_flush(self):
|
|||
|
|
"""Synchronous flush (testing)."""
|
|||
|
|
self._flush()
|
|||
|
|
|
|||
|
|
def get_stats(self) -> dict:
|
|||
|
|
with self._lock:
|
|||
|
|
sizes = {a: len(q) for a, q in self._history.items()}
|
|||
|
|
return {
|
|||
|
|
"files_written": self._files_written,
|
|||
|
|
"rows_written": self._rows_written,
|
|||
|
|
"last_flush_ts": self._last_flush_ts,
|
|||
|
|
"buffer_sizes": sizes,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Background thread
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
def _run(self):
|
|||
|
|
# P1-9: first flush after FIRST_FLUSH_S, not flush_interval_s
|
|||
|
|
time.sleep(FIRST_FLUSH_S)
|
|||
|
|
while self._running:
|
|||
|
|
try:
|
|||
|
|
self._flush()
|
|||
|
|
except Exception as exc:
|
|||
|
|
logger.error("OBF flush error: %s", exc)
|
|||
|
|
time.sleep(self.flush_interval)
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Flush
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
def _flush(self):
|
|||
|
|
now = datetime.now(timezone.utc)
|
|||
|
|
date_str = now.strftime("%Y-%m-%d")
|
|||
|
|
# P1-10: microsecond-precision filename — no same-second collision
|
|||
|
|
ts_str = now.strftime("%H-%M-%S-%f")
|
|||
|
|
|
|||
|
|
with self._lock:
|
|||
|
|
snapshots = {a: list(q) for a, q in self._history.items()}
|
|||
|
|
|
|||
|
|
total_rows = 0
|
|||
|
|
for asset, rows in snapshots.items():
|
|||
|
|
if not rows:
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
self._write_parquet(asset, rows, date_str, ts_str)
|
|||
|
|
total_rows += len(rows)
|
|||
|
|
except Exception as exc:
|
|||
|
|
logger.error("OBF write failed for %s: %s", asset, exc)
|
|||
|
|
|
|||
|
|
self._last_flush_ts = time.time()
|
|||
|
|
if total_rows > 0:
|
|||
|
|
logger.info("OBF flush: %d rows across %d assets", total_rows, len(self.assets))
|
|||
|
|
|
|||
|
|
# P3-2: prune old partitions after each flush
|
|||
|
|
try:
|
|||
|
|
self._cleanup_old_partitions()
|
|||
|
|
except Exception as exc:
|
|||
|
|
logger.warning("OBF cleanup error: %s", exc)
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Parquet write (P2-2: vectorised column transpose)
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
def _write_parquet(self, asset: str, rows: List[dict], date_str: str, ts_str: str):
|
|||
|
|
"""Write one Parquet file (Hive partitioned) + MD5 checksum.
|
|||
|
|
|
|||
|
|
Column building uses a single vectorised transpose pass (O(fields))
|
|||
|
|
rather than the original N×fields Python loop. Pattern inspired by
|
|||
|
|
prod/realtime_exf_service.py and backfill_ng3_to_arrow.py.
|
|||
|
|
"""
|
|||
|
|
out_dir = (
|
|||
|
|
self.base_dir
|
|||
|
|
/ f"exchange={EXCHANGE}"
|
|||
|
|
/ f"symbol={asset}"
|
|||
|
|
/ f"date={date_str}"
|
|||
|
|
)
|
|||
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
filename = f"part-{ts_str}.parquet"
|
|||
|
|
out_path = out_dir / filename
|
|||
|
|
tmp_path = out_path.with_suffix(".tmp")
|
|||
|
|
|
|||
|
|
# P2-2: single pass — transpose list-of-dicts → dict-of-lists
|
|||
|
|
sentinel = _SENTINELS
|
|||
|
|
arrays = {
|
|||
|
|
f.name: pa.array(
|
|||
|
|
[row.get(f.name, sentinel[f.name]) for row in rows],
|
|||
|
|
type=f.type,
|
|||
|
|
)
|
|||
|
|
for f in OB_SCHEMA
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
table = pa.table(arrays, schema=OB_SCHEMA)
|
|||
|
|
|
|||
|
|
# Atomic write: tmp → rename
|
|||
|
|
pq.write_table(table, tmp_path, compression="snappy")
|
|||
|
|
tmp_path.rename(out_path)
|
|||
|
|
|
|||
|
|
# MD5 checksum alongside
|
|||
|
|
digest = hashlib.md5(out_path.read_bytes()).hexdigest()
|
|||
|
|
(out_dir / f"{filename}.md5").write_text(digest)
|
|||
|
|
|
|||
|
|
self._files_written += 1
|
|||
|
|
self._rows_written += len(rows)
|
|||
|
|
logger.debug("OBF wrote %d rows → %s", len(rows), out_path)
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# P3-2: old partition cleanup
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
def _cleanup_old_partitions(self):
|
|||
|
|
"""Remove date partition directories older than MAX_FILE_AGE_DAYS. 0 = disabled."""
|
|||
|
|
if not MAX_FILE_AGE_DAYS or not self.base_dir.exists():
|
|||
|
|
return
|
|||
|
|
cutoff = datetime.now(timezone.utc).date() - timedelta(days=MAX_FILE_AGE_DAYS)
|
|||
|
|
for date_dir in self.base_dir.glob("**/date=*/"):
|
|||
|
|
try:
|
|||
|
|
date_str = date_dir.name.split("=", 1)[1]
|
|||
|
|
dir_date = datetime.strptime(date_str, "%Y-%m-%d").date()
|
|||
|
|
if dir_date < cutoff:
|
|||
|
|
shutil.rmtree(date_dir)
|
|||
|
|
logger.info("OBF pruned old partition: %s", date_dir)
|
|||
|
|
except (IndexError, ValueError):
|
|||
|
|
pass # malformed dir name — skip
|