Files
DOLPHIN/prod/nautilus_prefect_flow.py

599 lines
24 KiB
Python
Raw Normal View History

"""nautilus_prefect_flow.py — Prefect-supervised Nautilus BacktestEngine daily runner.
This flow wraps the Nautilus BacktestEngine + DolphinActor inside a Prefect 3.x
flow, providing:
- Resilience: Prefect retries, structured error handling, HZ circuit-break checks
- Observability: Prefect task logs, HZ heartbeat writes, per-run metrics
- State continuity: capital/engine state persisted to HZ; restored on next run
- Champion integrity: param hash verified at startup (prevents silent drift)
Designed to run in siloqy-env on DOLPHIN (Linux):
source /home/dolphin/siloqy_env/bin/activate
PREFECT_API_URL=http://localhost:4200/api python prod/nautilus_prefect_flow.py
Scheduling (daily at 00:10 UTC, 5 min after paper_trade_flow):
python prod/nautilus_prefect_flow.py --register
Architecture:
dolphin_nautilus_flow
hz_probe_task # Verify HZ reachable; raise on hard failure
validate_champion_params # Hash-check engine params vs FROZEN spec
load_bar_data_task # Load vbt_cache_klines/ parquet for run_date
run_nautilus_backtest_task # BacktestEngine.run() in Nautilus kernel
DolphinActor.on_start() MC-Forewarner wired (DolphinForewarner)
DolphinActor.on_bar() × N
write_hz_result_task # Persist result + state to HZ IMaps
heartbeat_task # Write liveness/lag marker to DOLPHIN_HEARTBEAT
Champion params (FROZEN see blue.yml and champion spec):
vel_div_threshold=-0.02 vel_div_extreme=-0.05 fixed_tp_pct=0.0095
max_hold_bars=120 fraction=0.20 min_leverage=0.5
max_leverage=5.0 abs_max_leverage=6.0 leverage_convexity=3.0
dc_lookback_bars=7 dc_min_magnitude_bps=0.75 min_irp_alignment=0.45
sp_maker_entry_rate=0.62 sp_maker_exit_rate=0.50 seed=42
"""
from __future__ import annotations
import json
import hashlib
import sys
import time
import logging
import argparse
import yaml
from datetime import datetime, timedelta, date, timezone
from pathlib import Path
HCM_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(HCM_DIR / "nautilus_dolphin"))
from prefect import flow, task, get_run_logger
from prefect.schedules import Cron
import pandas as pd
# ── Constants ────────────────────────────────────────────────────────────────────
HZ_HOST = "localhost:5701"
HZ_CLUSTER = "dolphin"
HZ_HEARTBEAT_MAP = "DOLPHIN_HEARTBEAT"
HZ_STATE_MAP = "DOLPHIN_STATE_BLUE"
HZ_PNL_MAP = "DOLPHIN_PNL_BLUE"
KLINES_DIR = HCM_DIR / "vbt_cache_klines"
MC_MODELS_DIR = str(HCM_DIR / "nautilus_dolphin" / "mc_results" / "models")
# Champion parameter fingerprint — SHA256 of sorted canonical param string.
# Regenerate with: python -c "import prod.nautilus_prefect_flow as m; print(m._compute_champion_hash())"
_CHAMPION_PARAMS = {
# Engine mode — must be d_liq (GOLD: LiquidationGuardEngine 8x/9x + liq guard)
"boost_mode": "d_liq",
# Signal
"vel_div_threshold": -0.02,
"vel_div_extreme": -0.05,
# Exit
"fixed_tp_pct": 0.0095,
"max_hold_bars": 120,
# Sizing (note: max_leverage/abs_max_leverage are overridden to 8.0/9.0
# internally by d_liq mode — these values appear in blue.yml but are not
# used by the running engine; D_LIQ_SOFT_CAP=8.0 / D_LIQ_ABS_CAP=9.0 apply)
"fraction": 0.20,
"min_leverage": 0.5,
"max_leverage": 5.0,
"abs_max_leverage": 6.0,
"leverage_convexity": 3.0,
# Direction confirm
"dc_lookback_bars": 7,
"dc_min_magnitude_bps": 0.75,
# Asset selection
"min_irp_alignment": 0.45,
# Fees
"sp_maker_entry_rate": 0.62,
"sp_maker_exit_rate": 0.50,
# RNG
"seed": 42,
}
def _compute_champion_hash() -> str:
canonical = json.dumps(_CHAMPION_PARAMS, sort_keys=True)
return hashlib.sha256(canonical.encode()).hexdigest()[:16]
_CHAMPION_HASH = _compute_champion_hash() # frozen at import time
# ── Tasks ────────────────────────────────────────────────────────────────────────
@task(name="hz_probe", retries=3, retry_delay_seconds=10, timeout_seconds=30)
def hz_probe_task(hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER) -> bool:
"""Verify Hazelcast is reachable and responsive.
Raises on hard failure after 3 retries so the flow aborts cleanly rather
than running with no state persistence.
"""
log = get_run_logger()
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster,
cluster_members=[hz_host],
connection_timeout=5.0,
)
try:
m = client.get_map(HZ_HEARTBEAT_MAP).blocking()
m.put("probe_ts", str(time.time()))
log.info(f"[HZ_PROBE] HZ reachable at {hz_host} (cluster={hz_cluster})")
return True
finally:
client.shutdown()
@task(name="validate_champion_params", retries=0, timeout_seconds=10)
def validate_champion_params(config: dict) -> str:
"""Assert engine config matches frozen champion params.
Compares every key in _CHAMPION_PARAMS against the live config dict.
Returns the param hash on success; raises ValueError on any mismatch.
"""
log = get_run_logger()
eng = config.get("engine", {})
mismatches = []
for k, expected in _CHAMPION_PARAMS.items():
actual = eng.get(k)
if actual is None:
# tolerate missing — DolphinActor supplies defaults that match
continue
if isinstance(expected, str):
if str(actual) != expected:
mismatches.append(f" {k}: expected={expected!r} got={actual!r}")
elif abs(float(actual) - float(expected)) > 1e-9:
mismatches.append(f" {k}: expected={expected!r} got={actual!r}")
if mismatches:
msg = "CHAMPION PARAM DRIFT DETECTED:\n" + "\n".join(mismatches)
log.error(msg)
raise ValueError(msg)
log.info(f"[PARAMS] Champion params verified. hash={_CHAMPION_HASH}")
return _CHAMPION_HASH
@task(name="load_bar_data", retries=2, retry_delay_seconds=15, timeout_seconds=120)
def load_bar_data_task(run_date: str, klines_dir: Path = KLINES_DIR) -> pd.DataFrame:
"""Load vbt_cache_klines/<run_date>.parquet → DataFrame for replay.
Returns empty DataFrame if file is absent (engine will skip the day).
"""
log = get_run_logger()
parq = klines_dir / f"{run_date}.parquet"
if not parq.exists():
log.warning(f"[DATA] Parquet not found for {run_date}: {parq}")
return pd.DataFrame()
df = pd.read_parquet(parq)
log.info(f"[DATA] Loaded {len(df)} rows × {len(df.columns)} cols for {run_date}")
if "vel_div" not in df.columns:
log.error(f"[DATA] vel_div column missing from {parq} — aborting")
return pd.DataFrame()
valid = df["vel_div"].notna()
df = df[valid].reset_index(drop=True)
log.info(f"[DATA] {len(df)} valid rows after NaN vel_div drop")
return df
@task(name="run_nautilus_backtest", retries=0, timeout_seconds=600)
def run_nautilus_backtest_task(
run_date: str,
config: dict,
initial_capital: float,
posture: str = "APEX",
) -> dict:
"""Run one day through the Nautilus BacktestEngine + DolphinActor.
This task boots a full Nautilus kernel (BacktestEngine), registers
DolphinActor as the strategy, injects synthetic bars representing the
day's scan timestamps, then calls engine.run().
DolphinActor.on_bar() will:
1. Load vbt_cache_klines/ parquet (replay mode, live_mode=False)
2. Process each bar through NDAlphaEngine.step_bar()
3. Write results to HZ DOLPHIN_PNL_BLUE
Returns a summary dict with pnl, trades, capital, posture.
"""
log = get_run_logger()
t0 = time.time()
# ── Nautilus imports (deferred to avoid top-level JIT cost) ─────────────
from nautilus_trader.backtest.engine import BacktestEngine, BacktestEngineConfig
from nautilus_trader.model.identifiers import Venue
from nautilus_trader.model.data import Bar, BarType
from nautilus_trader.model.objects import Price, Quantity, Money, Currency
from nautilus_trader.model.enums import OmsType, AccountType
from nautilus_trader.core.datetime import dt_to_unix_nanos
from nautilus_trader.test_kit.providers import TestInstrumentProvider
from nautilus_dolphin.nautilus.dolphin_actor import DolphinActor
# ── Build actor config with posture + capital ────────────────────────────
actor_cfg = dict(config)
actor_cfg.setdefault("paper_trade", {})["initial_capital"] = initial_capital
actor_cfg["posture_override"] = posture
actor_cfg["live_mode"] = False # replay mode: reads parquet internally
# MC-Forewarner: explicit path so DolphinActor.on_start() wires the MC gate.
# Gold-performance stack requires MC gate active — do NOT remove this line.
actor_cfg["mc_models_dir"] = MC_MODELS_DIR
# ── Boot BacktestEngine ──────────────────────────────────────────────────
be_cfg = BacktestEngineConfig(trader_id="DOLPHIN-NAUTILUS-001")
engine = BacktestEngine(config=be_cfg)
actor = DolphinActor(config=actor_cfg)
engine.add_strategy(actor)
venue = Venue("BINANCE")
usdt = Currency.from_str("USDT")
engine.add_venue(
venue=venue,
oms_type=OmsType.HEDGING,
account_type=AccountType.MARGIN,
base_currency=usdt,
starting_balances=[Money(initial_capital, usdt)],
)
instrument = TestInstrumentProvider.default_fx_ccy("BTCUSD", venue)
engine.add_instrument(instrument)
# ── Generate synthetic bars for the day (one per 5s = 17,280/day).
# DolphinActor.on_bar() uses ts_event to extract the date, then loads
# the full day's parquet slice in _load_parquet_data().
# We inject exactly one bar at midnight to trigger a single begin_day()
# call; DolphinActor then iterates over the full parquet rows internally.
bar_type = BarType.from_str("BTCUSD.BINANCE-5-SECOND-LAST-EXTERNAL")
dt_event = datetime.strptime(run_date, "%Y-%m-%d").replace(
hour=0, minute=0, second=5, tzinfo=timezone.utc
)
bars = [
Bar(
bar_type=bar_type,
open=Price.from_str("10000.00000"),
high=Price.from_str("10000.00000"),
low=Price.from_str("10000.00000"),
close=Price.from_str("10000.00000"),
volume=Quantity.from_str("1"),
ts_event=dt_to_unix_nanos(dt_event),
ts_init=dt_to_unix_nanos(dt_event),
)
]
engine.add_data(bars)
log.info(f"[NAUTILUS] Starting engine for {run_date} | posture={posture} | capital={initial_capital:,.2f}")
try:
engine.run()
except Exception as e:
log.error(f"[NAUTILUS] BacktestEngine.run() raised: {e}")
raise
elapsed = time.time() - t0
log.info(f"[NAUTILUS] Engine finished in {elapsed:.2f}s")
# Collect summary from actor state
result = {
"date": run_date,
"posture": posture,
"capital": initial_capital, # updated below if actor exposes it
"pnl": 0.0,
"trades": 0,
"stale_state_events": actor._stale_state_events,
"processed_dates": list(actor._processed_dates),
"elapsed_s": round(elapsed, 3),
"engine": "nautilus_backtest",
}
# Pull capital from actor's engine if available
if actor.engine is not None:
result["capital"] = getattr(actor.engine, "capital", initial_capital)
result["pnl"] = result["capital"] - initial_capital
result["trades"] = len(getattr(actor.engine, "trade_history", []))
log.info(
f"[NAUTILUS] {run_date}: PnL={result['pnl']:+.2f} | "
f"T={result['trades']} | StaleEvents={result['stale_state_events']}"
)
return result
@task(name="write_hz_result", retries=3, retry_delay_seconds=5, timeout_seconds=30)
def write_hz_result_task(
result: dict,
initial_capital: float,
hz_host: str = HZ_HOST,
hz_cluster: str = HZ_CLUSTER,
state_map: str = HZ_STATE_MAP,
pnl_map: str = HZ_PNL_MAP,
) -> None:
"""Write run result + updated state to Hazelcast.
Creates its own client per call (serialization-safe across tasks).
Persists both PnL entry and latest state for capital continuity.
"""
log = get_run_logger()
import hazelcast
run_date = result["date"]
capital = float(result.get("capital", initial_capital))
state = {
"strategy": "nautilus-blue",
"capital": capital,
"date": run_date,
"pnl": result.get("pnl", 0.0),
"trades": result.get("trades", 0),
"posture": result.get("posture", "APEX"),
"stale_state_events": result.get("stale_state_events", 0),
"updated_at": datetime.now(timezone.utc).isoformat(),
"engine": "nautilus_backtest",
"param_hash": _CHAMPION_HASH,
}
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster, cluster_members=[hz_host]
)
try:
client.get_map(pnl_map).blocking().put(run_date, json.dumps(result))
client.get_map(state_map).blocking().put("latest_nautilus", json.dumps(state))
log.info(
f"[HZ] Wrote result to {pnl_map}[{run_date}] and {state_map}[latest_nautilus]"
)
finally:
client.shutdown()
@task(name="heartbeat", retries=0, timeout_seconds=15)
def heartbeat_task(
run_date: str,
phase: str,
extras: dict | None = None,
hz_host: str = HZ_HOST,
hz_cluster: str = HZ_CLUSTER,
) -> None:
"""Write a liveness heartbeat to DOLPHIN_HEARTBEAT.
Called at flow start, after engine run, and at flow end so external
monitoring dashboards can detect stalled runs (no heartbeat in >10 min).
"""
import hazelcast
payload = {
"ts": time.time(),
"iso": datetime.now(timezone.utc).isoformat(),
"run_date": run_date,
"phase": phase,
"flow": "nautilus_prefect",
}
if extras:
payload.update(extras)
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster, cluster_members=[hz_host],
connection_timeout=5.0,
)
try:
client.get_map(HZ_HEARTBEAT_MAP).blocking().put(
"nautilus_flow_heartbeat", json.dumps(payload)
)
except Exception:
pass # heartbeat failure is never fatal
finally:
client.shutdown()
@task(name="read_posture_from_hz", retries=2, retry_delay_seconds=5, timeout_seconds=20)
def read_posture_task(
hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER
) -> tuple[str, float]:
"""Read current posture + Rm from DOLPHIN_SAFETY.
Returns (posture, Rm) defaults to ('APEX', 1.0) if HZ unavailable.
"""
log = get_run_logger()
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster, cluster_members=[hz_host]
)
try:
try:
ref = client.get_cp_subsystem().get_atomic_reference("DOLPHIN_SAFETY").blocking()
raw = ref.get()
except Exception:
m = client.get_map("DOLPHIN_SAFETY").blocking()
raw = m.get("latest")
if raw:
state = json.loads(raw)
posture = state.get("posture", "APEX")
rm = float(state.get("Rm", 1.0))
log.info(f"[SAFETY] Posture={posture} Rm={rm:.3f}")
return posture, rm
except Exception as e:
log.warning(f"[SAFETY] HZ read failed: {e} — defaulting APEX/1.0")
finally:
client.shutdown()
return "APEX", 1.0
@task(name="restore_capital", retries=2, retry_delay_seconds=5, timeout_seconds=20)
def restore_capital_task(
initial_capital: float,
hz_host: str = HZ_HOST,
hz_cluster: str = HZ_CLUSTER,
state_map: str = HZ_STATE_MAP,
) -> float:
"""Restore capital from latest HZ state.
Falls back to config initial_capital if no prior state exists.
"""
log = get_run_logger()
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster, cluster_members=[hz_host]
)
try:
raw = client.get_map(state_map).blocking().get("latest_nautilus")
if raw:
state = json.loads(raw)
cap = float(state.get("capital", initial_capital))
log.info(f"[STATE] Restored capital={cap:,.2f} from HZ {state_map}")
return cap
except Exception as e:
log.warning(f"[STATE] Capital restore failed: {e} — using config capital")
finally:
client.shutdown()
return initial_capital
# ── Flow ─────────────────────────────────────────────────────────────────────────
@flow(
name="dolphin-nautilus-backtest",
log_prints=True,
description=(
"Daily Nautilus BacktestEngine run under Prefect supervision. "
"Wraps DolphinActor with champion params, HZ heartbeats, and capital continuity."
),
)
def dolphin_nautilus_flow(
config_path: str = "prod/configs/blue.yml",
run_date: str | None = None,
dry_run: bool = False,
) -> dict:
"""Main Prefect flow: Nautilus BacktestEngine daily runner.
Scheduled at 00:10 UTC (after paper_trade_flow at 00:05 UTC).
Run manually for a specific date:
dolphin_nautilus_flow(run_date="2026-03-21")
"""
log = get_run_logger()
# Resolve target date (yesterday by default)
target_date = run_date or (date.today() - timedelta(days=1)).isoformat()
log.info(f"=== NAUTILUS BACKTEST FLOW: {target_date} ===")
# Load strategy config
cfg_path = Path(config_path)
if not cfg_path.is_absolute():
cfg_path = HCM_DIR / cfg_path
with open(cfg_path) as f:
config = yaml.safe_load(f)
initial_capital = float(config.get("paper_trade", {}).get("initial_capital", 25000.0))
hz_cfg = config.get("hazelcast", {})
state_map = hz_cfg.get("imap_state", HZ_STATE_MAP)
pnl_map = hz_cfg.get("imap_pnl", HZ_PNL_MAP)
# ── 1. HZ probe ──────────────────────────────────────────────────────────
hz_ok = False
try:
hz_ok = hz_probe_task(HZ_HOST, HZ_CLUSTER)
except Exception as e:
log.warning(f"[HZ_PROBE] HZ unreachable after retries: {e}. Continuing without HZ.")
# ── 2. Heartbeat: flow_start ─────────────────────────────────────────────
if hz_ok:
heartbeat_task(target_date, "flow_start")
# ── 3. Champion param validation ─────────────────────────────────────────
param_hash = validate_champion_params(config)
# ── 4. Read posture ───────────────────────────────────────────────────────
posture, rm = "APEX", 1.0
if hz_ok:
posture, rm = read_posture_task(HZ_HOST, HZ_CLUSTER)
if posture == "HIBERNATE":
log.warning(f"[POSTURE] HIBERNATE — skipping engine run for {target_date}")
result = {
"date": target_date, "posture": posture, "pnl": 0.0,
"capital": initial_capital, "trades": 0, "skipped": True,
}
if hz_ok:
write_hz_result_task(result, initial_capital, HZ_HOST, HZ_CLUSTER, state_map, pnl_map)
heartbeat_task(target_date, "flow_end_hibernate")
return result
# ── 5. Restore capital ────────────────────────────────────────────────────
capital = initial_capital
if hz_ok:
capital = restore_capital_task(initial_capital, HZ_HOST, HZ_CLUSTER, state_map)
# ── 6. Validate bar data exists ───────────────────────────────────────────
df = load_bar_data_task(target_date, KLINES_DIR)
if df.empty:
log.warning(f"[DATA] No scan data for {target_date} — skipping engine run")
result = {
"date": target_date, "posture": posture, "pnl": 0.0,
"capital": capital, "trades": 0, "skipped": True, "reason": "no_data",
}
if hz_ok:
heartbeat_task(target_date, "flow_end_no_data")
return result
if dry_run:
log.info(f"[DRY_RUN] Data OK ({len(df)} rows). Engine not started.")
return {"date": target_date, "dry_run": True, "rows": len(df)}
# ── 7. Run Nautilus engine ────────────────────────────────────────────────
if hz_ok:
heartbeat_task(target_date, "engine_start", {"capital": capital, "posture": posture})
result = run_nautilus_backtest_task(
run_date=target_date,
config=config,
initial_capital=capital,
posture=posture,
)
result["param_hash"] = param_hash
# ── 8. Persist result ─────────────────────────────────────────────────────
if hz_ok:
write_hz_result_task(result, capital, HZ_HOST, HZ_CLUSTER, state_map, pnl_map)
heartbeat_task(
target_date, "flow_end",
{"pnl": result.get("pnl", 0.0), "trades": result.get("trades", 0)},
)
log.info(
f"=== DONE: nautilus-blue {target_date} | "
f"PnL={result.get('pnl', 0):+.2f} | "
f"Capital={result.get('capital', capital):,.2f} | "
f"Trades={result.get('trades', 0)} ==="
)
return result
# ── CLI entry point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Dolphin Nautilus Prefect Flow")
parser.add_argument("--config", default="prod/configs/blue.yml", help="Strategy YAML config")
parser.add_argument("--date", default=None, help="YYYY-MM-DD (default: yesterday)")
parser.add_argument("--register", action="store_true", help="Register Prefect deployment")
parser.add_argument("--dry-run", action="store_true", help="Probe + validate only, no engine")
args = parser.parse_args()
import os
os.environ.setdefault("PREFECT_API_URL", "http://localhost:4200/api")
if args.register:
abs_cfg = str(Path(args.config).resolve())
deployment = dolphin_nautilus_flow.to_deployment(
name="dolphin-nautilus-blue",
parameters={"config_path": abs_cfg},
schedule=Cron("10 0 * * *", timezone="UTC"),
work_pool_name="dolphin",
tags=["blue", "nautilus", "dolphin", "backtest"],
)
deployment.apply()
print("Registered: dolphin-nautilus-blue (daily 00:10 UTC)")
else:
dolphin_nautilus_flow(
config_path=args.config,
run_date=args.date,
dry_run=args.dry_run,
)