"""nautilus_prefect_flow.py — Prefect-supervised Nautilus BacktestEngine daily runner. This flow wraps the Nautilus BacktestEngine + DolphinActor inside a Prefect 3.x flow, providing: - Resilience: Prefect retries, structured error handling, HZ circuit-break checks - Observability: Prefect task logs, HZ heartbeat writes, per-run metrics - State continuity: capital/engine state persisted to HZ; restored on next run - Champion integrity: param hash verified at startup (prevents silent drift) Designed to run in siloqy-env on DOLPHIN (Linux): source /home/dolphin/siloqy_env/bin/activate PREFECT_API_URL=http://localhost:4200/api python prod/nautilus_prefect_flow.py Scheduling (daily at 00:10 UTC, 5 min after paper_trade_flow): python prod/nautilus_prefect_flow.py --register Architecture: dolphin_nautilus_flow ├── hz_probe_task # Verify HZ reachable; raise on hard failure ├── validate_champion_params # Hash-check engine params vs FROZEN spec ├── load_bar_data_task # Load vbt_cache_klines/ parquet for run_date ├── run_nautilus_backtest_task # BacktestEngine.run() in Nautilus kernel │ └─ DolphinActor.on_start() → MC-Forewarner wired (DolphinForewarner) │ └─ DolphinActor.on_bar() × N ├── write_hz_result_task # Persist result + state to HZ IMaps └── heartbeat_task # Write liveness/lag marker to DOLPHIN_HEARTBEAT Champion params (FROZEN — see blue.yml and champion spec): vel_div_threshold=-0.02 vel_div_extreme=-0.05 fixed_tp_pct=0.0095 max_hold_bars=120 fraction=0.20 min_leverage=0.5 max_leverage=5.0 abs_max_leverage=6.0 leverage_convexity=3.0 dc_lookback_bars=7 dc_min_magnitude_bps=0.75 min_irp_alignment=0.45 sp_maker_entry_rate=0.62 sp_maker_exit_rate=0.50 seed=42 """ from __future__ import annotations import json import hashlib import sys import time import logging import argparse import yaml from datetime import datetime, timedelta, date, timezone from pathlib import Path HCM_DIR = Path(__file__).parent.parent sys.path.insert(0, str(HCM_DIR / "nautilus_dolphin")) from prefect import flow, task, get_run_logger from prefect.schedules import Cron import pandas as pd # ── Constants ──────────────────────────────────────────────────────────────────── HZ_HOST = "localhost:5701" HZ_CLUSTER = "dolphin" HZ_HEARTBEAT_MAP = "DOLPHIN_HEARTBEAT" HZ_STATE_MAP = "DOLPHIN_STATE_BLUE" HZ_PNL_MAP = "DOLPHIN_PNL_BLUE" KLINES_DIR = HCM_DIR / "vbt_cache_klines" MC_MODELS_DIR = str(HCM_DIR / "nautilus_dolphin" / "mc_results" / "models") # Champion parameter fingerprint — SHA256 of sorted canonical param string. # Regenerate with: python -c "import prod.nautilus_prefect_flow as m; print(m._compute_champion_hash())" _CHAMPION_PARAMS = { # Engine mode — must be d_liq (GOLD: LiquidationGuardEngine 8x/9x + liq guard) "boost_mode": "d_liq", # Signal "vel_div_threshold": -0.02, "vel_div_extreme": -0.05, # Exit "fixed_tp_pct": 0.0095, "max_hold_bars": 120, # Sizing (note: max_leverage/abs_max_leverage are overridden to 8.0/9.0 # internally by d_liq mode — these values appear in blue.yml but are not # used by the running engine; D_LIQ_SOFT_CAP=8.0 / D_LIQ_ABS_CAP=9.0 apply) "fraction": 0.20, "min_leverage": 0.5, "max_leverage": 5.0, "abs_max_leverage": 6.0, "leverage_convexity": 3.0, # Direction confirm "dc_lookback_bars": 7, "dc_min_magnitude_bps": 0.75, # Asset selection "min_irp_alignment": 0.45, # Fees "sp_maker_entry_rate": 0.62, "sp_maker_exit_rate": 0.50, # RNG "seed": 42, } def _compute_champion_hash() -> str: canonical = json.dumps(_CHAMPION_PARAMS, sort_keys=True) return hashlib.sha256(canonical.encode()).hexdigest()[:16] _CHAMPION_HASH = _compute_champion_hash() # frozen at import time # ── Tasks ──────────────────────────────────────────────────────────────────────── @task(name="hz_probe", retries=3, retry_delay_seconds=10, timeout_seconds=30) def hz_probe_task(hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER) -> bool: """Verify Hazelcast is reachable and responsive. Raises on hard failure after 3 retries so the flow aborts cleanly rather than running with no state persistence. """ log = get_run_logger() import hazelcast client = hazelcast.HazelcastClient( cluster_name=hz_cluster, cluster_members=[hz_host], connection_timeout=5.0, ) try: m = client.get_map(HZ_HEARTBEAT_MAP).blocking() m.put("probe_ts", str(time.time())) log.info(f"[HZ_PROBE] HZ reachable at {hz_host} (cluster={hz_cluster})") return True finally: client.shutdown() @task(name="validate_champion_params", retries=0, timeout_seconds=10) def validate_champion_params(config: dict) -> str: """Assert engine config matches frozen champion params. Compares every key in _CHAMPION_PARAMS against the live config dict. Returns the param hash on success; raises ValueError on any mismatch. """ log = get_run_logger() eng = config.get("engine", {}) mismatches = [] for k, expected in _CHAMPION_PARAMS.items(): actual = eng.get(k) if actual is None: # tolerate missing — DolphinActor supplies defaults that match continue if isinstance(expected, str): if str(actual) != expected: mismatches.append(f" {k}: expected={expected!r} got={actual!r}") elif abs(float(actual) - float(expected)) > 1e-9: mismatches.append(f" {k}: expected={expected!r} got={actual!r}") if mismatches: msg = "CHAMPION PARAM DRIFT DETECTED:\n" + "\n".join(mismatches) log.error(msg) raise ValueError(msg) log.info(f"[PARAMS] Champion params verified. hash={_CHAMPION_HASH}") return _CHAMPION_HASH @task(name="load_bar_data", retries=2, retry_delay_seconds=15, timeout_seconds=120) def load_bar_data_task(run_date: str, klines_dir: Path = KLINES_DIR) -> pd.DataFrame: """Load vbt_cache_klines/.parquet → DataFrame for replay. Returns empty DataFrame if file is absent (engine will skip the day). """ log = get_run_logger() parq = klines_dir / f"{run_date}.parquet" if not parq.exists(): log.warning(f"[DATA] Parquet not found for {run_date}: {parq}") return pd.DataFrame() df = pd.read_parquet(parq) log.info(f"[DATA] Loaded {len(df)} rows × {len(df.columns)} cols for {run_date}") if "vel_div" not in df.columns: log.error(f"[DATA] vel_div column missing from {parq} — aborting") return pd.DataFrame() valid = df["vel_div"].notna() df = df[valid].reset_index(drop=True) log.info(f"[DATA] {len(df)} valid rows after NaN vel_div drop") return df @task(name="run_nautilus_backtest", retries=0, timeout_seconds=600) def run_nautilus_backtest_task( run_date: str, config: dict, initial_capital: float, posture: str = "APEX", ) -> dict: """Run one day through the Nautilus BacktestEngine + DolphinActor. This task boots a full Nautilus kernel (BacktestEngine), registers DolphinActor as the strategy, injects synthetic bars representing the day's scan timestamps, then calls engine.run(). DolphinActor.on_bar() will: 1. Load vbt_cache_klines/ parquet (replay mode, live_mode=False) 2. Process each bar through NDAlphaEngine.step_bar() 3. Write results to HZ DOLPHIN_PNL_BLUE Returns a summary dict with pnl, trades, capital, posture. """ log = get_run_logger() t0 = time.time() # ── Nautilus imports (deferred to avoid top-level JIT cost) ───────────── from nautilus_trader.backtest.engine import BacktestEngine, BacktestEngineConfig from nautilus_trader.model.identifiers import Venue from nautilus_trader.model.data import Bar, BarType from nautilus_trader.model.objects import Price, Quantity, Money, Currency from nautilus_trader.model.enums import OmsType, AccountType from nautilus_trader.core.datetime import dt_to_unix_nanos from nautilus_trader.test_kit.providers import TestInstrumentProvider from nautilus_dolphin.nautilus.dolphin_actor import DolphinActor # ── Build actor config with posture + capital ──────────────────────────── actor_cfg = dict(config) actor_cfg.setdefault("paper_trade", {})["initial_capital"] = initial_capital actor_cfg["posture_override"] = posture actor_cfg["live_mode"] = False # replay mode: reads parquet internally # MC-Forewarner: explicit path so DolphinActor.on_start() wires the MC gate. # Gold-performance stack requires MC gate active — do NOT remove this line. actor_cfg["mc_models_dir"] = MC_MODELS_DIR # ── Boot BacktestEngine ────────────────────────────────────────────────── be_cfg = BacktestEngineConfig(trader_id="DOLPHIN-NAUTILUS-001") engine = BacktestEngine(config=be_cfg) actor = DolphinActor(config=actor_cfg) engine.add_strategy(actor) venue = Venue("BINANCE") usdt = Currency.from_str("USDT") engine.add_venue( venue=venue, oms_type=OmsType.HEDGING, account_type=AccountType.MARGIN, base_currency=usdt, starting_balances=[Money(initial_capital, usdt)], ) instrument = TestInstrumentProvider.default_fx_ccy("BTCUSD", venue) engine.add_instrument(instrument) # ── Generate synthetic bars for the day (one per 5s = 17,280/day). # DolphinActor.on_bar() uses ts_event to extract the date, then loads # the full day's parquet slice in _load_parquet_data(). # We inject exactly one bar at midnight to trigger a single begin_day() # call; DolphinActor then iterates over the full parquet rows internally. bar_type = BarType.from_str("BTCUSD.BINANCE-5-SECOND-LAST-EXTERNAL") dt_event = datetime.strptime(run_date, "%Y-%m-%d").replace( hour=0, minute=0, second=5, tzinfo=timezone.utc ) bars = [ Bar( bar_type=bar_type, open=Price.from_str("10000.00000"), high=Price.from_str("10000.00000"), low=Price.from_str("10000.00000"), close=Price.from_str("10000.00000"), volume=Quantity.from_str("1"), ts_event=dt_to_unix_nanos(dt_event), ts_init=dt_to_unix_nanos(dt_event), ) ] engine.add_data(bars) log.info(f"[NAUTILUS] Starting engine for {run_date} | posture={posture} | capital={initial_capital:,.2f}") try: engine.run() except Exception as e: log.error(f"[NAUTILUS] BacktestEngine.run() raised: {e}") raise elapsed = time.time() - t0 log.info(f"[NAUTILUS] Engine finished in {elapsed:.2f}s") # Collect summary from actor state result = { "date": run_date, "posture": posture, "capital": initial_capital, # updated below if actor exposes it "pnl": 0.0, "trades": 0, "stale_state_events": actor._stale_state_events, "processed_dates": list(actor._processed_dates), "elapsed_s": round(elapsed, 3), "engine": "nautilus_backtest", } # Pull capital from actor's engine if available if actor.engine is not None: result["capital"] = getattr(actor.engine, "capital", initial_capital) result["pnl"] = result["capital"] - initial_capital result["trades"] = len(getattr(actor.engine, "trade_history", [])) log.info( f"[NAUTILUS] {run_date}: PnL={result['pnl']:+.2f} | " f"T={result['trades']} | StaleEvents={result['stale_state_events']}" ) return result @task(name="write_hz_result", retries=3, retry_delay_seconds=5, timeout_seconds=30) def write_hz_result_task( result: dict, initial_capital: float, hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER, state_map: str = HZ_STATE_MAP, pnl_map: str = HZ_PNL_MAP, ) -> None: """Write run result + updated state to Hazelcast. Creates its own client per call (serialization-safe across tasks). Persists both PnL entry and latest state for capital continuity. """ log = get_run_logger() import hazelcast run_date = result["date"] capital = float(result.get("capital", initial_capital)) state = { "strategy": "nautilus-blue", "capital": capital, "date": run_date, "pnl": result.get("pnl", 0.0), "trades": result.get("trades", 0), "posture": result.get("posture", "APEX"), "stale_state_events": result.get("stale_state_events", 0), "updated_at": datetime.now(timezone.utc).isoformat(), "engine": "nautilus_backtest", "param_hash": _CHAMPION_HASH, } client = hazelcast.HazelcastClient( cluster_name=hz_cluster, cluster_members=[hz_host] ) try: client.get_map(pnl_map).blocking().put(run_date, json.dumps(result)) client.get_map(state_map).blocking().put("latest_nautilus", json.dumps(state)) log.info( f"[HZ] Wrote result to {pnl_map}[{run_date}] and {state_map}[latest_nautilus]" ) finally: client.shutdown() @task(name="heartbeat", retries=0, timeout_seconds=15) def heartbeat_task( run_date: str, phase: str, extras: dict | None = None, hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER, ) -> None: """Write a liveness heartbeat to DOLPHIN_HEARTBEAT. Called at flow start, after engine run, and at flow end so external monitoring dashboards can detect stalled runs (no heartbeat in >10 min). """ import hazelcast payload = { "ts": time.time(), "iso": datetime.now(timezone.utc).isoformat(), "run_date": run_date, "phase": phase, "flow": "nautilus_prefect", } if extras: payload.update(extras) client = hazelcast.HazelcastClient( cluster_name=hz_cluster, cluster_members=[hz_host], connection_timeout=5.0, ) try: client.get_map(HZ_HEARTBEAT_MAP).blocking().put( "nautilus_flow_heartbeat", json.dumps(payload) ) except Exception: pass # heartbeat failure is never fatal finally: client.shutdown() @task(name="read_posture_from_hz", retries=2, retry_delay_seconds=5, timeout_seconds=20) def read_posture_task( hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER ) -> tuple[str, float]: """Read current posture + Rm from DOLPHIN_SAFETY. Returns (posture, Rm) — defaults to ('APEX', 1.0) if HZ unavailable. """ log = get_run_logger() import hazelcast client = hazelcast.HazelcastClient( cluster_name=hz_cluster, cluster_members=[hz_host] ) try: try: ref = client.get_cp_subsystem().get_atomic_reference("DOLPHIN_SAFETY").blocking() raw = ref.get() except Exception: m = client.get_map("DOLPHIN_SAFETY").blocking() raw = m.get("latest") if raw: state = json.loads(raw) posture = state.get("posture", "APEX") rm = float(state.get("Rm", 1.0)) log.info(f"[SAFETY] Posture={posture} Rm={rm:.3f}") return posture, rm except Exception as e: log.warning(f"[SAFETY] HZ read failed: {e} — defaulting APEX/1.0") finally: client.shutdown() return "APEX", 1.0 @task(name="restore_capital", retries=2, retry_delay_seconds=5, timeout_seconds=20) def restore_capital_task( initial_capital: float, hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER, state_map: str = HZ_STATE_MAP, ) -> float: """Restore capital from latest HZ state. Falls back to config initial_capital if no prior state exists. """ log = get_run_logger() import hazelcast client = hazelcast.HazelcastClient( cluster_name=hz_cluster, cluster_members=[hz_host] ) try: raw = client.get_map(state_map).blocking().get("latest_nautilus") if raw: state = json.loads(raw) cap = float(state.get("capital", initial_capital)) log.info(f"[STATE] Restored capital={cap:,.2f} from HZ {state_map}") return cap except Exception as e: log.warning(f"[STATE] Capital restore failed: {e} — using config capital") finally: client.shutdown() return initial_capital # ── Flow ───────────────────────────────────────────────────────────────────────── @flow( name="dolphin-nautilus-backtest", log_prints=True, description=( "Daily Nautilus BacktestEngine run under Prefect supervision. " "Wraps DolphinActor with champion params, HZ heartbeats, and capital continuity." ), ) def dolphin_nautilus_flow( config_path: str = "prod/configs/blue.yml", run_date: str | None = None, dry_run: bool = False, ) -> dict: """Main Prefect flow: Nautilus BacktestEngine daily runner. Scheduled at 00:10 UTC (after paper_trade_flow at 00:05 UTC). Run manually for a specific date: dolphin_nautilus_flow(run_date="2026-03-21") """ log = get_run_logger() # Resolve target date (yesterday by default) target_date = run_date or (date.today() - timedelta(days=1)).isoformat() log.info(f"=== NAUTILUS BACKTEST FLOW: {target_date} ===") # Load strategy config cfg_path = Path(config_path) if not cfg_path.is_absolute(): cfg_path = HCM_DIR / cfg_path with open(cfg_path) as f: config = yaml.safe_load(f) initial_capital = float(config.get("paper_trade", {}).get("initial_capital", 25000.0)) hz_cfg = config.get("hazelcast", {}) state_map = hz_cfg.get("imap_state", HZ_STATE_MAP) pnl_map = hz_cfg.get("imap_pnl", HZ_PNL_MAP) # ── 1. HZ probe ────────────────────────────────────────────────────────── hz_ok = False try: hz_ok = hz_probe_task(HZ_HOST, HZ_CLUSTER) except Exception as e: log.warning(f"[HZ_PROBE] HZ unreachable after retries: {e}. Continuing without HZ.") # ── 2. Heartbeat: flow_start ───────────────────────────────────────────── if hz_ok: heartbeat_task(target_date, "flow_start") # ── 3. Champion param validation ───────────────────────────────────────── param_hash = validate_champion_params(config) # ── 4. Read posture ─────────────────────────────────────────────────────── posture, rm = "APEX", 1.0 if hz_ok: posture, rm = read_posture_task(HZ_HOST, HZ_CLUSTER) if posture == "HIBERNATE": log.warning(f"[POSTURE] HIBERNATE — skipping engine run for {target_date}") result = { "date": target_date, "posture": posture, "pnl": 0.0, "capital": initial_capital, "trades": 0, "skipped": True, } if hz_ok: write_hz_result_task(result, initial_capital, HZ_HOST, HZ_CLUSTER, state_map, pnl_map) heartbeat_task(target_date, "flow_end_hibernate") return result # ── 5. Restore capital ──────────────────────────────────────────────────── capital = initial_capital if hz_ok: capital = restore_capital_task(initial_capital, HZ_HOST, HZ_CLUSTER, state_map) # ── 6. Validate bar data exists ─────────────────────────────────────────── df = load_bar_data_task(target_date, KLINES_DIR) if df.empty: log.warning(f"[DATA] No scan data for {target_date} — skipping engine run") result = { "date": target_date, "posture": posture, "pnl": 0.0, "capital": capital, "trades": 0, "skipped": True, "reason": "no_data", } if hz_ok: heartbeat_task(target_date, "flow_end_no_data") return result if dry_run: log.info(f"[DRY_RUN] Data OK ({len(df)} rows). Engine not started.") return {"date": target_date, "dry_run": True, "rows": len(df)} # ── 7. Run Nautilus engine ──────────────────────────────────────────────── if hz_ok: heartbeat_task(target_date, "engine_start", {"capital": capital, "posture": posture}) result = run_nautilus_backtest_task( run_date=target_date, config=config, initial_capital=capital, posture=posture, ) result["param_hash"] = param_hash # ── 8. Persist result ───────────────────────────────────────────────────── if hz_ok: write_hz_result_task(result, capital, HZ_HOST, HZ_CLUSTER, state_map, pnl_map) heartbeat_task( target_date, "flow_end", {"pnl": result.get("pnl", 0.0), "trades": result.get("trades", 0)}, ) log.info( f"=== DONE: nautilus-blue {target_date} | " f"PnL={result.get('pnl', 0):+.2f} | " f"Capital={result.get('capital', capital):,.2f} | " f"Trades={result.get('trades', 0)} ===" ) return result # ── CLI entry point ─────────────────────────────────────────────────────────────── if __name__ == "__main__": parser = argparse.ArgumentParser(description="Dolphin Nautilus Prefect Flow") parser.add_argument("--config", default="prod/configs/blue.yml", help="Strategy YAML config") parser.add_argument("--date", default=None, help="YYYY-MM-DD (default: yesterday)") parser.add_argument("--register", action="store_true", help="Register Prefect deployment") parser.add_argument("--dry-run", action="store_true", help="Probe + validate only, no engine") args = parser.parse_args() import os os.environ.setdefault("PREFECT_API_URL", "http://localhost:4200/api") if args.register: abs_cfg = str(Path(args.config).resolve()) deployment = dolphin_nautilus_flow.to_deployment( name="dolphin-nautilus-blue", parameters={"config_path": abs_cfg}, schedule=Cron("10 0 * * *", timezone="UTC"), work_pool_name="dolphin", tags=["blue", "nautilus", "dolphin", "backtest"], ) deployment.apply() print("Registered: dolphin-nautilus-blue (daily 00:10 UTC)") else: dolphin_nautilus_flow( config_path=args.config, run_date=args.date, dry_run=args.dry_run, )