Files
DOLPHIN/prod/nautilus_prefect_flow.py
hjnormey 01c19662cb initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems:
- prod/ (BLUE harness, configs, scripts, docs)
- nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved)
- adaptive_exit/ (AEM engine + models/bucket_assignments.pkl)
- Observability/ (EsoF advisor, TUI, dashboards)
- external_factors/ (EsoF producer)
- mc_forewarning_qlabs_fork/ (MC regime/envelope)

Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00

599 lines
24 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""nautilus_prefect_flow.py — Prefect-supervised Nautilus BacktestEngine daily runner.
This flow wraps the Nautilus BacktestEngine + DolphinActor inside a Prefect 3.x
flow, providing:
- Resilience: Prefect retries, structured error handling, HZ circuit-break checks
- Observability: Prefect task logs, HZ heartbeat writes, per-run metrics
- State continuity: capital/engine state persisted to HZ; restored on next run
- Champion integrity: param hash verified at startup (prevents silent drift)
Designed to run in siloqy-env on DOLPHIN (Linux):
source /home/dolphin/siloqy_env/bin/activate
PREFECT_API_URL=http://localhost:4200/api python prod/nautilus_prefect_flow.py
Scheduling (daily at 00:10 UTC, 5 min after paper_trade_flow):
python prod/nautilus_prefect_flow.py --register
Architecture:
dolphin_nautilus_flow
├── hz_probe_task # Verify HZ reachable; raise on hard failure
├── validate_champion_params # Hash-check engine params vs FROZEN spec
├── load_bar_data_task # Load vbt_cache_klines/ parquet for run_date
├── run_nautilus_backtest_task # BacktestEngine.run() in Nautilus kernel
│ └─ DolphinActor.on_start() → MC-Forewarner wired (DolphinForewarner)
│ └─ DolphinActor.on_bar() × N
├── write_hz_result_task # Persist result + state to HZ IMaps
└── heartbeat_task # Write liveness/lag marker to DOLPHIN_HEARTBEAT
Champion params (FROZEN — see blue.yml and champion spec):
vel_div_threshold=-0.02 vel_div_extreme=-0.05 fixed_tp_pct=0.0095
max_hold_bars=120 fraction=0.20 min_leverage=0.5
max_leverage=5.0 abs_max_leverage=6.0 leverage_convexity=3.0
dc_lookback_bars=7 dc_min_magnitude_bps=0.75 min_irp_alignment=0.45
sp_maker_entry_rate=0.62 sp_maker_exit_rate=0.50 seed=42
"""
from __future__ import annotations
import json
import hashlib
import sys
import time
import logging
import argparse
import yaml
from datetime import datetime, timedelta, date, timezone
from pathlib import Path
HCM_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(HCM_DIR / "nautilus_dolphin"))
from prefect import flow, task, get_run_logger
from prefect.schedules import Cron
import pandas as pd
# ── Constants ────────────────────────────────────────────────────────────────────
HZ_HOST = "localhost:5701"
HZ_CLUSTER = "dolphin"
HZ_HEARTBEAT_MAP = "DOLPHIN_HEARTBEAT"
HZ_STATE_MAP = "DOLPHIN_STATE_BLUE"
HZ_PNL_MAP = "DOLPHIN_PNL_BLUE"
KLINES_DIR = HCM_DIR / "vbt_cache_klines"
MC_MODELS_DIR = str(HCM_DIR / "nautilus_dolphin" / "mc_results" / "models")
# Champion parameter fingerprint — SHA256 of sorted canonical param string.
# Regenerate with: python -c "import prod.nautilus_prefect_flow as m; print(m._compute_champion_hash())"
_CHAMPION_PARAMS = {
# Engine mode — must be d_liq (GOLD: LiquidationGuardEngine 8x/9x + liq guard)
"boost_mode": "d_liq",
# Signal
"vel_div_threshold": -0.02,
"vel_div_extreme": -0.05,
# Exit
"fixed_tp_pct": 0.0095,
"max_hold_bars": 120,
# Sizing (note: max_leverage/abs_max_leverage are overridden to 8.0/9.0
# internally by d_liq mode — these values appear in blue.yml but are not
# used by the running engine; D_LIQ_SOFT_CAP=8.0 / D_LIQ_ABS_CAP=9.0 apply)
"fraction": 0.20,
"min_leverage": 0.5,
"max_leverage": 5.0,
"abs_max_leverage": 6.0,
"leverage_convexity": 3.0,
# Direction confirm
"dc_lookback_bars": 7,
"dc_min_magnitude_bps": 0.75,
# Asset selection
"min_irp_alignment": 0.45,
# Fees
"sp_maker_entry_rate": 0.62,
"sp_maker_exit_rate": 0.50,
# RNG
"seed": 42,
}
def _compute_champion_hash() -> str:
canonical = json.dumps(_CHAMPION_PARAMS, sort_keys=True)
return hashlib.sha256(canonical.encode()).hexdigest()[:16]
_CHAMPION_HASH = _compute_champion_hash() # frozen at import time
# ── Tasks ────────────────────────────────────────────────────────────────────────
@task(name="hz_probe", retries=3, retry_delay_seconds=10, timeout_seconds=30)
def hz_probe_task(hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER) -> bool:
"""Verify Hazelcast is reachable and responsive.
Raises on hard failure after 3 retries so the flow aborts cleanly rather
than running with no state persistence.
"""
log = get_run_logger()
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster,
cluster_members=[hz_host],
connection_timeout=5.0,
)
try:
m = client.get_map(HZ_HEARTBEAT_MAP).blocking()
m.put("probe_ts", str(time.time()))
log.info(f"[HZ_PROBE] HZ reachable at {hz_host} (cluster={hz_cluster})")
return True
finally:
client.shutdown()
@task(name="validate_champion_params", retries=0, timeout_seconds=10)
def validate_champion_params(config: dict) -> str:
"""Assert engine config matches frozen champion params.
Compares every key in _CHAMPION_PARAMS against the live config dict.
Returns the param hash on success; raises ValueError on any mismatch.
"""
log = get_run_logger()
eng = config.get("engine", {})
mismatches = []
for k, expected in _CHAMPION_PARAMS.items():
actual = eng.get(k)
if actual is None:
# tolerate missing — DolphinActor supplies defaults that match
continue
if isinstance(expected, str):
if str(actual) != expected:
mismatches.append(f" {k}: expected={expected!r} got={actual!r}")
elif abs(float(actual) - float(expected)) > 1e-9:
mismatches.append(f" {k}: expected={expected!r} got={actual!r}")
if mismatches:
msg = "CHAMPION PARAM DRIFT DETECTED:\n" + "\n".join(mismatches)
log.error(msg)
raise ValueError(msg)
log.info(f"[PARAMS] Champion params verified. hash={_CHAMPION_HASH}")
return _CHAMPION_HASH
@task(name="load_bar_data", retries=2, retry_delay_seconds=15, timeout_seconds=120)
def load_bar_data_task(run_date: str, klines_dir: Path = KLINES_DIR) -> pd.DataFrame:
"""Load vbt_cache_klines/<run_date>.parquet → DataFrame for replay.
Returns empty DataFrame if file is absent (engine will skip the day).
"""
log = get_run_logger()
parq = klines_dir / f"{run_date}.parquet"
if not parq.exists():
log.warning(f"[DATA] Parquet not found for {run_date}: {parq}")
return pd.DataFrame()
df = pd.read_parquet(parq)
log.info(f"[DATA] Loaded {len(df)} rows × {len(df.columns)} cols for {run_date}")
if "vel_div" not in df.columns:
log.error(f"[DATA] vel_div column missing from {parq} — aborting")
return pd.DataFrame()
valid = df["vel_div"].notna()
df = df[valid].reset_index(drop=True)
log.info(f"[DATA] {len(df)} valid rows after NaN vel_div drop")
return df
@task(name="run_nautilus_backtest", retries=0, timeout_seconds=600)
def run_nautilus_backtest_task(
run_date: str,
config: dict,
initial_capital: float,
posture: str = "APEX",
) -> dict:
"""Run one day through the Nautilus BacktestEngine + DolphinActor.
This task boots a full Nautilus kernel (BacktestEngine), registers
DolphinActor as the strategy, injects synthetic bars representing the
day's scan timestamps, then calls engine.run().
DolphinActor.on_bar() will:
1. Load vbt_cache_klines/ parquet (replay mode, live_mode=False)
2. Process each bar through NDAlphaEngine.step_bar()
3. Write results to HZ DOLPHIN_PNL_BLUE
Returns a summary dict with pnl, trades, capital, posture.
"""
log = get_run_logger()
t0 = time.time()
# ── Nautilus imports (deferred to avoid top-level JIT cost) ─────────────
from nautilus_trader.backtest.engine import BacktestEngine, BacktestEngineConfig
from nautilus_trader.model.identifiers import Venue
from nautilus_trader.model.data import Bar, BarType
from nautilus_trader.model.objects import Price, Quantity, Money, Currency
from nautilus_trader.model.enums import OmsType, AccountType
from nautilus_trader.core.datetime import dt_to_unix_nanos
from nautilus_trader.test_kit.providers import TestInstrumentProvider
from nautilus_dolphin.nautilus.dolphin_actor import DolphinActor
# ── Build actor config with posture + capital ────────────────────────────
actor_cfg = dict(config)
actor_cfg.setdefault("paper_trade", {})["initial_capital"] = initial_capital
actor_cfg["posture_override"] = posture
actor_cfg["live_mode"] = False # replay mode: reads parquet internally
# MC-Forewarner: explicit path so DolphinActor.on_start() wires the MC gate.
# Gold-performance stack requires MC gate active — do NOT remove this line.
actor_cfg["mc_models_dir"] = MC_MODELS_DIR
# ── Boot BacktestEngine ──────────────────────────────────────────────────
be_cfg = BacktestEngineConfig(trader_id="DOLPHIN-NAUTILUS-001")
engine = BacktestEngine(config=be_cfg)
actor = DolphinActor(config=actor_cfg)
engine.add_strategy(actor)
venue = Venue("BINANCE")
usdt = Currency.from_str("USDT")
engine.add_venue(
venue=venue,
oms_type=OmsType.HEDGING,
account_type=AccountType.MARGIN,
base_currency=usdt,
starting_balances=[Money(initial_capital, usdt)],
)
instrument = TestInstrumentProvider.default_fx_ccy("BTCUSD", venue)
engine.add_instrument(instrument)
# ── Generate synthetic bars for the day (one per 5s = 17,280/day).
# DolphinActor.on_bar() uses ts_event to extract the date, then loads
# the full day's parquet slice in _load_parquet_data().
# We inject exactly one bar at midnight to trigger a single begin_day()
# call; DolphinActor then iterates over the full parquet rows internally.
bar_type = BarType.from_str("BTCUSD.BINANCE-5-SECOND-LAST-EXTERNAL")
dt_event = datetime.strptime(run_date, "%Y-%m-%d").replace(
hour=0, minute=0, second=5, tzinfo=timezone.utc
)
bars = [
Bar(
bar_type=bar_type,
open=Price.from_str("10000.00000"),
high=Price.from_str("10000.00000"),
low=Price.from_str("10000.00000"),
close=Price.from_str("10000.00000"),
volume=Quantity.from_str("1"),
ts_event=dt_to_unix_nanos(dt_event),
ts_init=dt_to_unix_nanos(dt_event),
)
]
engine.add_data(bars)
log.info(f"[NAUTILUS] Starting engine for {run_date} | posture={posture} | capital={initial_capital:,.2f}")
try:
engine.run()
except Exception as e:
log.error(f"[NAUTILUS] BacktestEngine.run() raised: {e}")
raise
elapsed = time.time() - t0
log.info(f"[NAUTILUS] Engine finished in {elapsed:.2f}s")
# Collect summary from actor state
result = {
"date": run_date,
"posture": posture,
"capital": initial_capital, # updated below if actor exposes it
"pnl": 0.0,
"trades": 0,
"stale_state_events": actor._stale_state_events,
"processed_dates": list(actor._processed_dates),
"elapsed_s": round(elapsed, 3),
"engine": "nautilus_backtest",
}
# Pull capital from actor's engine if available
if actor.engine is not None:
result["capital"] = getattr(actor.engine, "capital", initial_capital)
result["pnl"] = result["capital"] - initial_capital
result["trades"] = len(getattr(actor.engine, "trade_history", []))
log.info(
f"[NAUTILUS] {run_date}: PnL={result['pnl']:+.2f} | "
f"T={result['trades']} | StaleEvents={result['stale_state_events']}"
)
return result
@task(name="write_hz_result", retries=3, retry_delay_seconds=5, timeout_seconds=30)
def write_hz_result_task(
result: dict,
initial_capital: float,
hz_host: str = HZ_HOST,
hz_cluster: str = HZ_CLUSTER,
state_map: str = HZ_STATE_MAP,
pnl_map: str = HZ_PNL_MAP,
) -> None:
"""Write run result + updated state to Hazelcast.
Creates its own client per call (serialization-safe across tasks).
Persists both PnL entry and latest state for capital continuity.
"""
log = get_run_logger()
import hazelcast
run_date = result["date"]
capital = float(result.get("capital", initial_capital))
state = {
"strategy": "nautilus-blue",
"capital": capital,
"date": run_date,
"pnl": result.get("pnl", 0.0),
"trades": result.get("trades", 0),
"posture": result.get("posture", "APEX"),
"stale_state_events": result.get("stale_state_events", 0),
"updated_at": datetime.now(timezone.utc).isoformat(),
"engine": "nautilus_backtest",
"param_hash": _CHAMPION_HASH,
}
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster, cluster_members=[hz_host]
)
try:
client.get_map(pnl_map).blocking().put(run_date, json.dumps(result))
client.get_map(state_map).blocking().put("latest_nautilus", json.dumps(state))
log.info(
f"[HZ] Wrote result to {pnl_map}[{run_date}] and {state_map}[latest_nautilus]"
)
finally:
client.shutdown()
@task(name="heartbeat", retries=0, timeout_seconds=15)
def heartbeat_task(
run_date: str,
phase: str,
extras: dict | None = None,
hz_host: str = HZ_HOST,
hz_cluster: str = HZ_CLUSTER,
) -> None:
"""Write a liveness heartbeat to DOLPHIN_HEARTBEAT.
Called at flow start, after engine run, and at flow end so external
monitoring dashboards can detect stalled runs (no heartbeat in >10 min).
"""
import hazelcast
payload = {
"ts": time.time(),
"iso": datetime.now(timezone.utc).isoformat(),
"run_date": run_date,
"phase": phase,
"flow": "nautilus_prefect",
}
if extras:
payload.update(extras)
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster, cluster_members=[hz_host],
connection_timeout=5.0,
)
try:
client.get_map(HZ_HEARTBEAT_MAP).blocking().put(
"nautilus_flow_heartbeat", json.dumps(payload)
)
except Exception:
pass # heartbeat failure is never fatal
finally:
client.shutdown()
@task(name="read_posture_from_hz", retries=2, retry_delay_seconds=5, timeout_seconds=20)
def read_posture_task(
hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER
) -> tuple[str, float]:
"""Read current posture + Rm from DOLPHIN_SAFETY.
Returns (posture, Rm) — defaults to ('APEX', 1.0) if HZ unavailable.
"""
log = get_run_logger()
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster, cluster_members=[hz_host]
)
try:
try:
ref = client.get_cp_subsystem().get_atomic_reference("DOLPHIN_SAFETY").blocking()
raw = ref.get()
except Exception:
m = client.get_map("DOLPHIN_SAFETY").blocking()
raw = m.get("latest")
if raw:
state = json.loads(raw)
posture = state.get("posture", "APEX")
rm = float(state.get("Rm", 1.0))
log.info(f"[SAFETY] Posture={posture} Rm={rm:.3f}")
return posture, rm
except Exception as e:
log.warning(f"[SAFETY] HZ read failed: {e} — defaulting APEX/1.0")
finally:
client.shutdown()
return "APEX", 1.0
@task(name="restore_capital", retries=2, retry_delay_seconds=5, timeout_seconds=20)
def restore_capital_task(
initial_capital: float,
hz_host: str = HZ_HOST,
hz_cluster: str = HZ_CLUSTER,
state_map: str = HZ_STATE_MAP,
) -> float:
"""Restore capital from latest HZ state.
Falls back to config initial_capital if no prior state exists.
"""
log = get_run_logger()
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name=hz_cluster, cluster_members=[hz_host]
)
try:
raw = client.get_map(state_map).blocking().get("latest_nautilus")
if raw:
state = json.loads(raw)
cap = float(state.get("capital", initial_capital))
log.info(f"[STATE] Restored capital={cap:,.2f} from HZ {state_map}")
return cap
except Exception as e:
log.warning(f"[STATE] Capital restore failed: {e} — using config capital")
finally:
client.shutdown()
return initial_capital
# ── Flow ─────────────────────────────────────────────────────────────────────────
@flow(
name="dolphin-nautilus-backtest",
log_prints=True,
description=(
"Daily Nautilus BacktestEngine run under Prefect supervision. "
"Wraps DolphinActor with champion params, HZ heartbeats, and capital continuity."
),
)
def dolphin_nautilus_flow(
config_path: str = "prod/configs/blue.yml",
run_date: str | None = None,
dry_run: bool = False,
) -> dict:
"""Main Prefect flow: Nautilus BacktestEngine daily runner.
Scheduled at 00:10 UTC (after paper_trade_flow at 00:05 UTC).
Run manually for a specific date:
dolphin_nautilus_flow(run_date="2026-03-21")
"""
log = get_run_logger()
# Resolve target date (yesterday by default)
target_date = run_date or (date.today() - timedelta(days=1)).isoformat()
log.info(f"=== NAUTILUS BACKTEST FLOW: {target_date} ===")
# Load strategy config
cfg_path = Path(config_path)
if not cfg_path.is_absolute():
cfg_path = HCM_DIR / cfg_path
with open(cfg_path) as f:
config = yaml.safe_load(f)
initial_capital = float(config.get("paper_trade", {}).get("initial_capital", 25000.0))
hz_cfg = config.get("hazelcast", {})
state_map = hz_cfg.get("imap_state", HZ_STATE_MAP)
pnl_map = hz_cfg.get("imap_pnl", HZ_PNL_MAP)
# ── 1. HZ probe ──────────────────────────────────────────────────────────
hz_ok = False
try:
hz_ok = hz_probe_task(HZ_HOST, HZ_CLUSTER)
except Exception as e:
log.warning(f"[HZ_PROBE] HZ unreachable after retries: {e}. Continuing without HZ.")
# ── 2. Heartbeat: flow_start ─────────────────────────────────────────────
if hz_ok:
heartbeat_task(target_date, "flow_start")
# ── 3. Champion param validation ─────────────────────────────────────────
param_hash = validate_champion_params(config)
# ── 4. Read posture ───────────────────────────────────────────────────────
posture, rm = "APEX", 1.0
if hz_ok:
posture, rm = read_posture_task(HZ_HOST, HZ_CLUSTER)
if posture == "HIBERNATE":
log.warning(f"[POSTURE] HIBERNATE — skipping engine run for {target_date}")
result = {
"date": target_date, "posture": posture, "pnl": 0.0,
"capital": initial_capital, "trades": 0, "skipped": True,
}
if hz_ok:
write_hz_result_task(result, initial_capital, HZ_HOST, HZ_CLUSTER, state_map, pnl_map)
heartbeat_task(target_date, "flow_end_hibernate")
return result
# ── 5. Restore capital ────────────────────────────────────────────────────
capital = initial_capital
if hz_ok:
capital = restore_capital_task(initial_capital, HZ_HOST, HZ_CLUSTER, state_map)
# ── 6. Validate bar data exists ───────────────────────────────────────────
df = load_bar_data_task(target_date, KLINES_DIR)
if df.empty:
log.warning(f"[DATA] No scan data for {target_date} — skipping engine run")
result = {
"date": target_date, "posture": posture, "pnl": 0.0,
"capital": capital, "trades": 0, "skipped": True, "reason": "no_data",
}
if hz_ok:
heartbeat_task(target_date, "flow_end_no_data")
return result
if dry_run:
log.info(f"[DRY_RUN] Data OK ({len(df)} rows). Engine not started.")
return {"date": target_date, "dry_run": True, "rows": len(df)}
# ── 7. Run Nautilus engine ────────────────────────────────────────────────
if hz_ok:
heartbeat_task(target_date, "engine_start", {"capital": capital, "posture": posture})
result = run_nautilus_backtest_task(
run_date=target_date,
config=config,
initial_capital=capital,
posture=posture,
)
result["param_hash"] = param_hash
# ── 8. Persist result ─────────────────────────────────────────────────────
if hz_ok:
write_hz_result_task(result, capital, HZ_HOST, HZ_CLUSTER, state_map, pnl_map)
heartbeat_task(
target_date, "flow_end",
{"pnl": result.get("pnl", 0.0), "trades": result.get("trades", 0)},
)
log.info(
f"=== DONE: nautilus-blue {target_date} | "
f"PnL={result.get('pnl', 0):+.2f} | "
f"Capital={result.get('capital', capital):,.2f} | "
f"Trades={result.get('trades', 0)} ==="
)
return result
# ── CLI entry point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Dolphin Nautilus Prefect Flow")
parser.add_argument("--config", default="prod/configs/blue.yml", help="Strategy YAML config")
parser.add_argument("--date", default=None, help="YYYY-MM-DD (default: yesterday)")
parser.add_argument("--register", action="store_true", help="Register Prefect deployment")
parser.add_argument("--dry-run", action="store_true", help="Probe + validate only, no engine")
args = parser.parse_args()
import os
os.environ.setdefault("PREFECT_API_URL", "http://localhost:4200/api")
if args.register:
abs_cfg = str(Path(args.config).resolve())
deployment = dolphin_nautilus_flow.to_deployment(
name="dolphin-nautilus-blue",
parameters={"config_path": abs_cfg},
schedule=Cron("10 0 * * *", timezone="UTC"),
work_pool_name="dolphin",
tags=["blue", "nautilus", "dolphin", "backtest"],
)
deployment.apply()
print("Registered: dolphin-nautilus-blue (daily 00:10 UTC)")
else:
dolphin_nautilus_flow(
config_path=args.config,
run_date=args.date,
dry_run=args.dry_run,
)