DOLPHIN/prod/nautilus_prefect_flow.py

"""nautilus_prefect_flow.py — Prefect-supervised Nautilus BacktestEngine daily runner.

This flow wraps the Nautilus BacktestEngine + DolphinActor inside a Prefect 3.x
flow, providing:
  - Resilience: Prefect retries, structured error handling, HZ circuit-break checks
  - Observability: Prefect task logs, HZ heartbeat writes, per-run metrics
  - State continuity: capital/engine state persisted to HZ; restored on next run
  - Champion integrity: param hash verified at startup (prevents silent drift)

Designed to run in siloqy-env on DOLPHIN (Linux):
    source /home/dolphin/siloqy_env/bin/activate
    PREFECT_API_URL=http://localhost:4200/api python prod/nautilus_prefect_flow.py

Scheduling (daily at 00:10 UTC, 5 min after paper_trade_flow):
    python prod/nautilus_prefect_flow.py --register

Architecture:
    dolphin_nautilus_flow
      ├── hz_probe_task               # Verify HZ reachable; raise on hard failure
      ├── validate_champion_params    # Hash-check engine params vs FROZEN spec
      ├── load_bar_data_task          # Load vbt_cache_klines/ parquet for run_date
      ├── run_nautilus_backtest_task  # BacktestEngine.run() in Nautilus kernel
      │     └─ DolphinActor.on_start() → MC-Forewarner wired (DolphinForewarner)
      │     └─ DolphinActor.on_bar() × N
      ├── write_hz_result_task        # Persist result + state to HZ IMaps
      └── heartbeat_task              # Write liveness/lag marker to DOLPHIN_HEARTBEAT

Champion params (FROZEN — see blue.yml and champion spec):
    vel_div_threshold=-0.02  vel_div_extreme=-0.05  fixed_tp_pct=0.0095
    max_hold_bars=120        fraction=0.20           min_leverage=0.5
    max_leverage=5.0         abs_max_leverage=6.0    leverage_convexity=3.0
    dc_lookback_bars=7       dc_min_magnitude_bps=0.75  min_irp_alignment=0.45
    sp_maker_entry_rate=0.62 sp_maker_exit_rate=0.50    seed=42
"""

from __future__ import annotations

import json
import hashlib
import sys
import time
import logging
import argparse
import yaml
from datetime import datetime, timedelta, date, timezone
from pathlib import Path

HCM_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(HCM_DIR / "nautilus_dolphin"))

from prefect import flow, task, get_run_logger
from prefect.schedules import Cron

import pandas as pd

# ── Constants ────────────────────────────────────────────────────────────────────

HZ_HOST    = "localhost:5701"
HZ_CLUSTER = "dolphin"
HZ_HEARTBEAT_MAP = "DOLPHIN_HEARTBEAT"
HZ_STATE_MAP     = "DOLPHIN_STATE_BLUE"
HZ_PNL_MAP       = "DOLPHIN_PNL_BLUE"

KLINES_DIR    = HCM_DIR / "vbt_cache_klines"
MC_MODELS_DIR = str(HCM_DIR / "nautilus_dolphin" / "mc_results" / "models")

# Champion parameter fingerprint — SHA256 of sorted canonical param string.
# Regenerate with: python -c "import prod.nautilus_prefect_flow as m; print(m._compute_champion_hash())"
_CHAMPION_PARAMS = {
    # Engine mode — must be d_liq (GOLD: LiquidationGuardEngine 8x/9x + liq guard)
    "boost_mode":            "d_liq",
    # Signal
    "vel_div_threshold":    -0.02,
    "vel_div_extreme":      -0.05,
    # Exit
    "fixed_tp_pct":          0.0095,
    "max_hold_bars":         120,
    # Sizing (note: max_leverage/abs_max_leverage are overridden to 8.0/9.0
    # internally by d_liq mode — these values appear in blue.yml but are not
    # used by the running engine; D_LIQ_SOFT_CAP=8.0 / D_LIQ_ABS_CAP=9.0 apply)
    "fraction":              0.20,
    "min_leverage":          0.5,
    "max_leverage":          5.0,
    "abs_max_leverage":      6.0,
    "leverage_convexity":    3.0,
    # Direction confirm
    "dc_lookback_bars":      7,
    "dc_min_magnitude_bps":  0.75,
    # Asset selection
    "min_irp_alignment":     0.45,
    # Fees
    "sp_maker_entry_rate":   0.62,
    "sp_maker_exit_rate":    0.50,
    # RNG
    "seed":                  42,
}


def _compute_champion_hash() -> str:
    canonical = json.dumps(_CHAMPION_PARAMS, sort_keys=True)
    return hashlib.sha256(canonical.encode()).hexdigest()[:16]


_CHAMPION_HASH = _compute_champion_hash()   # frozen at import time


# ── Tasks ────────────────────────────────────────────────────────────────────────

@task(name="hz_probe", retries=3, retry_delay_seconds=10, timeout_seconds=30)
def hz_probe_task(hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER) -> bool:
    """Verify Hazelcast is reachable and responsive.

    Raises on hard failure after 3 retries so the flow aborts cleanly rather
    than running with no state persistence.
    """
    log = get_run_logger()
    import hazelcast
    client = hazelcast.HazelcastClient(
        cluster_name=hz_cluster,
        cluster_members=[hz_host],
        connection_timeout=5.0,
    )
    try:
        m = client.get_map(HZ_HEARTBEAT_MAP).blocking()
        m.put("probe_ts", str(time.time()))
        log.info(f"[HZ_PROBE] HZ reachable at {hz_host} (cluster={hz_cluster})")
        return True
    finally:
        client.shutdown()


@task(name="validate_champion_params", retries=0, timeout_seconds=10)
def validate_champion_params(config: dict) -> str:
    """Assert engine config matches frozen champion params.

    Compares every key in _CHAMPION_PARAMS against the live config dict.
    Returns the param hash on success; raises ValueError on any mismatch.
    """
    log = get_run_logger()
    eng = config.get("engine", {})
    mismatches = []
    for k, expected in _CHAMPION_PARAMS.items():
        actual = eng.get(k)
        if actual is None:
            # tolerate missing — DolphinActor supplies defaults that match
            continue
        if isinstance(expected, str):
            if str(actual) != expected:
                mismatches.append(f"  {k}: expected={expected!r} got={actual!r}")
        elif abs(float(actual) - float(expected)) > 1e-9:
            mismatches.append(f"  {k}: expected={expected!r} got={actual!r}")
    if mismatches:
        msg = "CHAMPION PARAM DRIFT DETECTED:\n" + "\n".join(mismatches)
        log.error(msg)
        raise ValueError(msg)
    log.info(f"[PARAMS] Champion params verified. hash={_CHAMPION_HASH}")
    return _CHAMPION_HASH


@task(name="load_bar_data", retries=2, retry_delay_seconds=15, timeout_seconds=120)
def load_bar_data_task(run_date: str, klines_dir: Path = KLINES_DIR) -> pd.DataFrame:
    """Load vbt_cache_klines/<run_date>.parquet → DataFrame for replay.

    Returns empty DataFrame if file is absent (engine will skip the day).
    """
    log = get_run_logger()
    parq = klines_dir / f"{run_date}.parquet"
    if not parq.exists():
        log.warning(f"[DATA] Parquet not found for {run_date}: {parq}")
        return pd.DataFrame()
    df = pd.read_parquet(parq)
    log.info(f"[DATA] Loaded {len(df)} rows × {len(df.columns)} cols for {run_date}")
    if "vel_div" not in df.columns:
        log.error(f"[DATA] vel_div column missing from {parq} — aborting")
        return pd.DataFrame()
    valid = df["vel_div"].notna()
    df = df[valid].reset_index(drop=True)
    log.info(f"[DATA] {len(df)} valid rows after NaN vel_div drop")
    return df


@task(name="run_nautilus_backtest", retries=0, timeout_seconds=600)
def run_nautilus_backtest_task(
    run_date: str,
    config: dict,
    initial_capital: float,
    posture: str = "APEX",
) -> dict:
    """Run one day through the Nautilus BacktestEngine + DolphinActor.

    This task boots a full Nautilus kernel (BacktestEngine), registers
    DolphinActor as the strategy, injects synthetic bars representing the
    day's scan timestamps, then calls engine.run().

    DolphinActor.on_bar() will:
      1. Load vbt_cache_klines/ parquet (replay mode, live_mode=False)
      2. Process each bar through NDAlphaEngine.step_bar()
      3. Write results to HZ DOLPHIN_PNL_BLUE

    Returns a summary dict with pnl, trades, capital, posture.
    """
    log = get_run_logger()
    t0 = time.time()

    # ── Nautilus imports (deferred to avoid top-level JIT cost) ─────────────
    from nautilus_trader.backtest.engine import BacktestEngine, BacktestEngineConfig
    from nautilus_trader.model.identifiers import Venue
    from nautilus_trader.model.data import Bar, BarType
    from nautilus_trader.model.objects import Price, Quantity, Money, Currency
    from nautilus_trader.model.enums import OmsType, AccountType
    from nautilus_trader.core.datetime import dt_to_unix_nanos
    from nautilus_trader.test_kit.providers import TestInstrumentProvider

    from nautilus_dolphin.nautilus.dolphin_actor import DolphinActor

    # ── Build actor config with posture + capital ────────────────────────────
    actor_cfg = dict(config)
    actor_cfg.setdefault("paper_trade", {})["initial_capital"] = initial_capital
    actor_cfg["posture_override"] = posture
    actor_cfg["live_mode"] = False   # replay mode: reads parquet internally
    # MC-Forewarner: explicit path so DolphinActor.on_start() wires the MC gate.
    # Gold-performance stack requires MC gate active — do NOT remove this line.
    actor_cfg["mc_models_dir"] = MC_MODELS_DIR

    # ── Boot BacktestEngine ──────────────────────────────────────────────────
    be_cfg = BacktestEngineConfig(trader_id="DOLPHIN-NAUTILUS-001")
    engine = BacktestEngine(config=be_cfg)

    actor = DolphinActor(config=actor_cfg)
    engine.add_strategy(actor)

    venue = Venue("BINANCE")
    usdt = Currency.from_str("USDT")
    engine.add_venue(
        venue=venue,
        oms_type=OmsType.HEDGING,
        account_type=AccountType.MARGIN,
        base_currency=usdt,
        starting_balances=[Money(initial_capital, usdt)],
    )

    instrument = TestInstrumentProvider.default_fx_ccy("BTCUSD", venue)
    engine.add_instrument(instrument)

    # ── Generate synthetic bars for the day (one per 5s = 17,280/day).
    # DolphinActor.on_bar() uses ts_event to extract the date, then loads
    # the full day's parquet slice in _load_parquet_data().
    # We inject exactly one bar at midnight to trigger a single begin_day()
    # call; DolphinActor then iterates over the full parquet rows internally.
    bar_type = BarType.from_str("BTCUSD.BINANCE-5-SECOND-LAST-EXTERNAL")
    dt_event = datetime.strptime(run_date, "%Y-%m-%d").replace(
        hour=0, minute=0, second=5, tzinfo=timezone.utc
    )
    bars = [
        Bar(
            bar_type=bar_type,
            open=Price.from_str("10000.00000"),
            high=Price.from_str("10000.00000"),
            low=Price.from_str("10000.00000"),
            close=Price.from_str("10000.00000"),
            volume=Quantity.from_str("1"),
            ts_event=dt_to_unix_nanos(dt_event),
            ts_init=dt_to_unix_nanos(dt_event),
        )
    ]
    engine.add_data(bars)

    log.info(f"[NAUTILUS] Starting engine for {run_date} | posture={posture} | capital={initial_capital:,.2f}")

    try:
        engine.run()
    except Exception as e:
        log.error(f"[NAUTILUS] BacktestEngine.run() raised: {e}")
        raise

    elapsed = time.time() - t0
    log.info(f"[NAUTILUS] Engine finished in {elapsed:.2f}s")

    # Collect summary from actor state
    result = {
        "date": run_date,
        "posture": posture,
        "capital": initial_capital,   # updated below if actor exposes it
        "pnl": 0.0,
        "trades": 0,
        "stale_state_events": actor._stale_state_events,
        "processed_dates": list(actor._processed_dates),
        "elapsed_s": round(elapsed, 3),
        "engine": "nautilus_backtest",
    }

    # Pull capital from actor's engine if available
    if actor.engine is not None:
        result["capital"] = getattr(actor.engine, "capital", initial_capital)
        result["pnl"] = result["capital"] - initial_capital
        result["trades"] = len(getattr(actor.engine, "trade_history", []))

    log.info(
        f"[NAUTILUS] {run_date}: PnL={result['pnl']:+.2f} | "
        f"T={result['trades']} | StaleEvents={result['stale_state_events']}"
    )
    return result


@task(name="write_hz_result", retries=3, retry_delay_seconds=5, timeout_seconds=30)
def write_hz_result_task(
    result: dict,
    initial_capital: float,
    hz_host: str = HZ_HOST,
    hz_cluster: str = HZ_CLUSTER,
    state_map: str = HZ_STATE_MAP,
    pnl_map: str = HZ_PNL_MAP,
) -> None:
    """Write run result + updated state to Hazelcast.

    Creates its own client per call (serialization-safe across tasks).
    Persists both PnL entry and latest state for capital continuity.
    """
    log = get_run_logger()
    import hazelcast
    run_date = result["date"]
    capital = float(result.get("capital", initial_capital))

    state = {
        "strategy": "nautilus-blue",
        "capital": capital,
        "date": run_date,
        "pnl": result.get("pnl", 0.0),
        "trades": result.get("trades", 0),
        "posture": result.get("posture", "APEX"),
        "stale_state_events": result.get("stale_state_events", 0),
        "updated_at": datetime.now(timezone.utc).isoformat(),
        "engine": "nautilus_backtest",
        "param_hash": _CHAMPION_HASH,
    }

    client = hazelcast.HazelcastClient(
        cluster_name=hz_cluster, cluster_members=[hz_host]
    )
    try:
        client.get_map(pnl_map).blocking().put(run_date, json.dumps(result))
        client.get_map(state_map).blocking().put("latest_nautilus", json.dumps(state))
        log.info(
            f"[HZ] Wrote result to {pnl_map}[{run_date}] and {state_map}[latest_nautilus]"
        )
    finally:
        client.shutdown()


@task(name="heartbeat", retries=0, timeout_seconds=15)
def heartbeat_task(
    run_date: str,
    phase: str,
    extras: dict | None = None,
    hz_host: str = HZ_HOST,
    hz_cluster: str = HZ_CLUSTER,
) -> None:
    """Write a liveness heartbeat to DOLPHIN_HEARTBEAT.

    Called at flow start, after engine run, and at flow end so external
    monitoring dashboards can detect stalled runs (no heartbeat in >10 min).
    """
    import hazelcast
    payload = {
        "ts": time.time(),
        "iso": datetime.now(timezone.utc).isoformat(),
        "run_date": run_date,
        "phase": phase,
        "flow": "nautilus_prefect",
    }
    if extras:
        payload.update(extras)
    client = hazelcast.HazelcastClient(
        cluster_name=hz_cluster, cluster_members=[hz_host],
        connection_timeout=5.0,
    )
    try:
        client.get_map(HZ_HEARTBEAT_MAP).blocking().put(
            "nautilus_flow_heartbeat", json.dumps(payload)
        )
    except Exception:
        pass  # heartbeat failure is never fatal
    finally:
        client.shutdown()


@task(name="read_posture_from_hz", retries=2, retry_delay_seconds=5, timeout_seconds=20)
def read_posture_task(
    hz_host: str = HZ_HOST, hz_cluster: str = HZ_CLUSTER
) -> tuple[str, float]:
    """Read current posture + Rm from DOLPHIN_SAFETY.

    Returns (posture, Rm) — defaults to ('APEX', 1.0) if HZ unavailable.
    """
    log = get_run_logger()
    import hazelcast
    client = hazelcast.HazelcastClient(
        cluster_name=hz_cluster, cluster_members=[hz_host]
    )
    try:
        try:
            ref = client.get_cp_subsystem().get_atomic_reference("DOLPHIN_SAFETY").blocking()
            raw = ref.get()
        except Exception:
            m = client.get_map("DOLPHIN_SAFETY").blocking()
            raw = m.get("latest")
        if raw:
            state = json.loads(raw)
            posture = state.get("posture", "APEX")
            rm = float(state.get("Rm", 1.0))
            log.info(f"[SAFETY] Posture={posture} Rm={rm:.3f}")
            return posture, rm
    except Exception as e:
        log.warning(f"[SAFETY] HZ read failed: {e} — defaulting APEX/1.0")
    finally:
        client.shutdown()
    return "APEX", 1.0


@task(name="restore_capital", retries=2, retry_delay_seconds=5, timeout_seconds=20)
def restore_capital_task(
    initial_capital: float,
    hz_host: str = HZ_HOST,
    hz_cluster: str = HZ_CLUSTER,
    state_map: str = HZ_STATE_MAP,
) -> float:
    """Restore capital from latest HZ state.

    Falls back to config initial_capital if no prior state exists.
    """
    log = get_run_logger()
    import hazelcast
    client = hazelcast.HazelcastClient(
        cluster_name=hz_cluster, cluster_members=[hz_host]
    )
    try:
        raw = client.get_map(state_map).blocking().get("latest_nautilus")
        if raw:
            state = json.loads(raw)
            cap = float(state.get("capital", initial_capital))
            log.info(f"[STATE] Restored capital={cap:,.2f} from HZ {state_map}")
            return cap
    except Exception as e:
        log.warning(f"[STATE] Capital restore failed: {e} — using config capital")
    finally:
        client.shutdown()
    return initial_capital


# ── Flow ─────────────────────────────────────────────────────────────────────────

@flow(
    name="dolphin-nautilus-backtest",
    log_prints=True,
    description=(
        "Daily Nautilus BacktestEngine run under Prefect supervision. "
        "Wraps DolphinActor with champion params, HZ heartbeats, and capital continuity."
    ),
)
def dolphin_nautilus_flow(
    config_path: str = "prod/configs/blue.yml",
    run_date: str | None = None,
    dry_run: bool = False,
) -> dict:
    """Main Prefect flow: Nautilus BacktestEngine daily runner.

    Scheduled at 00:10 UTC (after paper_trade_flow at 00:05 UTC).
    Run manually for a specific date:
        dolphin_nautilus_flow(run_date="2026-03-21")
    """
    log = get_run_logger()

    # Resolve target date (yesterday by default)
    target_date = run_date or (date.today() - timedelta(days=1)).isoformat()
    log.info(f"=== NAUTILUS BACKTEST FLOW: {target_date} ===")

    # Load strategy config
    cfg_path = Path(config_path)
    if not cfg_path.is_absolute():
        cfg_path = HCM_DIR / cfg_path
    with open(cfg_path) as f:
        config = yaml.safe_load(f)

    initial_capital = float(config.get("paper_trade", {}).get("initial_capital", 25000.0))
    hz_cfg = config.get("hazelcast", {})
    state_map = hz_cfg.get("imap_state", HZ_STATE_MAP)
    pnl_map   = hz_cfg.get("imap_pnl",   HZ_PNL_MAP)

    # ── 1. HZ probe ──────────────────────────────────────────────────────────
    hz_ok = False
    try:
        hz_ok = hz_probe_task(HZ_HOST, HZ_CLUSTER)
    except Exception as e:
        log.warning(f"[HZ_PROBE] HZ unreachable after retries: {e}. Continuing without HZ.")

    # ── 2. Heartbeat: flow_start ─────────────────────────────────────────────
    if hz_ok:
        heartbeat_task(target_date, "flow_start")

    # ── 3. Champion param validation ─────────────────────────────────────────
    param_hash = validate_champion_params(config)

    # ── 4. Read posture ───────────────────────────────────────────────────────
    posture, rm = "APEX", 1.0
    if hz_ok:
        posture, rm = read_posture_task(HZ_HOST, HZ_CLUSTER)

    if posture == "HIBERNATE":
        log.warning(f"[POSTURE] HIBERNATE — skipping engine run for {target_date}")
        result = {
            "date": target_date, "posture": posture, "pnl": 0.0,
            "capital": initial_capital, "trades": 0, "skipped": True,
        }
        if hz_ok:
            write_hz_result_task(result, initial_capital, HZ_HOST, HZ_CLUSTER, state_map, pnl_map)
            heartbeat_task(target_date, "flow_end_hibernate")
        return result

    # ── 5. Restore capital ────────────────────────────────────────────────────
    capital = initial_capital
    if hz_ok:
        capital = restore_capital_task(initial_capital, HZ_HOST, HZ_CLUSTER, state_map)

    # ── 6. Validate bar data exists ───────────────────────────────────────────
    df = load_bar_data_task(target_date, KLINES_DIR)
    if df.empty:
        log.warning(f"[DATA] No scan data for {target_date} — skipping engine run")
        result = {
            "date": target_date, "posture": posture, "pnl": 0.0,
            "capital": capital, "trades": 0, "skipped": True, "reason": "no_data",
        }
        if hz_ok:
            heartbeat_task(target_date, "flow_end_no_data")
        return result

    if dry_run:
        log.info(f"[DRY_RUN] Data OK ({len(df)} rows). Engine not started.")
        return {"date": target_date, "dry_run": True, "rows": len(df)}

    # ── 7. Run Nautilus engine ────────────────────────────────────────────────
    if hz_ok:
        heartbeat_task(target_date, "engine_start", {"capital": capital, "posture": posture})

    result = run_nautilus_backtest_task(
        run_date=target_date,
        config=config,
        initial_capital=capital,
        posture=posture,
    )
    result["param_hash"] = param_hash

    # ── 8. Persist result ─────────────────────────────────────────────────────
    if hz_ok:
        write_hz_result_task(result, capital, HZ_HOST, HZ_CLUSTER, state_map, pnl_map)
        heartbeat_task(
            target_date, "flow_end",
            {"pnl": result.get("pnl", 0.0), "trades": result.get("trades", 0)},
        )

    log.info(
        f"=== DONE: nautilus-blue {target_date} | "
        f"PnL={result.get('pnl', 0):+.2f} | "
        f"Capital={result.get('capital', capital):,.2f} | "
        f"Trades={result.get('trades', 0)} ==="
    )
    return result


# ── CLI entry point ───────────────────────────────────────────────────────────────

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Dolphin Nautilus Prefect Flow")
    parser.add_argument("--config",   default="prod/configs/blue.yml", help="Strategy YAML config")
    parser.add_argument("--date",     default=None,                    help="YYYY-MM-DD (default: yesterday)")
    parser.add_argument("--register", action="store_true",             help="Register Prefect deployment")
    parser.add_argument("--dry-run",  action="store_true",             help="Probe + validate only, no engine")
    args = parser.parse_args()

    import os
    os.environ.setdefault("PREFECT_API_URL", "http://localhost:4200/api")

    if args.register:
        abs_cfg = str(Path(args.config).resolve())
        deployment = dolphin_nautilus_flow.to_deployment(
            name="dolphin-nautilus-blue",
            parameters={"config_path": abs_cfg},
            schedule=Cron("10 0 * * *", timezone="UTC"),
            work_pool_name="dolphin",
            tags=["blue", "nautilus", "dolphin", "backtest"],
        )
        deployment.apply()
        print("Registered: dolphin-nautilus-blue (daily 00:10 UTC)")
    else:
        dolphin_nautilus_flow(
            config_path=args.config,
            run_date=args.date,
            dry_run=args.dry_run,
        )