Files
siloqy/prod/nautilus_event_trader.py

5252 lines
257 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN Nautilus Event-Driven Trader
"""
import sys
import json
import hashlib
import math
import os
import time
import signal
import threading
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
import traceback
import urllib.request
import uuid
from dataclasses import replace
from typing import Any, Mapping, Optional
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
from pathlib import Path
from collections import deque
# Stablecoins / pegged assets that must never be traded
_STABLECOIN_SYMBOLS = frozenset({
'USDCUSDT', 'BUSDUSDT', 'FDUSDUSDT', 'USDTUSDT', 'TUSDUSDT',
'DAIUSDT', 'FRAXUSDT', 'USDDUSDT', 'USTCUSDT', 'EURUSDT',
})
sys.path.insert(0, '/mnt/dolphinng5_predict')
sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin')
from nautilus_dolphin.nautilus.proxy_boost_engine import create_d_liq_engine
from nautilus_dolphin.nautilus.esf_alpha_orchestrator import NDPosition
from nautilus_dolphin.nautilus.adaptive_circuit_breaker import AdaptiveCircuitBreaker
from nautilus_dolphin.nautilus.ob_features import OBFeatureEngine
from nautilus_dolphin.nautilus.ob_provider import MockOBProvider
from nautilus_dolphin.nautilus.esof_size_gate import (
parse_esof_payload, esof_gate_from_payload, esof_score_from_payload,
esof_size_mult_from_score, ESOF_STALE_FALLBACK_MULT, ESOF_FRESHNESS_S,
)
from prod.clean_arch.adapters.eigen_scan_normalizer import normalize_ng7_scan
from prod.clean_arch.obf_tp_observation import inject_obf_midprice
from prod.clean_arch.tp_curve import compute_our_leverage, compute_soft_tp_pct
try:
sys.path.insert(0, '/mnt/dolphinng5_predict/Observability')
from esof_advisor import compute_esof as _compute_esof_inline
except Exception:
_compute_esof_inline = None
try:
from adaptive_exit.market_state_runtime import MarketStateRuntime
except Exception:
MarketStateRuntime = None
try:
from adaptive_exit.advanced_sl import AdvancedSLRuntime
except Exception:
AdvancedSLRuntime = None
try:
from adaptive_exit.sc_threshold_advisor import SCThresholdAdvisor
except Exception:
SCThresholdAdvisor = None
try:
from adaptive_exit.sc_gauge_advisor import SCGaugeAdvisor, build_obf_snapshot_from_engine
except Exception:
SCGaugeAdvisor = None
build_obf_snapshot_from_engine = None
try:
from adaptive_exit.bounce_advisor import BounceAdvisor
except Exception:
BounceAdvisor = None
try:
from adaptive_exit.post_win_long_overlay import PostWinExecutionFSM
except Exception:
PostWinExecutionFSM = None
try:
from nautilus_dolphin.nautilus.alpha_exit_v7_engine import AlphaExitEngineV7, TradeContextV7
except Exception:
AlphaExitEngineV7 = None
TradeContextV7 = None
BLUE_CH_DB = "dolphin"
try:
from prod.ch_writer import ch_put, ts_us as _ch_ts_us
except ImportError:
def ch_put(*a, **kw): pass
def _ch_ts_us(): return 0
try:
from prod.execution_quality import build_execution_quality_record
from prod.execution_quality import build_trade_execution_quality_summary
except Exception:
build_execution_quality_record = None
build_trade_execution_quality_summary = None
try:
from announcement_router import build_announcement_center
except ImportError:
from prod.announcement_router import build_announcement_center
sys.path.insert(0, '/mnt/dolphinng5_predict/prod')
from dolphin_exit_handler import install_exit_handler
install_exit_handler("nautilus_trader")
from prod.clean_arch.runtime.runner_heartbeat import (
build_runner_heartbeat_payload,
write_runner_heartbeat,
)
HZ_CLUSTER = "dolphin"
HZ_HOST = "127.0.0.1:5701"
EIGEN_DIR = Path('/mnt/dolphinng6_data/eigenvalues')
CAPITAL_DISK_CHECKPOINT = Path("/tmp/dolphin_capital_checkpoint.json")
CAPITAL_CORRECTIVE_REPLAY = Path("/tmp/dolphin_latest_nautilus_replay.json")
CAPITAL_UPDATE_LEDGER = Path("/tmp/dolphin_capital_update_ledger.json")
CAPITAL_CORRECTIVE_REPLAY_HZ_KEY = "capital_correction_replay"
ANNOUNCEMENT_CONFIG = Path("/mnt/dolphinng5_predict/prod/configs/position_notifications_blue.json")
ANNOUNCEMENT_RUNTIME_ENV = Path("/mnt/dolphin_training/observability_notifications_blue.runtime.json")
# Economic dust floor for OPEN position_state rows and retract remainders.
# A remainder at/below this is a FULL CLOSE, never an OPEN snapshot. The
# lifecycle invariant "OPEN ⇒ size > dust" is enforced at the single write
# gate (_ps_write_open); zero/dust-size OPEN rows are the malformed class
# behind the 2026-06-11 restore restart-loop (MALFORMED_OPEN_RESTORE_BUG.md).
# $0.01 sits far above the round(notional,4)=0 boundary (5e-5), so a row
# that passes the gate can never round to a zero notional on disk.
POSITION_DUST_NOTIONAL_USD = 0.01
ENGINE_KWARGS = dict(
initial_capital=25000.0, vel_div_threshold=-0.02, vel_div_extreme=-0.05,
min_leverage=0.5, max_leverage=8.0, # note: create_d_liq_engine overrides to D_LIQ_SOFT_CAP=8.0
leverage_convexity=3.0,
fraction=0.20, fixed_tp_pct=0.0020, stop_pct=1.0, max_hold_bars=250, # TP research 2026-05-11: 0.95→0.20%
use_direction_confirm=True, dc_lookback_bars=7, dc_min_magnitude_bps=0.75,
dc_skip_contradicts=True, dc_leverage_boost=1.0, dc_leverage_reduce=0.5,
use_asset_selection=True, min_irp_alignment=0.0, # gold spec: no IRP filter
use_sp_fees=True, use_sp_slippage=True,
sp_maker_entry_rate=0.62, sp_maker_exit_rate=0.50,
use_ob_edge=True, ob_edge_bps=5.0, ob_confirm_rate=0.40,
lookback=100, use_alpha_layers=True, use_dynamic_leverage=True, seed=42,
allow_subday_acb_exit=False,
)
def _env_bool(name: str, default: bool) -> bool:
raw = os.environ.get(name)
if raw is None:
return default
return str(raw).strip().lower() in {"1", "true", "yes", "on"}
def _env_float(name: str, default: float) -> float:
raw = os.environ.get(name)
if raw is None:
return default
try:
value = float(raw)
except (TypeError, ValueError):
return default
return value if math.isfinite(value) else default
def _env_int(name: str, default: int) -> int:
raw = os.environ.get(name)
if raw is None:
return default
try:
value = int(float(raw))
except (TypeError, ValueError):
return default
return value
def _direction_from_env(value: Optional[str] = None) -> int:
raw = os.environ.get("DOLPHIN_DIRECTION", "short_only") if value is None else value
text = str(raw or "short_only").strip().lower()
if text in {"short", "short_only", "sell", "-1"}:
return -1
if text in {"long", "long_only", "buy", "+1", "1"}:
return 1
raise ValueError(
f"Unsupported DOLPHIN_DIRECTION={raw!r}; use short_only or long_only"
)
def _direction_label(direction: int) -> str:
return "LONG" if int(direction) == 1 else "SHORT"
def _normalize_v7_exit_reason(reason: str) -> str:
text = str(reason or "").strip()
if text == "V7_MAE_SL_VOL_NORM":
return "V7.1_MAE_SL_VOL_NORM"
return text
def _safe_float(value, default: float = 0.0) -> float:
try:
out = float(value)
except (TypeError, ValueError):
return default
return out if math.isfinite(out) else default
def _flatten_env_payload(payload, prefix: str = "") -> dict:
flat = {}
if not isinstance(payload, dict):
return flat
for key, value in payload.items():
if not isinstance(key, str) or not key.strip():
continue
full_key = f"{prefix}_{key}" if prefix else key
if isinstance(value, dict):
flat.update(_flatten_env_payload(value, full_key))
else:
flat[full_key.upper()] = value
return flat
def _seed_runtime_env(path: Path) -> None:
if not path.exists():
return
try:
payload = json.loads(path.read_text())
except Exception:
return
for key, value in _flatten_env_payload(payload).items():
if key not in os.environ and value not in (None, "", "__CHANGE_ME__", "__REPLACE_ME__"):
os.environ[key] = str(value)
BTC_VOL_WINDOW = 50
# Per-bucket SL % used when HIBERNATE fires while a position is open.
# Instead of immediate HIBERNATE_HALT, we arm TP (existing fixed_tp_pct) +
# a per-bucket stop-loss so the position exits cleanly rather than being
# force-closed at whatever price the halt fires at.
# Values derived from AE shadow data + bucket trade analysis (2026-04-19).
# B3 wide: shadow shows mae_norm 5-5.1 before FIXED_TP; 3.5×ATR fires on noise.
# B4 tight: 34.8% WR, 0.80 R:R — cut fast, no recovery value.
# B6 widest: extreme vol (vol_daily_pct 760-864); normal ATR excursions are large.
_BUCKET_SL_PCT: dict = {
0: 0.015, # Low-vol high-corr nano-cap
1: 0.012, # Med-vol low-corr mid-price (XRP/XLM class)
2: 0.015, # Mega-cap BTC/ETH — default (not traded)
3: 0.025, # High-vol mid-corr STAR bucket (ENJ/ADA/DOGE) — needs room
4: 0.008, # Worst bucket (BNB/LTC/LINK) — cut fast
5: 0.018, # High-vol low-corr micro-price (ATOM/TRX class)
6: 0.030, # Extreme-vol mid-corr (FET/ZRX) — widest
'default': 0.015,
}
# Gold-calibrated from full 5-year BTC history: 0.00026414 (stricter, ~2.7x tighter).
# 2026-04-07: switched to 56-day gold window value (0.00009868) — the exact threshold
# used in the T=2155 ROI=+189% backtest. More permissive; paper trading to gather data.
# 2026-05-09 weekend mode: runtime-configurable lower gate for low-vol tape.
#
# Legacy references preserved:
# VOL_P60_THRESHOLD_LEGACY_MAIN = 0.00026414
# VOL_P60_THRESHOLD_GOLD_56D = 0.00009868
VOL_P60_THRESHOLD_LEGACY_MAIN = 0.00026414
VOL_P60_THRESHOLD_GOLD_56D = 0.00009868
VOL_P60_THRESHOLD_WEEKEND_DEFAULT = 0.00003
VOL_P60_THRESHOLD_RELAXED_TEMP = 0.00015838
# Backward-compatible alias retained for older tests and tooling.
VOL_P60_THRESHOLD = VOL_P60_THRESHOLD_LEGACY_MAIN
def _vol_p60_threshold_from_env(default: float = VOL_P60_THRESHOLD_LEGACY_MAIN) -> float:
raw = os.environ.get("DOLPHIN_VOL_P60_THRESHOLD")
if raw is None:
return float(default)
try:
out = float(str(raw).strip())
except Exception:
return float(default)
if not math.isfinite(out) or out <= 0.0:
return float(default)
return float(out)
# Algorithm Versioning
# v1_shakedown: v50-v150 (noise bug), loose vol gate
# v2_gold_fix: CORRECTED v50-v750 macro divergence (matches parquet backtest)
ALGO_VERSION = "v2_gold_fix_v50-v750"
# Persistent, version-tagged trade log (survives reboots; sorts by date).
# Keep a local fallback path so mount hiccups never break runtime callbacks.
_LOG_DIR_PRIMARY = "/mnt/dolphinng5_predict/prod/logs"
_LOG_DIR_FALLBACK = "/tmp/dolphin_logs/trader"
_LOG_IO_LAST_WARN_TS = 0.0
running = True
_PROCESS_BOOT_TS = time.time()
_SIGTERM_STARTUP_GRACE_S = 20.0
# ── Scan-flow watchdog (2026-06-10) ──────────────────────────────────────────
# BLUE went deaf 3× on 2026-06-09 (scan listener/worker stalled silently while
# supervisord showed RUNNING) and lost most of a trading session. The watchdog
# detects a stalled scan path and self-exits with WATCHDOG_EXIT_CODE so
# supervisord (autorestart=true) brings the process back clean. Restore of
# capital + position state on boot is the proven recovery path.
SCAN_STALL_S = 120.0 # scan path considered stalled after this
WATCHDOG_RESTART_MIN_UPTIME_S = 600.0 # never self-restart during warm-up
WATCHDOG_PROBE_INTERVAL_S = 30.0 # spacing between HZ deafness probes
UPSTREAM_DARK_LOG_EVERY_S = 300.0 # CRITICAL reminder cadence when dark
WATCHDOG_EXIT_CODE = 86
# Scanner restarts reset scan_number to 0. A backwards jump larger than this
# is a restart (accept + re-anchor ratchet), not a stale duplicate (drop).
SCAN_NUMBER_RESET_GAP = 1000
def _trade_log_paths(ts_dt: datetime) -> tuple[str, str]:
log_date = ts_dt.strftime("%Y%m%d")
fname = f"nautilus_trader_{log_date}_{ALGO_VERSION}.log"
return os.path.join(_LOG_DIR_PRIMARY, fname), os.path.join(_LOG_DIR_FALLBACK, fname)
def log(msg):
global _LOG_IO_LAST_WARN_TS
ts_dt = datetime.now(timezone.utc)
ts = ts_dt.isoformat()
line = f"[{ts}] {msg}"
print(line, flush=True)
primary_path, fallback_path = _trade_log_paths(ts_dt)
try:
os.makedirs(_LOG_DIR_PRIMARY, exist_ok=True)
with open(primary_path, 'a') as f:
f.write(line + '\n')
return
except OSError as e:
now = time.time()
if now - _LOG_IO_LAST_WARN_TS >= 60.0:
_LOG_IO_LAST_WARN_TS = now
print(f"[{ts}] LOG_PATH_FALLBACK: primary log write failed: {e}", flush=True)
try:
os.makedirs(_LOG_DIR_FALLBACK, exist_ok=True)
with open(fallback_path, 'a') as f:
f.write(line + '\n')
except Exception:
# Last-resort: stdout still has the log line.
pass
def _chain_digest(payload: dict) -> str:
"""Stable digest for BLUE exit-chain state."""
body = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str).encode()
return hashlib.sha256(body).hexdigest()
def _build_chain_state(
*,
trade_id: str,
asset: str,
side: str,
entry_price: float,
quantity: float,
notional: float,
entry_bar: int,
entry_ts: int,
retraction_legs: int = 0,
realized_pnl_legs_total: float = 0.0,
chain_root_trade_id: str | None = None,
chain_head_leg_id: str | None = None,
chain_prev_leg_id: str = "",
chain_mode: str = "LIVE",
) -> dict:
"""Build a deterministic chain snapshot for the current open trade head."""
root = str(chain_root_trade_id or trade_id or "")
seq = max(0, int(retraction_legs))
head = str(chain_head_leg_id or (f"{trade_id}:open" if seq <= 0 else f"{trade_id}:x{seq:03d}"))
prev = str(chain_prev_leg_id or "")
anchor = {
"trade_id": str(trade_id or ""),
"chain_root_trade_id": root,
"chain_head_leg_id": head,
"chain_prev_leg_id": prev,
"chain_seq": seq,
"chain_mode": str(chain_mode or "LIVE"),
"asset": str(asset or ""),
"side": str(side or "").upper(),
"entry_price": round(float(entry_price or 0.0), 12),
"quantity": round(float(quantity or 0.0), 12),
"notional": round(float(notional or 0.0), 12),
"entry_bar": int(entry_bar or 0),
"entry_ts": int(entry_ts or 0),
"retraction_legs": seq,
"realized_pnl_legs_total": round(float(realized_pnl_legs_total or 0.0), 12),
}
anchor["chain_token"] = _chain_digest(anchor)
anchor["chain_version"] = 1
anchor["chain_kind"] = "ROOT" if seq <= 0 else "LEG"
return anchor
class DolphinLiveTrader:
def __init__(self):
self.eng = None
self.hz_client = None
self.features_map = None
self.safety_map = None
self.pnl_map = None
self.state_map = None
self.heartbeat_map = None
self.control_map = None
self.eng_lock = threading.Lock()
self._heartbeat_stop = threading.Event()
self._runtime_command_lock = threading.Lock()
self._dedup_lock = threading.Lock() # guards atomic check-and-set on last_scan_number
self._scan_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="scan")
self.last_scan_number = -1
# Scan-flow watchdog state. Event ts proves the HZ listener is alive;
# accept ts proves the worker thread is draining; the dupe counter
# separates "worker stuck" from "upstream flooding duplicates".
self._last_scan_event_ts = time.time()
self._last_scan_accept_ts = time.time()
self._dupe_drops_total = 0
self._watchdog_stop = threading.Event()
self._probe_executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="wdprobe")
self.last_file_mtime = 0
self.bar_idx = 0
self.current_day = None
self.trades_executed = 0
self.scans_processed = 0
self.btc_prices = deque(maxlen=BTC_VOL_WINDOW + 2)
self.cached_posture = "APEX"
self.posture_cache_time = 0
self.ob_assets = []
self.ob_eng = None
self.acb = None
self.last_w750_vel = None
self._pending_entries: dict = {} # trade_id → entry snapshot (for CH trade_events)
self._last_exf: dict = {}
self._last_engine_snapshot_payload = None
self._exf_log_time = 0.0 # throttle for on_exf_update logging
self._ae = None # AdaptiveExitEngine shadow (parallel, never real exits)
self._v7_exit_engine = None # AlphaExitEngineV7 live BLUE exit control + journal
self._v7_contexts: dict = {} # trade_id → TradeContextV7
self._v7_decisions: dict = {} # trade_id → latest v7 decision
self._v7_decision_seq: dict = {} # trade_id → monotonic eval sequence
self._v7_journal_enabled: bool = _env_bool("DOLPHIN_ENABLE_V7_JOURNAL", True)
self._v7_journal_db: str = BLUE_CH_DB
self._v7_journal_table: str = "v7_decision_events"
self._v7_live_exit_enabled: bool = False
self._sc_advisor = None # SC threshold advisor (shadow-only)
self._sc_advisor_last_log = 0.0
self._sc_gauge = None # SC bucket gauge advisor (shadow-only)
self._sc_gauge_last_log = 0.0
self._bounce_advisor = None # inverse-ARS bounce advisor (shadow-only)
self._bounce_advisor_last_log = 0.0
self._bounce_price_history: dict[str, deque] = {}
self._last_prices_dict: dict[str, float] = {}
self._market_state_runtime = MarketStateRuntime() if MarketStateRuntime is not None else None
self._tp_base_pct = float(ENGINE_KWARGS.get("fixed_tp_pct", 0.0020))
self._advanced_sl = AdvancedSLRuntime.load() if AdvancedSLRuntime is not None else None
self._advanced_sl_live_exit_enabled: bool = _env_bool("DOLPHIN_ENABLE_ADVANCED_SL_LIVE", False)
if self._advanced_sl is not None and self._advanced_sl_live_exit_enabled:
self._advanced_sl.config = replace(self._advanced_sl.config, enabled=True)
self._catastrophic_floor_pct: float = max(
0.0,
_env_float("DOLPHIN_CATASTROPHIC_FLOOR_PCT", 0.0120),
)
self._overlay_catastrophic_floor_pct: float = max(
0.0,
_env_float("DOLPHIN_OVERLAY_CATASTROPHIC_FLOOR_PCT", 0.0050),
)
self._overlay_catastrophic_max_loss_usd: float = max(
0.0,
_env_float("DOLPHIN_OVERLAY_CATASTROPHIC_MAX_LOSS_USD", 500.0),
)
self._overlay_advsl_live_exit_enabled: bool = _env_bool("DOLPHIN_OVERLAY_ADVSL_LIVE", True)
self._overlay_advsl_min_bars: int = max(0, _env_int("DOLPHIN_OVERLAY_ADVSL_MIN_BARS", 6))
self._overlay_advsl_mfe_max_pct: float = max(0.0, _env_float("DOLPHIN_OVERLAY_ADVSL_MFE_MAX_PCT", 0.0020))
self._overlay_advsl_pressure_min: float = max(0.0, _env_float("DOLPHIN_OVERLAY_ADVSL_PRESSURE_MIN", 1.85))
self._overlay_advsl_mae_risk_min: float = max(0.0, _env_float("DOLPHIN_OVERLAY_ADVSL_MAE_RISK_MIN", 0.50))
self._hibernate_protect_active: str | None = None # trade_id being protected
self._bucket_assignments: dict = {} # asset → KMeans bucket_id (loaded from pkl)
self._last_esof_size_mult: float = 1.0
self._restore_failed: bool = False
self._restore_failure_reason: str = ""
self._restore_source: str = ""
self.trade_direction: int = _direction_from_env()
self.vol_p60_threshold: float = _vol_p60_threshold_from_env()
self._runtime_direction: int = self.trade_direction
self._efsm = PostWinExecutionFSM() if PostWinExecutionFSM is not None else None
self._trade_announcement_center = None
self._processed_retract_commands: deque = deque(maxlen=5000)
self._processed_retract_set: set[str] = set()
_seed_runtime_env(ANNOUNCEMENT_RUNTIME_ENV)
if ANNOUNCEMENT_CONFIG.exists():
try:
self._trade_announcement_center = build_announcement_center(
ANNOUNCEMENT_CONFIG,
hz_getter=self._get_hz,
logger=None,
)
log(" Position announcements: loaded")
except Exception as e:
log(f" Position announcements: {e}")
self._trade_announcement_center = None
if self._efsm is not None:
log(" EFSM: loaded (post-win LONG overlay)")
if self._advanced_sl is not None:
log(" AdvancedSL: loaded (shadow prototype)")
def _get_hz(self):
"""Return a live Hazelcast client for announcement channels."""
hz = self.hz_client
if hz is None:
return None
try:
if not hz.lifecycle_service.is_running():
return None
except Exception:
return None
return hz
def _latest_maras_context(self) -> dict:
"""Best-effort MARAS context for meta exit gates."""
try:
if self.features_map is None:
return {}
raw = self.features_map.blocking().get("maras_latest")
if not raw:
return {}
payload = json.loads(raw) if isinstance(raw, str) else raw
if not isinstance(payload, dict):
return {}
return {
"composite_hash": payload.get("composite_hash", payload.get("hash", 0)),
"scalar_hash": payload.get("scalar_hash", 0),
"regime": payload.get("regime", ""),
"final_score": payload.get("final_score", 0.0),
"confidence": payload.get("confidence", 0.0),
}
except Exception:
return {}
def _resolve_runtime_direction(self) -> int:
"""Resolve active trade direction for the next eligible entry."""
base = int(self.trade_direction)
if base != -1 or self._efsm is None:
return base
with self.eng_lock:
has_open_position = getattr(self.eng, "position", None) is not None
if has_open_position:
return base
return 1 if int(self._efsm.pending_slots) > 0 else base
def _apply_runtime_direction(self) -> None:
"""Apply current runtime direction to the engine regime."""
resolved = self._resolve_runtime_direction()
with self.eng_lock:
if getattr(self.eng, "regime_direction", self.trade_direction) != resolved:
self.eng.regime_direction = resolved
self._runtime_direction = resolved
def _build_engine(self):
log("Building NDAlphaEngine...")
engine_kwargs = dict(ENGINE_KWARGS)
engine_kwargs["allow_subday_acb_exit"] = _env_bool(
"DOLPHIN_ALLOW_ACB_SUBDAY_EXIT",
bool(engine_kwargs.get("allow_subday_acb_exit", False)),
)
self.eng = create_d_liq_engine(**engine_kwargs)
# TP profit-floor ratchet (LINK 5e05eeeb, 2026-06-11): once the BASE
# 0.20% TP has been crossed, regression back to base exits (TP_FLOOR)
# instead of riding the OB-widened threshold back to a loss. Class
# default is OFF (backtest/champion parity); live default is ON.
# Kill switch: DOLPHIN_TP_FLOOR=0.
self.eng.exit_manager.tp_floor_enabled = _env_bool("DOLPHIN_TP_FLOOR", True)
log(f" Engine: {type(self.eng).__name__}")
log(f" TP profit-floor: {'ON' if self.eng.exit_manager.tp_floor_enabled else 'OFF'}")
log(f" Direction: {_direction_label(self.trade_direction)} ({self.trade_direction:+d})")
log(
" VOL gate threshold: "
f"{self.vol_p60_threshold:.8f} "
f"(legacy_main={VOL_P60_THRESHOLD_LEGACY_MAIN:.8f}, gold_56d={VOL_P60_THRESHOLD_GOLD_56D:.8f}, "
f"relaxed_temp={VOL_P60_THRESHOLD_RELAXED_TEMP:.7f})"
)
log(f" ACB subday exits: {'ON' if engine_kwargs['allow_subday_acb_exit'] else 'OFF'}")
log(f" Leverage: soft={self.eng.base_max_leverage}x abs={self.eng.abs_max_leverage}x")
if EIGEN_DIR.exists():
try:
date_strings = sorted([d.name for d in EIGEN_DIR.iterdir() if d.is_dir()])
self.acb = AdaptiveCircuitBreaker()
self.acb.preload_w750(date_strings)
self.eng.set_acb(self.acb)
log(" ACBv6: loaded")
except Exception as e:
log(f" ACBv6: {e}")
else:
self.acb = AdaptiveCircuitBreaker()
self.eng.set_acb(self.acb)
log(" ACBv6: loaded (no preload dates)")
self.eng.set_esoteric_hazard_multiplier(0.0) # gold spec: init guard, MUST precede set_mc_forewarner
log(f" Hazard: set_esoteric_hazard_multiplier(0.0) — soft={self.eng.base_max_leverage}x")
MC_MODELS_DIR = '/mnt/dolphinng5_predict/nautilus_dolphin/mc_results/models'
MC_BASE_CFG = {
'trial_id': 0, 'vel_div_threshold': -0.020, 'vel_div_extreme': -0.050,
'use_direction_confirm': True, 'dc_lookback_bars': 7,
'dc_min_magnitude_bps': 0.75, 'dc_skip_contradicts': True,
'dc_leverage_boost': 1.00, 'dc_leverage_reduce': 0.50,
'vd_trend_lookback': 10, 'min_leverage': 0.50, 'max_leverage': 8.00, # gold spec
'leverage_convexity': 3.00, 'fraction': 0.20, 'use_alpha_layers': True,
'use_dynamic_leverage': True, 'fixed_tp_pct': 0.0020, 'stop_pct': 1.00,
'max_hold_bars': 250, 'use_sp_fees': True, 'use_sp_slippage': True, # gold spec
'sp_maker_entry_rate': 0.62, 'sp_maker_exit_rate': 0.50,
'use_ob_edge': True, 'ob_edge_bps': 5.00, 'ob_confirm_rate': 0.40,
'ob_imbalance_bias': -0.09, 'ob_depth_scale': 1.00,
'use_asset_selection': True, 'min_irp_alignment': 0.0,
'asset_selector_lookback': 10, 'lookback': 100, # gold spec
'acb_beta_high': 0.80, 'acb_beta_low': 0.20, 'acb_w750_threshold_pct': 60,
}
if Path(MC_MODELS_DIR).exists():
try:
from mc.mc_ml import DolphinForewarner
forewarner = DolphinForewarner(models_dir=MC_MODELS_DIR)
self.eng.set_mc_forewarner(forewarner, MC_BASE_CFG)
log(" MC-Forewarner: wired")
except Exception as e:
log(f" MC-Forewarner: {e}")
try:
from adaptive_exit.adaptive_exit_engine import AdaptiveExitEngine
self._ae = AdaptiveExitEngine.load()
log(" AdaptiveExitEngine: loaded (shadow mode — no real exits)")
except Exception as e:
log(f" AdaptiveExitEngine: {e} — shadow disabled")
if AlphaExitEngineV7 is not None and self._v7_journal_enabled:
try:
self._v7_exit_engine = AlphaExitEngineV7(bar_duration_sec=11.0)
self._ensure_v7_journal_table()
log(" AlphaExitEngineV7: loaded (live BLUE exit control + journal)")
except Exception as e:
log(f" AlphaExitEngineV7: {e} — shadow disabled")
self._v7_exit_engine = None
self._v7_live_exit_enabled = self._v7_exit_engine is not None
if self.eng is not None:
self.eng.exit_decision_provider = self._v7_live_exit_decision if self._v7_live_exit_enabled else None
self._load_bucket_assignments()
if SCThresholdAdvisor is not None:
try:
self._sc_advisor = SCThresholdAdvisor.load(
strategy="blue",
shadow_db=BLUE_CH_DB,
)
log(" SCThresholdAdvisor: loaded (shadow mode — no sizing changes)")
except Exception as e:
log(f" SCThresholdAdvisor: {e} — shadow disabled")
self._sc_advisor = None
if SCGaugeAdvisor is not None:
try:
self._sc_gauge = SCGaugeAdvisor.load(
strategy="blue",
shadow_db=BLUE_CH_DB,
)
log(" SCGaugeAdvisor: loaded (shadow mode — no sizing changes)")
except Exception as e:
log(f" SCGaugeAdvisor: {e} — shadow disabled")
self._sc_gauge = None
if BounceAdvisor is not None:
try:
self._bounce_advisor = BounceAdvisor.load(
strategy="blue",
shadow_db=BLUE_CH_DB,
)
log(" BounceAdvisor: loaded (shadow mode — no execution changes)")
except Exception as e:
log(f" BounceAdvisor: {e} — shadow disabled")
self._bounce_advisor = None
def _load_bucket_assignments(self):
"""Load KMeans asset→bucket_id mapping for hibernate protection SL levels."""
try:
import pickle
pkl_path = Path('/mnt/dolphinng5_predict/adaptive_exit/models/bucket_assignments.pkl')
with open(pkl_path, 'rb') as f:
data = pickle.load(f)
self._bucket_assignments = data.get('assignments', {})
log(f" BucketAssignments: {len(self._bucket_assignments)} assets loaded for hibernate protection")
except Exception as e:
log(f" BucketAssignments: {e} — hibernate protect will use default SL={_BUCKET_SL_PCT['default']*100:.1f}%")
def _announce_position_event(
self,
*,
kind: str,
severity: str,
title: str,
message: str,
metadata: dict | None = None,
) -> None:
center = getattr(self, "_trade_announcement_center", None)
if center is None:
return
try:
center.note_event(
kind=kind,
severity=severity,
title=title,
message=message,
metadata=metadata or {},
)
except Exception as e:
log(f" Position announcement failed: {e}")
def _read_esof_payload(self) -> dict | None:
"""Read the freshest EsoF advisory payload from HZ, if available."""
if not self.features_map:
return None
for key in ("esof_latest", "esof_advisor_latest"):
try:
raw = self.features_map.blocking().get(key)
except Exception:
continue
payload = parse_esof_payload(raw)
if payload:
return payload
return None
def _sync_esof_size_gate(self) -> None:
"""Update the shared engine with the current continuous EsoF size multiplier.
When the HZ payload is stale or missing (daemon died, HZ restarted),
falls back to inline computation using the canonical compute_esof() from
esof_advisor.py single implementation, no parallel code.
"""
payload = self._read_esof_payload()
score = esof_score_from_payload(payload, max_age_s=ESOF_FRESHNESS_S)
source = "hz"
if score is None and _compute_esof_inline is not None:
try:
inline = _compute_esof_inline()
score = esof_score_from_payload(inline, max_age_s=None)
if score is not None:
source = "inline"
payload = inline
except Exception:
pass
mult = esof_size_mult_from_score(score)
with self.eng_lock:
if hasattr(self.eng, "set_esof_advisory_score"):
self.eng.set_esof_advisory_score(score)
if mult != self._last_esof_size_mult:
self._last_esof_size_mult = mult
if score is None:
log(f"EsoF size gate: STALE-FALLBACK mult={mult:.2f} (no HZ + no inline)")
elif source == "inline":
log(f"EsoF size gate: INLINE sc={score:+.3f} mult={mult:.2f} (HZ stale)")
else:
log(f"EsoF size gate: sc={score:+.3f} mult={mult:.2f}")
def _tp_curve_context(self, *, notional: float | None = None) -> dict[str, Any]:
pos = getattr(self.eng, "position", None)
capital = float(getattr(self.eng, "capital", 0.0) or 0.0)
if notional is None:
if pos is not None:
pos_notional = _safe_float(getattr(pos, "notional", 0.0), 0.0)
if pos_notional <= 0.0:
pos_notional = _safe_float(
getattr(pos, "size", 0.0) * getattr(pos, "entry_price", 0.0),
0.0,
)
notional = pos_notional
else:
notional = 0.0
our_leverage = compute_our_leverage(notional=notional, capital=capital)
tp_effective_pct = compute_soft_tp_pct(self._tp_base_pct, our_leverage)
bundle = {}
if self._market_state_runtime is not None and getattr(self._market_state_runtime, "latest_bundle_dict", None):
bundle = dict(self._market_state_runtime.latest_bundle_dict)
return {
"tp_base_pct": float(self._tp_base_pct),
"tp_effective_pct": float(tp_effective_pct),
"our_leverage": float(our_leverage),
"market_state_bundle_json": json.dumps(bundle, default=str, sort_keys=True) if bundle else "{}",
}
def _sync_tp_threshold(self) -> None:
"""Read live TP threshold from HZ control plane and propagate to engine.
HZ key: DOLPHIN_FEATURES["live_tp_threshold"] JSON {"tp_pct": 0.0020, "ts": ...}
If absent or stale, keeps the current default (0.0020 from ENGINE_KWARGS).
A tighter TP cuts open positions immediately; a wider TP extends the hold.
"""
try:
ctx = self._tp_curve_context()
tp_pct = float(ctx.get("tp_effective_pct", 0.0) or 0.0)
if tp_pct <= 0:
return
with self.eng_lock:
old = self.eng.set_live_tp_pct(tp_pct)
if abs(old - tp_pct) > 1e-6:
log(
f"TP threshold: {old*100:.2f}% → {tp_pct*100:.2f}% "
f"(soft curve, lev={ctx.get('our_leverage', 0.0):.2f}x)"
)
except Exception:
pass
def _inject_obf_midprice(self, prices_dict: dict) -> dict:
"""Override scan price for the open position's asset with live OB mid-price.
Scan prices are quantized to ~4 decimal places (e.g. 0.1255 vs 0.1256),
which is too coarse for a 0.20% TP on low-priced assets. The OBF universe
service has live WebSocket bid/ask at ~0.5s resolution with full precision.
This method substitutes the scan price with (best_bid + best_ask) / 2 for
the position's asset only, so TP evaluation sees the freshest available
observation without changing the TP threshold itself.
"""
try:
pos = self.eng.position
if pos is None or not pos.asset:
return prices_dict
raw = self.features_map.blocking().get("obf_universe_latest")
return inject_obf_midprice(
prices_dict,
position_asset=str(pos.asset or ""),
obf_payload=raw,
max_age_s=3.0,
now_s=time.time(),
)
except Exception:
return prices_dict
def _sync_sc_threshold_advisor(self, scan_number: int, vel_div: float) -> None:
"""Shadow-only advisory layer for tracking / future threshold learning."""
if self._sc_advisor is None:
return
try:
payload = self._read_esof_payload()
trade_history = getattr(self.eng, "trade_history", [])
open_tid = next(iter(self._pending_entries.keys()), "")
pending = self._pending_entries.get(open_tid, {}) if open_tid else {}
rec = self._sc_advisor.evaluate(
trade_id=str(open_tid or ""),
asset=str(pending.get("asset", "")),
sc=_safe_float(payload.get("advisory_score", payload.get("score", 0.0)) if payload else None),
vel_div=float(vel_div or 0.0),
exf_snapshot=getattr(self, "_last_exf", {}) or {},
trade_history=trade_history,
current_mult=float(self._last_esof_size_mult or 1.0),
esof_payload=payload,
scan_number=int(scan_number or 0),
bar_idx=int(self.bar_idx),
strategy="blue",
log_shadow=True,
)
if open_tid:
pending["sc_threshold_advisor"] = rec
pending["sc_exec_mult"] = float(self._last_esof_size_mult or 1.0)
self._pending_entries[open_tid] = pending
try:
self._record_sc_haircut(trade_id=open_tid, pending=pending, source="sc_threshold")
except Exception as e:
log(f"SC haircut record failed for {open_tid}: {e}")
now = time.time()
if now - self._sc_advisor_last_log >= 300:
self._sc_advisor_last_log = now
log(
f"SC_ADVISOR: sc={rec['sc']:+.3f} cur={rec['current_mult']:.2f} "
f"rec={rec['recommended_mult']:.2f} cut={rec['recommended_sc_cut']:+.2f} "
f"conf={rec['confidence']:.2f} src={rec['decision_source']}"
)
except Exception as e:
log(f"SC_ADVISOR error: {e}")
def _current_obf_snapshot(self, asset: str, bar_idx: int) -> dict[str, dict]:
if build_obf_snapshot_from_engine is None or self.ob_eng is None or not asset:
return {}
try:
return build_obf_snapshot_from_engine(self.ob_eng, asset, bar_idx)
except Exception:
return {}
def _record_bounce_prices(self, prices_dict: dict[str, float]) -> None:
"""Maintain rolling price histories for the bounce advisor."""
if not prices_dict:
return
for asset, px in prices_dict.items():
try:
price = float(px)
except Exception:
continue
if not math.isfinite(price) or price <= 0.0:
continue
hist = self._bounce_price_history.get(asset)
if hist is None:
hist = deque(maxlen=512)
self._bounce_price_history[asset] = hist
hist.append(price)
def _bounce_price_path(self, asset: str) -> list[float]:
hist = self._bounce_price_history.get(asset)
if not hist:
return []
return [float(px) for px in hist if math.isfinite(float(px))]
def _bounce_eval(
self,
*,
trade_id: str,
asset: str,
side: str,
source: str,
scan_number: int,
entry_ts: datetime | None,
current_price: float,
entry_price: float,
quantity: float,
notional: float,
leverage: float,
vel_div: float,
current_mult: float,
bars_held: int,
log_shadow: bool = True,
) -> dict | None:
"""Evaluate the bounce advisor on a rolling price path and persist the row."""
if self._bounce_advisor is None or not trade_id or not asset:
return None
price_path = self._bounce_price_path(asset)
if len(price_path) < 3:
return None
rec = self._bounce_advisor.evaluate(
trade_id=str(trade_id),
asset=str(asset),
side=str(side or "SHORT"),
price_path=price_path,
entry_ts=entry_ts or datetime.now(timezone.utc),
entry_price=float(entry_price or 0.0),
current_price=float(current_price or 0.0),
quantity=float(quantity or 0.0),
notional=float(notional or 0.0),
leverage=float(leverage or 0.0),
current_mult=float(current_mult or 1.0),
vel_div=float(vel_div or 0.0),
scan_number=int(scan_number or 0),
bar_idx=int(self.bar_idx),
bars_held=int(max(0, bars_held)),
source=str(source or "entry"),
obf_snapshot=self._current_obf_snapshot(asset, self.bar_idx),
log_shadow=log_shadow,
use_ta=True,
use_obf=True,
)
if rec:
rec["price_path"] = price_path[-128:]
return rec
def _ensure_v7_journal_table(self) -> None:
"""Create the V7 decision journal if it does not already exist."""
ddl = f"""
CREATE TABLE IF NOT EXISTS {self._v7_journal_db}.{self._v7_journal_table}
(
ts DateTime64(6, 'UTC'),
ts_day Date MATERIALIZED toDate(ts),
strategy LowCardinality(String),
source LowCardinality(String),
trade_id String,
asset LowCardinality(String),
side LowCardinality(String),
entry_price Float64,
current_price Float64,
quantity Float64,
notional Float64,
leverage Float32,
bar_idx UInt32,
decision_seq UInt32,
bars_held UInt16,
action LowCardinality(String),
reason LowCardinality(String),
pnl_pct Float32,
mfe Float32,
mae Float32,
mfe_risk Float32,
mae_risk Float32,
exit_pressure Float32,
rv_comp Float32,
mae_thresh1 Float32,
bounce_score Float32,
bounce_risk Float32,
ob_imbalance Float32,
vel_div_entry Float32,
vel_div_now Float32,
v50_vel Float32,
v750_vel Float32,
exf_funding Float32,
exf_dvol Float32,
exf_fear_greed Float32,
exf_taker Float32,
posture LowCardinality(String),
tp_base_pct Float32 DEFAULT 0,
dynamic_tp_pct Float32 DEFAULT 0,
tp_mod_factor Float32 DEFAULT 0,
cascade_count UInt16 DEFAULT 0,
ob_regime_signal Int8 DEFAULT 0,
tp_floor_armed UInt8 DEFAULT 0
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(ts)
ORDER BY (ts_day, trade_id, decision_seq, ts)
TTL ts_day + toIntervalDay(180)
"""
try:
req = urllib.request.Request(
"http://localhost:8123/",
data=ddl.encode(),
method="POST",
)
req.add_header("X-ClickHouse-User", "dolphin")
req.add_header("X-ClickHouse-Key", "dolphin_ch_2026")
urllib.request.urlopen(req, timeout=5).close()
except Exception as exc:
log(f"[V7_JOURNAL] table ensure failed: {exc}")
def _record_v7_decision(
self,
*,
trade_id: str,
asset: str,
side: str,
decision: dict,
current_price: float,
ob_imbalance: float,
vel_div_now: float,
v50_vel: float,
v750_vel: float,
source: str = "scan_eval",
bar_idx: int | None = None,
) -> None:
"""Persist a V7 evaluation for observability and offline comparison."""
if not self._v7_journal_enabled or self._v7_exit_engine is None:
return
pending = self._pending_entries.get(trade_id, {})
seq = int(self._v7_decision_seq.get(trade_id, 0)) + 1
self._v7_decision_seq[trade_id] = seq
entry_price = float(pending.get("entry_price", 0.0) or 0.0)
quantity = float(pending.get("quantity", 0.0) or 0.0)
row = {
"ts": _ch_ts_us(),
"strategy": "blue",
"source": source,
"trade_id": str(trade_id or ""),
"asset": str(asset or pending.get("asset", "")),
"side": str(side or pending.get("side", "")),
"entry_price": entry_price,
"current_price": float(current_price or 0.0),
"quantity": quantity,
"notional": float(quantity * entry_price),
"leverage": float(pending.get("leverage", 0.0) or 0.0),
"bar_idx": int(max(0, self.bar_idx - 1 if bar_idx is None else bar_idx)),
"decision_seq": seq,
"bars_held": int(decision.get("bars_held", 0) or 0),
"action": str(decision.get("action", "UNKNOWN") or "UNKNOWN"),
"reason": _normalize_v7_exit_reason(decision.get("reason") or ""),
"pnl_pct": float(decision.get("pnl_pct", 0.0) or 0.0),
"mfe": float(decision.get("mfe", 0.0) or 0.0),
"mae": float(decision.get("mae", 0.0) or 0.0),
"mfe_risk": float(decision.get("mfe_risk", 0.0) or 0.0),
"mae_risk": float(decision.get("mae_risk", 0.0) or 0.0),
"exit_pressure": float(decision.get("exit_pressure", 0.0) or 0.0),
"rv_comp": float(decision.get("rv_comp", 0.0) or 0.0),
"mae_thresh1": float(decision.get("mae_thresh1", 0.0) or 0.0),
"bounce_score": float(decision.get("bounce_score", 0.0) or 0.0),
"bounce_risk": float(decision.get("bounce_risk", 0.0) or 0.0),
"ob_imbalance": float(ob_imbalance or 0.0),
"vel_div_entry": float(pending.get("vel_div_entry", 0.0) or 0.0),
"vel_div_now": float(vel_div_now or 0.0),
"v50_vel": float(v50_vel or 0.0),
"v750_vel": float(v750_vel or 0.0),
"exf_funding": float(self._last_exf.get("funding", 0.0) or 0.0),
"exf_dvol": float(self._last_exf.get("dvol", 0.0) or 0.0),
"exf_fear_greed": float(self._last_exf.get("fear_greed", 0.0) or 0.0),
"exf_taker": float(self._last_exf.get("taker", 0.0) or 0.0),
"posture": str(pending.get("posture", self.cached_posture) or ""),
}
# TP-threshold observability (LINK 5e05eeeb incident, 2026-06-11):
# the EFFECTIVE TP gate is OB-modulated (cascade ×1.40 etc.) and was
# never logged — making the miss undiagnosable from the tape. Pull
# the exit manager's last evaluation for this trade; fall back to
# any diag fields carried on the decision dict itself.
try:
_tp_diag = dict(getattr(self.eng.exit_manager, "last_eval", {}) or {})
if str(_tp_diag.get("trade_id") or "") != str(trade_id or ""):
_tp_diag = {}
except Exception:
_tp_diag = {}
def _dg(key, default=0.0):
v = decision.get(key, _tp_diag.get(key, default))
return v if v is not None else default
row.update({
"tp_base_pct": float(_dg("tp_base_pct")),
"dynamic_tp_pct": float(_dg("dynamic_tp_pct")),
"tp_mod_factor": float(_dg("tp_mod_factor")),
"cascade_count": int(_dg("cascade_count", 0)),
"ob_regime_signal": int(_dg("ob_regime_signal", 0)),
"tp_floor_armed": 1 if _dg("tp_floor_armed", False) else 0,
})
try:
ch_put(self._v7_journal_table, row)
except Exception as exc:
log(f"[V7_JOURNAL] write failed: {exc}")
def _v7_live_exit_decision(
self,
*,
pos,
bar_idx: int,
prices: dict,
vel_div: float,
v50_vel: float,
v750_vel: float,
) -> dict | None:
"""Live BLUE exit hook backed by AlphaExitEngineV7.
The orchestrator calls this before falling back to the base exit manager.
Returns a V7 decision dict or None if the trade cannot yet be evaluated.
"""
if self._v7_exit_engine is None or pos is None:
return None
trade_id = str(getattr(pos, "trade_id", "") or "")
asset = str(getattr(pos, "asset", "") or "")
if not trade_id or not asset:
return None
pending = self._pending_entries.get(trade_id, {})
ctx_v7 = self._v7_contexts.get(trade_id)
eval_bar = max(0, int(bar_idx) - 1)
if ctx_v7 is None:
try:
ctx_v7 = self._v7_exit_engine.make_context(
entry_price=float(
pending.get("entry_price", getattr(pos, "entry_price", 0.0))
or getattr(pos, "entry_price", 0.0)
or 0.0
),
entry_bar=int(pending.get("entry_bar", eval_bar) or eval_bar),
side=1 if str(pending.get("side", "SHORT") or "SHORT") == "SHORT" else 0,
)
if self._last_exf:
ctx_v7.set_exf(
funding=float(self._last_exf.get("funding", 0.0) or 0.0),
dvol=float(self._last_exf.get("dvol", 0.0) or 0.0),
fear_greed=float(self._last_exf.get("fear_greed", 0.0) or 0.0),
taker=float(self._last_exf.get("taker", 0.0) or 0.0),
)
self._v7_contexts[trade_id] = ctx_v7
self._v7_decision_seq.setdefault(trade_id, 0)
except Exception as exc:
log(f" V7 live context init failed for {trade_id}: {exc}")
return None
elif self._last_exf:
try:
ctx_v7.set_exf(
funding=float(self._last_exf.get("funding", 0.0) or 0.0),
dvol=float(self._last_exf.get("dvol", 0.0) or 0.0),
fear_greed=float(self._last_exf.get("fear_greed", 0.0) or 0.0),
taker=float(self._last_exf.get("taker", 0.0) or 0.0),
)
except Exception:
pass
ob_imb = 0.0
if self.ob_eng is not None:
try:
ob_sig = self.ob_eng.get_signal(asset, float(eval_bar))
ob_imb = float(getattr(ob_sig, "imbalance_ma5", 0.0) or 0.0)
except Exception as exc:
log(f" V7 live OB signal failed for {trade_id}: {exc}")
cur_px = float(
prices.get(asset, getattr(pos, "current_price", 0.0))
or getattr(pos, "current_price", 0.0)
or 0.0
)
if cur_px <= 0.0:
return None
decision = self._v7_exit_engine.evaluate(
ctx_v7,
cur_px,
eval_bar,
ob_imb,
asset=asset,
)
self._v7_decisions[trade_id] = decision
self._record_v7_decision(
trade_id=trade_id,
asset=asset,
side=str(pending.get("side", "SHORT") or "SHORT"),
decision=decision,
current_price=cur_px,
ob_imbalance=ob_imb,
vel_div_now=vel_div,
v50_vel=v50_vel,
v750_vel=v750_vel,
source="live_exit",
bar_idx=eval_bar,
)
action = str(decision.get("action", "HOLD") or "HOLD")
if action != "HOLD":
log(
" V7 live decision: "
f"{trade_id} {asset} action={action} reason={decision.get('reason', '')} "
f"pressure={float(decision.get('exit_pressure', 0.0) or 0.0):+.3f} "
f"pnl_pct={float(decision.get('pnl_pct', 0.0) or 0.0):+.3f}"
)
return decision
def _sync_sc_gauge_advisor(self, scan_number: int, vel_div: float) -> None:
"""Shadow-only bucket gauge advisory surface."""
if self._sc_gauge is None:
return
try:
payload = self._read_esof_payload()
trade_history = getattr(self.eng, "trade_history", [])
open_tid = next(iter(self._pending_entries.keys()), "")
pending = self._pending_entries.get(open_tid, {}) if open_tid else {}
asset = str(pending.get("asset", ""))
rec = self._sc_gauge.evaluate(
trade_id=str(open_tid or ""),
asset=asset,
sc=_safe_float(payload.get("advisory_score", payload.get("score", 0.0)) if payload else None),
vel_div=float(vel_div or 0.0),
exf_snapshot=getattr(self, "_last_exf", {}) or {},
obf_snapshot=self._current_obf_snapshot(asset, self.bar_idx),
trade_history=trade_history,
current_mult=float(self._last_esof_size_mult or 1.0),
esof_payload=payload,
scan_number=int(scan_number or 0),
bar_idx=int(self.bar_idx),
strategy="blue",
log_shadow=True,
)
if open_tid:
pending["sc_bucket_gauge"] = rec
pending["sc_bucket_gauge_exec_mult"] = float(self._last_esof_size_mult or 1.0)
self._pending_entries[open_tid] = pending
now = time.time()
if now - self._sc_gauge_last_log >= 300:
self._sc_gauge_last_log = now
log(
f"SC_GAUGE: sc={rec['sc']:+.3f} bucket={rec['bucket_id']} "
f"cur={rec['current_mult']:.2f} rec={rec['recommended_size_mult']:.2f} "
f"tp={rec['recommended_tp_mult']:.2f} hold={rec['recommended_hold_mult']:.2f} "
f"cut={rec['recommended_sc_cut']:+.2f} conf={rec['confidence']:.2f}"
)
except Exception as e:
log(f"SC_GAUGE error: {e}")
def _resolve_trade_id(self, explicit: str | None = None, *, create_if_missing: bool = False) -> str:
"""Resolve a trade_id from the event, live position, or pending entry."""
tid = str(explicit or "").strip()
if tid:
return tid
pos = getattr(self.eng, "position", None)
if pos is not None:
pos_tid = str(getattr(pos, "trade_id", "") or "").strip()
if pos_tid:
return pos_tid
if len(self._pending_entries) == 1:
pending_tid = next(iter(self._pending_entries.keys()))
if pending_tid:
return pending_tid
if create_if_missing:
return uuid.uuid4().hex[:16]
return ""
def _query_clickhouse_tsv(
self,
sql: str,
*,
db_candidates: tuple[str, ...] = ("dolphin", "dolphin_prodgreen"),
timeout: float = 5.0,
) -> tuple[str, str]:
"""Run a small ClickHouse HTTP query and return (raw_text, db_used)."""
import base64 as _b64
auth = "Basic " + _b64.b64encode(b"dolphin:dolphin_ch_2026").decode()
last_exc: Exception | None = None
for db in db_candidates:
try:
req = urllib.request.Request(
f"http://localhost:8123/?database={db}",
data=sql.encode(),
headers={"Authorization": auth},
)
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode().strip(), db
except Exception as exc:
last_exc = exc
raise last_exc or RuntimeError("ClickHouse query failed")
def _parse_capital_blob(self, raw, source: str) -> tuple[float, dict] | None:
"""Parse a HZ/JSON state blob and validate the capital payload."""
if not raw:
return None
try:
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
data = json.loads(raw) if isinstance(raw, str) else raw
if isinstance(data, list):
# ledger-style payload (list of update rows): use the latest row
data = next((e for e in reversed(data) if isinstance(e, dict)), {})
if not isinstance(data, dict):
data = {}
capital = float(data.get("capital", 0) or 0)
if capital >= 1.0 and math.isfinite(capital):
return capital, data
log(f" restore candidate rejected from {source}: capital={capital!r}")
except Exception as exc:
log(f" restore candidate parse failed from {source}: {exc}")
return None
def _parse_timestamp_seconds(self, value) -> float | None:
"""Parse epoch/ISO timestamps into UTC epoch seconds."""
if value is None:
return None
try:
if isinstance(value, (int, float)):
ts = float(value)
elif isinstance(value, str):
text = value.strip()
if not text:
return None
try:
ts = float(text)
except ValueError:
dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
ts = dt.timestamp()
else:
return None
if not math.isfinite(ts):
return None
# Normalize millisecond / microsecond / nanosecond epochs down to seconds.
# CH event clocks are often stored as ts_us, while HZ blobs tend to be seconds.
scale_hops = 0
while ts > 1.0e11 and scale_hops < 4:
ts /= 1000.0
scale_hops += 1
return ts if ts > 0 else None
except Exception:
return None
def _extract_state_timestamp(self, blob: dict) -> float | None:
"""Extract the best timestamp from a state blob."""
if not isinstance(blob, dict):
return None
for key in ("updated_at", "timestamp", "ts", "iso"):
if key not in blob:
continue
parsed = self._parse_timestamp_seconds(blob.get(key))
if parsed is not None:
return parsed
return None
def _mark_restore_failure(self, reason: str) -> None:
"""Mark restore as failed and force the trader into halt mode."""
self._restore_failed = True
self._restore_failure_reason = reason
try:
with self.eng_lock:
if self.eng is not None:
self.eng.regime_dd_halt = True
self.eng._day_posture = "HIBERNATE"
except Exception:
pass
log(f"RESTORE HALT: {reason}")
def _restore_capital_from_legacy_checkpoint(self) -> bool:
"""Legacy escape hatch for the old scalar checkpoint path."""
if not _env_bool("DOLPHIN_ALLOW_LEGACY_CAPITAL_CHECKPOINT", False):
return False
def _try_load(raw, source):
parsed = self._parse_capital_blob(raw, source)
if parsed is None:
return False
capital, _ = parsed
self.eng.capital = capital
self._restore_source = source
log(f" Capital restored from legacy {source}: ${capital:,.2f}")
return True
try:
raw = self.state_map.blocking().get("capital_checkpoint")
if _try_load(raw, "HZ capital_checkpoint"):
return True
except Exception as e:
log(f" capital HZ legacy restore failed: {e}")
try:
if CAPITAL_DISK_CHECKPOINT.exists():
raw = CAPITAL_DISK_CHECKPOINT.read_text()
if _try_load(raw, "disk capital_checkpoint"):
return True
except Exception as e:
log(f" capital disk legacy restore failed: {e}")
return False
def _restore_capital_from_state(self) -> bool:
"""Restore capital from live HZ state or ledger-backed snapshots."""
parsed_state = {}
self._restore_state_snapshots = {}
source_rank = {
"capital_update_ledger": 65,
"status_snapshots": 50,
"latest_nautilus": 40,
"engine_snapshot": 30,
"pnl_day": 25,
"correction_replay_local": 20,
"correction_replay_hz": 10,
"trade_events": 5,
}
if CAPITAL_CORRECTIVE_REPLAY.exists():
try:
replay_blob = json.loads(CAPITAL_CORRECTIVE_REPLAY.read_text())
replay_capital = _safe_float(replay_blob.get("capital", 0.0), 0.0)
replay_ts = replay_blob.get("updated_at") or replay_blob.get("ts")
replay_ts_f = None
if isinstance(replay_ts, (int, float)):
replay_ts_f = float(replay_ts)
elif isinstance(replay_ts, str):
try:
replay_ts_f = datetime.fromisoformat(replay_ts.replace("Z", "+00:00")).timestamp()
except Exception:
replay_ts_f = None
if replay_capital >= 1.0:
parsed_state["correction_replay_local"] = (
"local corrective replay",
replay_capital,
replay_blob,
replay_ts_f,
)
except Exception as e:
log(f" capital corrective replay read failed: {e}")
try:
if CAPITAL_UPDATE_LEDGER.exists():
raw = CAPITAL_UPDATE_LEDGER.read_text()
ledger_rows = json.loads(raw) if raw else []
if isinstance(ledger_rows, list) and ledger_rows:
last = ledger_rows[-1] if isinstance(ledger_rows[-1], dict) else None
if isinstance(last, dict):
capital_after = _safe_float(last.get("capital_after", last.get("capital", 0.0)), 0.0)
if capital_after >= 1.0 and math.isfinite(capital_after):
parsed_state["capital_update_ledger_local"] = (
"local capital_update_ledger",
capital_after,
dict(last),
self._extract_state_timestamp(last),
)
except Exception as e:
log(f" capital ledger disk read failed: {e}")
try:
raw_ledger = None
if self.state_map is not None:
raw_ledger = self.state_map.blocking().get("capital_update_ledger")
ledger_rows = json.loads(raw_ledger) if isinstance(raw_ledger, str) and raw_ledger else list(raw_ledger or [])
if isinstance(ledger_rows, list) and ledger_rows:
last = ledger_rows[-1] if isinstance(ledger_rows[-1], dict) else None
if isinstance(last, dict):
capital_after = _safe_float(last.get("capital_after", last.get("capital", 0.0)), 0.0)
if capital_after >= 1.0:
parsed_state["capital_update_ledger"] = (
"capital_update_ledger",
capital_after,
dict(last),
self._extract_state_timestamp(last),
)
except Exception as e:
log(f" capital ledger restore failed: {e}")
for key, label in (
("capital_update_ledger_local", "local capital_update_ledger"),
("capital_update_ledger", "capital_update_ledger"),
("correction_replay_local", "local corrective replay"),
(CAPITAL_CORRECTIVE_REPLAY_HZ_KEY, "HZ corrective replay"),
("latest_nautilus", "HZ latest_nautilus"),
("engine_snapshot", "HZ engine_snapshot"),
):
try:
raw = self.state_map.blocking().get(key)
except Exception as e:
log(f" capital {key} read failed: {e}")
raw = None
parsed = self._parse_capital_blob(raw, label)
if parsed is not None:
capital, blob = parsed
parsed_key = (
"correction_replay_local"
if key == "correction_replay_local"
else "correction_replay_hz" if key == CAPITAL_CORRECTIVE_REPLAY_HZ_KEY else key
)
parsed_state[parsed_key] = (
label,
capital,
blob,
self._extract_state_timestamp(blob),
)
if key in ("latest_nautilus", "engine_snapshot") and isinstance(blob, dict):
self._restore_state_snapshots[key] = dict(blob)
day_key = datetime.now(timezone.utc).strftime('%Y-%m-%d')
if self.pnl_map is not None:
try:
raw = self.pnl_map.blocking().get(day_key)
except Exception as e:
log(f" capital pnl_map[{day_key}] read failed: {e}")
raw = None
parsed = self._parse_capital_blob(raw, f"HZ pnl[{day_key}]")
if parsed is not None:
capital, blob = parsed
parsed_state["pnl_day"] = (
f"HZ pnl[{day_key}]",
capital,
blob,
self._extract_state_timestamp(blob),
)
def _select_restore_candidate() -> tuple[str, str, float, dict, float | None] | None:
candidates: list[tuple[float, int, str, str, float, dict, float | None]] = []
for key, (label, capital, blob, ts) in parsed_state.items():
if not (math.isfinite(capital) and capital >= 1.0):
continue
candidates.append(
(
ts if ts is not None else float("-inf"),
source_rank.get(key, 0),
key,
label,
capital,
blob,
ts,
)
)
if not candidates:
return None
force_latest_seed = _env_bool("DOLPHIN_FORCE_LATEST_NAUTILUS_RESTORE", False)
if force_latest_seed and "latest_nautilus" in parsed_state:
label, capital, blob, ts = parsed_state["latest_nautilus"]
if math.isfinite(capital) and capital >= 1.0:
return "latest_nautilus", label, capital, blob, ts
if "capital_update_ledger_local" in parsed_state:
label, capital, blob, ts = parsed_state["capital_update_ledger_local"]
if math.isfinite(capital) and capital >= 1.0:
return "capital_update_ledger_local", label, capital, blob, ts
candidates.sort(key=lambda item: (item[0], item[1]), reverse=True)
_, _, key, label, capital, blob, ts = candidates[0]
return key, label, capital, blob, ts
for sql, label in (
(
"SELECT ts, capital, trades_executed, posture, phase "
"FROM status_snapshots ORDER BY ts DESC LIMIT 1 FORMAT TabSeparated",
"status_snapshots",
),
(
"SELECT ts, capital_after, capital_before, pnl, exit_reason, trade_id "
"FROM trade_events "
"WHERE strategy='blue' AND capital_after > 0 "
"ORDER BY ts DESC LIMIT 1 FORMAT TabSeparated",
"trade_events",
),
):
try:
raw, db = self._query_clickhouse_tsv(sql)
if not raw:
continue
cols = raw.split("\t")
capital = None
if label == "status_snapshots" and len(cols) >= 2:
capital = float(cols[1])
parsed_state["status_snapshots"] = (
f"status_snapshots[{db}]",
capital,
{"capital": capital, "ts": cols[0]},
self._parse_timestamp_seconds(cols[0]),
)
elif label == "trade_events" and len(cols) >= 4:
cap_after = float(cols[1])
cap_before = float(cols[2])
pnl = float(cols[3])
expected = cap_before + pnl
if math.isfinite(cap_after) and math.isfinite(expected):
if abs(cap_after - expected) <= max(1.0, abs(expected) * 0.002):
capital = cap_after
else:
log(
f" restore candidate rejected from {db}.{label}: "
f"capital_after={cap_after:.2f} expected={expected:.2f} "
f"exit_reason={cols[4] if len(cols) > 4 else ''}"
)
if capital is not None and math.isfinite(capital) and capital >= 1.0:
parsed_state["trade_events"] = (
f"{db}.{label}",
capital,
{"capital": capital, "ts": cols[0], "trade_id": cols[5] if len(cols) > 5 else ""},
self._parse_timestamp_seconds(cols[0]),
)
except Exception as e:
log(f" capital {label} replay failed: {e}")
chosen = _select_restore_candidate()
if chosen is not None:
key, label, capital, replay_blob, _ = chosen
self.eng.capital = capital
self._restore_source = label
if key in ("correction_replay_local", "correction_replay_hz"):
self._publish_corrective_replay(replay_blob)
log(f" Capital restored from {label}: ${capital:,.2f}")
return True
if self._restore_capital_from_legacy_checkpoint():
return True
self._mark_restore_failure("no sane capital source found (HZ state and ledger replay unavailable)")
return False
# ── CH position-state persistence ─────────────────────────────────────────
def _ps_write_open(
self,
tid: str,
entry: dict,
*,
ts: int | None = None,
entry_bar: int | None = None,
bars_held: int = 0,
pnl: float = 0.0,
) -> bool:
"""Persist OPEN row to position_state. SINGLE write gate for OPEN rows.
Lifecycle invariant (MALFORMED_OPEN_RESTORE_BUG.md, distal fix):
an OPEN row MUST represent a position with economic size. Writes with
quantity <= 0 or notional <= POSITION_DUST_NOTIONAL_USD are REFUSED
a dust/zero remainder is a lifecycle CLOSE and must go through
_ps_write_closed. Returns True if the row was emitted.
The keyword overrides let the partial-retract path persist the
remaining leg through this same gate (ts=now, continued entry_bar,
accumulated bars_held / realized pnl) instead of bypassing it with a
raw ch_put the bypass is how zero-size OPEN snapshots were born.
"""
try:
quantity = float(entry.get('quantity', 0.0) or 0.0)
entry_price = float(entry.get('entry_price', 0.0) or 0.0)
notional = round(quantity * entry_price, 4)
if quantity <= 0.0 or notional <= POSITION_DUST_NOTIONAL_USD:
log(
" position_state OPEN write REFUSED (lifecycle invariant): "
f"trade={tid} qty={quantity} notional={notional}"
"dust/zero remainders must close, not snapshot as OPEN"
)
return False
market_state_bundle_json = str(entry.get("market_state_bundle_json", "") or "")
ch_put("position_state", {
"ts": int(ts if ts is not None else entry['entry_ts']),
"trade_id": tid,
"asset": entry['asset'],
"direction": -1 if entry['side'] == 'SHORT' else 1,
"entry_price": entry_price,
"quantity": quantity,
"notional": notional,
"leverage": entry['leverage'],
"bucket_id": int(getattr(self, "_bucket_assignments", {}).get(entry['asset'], -1)),
"entry_bar": int(entry_bar if entry_bar is not None else self.bar_idx),
"status": "OPEN",
"exit_reason": "",
"pnl": float(pnl),
"bars_held": int(bars_held),
"market_state_bundle_json": market_state_bundle_json,
"tp_base_pct": float(entry.get("tp_base_pct", 0.0) or 0.0),
"tp_effective_pct": float(entry.get("tp_effective_pct", 0.0) or 0.0),
"our_leverage": float(entry.get("our_leverage", 0.0) or 0.0),
})
return True
except Exception as e:
log(f" position_state OPEN write failed: {e}")
return False
def _ps_write_closed(self, tid: str, pending: dict, x: dict):
"""Persist CLOSED row to position_state on exit (supersedes OPEN row via ReplacingMergeTree)."""
try:
market_state_bundle_json = str(pending.get("market_state_bundle_json", "") or "")
ch_put("position_state", {
"ts": _ch_ts_us(),
"trade_id": tid,
"asset": pending.get('asset', ''),
"direction": -1 if pending.get('side') == 'SHORT' else 1,
"entry_price": pending.get('entry_price', 0.0),
"quantity": pending.get('quantity', 0.0),
"notional": round(pending.get('quantity', 0.0) * pending.get('entry_price', 0.0), 4),
"leverage": pending.get('leverage', 0.0),
"bucket_id": int(getattr(self, "_bucket_assignments", {}).get(pending.get('asset', ''), -1)),
"entry_bar": 0,
"status": "CLOSED",
"exit_reason": str(x.get('reason', 'UNKNOWN')),
"pnl": float(x.get('net_pnl', 0) or 0),
"bars_held": int(x.get('bars_held', 0) or 0),
"market_state_bundle_json": market_state_bundle_json,
"tp_base_pct": float(pending.get("tp_base_pct", 0.0) or 0.0),
"tp_effective_pct": float(pending.get("tp_effective_pct", 0.0) or 0.0),
"our_leverage": float(pending.get("our_leverage", 0.0) or 0.0),
})
except Exception as e:
log(f" position_state CLOSED write failed: {e}")
def _fallback_pending_for_close(self, trade_id: str, exit_payload: Mapping[str, Any]) -> dict:
"""Best-effort pending snapshot when in-memory pending metadata is unavailable."""
side = str(exit_payload.get("side", "") or "").upper()
if side not in {"SHORT", "LONG"}:
direction = int(_safe_float(exit_payload.get("direction", -1), -1))
side = "SHORT" if direction == -1 else "LONG"
entry_price = _safe_float(exit_payload.get("entry_price", 0.0), 0.0)
quantity = _safe_float(exit_payload.get("quantity", 0.0), 0.0)
leverage = _safe_float(exit_payload.get("leverage", 0.0), 0.0)
asset = str(exit_payload.get("asset", "") or "")
return {
"trade_id": str(trade_id or ""),
"asset": asset,
"side": side,
"entry_price": entry_price if entry_price > 0 else 0.0,
"quantity": quantity if quantity > 0 else 0.0,
"notional": (entry_price * quantity) if entry_price > 0 and quantity > 0 else 0.0,
"leverage": leverage if leverage > 0 else 0.0,
"entry_date": str(self.current_day or ""),
"posture": "FALLBACK_CLOSE",
"vel_div_entry": 0.0,
"boost_at_entry": 1.0,
"beta_at_entry": 1.0,
}
def _restore_open_max_age_seconds(self) -> float:
"""Max tolerated age for an OPEN row before restore treats it as stale ghost state."""
env_value = _safe_float(os.environ.get("DOLPHIN_RESTORE_OPEN_MAX_AGE_SEC"), float("nan"))
if math.isfinite(env_value) and env_value > 0:
return float(env_value)
return 12.0 * 3600.0
def _restore_position_state(self):
"""On startup: check CH for an OPEN position and restore engine state."""
try:
import urllib.request, base64 as _b64
# IMPORTANT:
# Never filter status='OPEN' first, otherwise stale historical OPEN rows
# can be resurrected forever even after a newer CLOSED row exists.
# Resolve latest row per trade_id first, then keep only currently-OPEN.
sql = (
"SELECT trade_id, asset, direction, entry_price, quantity, "
"notional, leverage, bucket_id, bars_held, last_ts "
"FROM ("
" SELECT "
" trade_id, "
" argMax(asset, ts) AS asset, "
" argMax(direction, ts) AS direction, "
" argMax(entry_price, ts) AS entry_price, "
" argMax(quantity, ts) AS quantity, "
" argMax(notional, ts) AS notional, "
" argMax(leverage, ts) AS leverage, "
" argMax(bucket_id, ts) AS bucket_id, "
" argMax(bars_held, ts) AS bars_held, "
" argMax(status, ts) AS status, "
" argMax(ts, ts) AS last_ts "
" FROM dolphin.position_state "
" GROUP BY trade_id"
") "
"WHERE status = 'OPEN' AND quantity > 0 AND notional > 0 "
"ORDER BY last_ts DESC LIMIT 1 FORMAT TabSeparated"
)
def _restore_from_hz_snapshot(reason: str) -> bool:
"""Fallback restore path when ClickHouse is unavailable or empty.
We prefer latest_nautilus/engine_snapshot because these are the live
BLUE state surfaces and can still be coherent even if CH restore
is temporarily unavailable. The restored open leg is re-seeded back
into position_state so future restarts can recover without replaying
the entire incident.
"""
snapshot_sources = (
("latest_nautilus", "HZ latest_nautilus"),
("engine_snapshot", "HZ engine_snapshot"),
)
cached_snapshots = getattr(self, "_restore_state_snapshots", {}) or {}
for key, label in snapshot_sources:
blob = cached_snapshots.get(key)
if not isinstance(blob, dict):
try:
raw = self.state_map.blocking().get(key)
except Exception as e:
log(f" {label} read failed during restore fallback: {e}")
raw = None
parsed = self._parse_capital_blob(raw, label)
if parsed is None:
continue
_, blob = parsed
if not isinstance(blob, dict):
continue
open_positions = blob.get("open_positions")
if not isinstance(open_positions, list) or len(open_positions) != 1:
continue
pos_blob = open_positions[0]
if not isinstance(pos_blob, dict):
continue
trade_id = str(pos_blob.get("trade_id", "") or "").strip()
asset = str(pos_blob.get("asset", "") or "").strip()
side = str(pos_blob.get("side", "") or "").upper()
direction = -1 if side == "SHORT" else 1 if side == "LONG" else 0
entry_price = float(pos_blob.get("entry_price", 0.0) or 0.0)
quantity = float(pos_blob.get("quantity", 0.0) or 0.0)
notional = float(pos_blob.get("notional", quantity * entry_price) or 0.0)
leverage = float(pos_blob.get("leverage", 0.0) or 0.0)
stored_bars = int(pos_blob.get("bars_held", 0) or blob.get("bars_held", 0) or 0)
# Continuity formula identical to the CH path: anchor on
# THIS session's bar counter (negative entry_bar is fine,
# Int32 in CH) so bars_held resumes at stored_bars. The
# old snapshot_bar-based form anchored on the PREVIOUS
# session's counter, producing entry_bar >> bar_idx and
# therefore NEGATIVE bars_held after a restart.
restored_entry_bar = self.bar_idx - max(0, stored_bars)
snapshot_ts = self._extract_state_timestamp(blob)
entry_ts_us = int((snapshot_ts if snapshot_ts is not None else time.time()) * 1_000_000)
if not trade_id:
continue
if not asset:
continue
if direction not in (-1, 1):
continue
if not (math.isfinite(entry_price) and entry_price > 0):
continue
if not (math.isfinite(quantity) and quantity > 0):
continue
if not (math.isfinite(notional) and notional > 0):
notional = quantity * entry_price
if not (math.isfinite(leverage) and leverage > 0):
continue
chain_recon = self._load_chain_ledger_state(trade_id)
chain_meta = {}
if chain_recon:
chain_meta.update(chain_recon)
nested_chain = chain_recon.get("chain")
if isinstance(nested_chain, dict):
chain_meta.update(nested_chain)
chain_seed_pending = {
"asset": asset,
"side": side or ("SHORT" if direction == -1 else "LONG"),
"entry_price": entry_price,
"quantity": quantity,
"notional": notional,
"notional_entry": notional,
"leverage": leverage,
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
# NEVER take entry_bar from chain_meta: trade_reconstruction
# payloads carry the DEAD session's bar counter, so the
# override reinstated the stale clock frame the re-anchor
# exists to fix (negative bars_held → UInt16 spool poison,
# incident 2026-06-12). restored_entry_bar already encodes
# hold continuity via stored_bars in THIS session's frame.
"entry_bar": int(restored_entry_bar),
"entry_ts": int(chain_meta.get("entry_ts", entry_ts_us) or entry_ts_us) if chain_recon else entry_ts_us,
"retraction_legs": int(chain_meta.get("retraction_legs", chain_meta.get("chain_seq", 0)) or 0) if chain_recon else 0,
"realized_pnl_legs_total": float(chain_meta.get("realized_pnl_legs_total", 0.0) or 0.0) if chain_recon else 0.0,
}
try:
chain_state = self._chain_state_from_reconstruction(trade_id, chain_seed_pending, chain_recon)
except Exception as chain_err:
log(f" position_state HZ fallback chain restore failed: {chain_err}")
self._mark_restore_failure(str(chain_err))
return False
pos = NDPosition(
trade_id=trade_id,
asset=asset,
direction=direction,
entry_price=entry_price,
entry_bar=restored_entry_bar,
notional=notional,
leverage=leverage,
fraction=notional / max(self.eng.capital * leverage, 1.0),
entry_vel_div=0.0,
bucket_idx=0,
current_price=entry_price,
)
with self.eng_lock:
self.eng.position = pos
self.eng.exit_manager.setup_position(
trade_id,
entry_price,
direction,
restored_entry_bar,
stop_pct_override=float(getattr(self, "_catastrophic_floor_pct", 0.0120) or 0.0120),
)
self._pending_entries[trade_id] = {
"trade_id": trade_id,
"asset": asset,
"side": side or ("SHORT" if direction == -1 else "LONG"),
"entry_price": entry_price,
"quantity": quantity,
"notional": notional,
"notional_entry": notional,
"leverage": leverage,
"vel_div_entry": 0.0,
"boost_at_entry": 1.0,
"beta_at_entry": 1.0,
"posture": "RESTORED",
"entry_ts": entry_ts_us,
"entry_date": (self.current_day or ""),
"retraction_legs": int(chain_state.get("chain_seq", 0) or 0),
"realized_pnl_legs_total": float(chain_state.get("realized_pnl_legs_total", 0.0) or 0.0),
"chain_root_trade_id": chain_state.get("chain_root_trade_id", trade_id),
"chain_head_leg_id": chain_state.get("chain_head_leg_id", f"{trade_id}:open"),
"chain_prev_leg_id": chain_state.get("chain_prev_leg_id", ""),
"chain_seq": int(chain_state.get("chain_seq", 0) or 0),
"chain_token": chain_state.get("chain_token", ""),
"chain_mode": chain_state.get("chain_mode", "LIVE"),
"chain_version": int(chain_state.get("chain_version", 1) or 1),
"chain_kind": chain_state.get("chain_kind", "ROOT"),
}
v7_exit_engine = getattr(self, "_v7_exit_engine", None)
if v7_exit_engine is not None:
try:
ctx = v7_exit_engine.make_context(
entry_price=entry_price,
entry_bar=restored_entry_bar,
side=1 if direction == -1 else 0,
)
self._v7_contexts[trade_id] = ctx
self._v7_decision_seq[trade_id] = 0
except Exception as e:
log(f" V7 live restore context failed (HZ fallback): {e}")
self._seed_posture_for_restored_position()
with self.eng_lock:
self._apply_catastrophic_floor_to_open_position()
try:
self._ps_write_open(trade_id, self._pending_entries[trade_id])
except Exception as e:
log(f" position_state HZ fallback OPEN write failed: {e}")
self._restore_source = label
self._restore_failed = False
self._restore_failure_reason = ""
log(
f" position_state RESTORED from {label} ({reason}): "
f"{asset} {side or ('SHORT' if direction == -1 else 'LONG')} "
f"entry={entry_price} notional={notional:.0f} bars_held≈{stored_bars} trade={trade_id}"
)
return True
return False
def _hz_snapshot_is_flat(reason: str) -> bool:
"""Accept flat HZ state when CH restore is temporarily unavailable."""
snapshot_sources = (
("latest_nautilus", "HZ latest_nautilus"),
("engine_snapshot", "HZ engine_snapshot"),
)
cached_snapshots = getattr(self, "_restore_state_snapshots", {}) or {}
for key, label in snapshot_sources:
blob = cached_snapshots.get(key)
if not isinstance(blob, dict):
try:
raw = self.state_map.blocking().get(key)
except Exception as e:
log(f" {label} flat-check read failed during restore fallback: {e}")
raw = None
parsed = self._parse_capital_blob(raw, label)
if parsed is None:
continue
_, blob = parsed
if not isinstance(blob, dict):
continue
open_positions = blob.get("open_positions")
if isinstance(open_positions, list) and len(open_positions) == 0:
log(f" position_state: CH restore unavailable ({reason}); {label} is flat")
return True
return False
req = urllib.request.Request(
"http://localhost:8123/?database=dolphin",
data=sql.encode(),
headers={"Authorization": "Basic " +
_b64.b64encode(b"dolphin:dolphin_ch_2026").decode()})
with urllib.request.urlopen(req, timeout=5) as r:
row = r.read().decode().strip()
if not row:
log(" position_state: no open position to restore in CH; trying HZ fallback")
if _restore_from_hz_snapshot("CH empty"):
return
return
def _reject_restore_candidate(message: str, *, halt_on_exhaustion: bool = True) -> bool:
log(f" position_state open candidate rejected: {message}")
if _restore_from_hz_snapshot(message):
return True
if _hz_snapshot_is_flat(message):
return True
# Fallbacks exhausted: no HZ position AND no HZ flat-proof.
# Two garbage classes diverge here:
# - zero-size OPEN rows are the DOCUMENTED malformed/tombstone
# class (MALFORMED_OPEN_RESTORE_BUG.md): definitionally not
# live positions → flat continuation is correct
# (halt_on_exhaustion=False at those call sites);
# - corrupt direction/entry_price/leverage is UNKNOWN state —
# trading from flat over it risks a single-slot violation
# (XTZ 863c21da class) → halt via restore-failure.
if halt_on_exhaustion:
self._mark_restore_failure(message)
return False
cols = row.split('\t')
if len(cols) < 10:
log(f" position_state: unexpected row format: {row}")
if _restore_from_hz_snapshot("CH malformed"):
return
self._mark_restore_failure("position_state row malformed")
return
trade_id = cols[0]
asset = cols[1]
direction = int(cols[2])
entry_price = float(cols[3])
quantity = float(cols[4])
notional = float(cols[5])
leverage = float(cols[6])
bucket_id = int(cols[7])
stored_bars = int(cols[8])
last_ts = self._parse_timestamp_seconds(cols[9])
if not trade_id.strip():
self._mark_restore_failure("position_state row missing trade_id")
return
if not asset.strip():
self._mark_restore_failure(f"position_state row missing asset for trade {trade_id}")
return
if direction not in (-1, 1):
if _reject_restore_candidate(f"position_state row invalid direction for trade {trade_id}: {direction}"):
return
return
if not (math.isfinite(entry_price) and entry_price > 0):
if _reject_restore_candidate(f"position_state row invalid entry_price for trade {trade_id}: {entry_price}"):
return
return
if not (math.isfinite(quantity) and quantity > 0):
# zero/dust size = documented tombstone class → no halt
if _reject_restore_candidate(
f"position_state row invalid quantity for trade {trade_id}: {quantity}",
halt_on_exhaustion=False):
return
return
if not (math.isfinite(notional) and notional > 0):
# zero/dust size = documented tombstone class → no halt
if _reject_restore_candidate(
f"position_state row invalid notional for trade {trade_id}: {notional}",
halt_on_exhaustion=False):
return
return
if not (math.isfinite(leverage) and leverage > 0):
if _reject_restore_candidate(f"position_state row invalid leverage for trade {trade_id}: {leverage}"):
return
return
if stored_bars < 0:
self._mark_restore_failure(f"position_state row invalid bars_held for trade {trade_id}: {stored_bars}")
return
if last_ts is not None:
age_sec = max(0.0, time.time() - last_ts)
max_age_sec = self._restore_open_max_age_seconds()
if age_sec > max_age_sec:
log(
" position_state stale OPEN candidate rejected: "
f"trade={trade_id} age={age_sec:.0f}s limit={max_age_sec:.0f}s "
f"asset={asset} side={'SHORT' if direction == -1 else 'LONG'}"
)
stale_pending = {
"asset": asset,
"side": "SHORT" if direction == -1 else "LONG",
"entry_price": entry_price,
"quantity": quantity,
"leverage": leverage,
}
self._ps_write_closed(
trade_id,
stale_pending,
{
"reason": "RESTORE_STALE_OPEN_REJECT",
"net_pnl": 0.0,
"bars_held": stored_bars,
},
)
return
derived_notional = quantity * entry_price
if math.isfinite(derived_notional) and derived_notional > 0:
if abs(notional - derived_notional) > max(1.0, abs(derived_notional) * 0.01):
log(
" position_state notional mismatch: "
f"stored={notional:.6f} derived={derived_notional:.6f} trade={trade_id} "
"— using derived value"
)
notional = derived_notional
# entry_bar so the MAX_HOLD countdown CONTINUES from where it left
# off. At boot self.bar_idx is 0, so this is typically negative —
# that is intentional: bars_held = bar_idx entry_bar then equals
# stored_bars immediately. The old max(0, …) clamp zeroed the
# clock on every restore (XTZ 863c21da "bars_held≈0"; MAX_HOLD
# never fired and the phantom rode 1h to STOP_LOSS).
# position_state.entry_bar is Int32 — negative is storable.
restored_entry_bar = self.bar_idx - max(0, stored_bars)
chain_recon = self._load_chain_ledger_state(trade_id)
chain_meta = {}
if chain_recon:
chain_meta.update(chain_recon)
nested_chain = chain_recon.get("chain")
if isinstance(nested_chain, dict):
chain_meta.update(nested_chain)
chain_seed_pending = {
"asset": asset,
"side": 'SHORT' if direction == -1 else 'LONG',
"entry_price": entry_price,
"quantity": quantity,
"notional": notional,
"notional_entry": notional,
"leverage": leverage,
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
# NEVER take entry_bar from chain_meta: trade_reconstruction
# payloads carry the DEAD session's bar counter — the override
# reinstated the stale clock frame the re-anchor exists to fix
# (negative bars_held → UInt16 spool poison, incident 2026-06-12).
"entry_bar": int(restored_entry_bar),
"entry_ts": int(chain_meta.get("entry_ts", 0) or 0) if chain_recon else 0,
"retraction_legs": int(chain_meta.get("retraction_legs", chain_meta.get("chain_seq", 0)) or 0) if chain_recon else 0,
"realized_pnl_legs_total": float(chain_meta.get("realized_pnl_legs_total", 0.0) or 0.0) if chain_recon else 0.0,
}
try:
chain_state = self._chain_state_from_reconstruction(trade_id, chain_seed_pending, chain_recon)
except Exception as chain_err:
self._mark_restore_failure(str(chain_err))
return
pos = NDPosition(
trade_id = trade_id,
asset = asset,
direction = direction,
entry_price = entry_price,
entry_bar = restored_entry_bar,
notional = notional,
leverage = leverage,
fraction = notional / max(self.eng.capital * leverage, 1.0),
entry_vel_div = 0.0,
bucket_idx = 0, # signal-strength bucket (not KMeans); 0=safe default
current_price = entry_price,
)
with self.eng_lock:
self.eng.position = pos
self.eng.exit_manager.setup_position(
trade_id,
entry_price,
direction,
restored_entry_bar,
stop_pct_override=float(getattr(self, "_catastrophic_floor_pct", 0.0120) or 0.0120),
)
# NOTE: do NOT arm hibernate protect here.
# _day_posture starts as 'APEX' — the posture sync block on the
# first incoming scan will detect the APEX→HIBERNATE transition
# and call _hibernate_protect_position() at the right moment.
# Rebuild _pending_entries so the exit CH write fires correctly
side = 'SHORT' if direction == -1 else 'LONG'
self._pending_entries[trade_id] = {
'trade_id': trade_id,
'asset': asset,
'side': side,
'entry_price': entry_price,
'quantity': quantity,
'notional': float(quantity * entry_price),
'notional_entry': float(quantity * entry_price),
'leverage': leverage,
'vel_div_entry': 0.0,
'boost_at_entry': 1.0,
'beta_at_entry': 1.0,
'posture': 'RESTORED',
'entry_ts': int(chain_meta.get("entry_ts", _ch_ts_us()) or _ch_ts_us()) if chain_recon else _ch_ts_us(),
'entry_date': (self.current_day or ''),
'retraction_legs': int(chain_state.get("chain_seq", 0) or 0),
'realized_pnl_legs_total': float(chain_state.get("realized_pnl_legs_total", 0.0) or 0.0),
'chain_root_trade_id': chain_state.get("chain_root_trade_id", trade_id),
'chain_head_leg_id': chain_state.get("chain_head_leg_id", f"{trade_id}:open"),
'chain_prev_leg_id': chain_state.get("chain_prev_leg_id", ""),
'chain_seq': int(chain_state.get("chain_seq", 0) or 0),
'chain_token': chain_state.get("chain_token", ""),
'chain_mode': chain_state.get("chain_mode", "LIVE"),
'chain_version': int(chain_state.get("chain_version", 1) or 1),
'chain_kind': chain_state.get("chain_kind", "ROOT"),
}
if self._v7_exit_engine is not None:
try:
ctx = self._v7_exit_engine.make_context(
entry_price=entry_price,
entry_bar=restored_entry_bar,
side=1 if direction == -1 else 0,
)
self._v7_contexts[trade_id] = ctx
self._v7_decision_seq[trade_id] = 0
except Exception as e:
log(f" V7 live restore context failed: {e}")
self._seed_posture_for_restored_position()
with self.eng_lock:
self._apply_catastrophic_floor_to_open_position()
log(f" position_state RESTORED: {asset} {side} entry={entry_price} "
f"notional={notional:.0f} bars_held≈{stored_bars} trade={trade_id}")
except Exception as e:
log(f" position_state restore error: {e}")
if _restore_from_hz_snapshot(str(e)):
return
if _hz_snapshot_is_flat(str(e)):
return
self._mark_restore_failure(f"position_state restore error: {e}")
def _seed_posture_for_restored_position(self) -> None:
"""Make the next scan observe a posture transition for restored legs."""
try:
if self.eng is None or getattr(self.eng, "position", None) is None:
return
if getattr(self.eng, "_day_posture", "APEX") == "HIBERNATE":
self.eng._day_posture = "APEX"
log(" position_state restore: re-seeded day posture to APEX for hibernate sync")
except Exception as e:
log(f" position_state posture reseed failed: {e}")
def _rehydrate_engine_position_from_bingx(self, *, source: str = "startup") -> None:
"""Keep the local engine slot aligned with the exchange slot when live on BingX.
This is intentionally conservative in BLUE: when exchange is flat, clear any
stale local slot artifacts. Projection of non-flat exchange state is handled
by the execution/runtime layer.
"""
try:
exec_venue_name = getattr(self, "_exec_venue_name", None)
exec_venue = exec_venue_name() if callable(exec_venue_name) else ""
if str(exec_venue).upper() != "BINGX" or not bool(getattr(self, "live_mode", False)):
return
engine = getattr(self, "engine", None)
if engine is None:
return
get_live_positions = getattr(self, "_get_bingx_live_positions", None)
live_positions = get_live_positions() if callable(get_live_positions) else {}
if not isinstance(live_positions, dict):
live_positions = {}
current_pos = getattr(engine, "position", None)
if live_positions or current_pos is None:
return
stale_tid = str(getattr(current_pos, "trade_id", "") or "")
state = engine.get_state() if hasattr(engine, "get_state") else {}
if not isinstance(state, dict):
state = {}
state["position"] = None
try:
if hasattr(engine, "restore_state"):
engine.restore_state(state)
else:
engine.position = None
except Exception:
engine.position = None
open_positions = getattr(self, "_exec_open_positions", None)
if isinstance(open_positions, dict):
open_positions.pop(stale_tid, None)
pending_entries = getattr(self, "_pending_entries", None)
if isinstance(pending_entries, dict):
pending_entries.pop(stale_tid, None)
rt_exit_mgr = getattr(self, "_rt_exit_mgr", None)
if rt_exit_mgr is not None and stale_tid:
unregister = getattr(rt_exit_mgr, "unregister", None)
if callable(unregister):
try:
unregister(stale_tid)
except Exception:
pass
logger = getattr(self, "log", None) or getattr(self, "_log", None)
if logger is not None and hasattr(logger, "warning"):
logger.warning(
f"[BINGX_REHYDRATE] cleared stale engine slot from {source}: "
f"exchange flat, trade_id={stale_tid or '<missing>'}"
)
except Exception as exc:
logger = getattr(self, "log", None) or getattr(self, "_log", None)
if logger is not None and hasattr(logger, "debug"):
logger.debug(f"[BINGX_REHYDRATE] stale-slot cleanup failed: {exc}")
def _hibernate_protect_position(self):
"""Arm per-bucket TP+SL instead of immediate HIBERNATE_HALT.
Must be called under eng_lock with an open position.
Sets stop_pct_override on the live exit_manager state so the position
exits via FIXED_TP or STOP_LOSS rather than being force-closed.
Records trade_id in _hibernate_protect_active so the exit path can
re-label the reason and finalize posture once the position closes.
"""
pos = self.eng.position
if pos is None:
return
bucket = getattr(self, "_bucket_assignments", {}).get(pos.asset, 'default')
sl_pct = _BUCKET_SL_PCT.get(bucket, _BUCKET_SL_PCT['default'])
tp_pct = self.eng.exit_manager.fixed_tp_pct
# Patch the live exit_manager state for this trade_id
em_state = self.eng.exit_manager._positions.get(pos.trade_id)
if em_state is not None:
em_state['stop_pct_override'] = sl_pct
else:
# Position not registered in exit_manager (shouldn't happen, but be safe)
log(f" HIBERNATE_PROTECT: trade {pos.trade_id} not in exit_manager — arming anyway via re-setup")
self.eng.exit_manager.setup_position(
pos.trade_id, pos.entry_price, pos.direction, pos.entry_bar,
stop_pct_override=sl_pct,
)
self._hibernate_protect_active = pos.trade_id
log(f"HIBERNATE_PROTECT armed: {pos.asset} B{bucket} "
f"SL={sl_pct*100:.2f}% TP={tp_pct*100:.2f}% trade={pos.trade_id}")
def _apply_catastrophic_floor_to_open_position(self):
"""Keep a bounded live loss floor armed on the current BLUE position."""
floor_pct, floor_label = self._catastrophic_floor_for_open_position()
if floor_pct <= 0.0:
return
if self.eng is None:
return
pos = getattr(self.eng, "position", None)
if pos is None:
return
trade_id = str(getattr(pos, "trade_id", "") or "")
if not trade_id:
return
try:
em_state = self.eng.exit_manager._positions.get(trade_id)
if em_state is None:
self.eng.exit_manager.setup_position(
trade_id,
pos.entry_price,
pos.direction,
pos.entry_bar,
stop_pct_override=floor_pct,
)
log(
f"CATASTROPHIC_FLOOR armed: {pos.asset} "
f"SL={floor_pct*100:.2f}% mode={floor_label} trade={trade_id}"
)
return
current = _safe_float(em_state.get("stop_pct_override"), 0.0)
if current <= 0.0 or current > floor_pct:
em_state["stop_pct_override"] = floor_pct
log(
f"CATASTROPHIC_FLOOR armed: {pos.asset} "
f"SL={floor_pct*100:.2f}% mode={floor_label} trade={trade_id}"
)
except Exception as e:
log(f" CATASTROPHIC_FLOOR failed: {e}")
def _catastrophic_floor_for_open_position(self) -> tuple[float, str]:
base_floor = float(getattr(self, "_catastrophic_floor_pct", 0.0) or 0.0)
if self.eng is None:
return base_floor, "base"
pos = getattr(self.eng, "position", None)
if pos is None:
return base_floor, "base"
trade_id = str(getattr(pos, "trade_id", "") or "")
pending = self._pending_entries.get(trade_id, {}) if trade_id else {}
if not bool(pending.get("overlay_flip", False)):
return base_floor, "base"
overlay_floor = float(getattr(self, "_overlay_catastrophic_floor_pct", 0.0) or 0.0)
candidates = [value for value in (base_floor, overlay_floor) if value > 0.0]
floor_pct = min(candidates) if candidates else 0.0
notional = _safe_float(
pending.get("notional_entry", pending.get("notional", getattr(pos, "notional", 0.0))),
0.0,
)
max_loss_usd = float(getattr(self, "_overlay_catastrophic_max_loss_usd", 0.0) or 0.0)
if notional > 0.0 and max_loss_usd > 0.0:
floor_pct = min(floor_pct, max_loss_usd / notional) if floor_pct > 0.0 else max_loss_usd / notional
reason = str(pending.get("overlay_reason", "") or "overlay")
return max(0.0, floor_pct), f"overlay:{reason}"
def _overlay_advsl_should_exit(
self,
trade_id: str,
pending: Mapping[str, Any],
v7_decision: Mapping[str, Any],
bars_held: int,
current_price: float,
) -> tuple[bool, str]:
if not bool(getattr(self, "_overlay_advsl_live_exit_enabled", False)):
return False, "disabled"
if not bool(pending.get("overlay_flip", False)):
return False, "not_overlay"
if int(bars_held or 0) < int(getattr(self, "_overlay_advsl_min_bars", 0) or 0):
return False, "min_hold"
entry = _safe_float(pending.get("entry_price"), 0.0)
if entry <= 0.0 or current_price <= 0.0:
return False, "bad_price"
side = str(pending.get("side", "SHORT") or "SHORT").upper()
favorable = ((current_price - entry) / entry) if side == "LONG" else ((entry - current_price) / entry)
adverse = max(0.0, -favorable)
lifetime_mfe = max(0.0, _safe_float(v7_decision.get("mfe"), 0.0))
pressure = _safe_float(v7_decision.get("exit_pressure"), 0.0)
mae_risk = _safe_float(v7_decision.get("mae_risk"), 0.0)
floor_pct, floor_label = self._catastrophic_floor_for_open_position()
meaningful_mfe = float(getattr(self, "_overlay_advsl_mfe_max_pct", 0.0) or 0.0)
pressure_min = float(getattr(self, "_overlay_advsl_pressure_min", 0.0) or 0.0)
mae_risk_min = float(getattr(self, "_overlay_advsl_mae_risk_min", 0.0) or 0.0)
no_meaningful_mfe = lifetime_mfe <= meaningful_mfe
pressure_gate = pressure >= pressure_min and mae_risk >= mae_risk_min
if adverse >= floor_pct and floor_pct > 0.0 and (no_meaningful_mfe or pressure_gate):
return True, (
f"{floor_label}:adverse={adverse:.5f}:mfe={lifetime_mfe:.5f}:"
f"pressure={pressure:.2f}:mae_risk={mae_risk:.2f}"
)
return False, "hold"
def _connect_hz(self):
log("Connecting to Hazelcast...")
import hazelcast
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
import logging as _logging
# Client lifecycle events (connection added/removed, heartbeat,
# reconnect attempts) at INFO to stderr — the 2026-06-12 silent-death
# investigation found ZERO client log lines because nothing routed
# them; without this the reactor's health is invisible.
_hz_logger = _logging.getLogger("hazelcast")
if not _hz_logger.handlers:
_h = _logging.StreamHandler()
_h.setFormatter(_logging.Formatter("%(asctime)s HZCLIENT %(levelname)s %(name)s: %(message)s"))
_hz_logger.addHandler(_h)
_hz_logger.setLevel(_logging.INFO)
self.hz_client = hazelcast.HazelcastClient(
cluster_name=HZ_CLUSTER,
cluster_members=[HZ_HOST],
invocation_timeout=3.0, # prevent indefinite scan-loop stall when HZ is unresponsive
)
self.features_map = self.hz_client.get_map("DOLPHIN_FEATURES")
self.safety_map = self.hz_client.get_map("DOLPHIN_SAFETY")
self.pnl_map = self.hz_client.get_map("DOLPHIN_PNL_BLUE")
self.state_map = self.hz_client.get_map("DOLPHIN_STATE_BLUE")
self.heartbeat_map = self.hz_client.get_map("DOLPHIN_HEARTBEAT")
self.control_map = self.hz_client.get_map("DOLPHIN_CONTROL_PLANE")
if self._advanced_sl is not None:
try:
self._advanced_sl.bind_hz(features_map=self.features_map, state_map=self.state_map)
self._advanced_sl.publish_control_plane()
except Exception:
pass
# Immediate heartbeat — prevents Cat1=0 during startup gap
try:
write_runner_heartbeat(
self.heartbeat_map,
build_runner_heartbeat_payload(
flow="nautilus_event_trader",
phase="starting",
run_date=self.current_day,
runner="blue",
),
)
except Exception:
pass
log(" Hz connected")
def _heartbeat_loop(self):
"""Out-of-band heartbeat writer (independent of scan loop)."""
while not self._heartbeat_stop.is_set():
try:
if self.heartbeat_map is not None:
now = time.time()
write_runner_heartbeat(
self.heartbeat_map,
build_runner_heartbeat_payload(
flow="nautilus_event_trader",
phase="trading",
run_date=self.current_day,
runner="blue",
extra={
"last_scan_age_s": round(now - self._last_scan_accept_ts, 1),
"last_event_age_s": round(now - self._last_scan_event_ts, 1),
"scans_processed": self.scans_processed,
},
),
)
if self.control_map is not None:
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
# RETRACT can produce a forced terminal close which must
# run through the scan-thread close finalizer. The
# heartbeat may still apply non-exit commands while scans
# are quiet, but it must leave RETRACT queued.
self._drain_runtime_commands(allow_retract=False)
except Exception as e:
# Never route heartbeat failures through the mounted trade log:
# if that filesystem is sick, the exception handler must still
# survive so the loop can retry on the next tick.
try:
print(
f"[{datetime.now(timezone.utc).isoformat()}] "
f"Heartbeat loop put failed: {e}",
flush=True,
)
except Exception:
pass
finally:
self._heartbeat_stop.wait(10.0)
# ── Scan-flow watchdog ────────────────────────────────────────────────────
# Detection only — no alpha/engine involvement. Distinguishes three stall
# modes and recovers the two that a process restart fixes:
# 1. worker stuck (events fresh, accepts stale, no dupe churn) → restart
# 2. listener deaf (events stale but HZ key still advancing) → restart
# 3. upstream dark (HZ key frozen too) → log only
# Uses print() not log(): log() appends to the CIFS share and the watchdog
# must stay alive precisely when that mount is sick.
def _probe_latest_scan_number(self, timeout_s: float = 10.0):
"""Read latest_eigen_scan from HZ off-thread; None on timeout/error."""
try:
fut = self._probe_executor.submit(
lambda: self.features_map.blocking().get('latest_eigen_scan')
)
try:
raw = fut.result(timeout=timeout_s)
except Exception:
fut.cancel()
raise
if not raw:
return None
scan = json.loads(raw) if isinstance(raw, str) else raw
if scan.get('version') == 'NG7':
inner = scan.get('scan_number')
if inner is None and isinstance(scan.get('scan'), dict):
inner = scan['scan'].get('scan_number')
return int(inner or 0)
return int(scan.get('scan_number') or 0)
except Exception:
return None
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
def _dump_blackbox(self, reason: str):
"""Forensic dump before a watchdog restart — answers WHY the HZ client
died (incidents: silent client death every 40min8h, no exception ever
reaches stderr; prime suspect is the hazelcast reactor thread, which
runs I/O + future completion + event dispatch + heartbeat manager, so
its death is silent by construction). print() only CIFS-safe."""
try:
import sys as _sys
now_iso = datetime.now(timezone.utc).isoformat()
print(f"[{now_iso}] BLACKBOX dump ({reason}):", flush=True)
try:
running_flag = self.hz_client.lifecycle_service.is_running()
except Exception as exc:
running_flag = f"err:{exc}"
print(f" hz_client.lifecycle.is_running={running_flag}", flush=True)
try:
cm = getattr(self.hz_client, "_connection_manager", None)
conns = getattr(cm, "active_connections", None)
print(f" active_connections={conns!r}", flush=True)
except Exception as exc:
print(f" connection introspect failed: {exc}", flush=True)
frames = _sys._current_frames()
for th in threading.enumerate():
frame = frames.get(th.ident)
hz_mark = " <HZ?>" if "hazelcast" in th.name.lower() or "reactor" in th.name.lower() else ""
print(f" THREAD {th.name} daemon={th.daemon} alive={th.is_alive()}{hz_mark}", flush=True)
if frame is not None:
for fl in traceback.format_stack(frame):
for ln in fl.rstrip().splitlines():
print(f" {ln}", flush=True)
# any hazelcast-named thread MISSING from the enumeration = reactor died
hz_threads = [t.name for t in threading.enumerate()
if "hazelcast" in t.name.lower() or "reactor" in t.name.lower()]
print(f" hazelcast-ish threads present: {hz_threads or 'NONE — reactor thread is DEAD'}",
flush=True)
except Exception as exc:
print(f" BLACKBOX dump failed: {exc}", flush=True)
def _watchdog_restart(self, reason: str):
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
try:
self._dump_blackbox(reason)
except Exception:
pass
print(f"[{datetime.now(timezone.utc).isoformat()}] "
f"WATCHDOG_RESTART: {reason} — exiting {WATCHDOG_EXIT_CODE} for "
f"supervisord respawn (capital/position restore on boot)", flush=True)
os._exit(WATCHDOG_EXIT_CODE)
def _scan_watchdog_loop(self):
last_probe_num = None
last_probe_ts = 0.0
last_dark_log_ts = 0.0
dupes_at_stall = None
while not self._watchdog_stop.is_set():
self._watchdog_stop.wait(15.0)
if self._watchdog_stop.is_set() or not running:
return
now = time.time()
acc_age = now - self._last_scan_accept_ts
ev_age = now - self._last_scan_event_ts
if acc_age < SCAN_STALL_S:
last_probe_num = None
dupes_at_stall = None
continue
uptime_ok = (now - _PROCESS_BOOT_TS) > WATCHDOG_RESTART_MIN_UPTIME_S
if ev_age < SCAN_STALL_S:
# Listener delivering but worker not accepting.
if dupes_at_stall is None:
dupes_at_stall = self._dupe_drops_total
continue
if self._dupe_drops_total > dupes_at_stall:
if now - last_dark_log_ts > UPSTREAM_DARK_LOG_EVERY_S:
last_dark_log_ts = now
print(f"[{datetime.now(timezone.utc).isoformat()}] "
f"WATCHDOG: upstream repeating duplicate scan_number "
f"{self.last_scan_number} for {acc_age:.0f}s — scanner stuck, "
f"no restart (restart will not help)", flush=True)
elif uptime_ok:
self._watchdog_restart(
f"scan worker stalled {acc_age:.0f}s with events still arriving "
f"(likely blocked I/O while holding eng_lock)")
continue
# Both event stream and accepts stale → probe HZ for deafness.
probe = self._probe_latest_scan_number()
if probe is None:
# Persistent probe failure = our HZ client is dead (listener
# and probe share it). 2026-06-10 15:18 incident: scanner kept
# writing, PINK kept receiving, but BLUE's client died — probe
# returned None forever and the old logic mislabeled it
# "upstream dark" and never restarted. Three consecutive
# failures (~45 s) with uptime past warm-up → self-restart.
self._probe_fail_streak = getattr(self, "_probe_fail_streak", 0) + 1
if self._probe_fail_streak >= 3 and uptime_ok:
self._watchdog_restart(
f"HZ probe failed {self._probe_fail_streak}x while no "
f"events for {ev_age:.0f}s — HZ client presumed dead")
else:
self._probe_fail_streak = 0
if probe is not None:
if last_probe_num is None:
last_probe_num = probe
last_probe_ts = now
elif (now - last_probe_ts) >= WATCHDOG_PROBE_INTERVAL_S:
if probe != last_probe_num and uptime_ok:
self._watchdog_restart(
f"listener deaf: HZ latest_eigen_scan advanced "
f"{last_probe_num}{probe} but no events for {ev_age:.0f}s")
last_probe_num = probe
last_probe_ts = now
if now - last_dark_log_ts > UPSTREAM_DARK_LOG_EVERY_S:
last_dark_log_ts = now
print(f"[{datetime.now(timezone.utc).isoformat()}] "
f"WATCHDOG: NO SCANS for {acc_age:.0f}s (HZ scan_number probe="
f"{probe}) — upstream scanner appears DARK; open positions are "
f"UNMANAGED until scans resume", flush=True)
def _read_posture(self):
now = time.time()
if now - self.posture_cache_time < 10:
return self.cached_posture
try:
posture_raw = self.safety_map.blocking().get("latest") or self.safety_map.blocking().get("posture")
if posture_raw:
if isinstance(posture_raw, str):
try:
parsed = json.loads(posture_raw)
self.cached_posture = parsed.get("posture", posture_raw)
except (json.JSONDecodeError, AttributeError):
self.cached_posture = posture_raw
else:
self.cached_posture = posture_raw.get("posture", "APEX")
self.posture_cache_time = now
except:
pass
return self.cached_posture
def _rollover_day(self):
today = datetime.now(timezone.utc).strftime('%Y-%m-%d')
if today == self.current_day:
return
posture = self._read_posture()
with self.eng_lock:
if today != self.current_day: # double-checked: only one thread calls begin_day
if getattr(self, 'acb', None):
try:
exf_raw = self.features_map.blocking().get('exf_latest') if self.features_map else None
es_raw = self.features_map.blocking().get('latest_eigen_scan') if self.features_map else None
exf_snapshot = json.loads(exf_raw) if isinstance(exf_raw, str) else (exf_raw or {})
eigen_scan = json.loads(es_raw) if isinstance(es_raw, str) else (es_raw or {})
w750_vel = eigen_scan.get('w750_velocity', 0.0)
if exf_snapshot:
self.acb.get_dynamic_boost_from_hz(
date_str=today,
exf_snapshot=exf_snapshot,
w750_velocity=float(w750_vel) if w750_vel else None,
direction=self.trade_direction,
)
log(f"ACB: Pre-warmed cache for {today} from HZ")
except Exception as e:
log(f"ACB Rollover Error: {e}")
self.eng.begin_day(today, posture=posture, direction=self.trade_direction)
self.bar_idx = 0
self.current_day = today
log(
f"begin_day({today}) called with posture={posture} "
f"direction={_direction_label(self.trade_direction)}"
)
def _mark_retract_command_seen(self, command_id: str) -> None:
if not command_id or command_id in self._processed_retract_set:
return
self._processed_retract_commands.append(command_id)
self._processed_retract_set.add(command_id)
def _mark_runtime_command_seen(self, command_id: str) -> None:
"""Mark a runtime command id as processed for idempotency."""
self._mark_retract_command_seen(command_id)
def _enqueue_blue_runtime_command(self, cmd: dict) -> bool:
"""Append a command to the BLUE runtime command queue."""
if self.control_map is None:
return False
try:
raw_q = self.control_map.blocking().get("blue_runtime_commands")
q = json.loads(raw_q) if isinstance(raw_q, str) and raw_q else []
if not isinstance(q, list):
q = []
q.append(cmd)
q = q[-200:]
self.control_map.blocking().put("blue_runtime_commands", json.dumps(q))
return True
except Exception as e:
log(f" BLUE runtime command enqueue failed: {e}")
return False
def _capital_state_payload(
self,
capital: float,
*,
reason: str = "",
source: str = "",
trade_id: str = "",
asset: str = "",
replay_blob: Mapping[str, Any] | None = None,
) -> dict:
"""Build a canonical capital payload for HZ and disk persistence."""
payload = dict(replay_blob or {})
payload["capital"] = float(capital)
payload["ts"] = float(time.time())
payload["updated_at"] = datetime.now(timezone.utc).isoformat()
payload["reason"] = str(reason or payload.get("reason", "") or "")
payload["source"] = str(source or payload.get("source", "") or "")
if trade_id:
payload["trade_id"] = str(trade_id)
if asset:
payload["asset"] = str(asset)
payload.setdefault("strategy", "nautilus-blue")
payload.setdefault("engine", "nautilus_event_trader")
return payload
def _capital_ledger_event_payload(
self,
*,
capital_before: float,
capital_after: float,
reason: str = "",
source: str = "",
trade_id: str = "",
asset: str = "",
event_ts: float | None = None,
applies_before_ts: float | None = None,
mode: str = "terminal_update",
replay_blob: Mapping[str, Any] | None = None,
) -> dict:
payload = dict(replay_blob or {})
payload["capital_before"] = float(capital_before)
payload["capital_after"] = float(capital_after)
payload["capital"] = float(capital_after)
payload["capital_delta"] = float(capital_after - capital_before)
payload["ts"] = float(event_ts if event_ts is not None else time.time())
payload["updated_at"] = datetime.now(timezone.utc).isoformat()
payload["reason"] = str(reason or payload.get("reason", "") or "")
payload["source"] = str(source or payload.get("source", "") or "")
payload["mode"] = str(mode)
if applies_before_ts is not None:
payload["applies_before_ts"] = float(applies_before_ts)
if trade_id:
payload["trade_id"] = str(trade_id)
if asset:
payload["asset"] = str(asset)
payload.setdefault("strategy", "nautilus-blue")
payload.setdefault("engine", "nautilus_event_trader")
return payload
def _record_capital_ledger_event(self, entry: Mapping[str, Any]) -> None:
"""Append a capital event to the durable BLUE ledger surfaces."""
try:
raw = None
ledger = []
if self.state_map is not None:
raw = self.state_map.blocking().get("capital_update_ledger")
if raw:
ledger = json.loads(raw) if isinstance(raw, str) else list(raw)
if not isinstance(ledger, list):
ledger = []
ledger.append(dict(entry))
ledger = ledger[-1000:]
ledger_payload = json.dumps(ledger)
if self.state_map is not None:
self.state_map.blocking().put("capital_update_ledger", ledger_payload)
if self.control_map is not None:
self.control_map.blocking().put("blue_capital_update_ledger_latest", json.dumps(dict(entry)))
CAPITAL_UPDATE_LEDGER.write_text(ledger_payload)
except Exception as e:
log(f" capital ledger write failed: {e}")
def _current_capital_state_timestamp(self) -> float | None:
"""Return the freshest timestamp currently known for BLUE capital state."""
candidates: list[float] = []
def _maybe_add_blob(raw, source: str) -> None:
parsed = self._parse_capital_blob(raw, source)
if parsed is None:
return
_, blob = parsed
ts = self._extract_state_timestamp(blob)
if ts is not None:
candidates.append(ts)
try:
if self.state_map is not None:
for key in ("latest_nautilus", "engine_snapshot", CAPITAL_CORRECTIVE_REPLAY_HZ_KEY, "capital_checkpoint"):
try:
_maybe_add_blob(self.state_map.blocking().get(key), f"HZ {key}")
except Exception:
continue
except Exception:
pass
try:
if self.pnl_map is not None:
day_key = datetime.now(timezone.utc).strftime("%Y-%m-%d")
_maybe_add_blob(self.pnl_map.blocking().get(day_key), f"HZ pnl[{day_key}]")
except Exception:
pass
try:
if CAPITAL_DISK_CHECKPOINT.exists():
raw = CAPITAL_DISK_CHECKPOINT.read_text()
data = json.loads(raw) if raw else {}
ts = self._extract_state_timestamp(data if isinstance(data, dict) else {})
if ts is not None:
candidates.append(ts)
except Exception:
pass
try:
if CAPITAL_UPDATE_LEDGER.exists():
raw = CAPITAL_UPDATE_LEDGER.read_text()
rows = json.loads(raw) if raw else []
if isinstance(rows, list) and rows:
last = rows[-1] if isinstance(rows[-1], dict) else None
if isinstance(last, dict):
ts = self._extract_state_timestamp(last)
if ts is not None:
candidates.append(ts)
except Exception:
pass
return max(candidates) if candidates else None
def _resolved_capital_state_value(self, fallback: float | None = None) -> tuple[float | None, str, float | None]:
"""Return the freshest authoritative BLUE capital value available locally."""
candidates: list[tuple[float, int, float, str, float | None]] = []
source_rank = {
"capital_update_ledger": 65,
"latest_nautilus": 40,
"engine_snapshot": 30,
"pnl_day": 25,
"correction_replay_local": 20,
"correction_replay_hz": 10,
"capital_checkpoint": 5,
}
def _maybe_add_blob(raw, source: str, rank_key: str) -> None:
parsed = self._parse_capital_blob(raw, source)
if parsed is None:
return
capital, blob = parsed
ts = self._extract_state_timestamp(blob)
candidates.append(
(
ts if ts is not None else float("-inf"),
source_rank.get(rank_key, 0),
capital,
source,
ts,
)
)
try:
if CAPITAL_CORRECTIVE_REPLAY.exists():
try:
replay_blob = json.loads(CAPITAL_CORRECTIVE_REPLAY.read_text())
except Exception:
replay_blob = None
if isinstance(replay_blob, dict):
replay_capital = _safe_float(replay_blob.get("capital", 0.0), 0.0)
if replay_capital >= 1.0 and math.isfinite(replay_capital):
replay_ts = self._extract_state_timestamp(replay_blob)
candidates.append(
(
replay_ts if replay_ts is not None else float("-inf"),
source_rank["correction_replay_local"],
replay_capital,
"local corrective replay",
replay_ts,
)
)
except Exception:
pass
try:
if self.state_map is not None:
raw_ledger = self.state_map.blocking().get("capital_update_ledger")
ledger_rows = json.loads(raw_ledger) if isinstance(raw_ledger, str) and raw_ledger else list(raw_ledger or [])
if isinstance(ledger_rows, list) and ledger_rows:
last = ledger_rows[-1] if isinstance(ledger_rows[-1], dict) else None
if isinstance(last, dict):
capital_after = _safe_float(last.get("capital_after", last.get("capital", 0.0)), 0.0)
if capital_after >= 1.0 and math.isfinite(capital_after):
ledger_ts = self._extract_state_timestamp(last)
candidates.append(
(
ledger_ts if ledger_ts is not None else float("-inf"),
source_rank["capital_update_ledger"],
capital_after,
"capital_update_ledger",
ledger_ts,
)
)
for key, label, rank_key in (
("latest_nautilus", "HZ latest_nautilus", "latest_nautilus"),
("engine_snapshot", "HZ engine_snapshot", "engine_snapshot"),
(CAPITAL_CORRECTIVE_REPLAY_HZ_KEY, "HZ corrective replay", "correction_replay_hz"),
("capital_checkpoint", "HZ capital_checkpoint", "capital_checkpoint"),
):
try:
raw = self.state_map.blocking().get(key)
except Exception:
raw = None
_maybe_add_blob(raw, label, rank_key)
except Exception:
pass
try:
if self.pnl_map is not None:
day_key = datetime.now(timezone.utc).strftime("%Y-%m-%d")
raw = self.pnl_map.blocking().get(day_key)
_maybe_add_blob(raw, f"HZ pnl[{day_key}]", "pnl_day")
except Exception:
pass
try:
if CAPITAL_DISK_CHECKPOINT.exists():
raw = CAPITAL_DISK_CHECKPOINT.read_text()
data = json.loads(raw) if raw else {}
if isinstance(data, dict):
capital = _safe_float(data.get("capital", 0.0), 0.0)
ts = self._extract_state_timestamp(data)
if capital >= 1.0 and math.isfinite(capital):
candidates.append(
(
ts if ts is not None else float("-inf"),
source_rank["capital_checkpoint"],
capital,
"disk capital_checkpoint",
ts,
)
)
except Exception:
pass
if candidates:
candidates.sort(key=lambda item: (item[0], item[1]), reverse=True)
_, _, capital, source, ts = candidates[0]
return capital, source, ts
if fallback is not None:
try:
fallback_f = float(fallback)
except Exception:
fallback_f = None
if fallback_f is not None and math.isfinite(fallback_f) and fallback_f >= 1.0:
return fallback_f, "engine_fallback", None
return None, "unresolved", None
def _resolved_realized_trade_pnl(
self,
pending: Mapping[str, Any],
outcome: Mapping[str, Any],
*,
exit_price: float | None = None,
) -> tuple[float, str]:
"""Resolve realized PnL from the most trustworthy available close payload fields."""
raw_net = _safe_float(outcome.get("net_pnl", outcome.get("pnl", 0.0)), float("nan"))
if math.isfinite(raw_net) and abs(raw_net) >= 1e-9:
return raw_net, "net_pnl"
pnl_pct = _safe_float(outcome.get("pnl_pct", 0.0), float("nan"))
notional = _safe_float(pending.get("notional_entry", pending.get("notional", 0.0)), float("nan"))
if math.isfinite(pnl_pct) and math.isfinite(notional) and abs(pnl_pct) > 0.0 and notional > 0.0:
return pnl_pct * notional, "pnl_pct_notional"
entry_price = _safe_float(pending.get("entry_price", 0.0), float("nan"))
qty = _safe_float(pending.get("quantity", 0.0), float("nan"))
resolved_exit = exit_price if exit_price is not None else _safe_float(outcome.get("exit_price", 0.0), float("nan"))
if math.isfinite(entry_price) and math.isfinite(qty) and math.isfinite(resolved_exit):
if entry_price > 0.0 and qty > 0.0 and resolved_exit > 0.0:
side = str(pending.get("side", "SHORT") or "SHORT").upper()
direction = -1.0 if side == "SHORT" else 1.0
return direction * ((resolved_exit - entry_price) * qty), "entry_exit_qty"
if math.isfinite(raw_net):
return raw_net, "raw_net"
return 0.0, "unresolved"
@staticmethod
def _truthy_flag(value: Any) -> bool:
"""Interpret loose flag values from runtime payloads."""
if isinstance(value, bool):
return value
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
return value.strip().lower() in {"1", "true", "yes", "on", "y", "t"}
return False
def _resolved_capital_apply_pnl(self, outcome: Mapping[str, Any], realized_pnl: float) -> tuple[float, str]:
"""Resolve capital delta for close handling, suppressing known already-realized exits."""
if self._truthy_flag(outcome.get("capital_already_realized", False)):
return 0.0, "already_realized"
return float(realized_pnl or 0.0), "direct"
def _commit_capital_state(
self,
capital: float,
*,
reason: str = "",
source: str = "",
trade_id: str = "",
asset: str = "",
replay_blob: Mapping[str, Any] | None = None,
update_replay_key: bool = False,
mirror_control_plane: bool = True,
) -> dict | None:
"""Write capital to all canonical BLUE bookkeeping surfaces."""
capital = float(capital)
if capital < 1.0 or not math.isfinite(capital):
return None
payload = self._capital_state_payload(
capital,
reason=reason,
source=source,
trade_id=trade_id,
asset=asset,
replay_blob=replay_blob,
)
runtime_snapshot = getattr(self, "_last_engine_snapshot_payload", None)
if isinstance(runtime_snapshot, Mapping):
merged_payload = dict(runtime_snapshot)
merged_payload.update(payload)
payload = merged_payload
checkpoint_payload = json.dumps({"capital": capital, "ts": payload["ts"]})
state_payload = json.dumps(payload)
try:
if self.state_map is not None:
self.state_map.blocking().put("capital_checkpoint", checkpoint_payload)
self.state_map.blocking().put("latest_nautilus", state_payload)
self.state_map.blocking().put("engine_snapshot", state_payload)
if update_replay_key:
self.state_map.blocking().put(CAPITAL_CORRECTIVE_REPLAY_HZ_KEY, state_payload)
except Exception as e:
log(f" capital state HZ save failed: {e}")
if update_replay_key:
try:
CAPITAL_CORRECTIVE_REPLAY.write_text(state_payload)
except Exception as e:
log(f" capital corrective replay save failed: {e}")
try:
if self.pnl_map is not None:
day_key = datetime.now(timezone.utc).strftime('%Y-%m-%d')
self.pnl_map.blocking().put(day_key, state_payload)
except Exception as e:
log(f" capital pnl state save failed: {e}")
try:
CAPITAL_DISK_CHECKPOINT.write_text(checkpoint_payload)
except Exception as e:
log(f" capital disk save failed: {e}")
if mirror_control_plane and self.control_map is not None:
try:
self.control_map.blocking().put("blue_capital_update_latest", state_payload)
except Exception as e:
log(f" capital control plane mirror failed: {e}")
self.eng.capital = capital
return payload
def _apply_trade_capital_update(
self,
realized_pnl: float,
*,
reason: str,
source: str,
trade_id: str,
asset: str,
mirror_control_plane: bool = True,
) -> tuple[float, float]:
"""Apply a realized trade PnL to live capital and persist it immediately."""
capital_before, capital_source, capital_ts = self._resolved_capital_state_value(
fallback=float(getattr(self.eng, "capital", 0.0) or 0.0)
)
capital_before = float(capital_before or 0.0)
if capital_before < 1.0 or not math.isfinite(capital_before):
capital_before = 0.0
else:
self.eng.capital = capital_before
capital_after = capital_before + float(realized_pnl or 0.0)
payload = self._commit_capital_state(
capital_after,
reason=reason,
source=source,
trade_id=trade_id,
asset=asset,
mirror_control_plane=mirror_control_plane,
)
if payload is not None:
ledger_entry = self._capital_ledger_event_payload(
capital_before=capital_before,
capital_after=capital_after,
reason=reason,
source=source,
trade_id=trade_id,
asset=asset,
event_ts=self._parse_timestamp_seconds(payload.get("ts")),
applies_before_ts=self._parse_timestamp_seconds(payload.get("ts")),
mode="terminal_update",
)
self._record_capital_ledger_event(ledger_entry)
if capital_source != "engine_fallback":
log(
" capital update base resolved from "
f"{capital_source}"
+ (f" ts={capital_ts:.3f}" if capital_ts is not None else "")
+ f": before={capital_before:.2f} after={capital_after:.2f}"
)
return capital_before, capital_after
def _apply_internal_capital_update(self, cmd: dict) -> tuple[dict | None, str]:
"""Apply an in-band capital update command to the live BLUE engine."""
raw_capital = cmd.get("capital", None)
capital = _safe_float(raw_capital, float("nan"))
if capital < 1.0 or not math.isfinite(capital):
return None, "BAD_CAPITAL"
replay_blob = cmd.get("replay_blob") if isinstance(cmd.get("replay_blob"), Mapping) else None
capital_before = float(getattr(self.eng, "capital", capital) or capital)
event_ts = self._parse_timestamp_seconds(
cmd.get("event_ts")
or cmd.get("ts")
or (replay_blob.get("updated_at") if replay_blob else None)
or (replay_blob.get("ts") if replay_blob else None)
)
applies_before_ts = self._parse_timestamp_seconds(cmd.get("applies_before_ts"))
historical_only = False
if replay_blob is not None:
replay_ts = self._extract_state_timestamp(replay_blob)
current_ts = self._current_capital_state_timestamp()
if replay_ts is not None and current_ts is not None and replay_ts + 1.0 < current_ts:
historical_only = True
ledger_entry = self._capital_ledger_event_payload(
capital_before=capital_before,
capital_after=capital_before,
reason=str(cmd.get("reason", "CAPITAL_UPDATE") or "CAPITAL_UPDATE"),
source=str(cmd.get("source", "control_plane") or "control_plane"),
trade_id=str(cmd.get("trade_id", "") or ""),
asset=str(cmd.get("asset", "") or ""),
event_ts=event_ts,
applies_before_ts=current_ts,
mode="historical_replay_only",
replay_blob=replay_blob,
)
self._record_capital_ledger_event(ledger_entry)
log(
" capital update recorded as historical replay "
f"ts={replay_ts:.3f} current_ts={current_ts:.3f}"
)
return ledger_entry, "RECORDED_HISTORICAL"
payload = self._commit_capital_state(
capital,
reason=str(cmd.get("reason", "CAPITAL_UPDATE") or "CAPITAL_UPDATE"),
source=str(cmd.get("source", "control_plane") or "control_plane"),
trade_id=str(cmd.get("trade_id", "") or ""),
asset=str(cmd.get("asset", "") or ""),
replay_blob=replay_blob,
update_replay_key=bool(replay_blob),
mirror_control_plane=True,
)
if payload is None:
return None, "BAD_CAPITAL"
ledger_entry = self._capital_ledger_event_payload(
capital_before=capital_before,
capital_after=capital,
reason=str(cmd.get("reason", "CAPITAL_UPDATE") or "CAPITAL_UPDATE"),
source=str(cmd.get("source", "control_plane") or "control_plane"),
trade_id=str(cmd.get("trade_id", "") or ""),
asset=str(cmd.get("asset", "") or ""),
event_ts=event_ts,
applies_before_ts=applies_before_ts,
mode="terminal_update" if not historical_only else "historical_replay_only",
replay_blob=replay_blob,
)
self._record_capital_ledger_event(ledger_entry)
return payload, "APPLIED"
@staticmethod
def _sc_trim_fraction(current_mult: float, target_mult: float) -> float:
"""Translate a desired remaining multiplier into a retract fraction."""
cur = float(current_mult or 0.0)
tgt = float(target_mult or 0.0)
if not math.isfinite(cur) or not math.isfinite(tgt):
return 0.0
cur = max(0.0, cur)
tgt = max(0.0, tgt)
if cur <= 0.0 or tgt >= cur:
return 0.0
return max(0.0, min(1.0, 1.0 - (tgt / cur)))
def _record_sc_haircut(self, *, trade_id: str, pending: dict, source: str) -> dict | None:
"""Record SC haircut guidance as sizing metadata only.
SC is not an actuation surface. It records a haircut target that later
sizing logic can use, but it does not enqueue a live retract command.
"""
if not trade_id:
return None
pos = getattr(self.eng, "position", None)
if pos is None:
return None
pos_tid = str(getattr(pos, "trade_id", "") or "")
if pos_tid and pos_tid != str(trade_id):
return None
recs: list[float] = []
sc_rec = pending.get("sc_threshold_advisor")
if isinstance(sc_rec, dict):
recs.append(float(sc_rec.get("recommended_mult", 1.0) or 1.0))
gauge_rec = pending.get("sc_bucket_gauge")
if isinstance(gauge_rec, dict):
recs.append(float(gauge_rec.get("recommended_size_mult", 1.0) or 1.0))
if not recs:
return None
target_mult = max(0.0, min(recs))
current_notional = float(getattr(pos, "notional", pending.get("notional", 0.0)) or 0.0)
entry_notional = float(
pending.get("notional_entry", pending.get("notional", current_notional)) or current_notional
)
if current_notional <= 0.0 or entry_notional <= 0.0:
return None
current_mult = current_notional / entry_notional
last_target = float(pending.get("sc_haircut_last_target_mult", 1.0) or 1.0)
if target_mult >= current_mult - 1e-6 or target_mult >= last_target - 1e-6:
return None
frac = self._sc_trim_fraction(current_mult=current_mult, target_mult=target_mult)
if frac <= 0.0:
return None
pending["sc_haircut_target_mult"] = target_mult
pending["sc_haircut_fraction"] = frac
pending["sc_haircut_source"] = str(source or "sc")
pending["sc_haircut_last_updated_ts"] = float(time.time())
pending["sc_haircut_last_target_mult"] = target_mult
self._pending_entries[trade_id] = pending
log(
f" SC haircut record: {trade_id} target={target_mult:.2f} "
f"cur={current_mult:.2f} frac={frac:.3f} source={source}"
)
return {
"trade_id": trade_id,
"target_mult": target_mult,
"current_mult": current_mult,
"fraction": frac,
"source": str(source or "sc"),
}
def _apply_sc_entry_size_multiplier(self, trade_id: str, entry: dict, pending: dict) -> float:
"""Apply the live EsoF/SC size gate to an entry before persistence.
This is the actual sizing actuation surface for the deterministic SC gate.
It keeps the haircut size-only: no retract/close commands are enqueued.
"""
mult = float(self._last_esof_size_mult or 1.0)
if not math.isfinite(mult):
mult = 1.0
mult = max(0.0, min(1.0, mult))
pending["sc_exec_mult"] = mult
if mult >= 0.999:
return mult
entry_price = float(entry.get("entry_price", pending.get("entry_price", 0.0)) or 0.0)
base_notional = float(entry.get("notional", pending.get("notional", 0.0)) or 0.0)
if base_notional <= 0.0 and entry_price > 0.0:
quantity = float(entry.get("quantity", pending.get("quantity", 0.0)) or 0.0)
base_notional = quantity * entry_price
if base_notional <= 0.0:
return mult
effective_notional = round(base_notional * mult, 12)
if effective_notional <= 0.0:
return mult
base_quantity = float(entry.get("quantity", pending.get("quantity", 0.0)) or 0.0)
if base_quantity <= 0.0 and entry_price > 0.0:
base_quantity = base_notional / entry_price
effective_quantity = round(effective_notional / max(entry_price, 1e-12), 6) if entry_price > 0.0 else base_quantity * mult
base_leverage = float(entry.get("leverage", pending.get("leverage", 0.0)) or 0.0)
effective_leverage = round(base_leverage * mult, 6) if base_leverage > 0.0 else base_leverage
entry.setdefault("notional_entry", base_notional)
entry["notional"] = effective_notional
entry["quantity"] = effective_quantity
if effective_leverage > 0.0:
entry["leverage"] = effective_leverage
entry["sc_exec_mult"] = mult
entry["sc_exec_notional"] = effective_notional
entry["sc_exec_quantity"] = effective_quantity
pending.setdefault("notional_entry", base_notional)
pending["notional"] = effective_notional
pending["quantity"] = effective_quantity
if effective_leverage > 0.0:
pending["leverage"] = effective_leverage
pending["sc_exec_notional"] = effective_notional
pending["sc_exec_quantity"] = effective_quantity
pending["sc_exec_leverage"] = effective_leverage if effective_leverage > 0.0 else base_leverage
pos = getattr(self.eng, "position", None)
if pos is not None and str(getattr(pos, "trade_id", "") or "") in ("", str(trade_id)):
try:
pos.notional = effective_notional
except Exception:
pass
try:
pos.quantity = effective_quantity
except Exception:
pass
if effective_leverage > 0.0:
try:
pos.leverage = effective_leverage
except Exception:
pass
log(
f" SC haircut execute: {trade_id} mult={mult:.3f} "
f"notional={base_notional:.6f}->{effective_notional:.6f} "
f"qty={base_quantity:.6f}->{effective_quantity:.6f}"
)
return mult
def _build_retract_exit(self, *, trade_id: str, reason: str, bars_held: int, pnl_pct: float, net_pnl: float) -> dict:
return {
"trade_id": trade_id,
"reason": reason,
"bars_held": int(max(0, bars_held)),
"pnl_pct": float(pnl_pct),
"net_pnl": float(net_pnl),
# Full retract legs already realize pnl incrementally; close-path capital
# application must be a no-op to avoid double-booking.
"capital_already_realized": True,
# Preserve explicit economic fields for observability/reporting.
"economic_pnl": float(net_pnl),
"economic_pnl_pct": float(pnl_pct),
}
def _build_trade_execution_quality_summary(
self,
*,
trade_id: str,
pending: dict,
exit_payload: dict,
capital_before: float,
capital_after: float,
realized_pnl: float,
exit_price: float,
source: str,
) -> dict:
if build_trade_execution_quality_summary is None:
raise RuntimeError("execution quality summary helper unavailable")
return build_trade_execution_quality_summary(
trade_id=trade_id,
pending=pending,
exit_payload=exit_payload,
capital_before=capital_before,
capital_after=capital_after,
realized_pnl=realized_pnl,
exit_price=exit_price,
source=source,
ts=_ch_ts_us(),
)
def _persist_trade_execution_quality(self, record: dict) -> None:
try:
ch_put("trade_execution_quality", record)
except Exception as e:
log(f" trade_execution_quality CH write failed: {e}")
try:
if self.state_map is not None:
self.state_map.blocking().put("last_trade_execution_quality", record)
except Exception as e:
log(f" trade execution quality HZ state write failed: {e}")
try:
if self.control_map is not None:
self.control_map.blocking().put("blue_last_trade_execution_quality", record)
except Exception as e:
log(f" trade execution quality control plane write failed: {e}")
def _chain_state_for_pending(
self,
trade_id: str,
pending: dict,
*,
chain_mode: str = "LIVE",
chain_head_leg_id: str | None = None,
chain_prev_leg_id: str | None = None,
chain_seq: int | None = None,
) -> dict:
"""Return the canonical linked-list state for the current open trade head."""
seq = int(chain_seq if chain_seq is not None else pending.get("retraction_legs", 0) or 0)
quantity = float(pending.get("quantity", 0.0) or 0.0)
entry_price = float(pending.get("entry_price", 0.0) or 0.0)
notional = float(pending.get("notional", pending.get("notional_entry", 0.0)) or 0.0)
entry_bar = int(pending.get("entry_bar", 0) or 0)
entry_ts = int(pending.get("entry_ts", 0) or 0)
realized = float(pending.get("realized_pnl_legs_total", 0.0) or 0.0)
return _build_chain_state(
trade_id=str(trade_id or ""),
asset=str(pending.get("asset", "") or ""),
side=str(pending.get("side", "") or "SHORT"),
entry_price=entry_price,
quantity=quantity,
notional=notional,
entry_bar=entry_bar,
entry_ts=entry_ts,
retraction_legs=seq,
realized_pnl_legs_total=realized,
chain_root_trade_id=str(pending.get("chain_root_trade_id", trade_id) or trade_id),
chain_head_leg_id=chain_head_leg_id or pending.get("chain_head_leg_id"),
chain_prev_leg_id=chain_prev_leg_id if chain_prev_leg_id is not None else str(pending.get("chain_prev_leg_id", "") or ""),
chain_mode=chain_mode,
)
def _load_chain_ledger_state(self, trade_id: str) -> dict | None:
"""Load the latest reconstruction payload for a trade, if ClickHouse is reachable."""
try:
import base64 as _b64
escaped_tid = str(trade_id or "").replace("'", "''")
sql = (
"SELECT event_type, event_id, payload_json "
"FROM dolphin.trade_reconstruction "
f"WHERE trade_id = '{escaped_tid}' "
"ORDER BY ts DESC LIMIT 1 FORMAT JSONEachRow"
)
req = urllib.request.Request(
"http://localhost:8123/?database=dolphin",
data=sql.encode(),
headers={"Authorization": "Basic " +
_b64.b64encode(b"dolphin:dolphin_ch_2026").decode()},
)
with urllib.request.urlopen(req, timeout=5) as r:
raw = r.read().decode().strip()
if not raw:
return None
row = json.loads(raw.splitlines()[0])
payload = json.loads(row.get("payload_json", "{}") or "{}")
payload["event_type"] = row.get("event_type", "")
payload["event_id"] = row.get("event_id", "")
return payload
except Exception:
return None
def _chain_state_from_reconstruction(self, trade_id: str, pending: dict, recon: dict | None) -> dict:
"""Merge reconstruction payload chain hints with the current live state."""
chain_data = {}
seq = 0
prev_leg_id = ""
head_leg_id = f"{trade_id}:open"
chain_mode = "LEGACY"
if recon:
chain_data.update(recon)
nested = recon.get("chain")
if isinstance(nested, dict):
chain_data.update(nested)
seq = int(chain_data.get("chain_seq", chain_data.get("retraction_legs", 0)) or 0)
prev_leg_id = str(chain_data.get("chain_prev_leg_id", "") or "")
head_leg_id = str(chain_data.get("chain_head_leg_id", "") or head_leg_id)
chain_mode = str(chain_data.get("chain_mode", "LIVE") or "LIVE")
if "chain_token" not in chain_data:
chain_mode = "LEGACY_REBUILT"
chain = self._chain_state_for_pending(
trade_id,
pending,
chain_mode=chain_mode,
chain_head_leg_id=head_leg_id,
chain_prev_leg_id=prev_leg_id,
chain_seq=seq,
)
if chain_data.get("chain_token"):
expected = str(chain_data.get("chain_token", "") or "")
if expected != chain.get("chain_token"):
# Do not hard-halt restore on legacy/stale token drift.
# Keep trading continuity with a rebuilt chain and surface the
# mismatch loudly for follow-up reconciliation.
derived = str(chain.get("chain_token", "") or "")
log(
" chain token mismatch on restore: "
f"trade={trade_id} stored={expected[:12]} derived={derived[:12]} "
"— continuing with derived token"
)
chain["chain_mode"] = "LEGACY_REBUILT_MISMATCH"
# A log line is not forensics — emit a first-class journal
# event so the mismatch is queryable (the XTZ 863c21da
# incident took a day to reconstruct from grep).
try:
ch_put("trade_reconstruction", {
"ts": _ch_ts_us(),
"trade_id": trade_id,
"event_type": "CHAIN_TOKEN_MISMATCH",
"event_id": f"{trade_id}:chain_mismatch",
"payload_json": json.dumps({
"stored_token": expected,
"derived_token": derived,
"chain_mode": "LEGACY_REBUILT_MISMATCH",
"pending": {k: pending.get(k) for k in
("asset", "side", "entry_price", "quantity",
"notional", "entry_bar") if k in pending},
}, default=str),
"market_state_bundle_json": "",
"tp_base_pct": 0.0,
"tp_effective_pct": 0.0,
"our_leverage": 0.0,
})
except Exception:
pass
return chain
def _apply_internal_retract(self, cmd: dict, prices_dict: dict) -> tuple[dict | None, str]:
"""Apply partial retraction on in-memory BLUE position; returns (forced_exit, status)."""
with self.eng_lock:
pos = getattr(self.eng, "position", None)
if pos is None:
return None, "NO_POSITION"
tid = str(getattr(pos, "trade_id", "") or "")
if not tid:
return None, "NO_TRADE_ID"
req_tid = str(cmd.get("trade_id", "") or "").strip()
if req_tid and req_tid != tid:
return None, f"TRADE_MISMATCH open={tid} cmd={req_tid}"
pending = self._pending_entries.get(tid) or {}
side = str(pending.get("side", "SHORT") or "SHORT").upper()
entry_price = float(pending.get("entry_price", getattr(pos, "entry_price", 0.0)) or 0.0)
if entry_price <= 0:
return None, "BAD_ENTRY_PRICE"
open_notional = float(getattr(pos, "notional", 0.0) or 0.0)
if open_notional <= 0:
return None, "ZERO_NOTIONAL"
frac = float(cmd.get("fraction", 0.0) or 0.0)
if not (0.0 < frac <= 1.0):
return None, "BAD_FRACTION"
expected_chain = self._chain_state_for_pending(tid, pending)
cmd_chain_token = str(cmd.get("chain_token", "") or "").strip()
cmd_chain_head = str(cmd.get("chain_head_leg_id", "") or "").strip()
cmd_chain_root = str(cmd.get("chain_root_trade_id", "") or "").strip()
cmd_chain_seq = int(cmd.get("chain_seq", expected_chain["chain_seq"]) or expected_chain["chain_seq"])
if not cmd_chain_token or not cmd_chain_head or not cmd_chain_root:
return None, "NO_CHAIN_LINK"
if cmd_chain_root != expected_chain["chain_root_trade_id"]:
return None, f"CHAIN_ROOT_MISMATCH expected={expected_chain['chain_root_trade_id']} cmd={cmd_chain_root}"
if cmd_chain_head != expected_chain["chain_head_leg_id"] or cmd_chain_token != expected_chain["chain_token"]:
return None, (
f"CHAIN_MISMATCH head={expected_chain['chain_head_leg_id']} "
f"seq={expected_chain['chain_seq']} token={expected_chain['chain_token'][:12]}"
)
if cmd_chain_seq != expected_chain["chain_seq"]:
return None, (
f"CHAIN_SEQ_MISMATCH expected={expected_chain['chain_seq']} cmd={cmd_chain_seq}"
)
reduce_notional = min(open_notional, open_notional * frac)
if reduce_notional <= 0.0:
return None, "ZERO_REDUCE_NOTIONAL"
current_price = float(prices_dict.get(pos.asset, getattr(pos, "current_price", entry_price)) or entry_price)
if current_price <= 0:
current_price = entry_price
direction = -1.0 if side == "SHORT" else 1.0
pnl_pct_now = direction * ((current_price - entry_price) / entry_price)
net_pnl_leg = pnl_pct_now * reduce_notional
bars_held = max(0, int(self.bar_idx - int(pending.get("entry_bar", max(0, self.bar_idx - 1)) or max(0, self.bar_idx - 1))))
capital_before, capital_after = self._apply_trade_capital_update(
net_pnl_leg,
reason=str(cmd.get("reason", "RETRACT")),
source=str(cmd.get("source", "internal")),
trade_id=tid,
asset=str(getattr(pos, "asset", pending.get("asset", ""))),
mirror_control_plane=True,
)
remaining_notional = max(0.0, open_notional - reduce_notional)
remaining_qty = round((remaining_notional / entry_price), 6) if entry_price > 0 else 0.0
pos.notional = remaining_notional
pos.current_price = current_price
try:
pos.pnl_pct = pnl_pct_now
except Exception:
pass
pending.setdefault("notional_entry", float(pending.get("notional", open_notional) or open_notional))
pending["notional"] = remaining_notional
pending["quantity"] = remaining_qty
pending["retraction_legs"] = int(pending.get("retraction_legs", 0) or 0) + 1
pending["realized_pnl_legs_total"] = float(pending.get("realized_pnl_legs_total", 0.0) or 0.0) + net_pnl_leg
leg_seq = int(pending["retraction_legs"])
leg_id = f"{tid}:x{leg_seq:03d}"
chain_state = self._chain_state_for_pending(
tid,
{
**pending,
"chain_root_trade_id": expected_chain["chain_root_trade_id"],
"chain_prev_leg_id": expected_chain["chain_head_leg_id"],
"chain_head_leg_id": leg_id,
"chain_mode": "LIVE",
},
chain_mode="LIVE",
chain_head_leg_id=leg_id,
chain_prev_leg_id=expected_chain["chain_head_leg_id"],
chain_seq=leg_seq,
)
self._pending_entries[tid] = pending
pending.update(chain_state)
current_bars_held = bars_held
entry_bar = int(pending.get("entry_bar", max(0, self.bar_idx - current_bars_held)) or max(0, self.bar_idx - current_bars_held))
# Full close when the remainder is economic dust — threshold is
# POSITION_DUST_NOTIONAL_USD, deliberately ALIGNED with the
# _ps_write_open lifecycle gate so no remainder can exist that is
# "open" in memory but rounds to a zero-size row on disk
# (the malformed-OPEN class, MALFORMED_OPEN_RESTORE_BUG.md).
fully_closed = remaining_notional <= POSITION_DUST_NOTIONAL_USD or remaining_qty <= 0.0
# The leg ledger rows (trade_exit_legs + trade_reconstruction) are
# written for EVERY leg including the terminal one. The previous
# full-close early-return skipped them, losing the final leg from
# the §38.9 replay surface.
ch_put("trade_exit_legs", {
"ts": _ch_ts_us(),
"date": str(pending.get("entry_date", self.current_day or "")),
"strategy": "blue",
"trade_id": tid,
"chain_root_trade_id": str(chain_state.get("chain_root_trade_id", tid) or tid),
"chain_head_leg_id": str(chain_state.get("chain_head_leg_id", leg_id) or leg_id),
"chain_prev_leg_id": str(chain_state.get("chain_prev_leg_id", "") or ""),
"chain_seq": int(chain_state.get("chain_seq", leg_seq) or leg_seq),
"chain_token": str(chain_state.get("chain_token", "") or ""),
"chain_mode": str(chain_state.get("chain_mode", "LIVE") or "LIVE"),
"exit_leg_id": leg_id,
"exit_seq": leg_seq,
"command_id": str(cmd.get("command_id", "")),
"source": str(cmd.get("source", "internal")),
"reason": str(cmd.get("reason", "RETRACT")),
"asset": str(getattr(pos, "asset", pending.get("asset", ""))),
"side": side,
"entry_price": entry_price,
"exit_price": current_price,
"fraction": frac,
"capital_before": capital_before,
"capital_after": capital_after,
"exit_notional": reduce_notional,
"remaining_notional": remaining_notional,
"remaining_qty": remaining_qty,
"pnl_pct_leg": pnl_pct_now,
"pnl_leg": net_pnl_leg,
"pnl_realized_total": float(pending.get("realized_pnl_legs_total", 0.0) or 0.0),
"bars_held": bars_held,
})
ch_put("trade_reconstruction", {
"ts": _ch_ts_us(),
"trade_id": tid,
"event_type": "FULL_RETRACT_EXIT" if fully_closed else "PARTIAL_EXIT",
"event_id": leg_id,
"payload_json": json.dumps({
"command": cmd,
"entry_price": entry_price,
"exit_price": current_price,
"exit_notional": reduce_notional,
"remaining_notional": remaining_notional,
"pnl_pct_leg": pnl_pct_now,
"pnl_leg": net_pnl_leg,
"pnl_realized_total": float(pending.get("realized_pnl_legs_total", 0.0) or 0.0),
"bar_idx": int(self.bar_idx),
"chain": chain_state,
}),
"market_state_bundle_json": str(pending.get("market_state_bundle_json", "") or ""),
"tp_base_pct": float(pending.get("tp_base_pct", 0.0) or 0.0),
"tp_effective_pct": float(pending.get("tp_effective_pct", 0.0) or 0.0),
"our_leverage": float(pending.get("our_leverage", 0.0) or 0.0),
})
if fully_closed:
self.eng.position = None
try:
self.eng.exit_manager._positions.pop(tid, None)
except Exception:
pass
total_realized = float(pending.get("realized_pnl_legs_total", 0.0) or 0.0)
denom = max(float(pending.get("notional_entry", open_notional) or open_notional), 1e-12)
forced = self._build_retract_exit(
trade_id=tid,
reason=str(cmd.get("reason", "RETRACT_FULL")),
bars_held=bars_held,
pnl_pct=total_realized / denom,
net_pnl=total_realized,
)
return forced, "FULL_CLOSE"
# Partial remainder: persist through the canonical OPEN write gate
# (lifecycle invariant enforced there) instead of a raw ch_put —
# the bypass was the causal origin of zero-size OPEN snapshots.
wrote = self._ps_write_open(
tid,
{
**pending,
"asset": str(getattr(pos, "asset", pending.get("asset", ""))),
"side": side,
"entry_price": entry_price,
"quantity": pending["quantity"],
"leverage": pending.get("leverage", getattr(pos, "leverage", 0.0)),
},
ts=_ch_ts_us(),
entry_bar=entry_bar,
bars_held=current_bars_held,
pnl=float(pending.get("realized_pnl_legs_total", 0.0) or 0.0),
)
if not wrote:
# Gate refused (dust slipped past fully_closed somehow) —
# surface loudly; the invariant says this must not happen.
log(
f"RETRACT WARNING: remainder for {tid} refused by OPEN gate "
f"(qty={pending['quantity']} notional={remaining_notional:.6f}) — "
"treat as accounting anomaly"
)
return None, "PARTIAL_OK"
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
def _process_runtime_commands(
self,
prices_dict: dict,
*,
allow_retract: bool = True,
) -> dict | None:
"""Drain BLUE runtime commands from control plane and apply retractions."""
if self.control_map is None:
return None
key = "blue_runtime_commands"
try:
raw = self.control_map.blocking().get(key)
if not raw:
return None
queue = json.loads(raw) if isinstance(raw, str) else list(raw)
if not isinstance(queue, list) or not queue:
return None
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
if allow_retract:
self.control_map.blocking().put(key, json.dumps([]))
else:
deferred = [
cmd for cmd in queue
if isinstance(cmd, dict)
and str(cmd.get("action", "") or "").upper() == "RETRACT"
]
queue = [
cmd for cmd in queue
if not (
isinstance(cmd, dict)
and str(cmd.get("action", "") or "").upper() == "RETRACT"
)
]
self.control_map.blocking().put(key, json.dumps(deferred))
except Exception as e:
log(f"RUNTIME_CMD read failed: {e}")
return None
forced_exit = None
for cmd in queue:
if not isinstance(cmd, dict):
continue
cid = str(cmd.get("command_id", "") or "")
if cid and cid in self._processed_retract_set:
hotkey = str(cmd.get("action", "") or "").upper() or "RUNTIME"
ch_put("hotkey_audit", {
"ts": int(time.time() * 1000),
"hotkey": f"{hotkey}_REPLAY",
"request_json": json.dumps(cmd, default=str),
"result": "IDEMPOTENT_REPLAY",
"effect_json": json.dumps({}, default=str),
})
continue
action = str(cmd.get("action", "") or "").upper()
if action == "RETRACT":
fx, status = self._apply_internal_retract(cmd, prices_dict)
self._mark_runtime_command_seen(cid)
ch_put("hotkey_audit", {
"ts": int(time.time() * 1000),
"hotkey": "RETRACT",
"request_json": json.dumps(cmd, default=str),
"result": status,
"effect_json": json.dumps({"forced_exit": bool(fx)}, default=str),
})
if fx is not None:
forced_exit = fx
continue
if action in ("SET_CAPITAL", "CAPITAL_UPDATE"):
effect, status = self._apply_internal_capital_update(cmd)
self._mark_runtime_command_seen(cid)
ch_put("hotkey_audit", {
"ts": int(time.time() * 1000),
"hotkey": "CAPITAL_UPDATE",
"request_json": json.dumps(cmd, default=str),
"result": status,
"effect_json": json.dumps(effect or {}, default=str),
})
continue
return forced_exit
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
def _drain_runtime_commands(
self,
prices_dict: dict | None = None,
*,
allow_retract: bool = True,
) -> dict | None:
"""Serialize queue draining so the scan and heartbeat paths do not race."""
lock = getattr(self, "_runtime_command_lock", None)
if lock is None:
lock = threading.Lock()
self._runtime_command_lock = lock
with lock:
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
return self._process_runtime_commands(
dict(prices_dict or self._last_prices_dict or {}),
allow_retract=allow_retract,
)
def _compute_vol_ok(self, scan):
assets = scan.get('assets', [])
prices = scan.get('asset_prices', [])
if not assets or not prices:
return True
prices_dict = dict(zip(assets, prices))
btc_price = prices_dict.get('BTCUSDT')
if btc_price is None:
return True
self.btc_prices.append(float(btc_price))
if len(self.btc_prices) < BTC_VOL_WINDOW:
return True
import numpy as np
arr = np.array(self.btc_prices)
dvol = float(np.std(np.diff(arr) / arr[:-1]))
return dvol > float(self.vol_p60_threshold)
@staticmethod
def _normalize_ng7(scan: dict) -> dict:
"""Promote NG7-format scan to the canonical BLUE-compatible flat dict."""
return normalize_ng7_scan(scan)
def on_scan(self, event):
"""Reactor-thread entry point — dispatches immediately to worker thread."""
if self._restore_failed or not event.value:
return
listener_time = time.time()
self._last_scan_event_ts = listener_time
self._scan_executor.submit(self._process_scan, event, listener_time)
def _process_scan(self, event, listener_time):
try:
if self._restore_failed or not event.value:
return
scan = json.loads(event.value) if isinstance(event.value, str) else event.value
# Normalise NG7 format → NG5-compatible flat dict before any field access
if scan.get('version') == 'NG7':
scan = self._normalize_ng7(scan)
scan_number = int(scan.get('scan_number') or 0)
# Dedup: scan_number is authoritative (monotonically increasing).
# file_mtime / timestamp are unreliable across NG7 restart probes.
# Exception: the scanner resets numbering to 0 on restart — a large
# backwards jump must re-anchor the ratchet, or BLUE drops every
# scan until manually restarted (near-miss on 2026-06-09/10).
with self._dedup_lock:
if scan_number > 0 and scan_number <= self.last_scan_number:
if scan_number < self.last_scan_number - SCAN_NUMBER_RESET_GAP:
log(f"WARN scanner restart detected: scan_number {self.last_scan_number}"
f"{scan_number} — re-anchoring dedup ratchet")
else:
self._dupe_drops_total += 1
return
self.last_scan_number = scan_number
self._last_scan_accept_ts = time.time()
self.scans_processed += 1
self._rollover_day()
assets = scan.get('assets') or []
if assets and not self.ob_assets:
self._wire_obf(assets)
prices = scan.get('asset_prices') or []
if assets and prices and len(assets) != len(prices):
log(f"WARN scan #{scan_number}: assets/prices mismatch "
f"({len(assets)}{len(prices)}) — dropped")
return
prices_dict = dict(zip(assets, prices)) if assets and prices else {}
self._last_prices_dict = dict(prices_dict)
# Remove stablecoins — they should never be selected as a trade asset
for sym in _STABLECOIN_SYMBOLS:
prices_dict.pop(sym, None)
self._record_bounce_prices(prices_dict)
vol_ok = self._compute_vol_ok(scan)
vel_div = float(scan.get('vel_div') or 0.0)
if not math.isfinite(vel_div):
log(f"WARN scan #{scan_number}: non-finite vel_div={vel_div} — clamped to 0.0")
vel_div = 0.0
v50_vel = float(scan.get('w50_velocity') or 0.0)
v750_vel = float(scan.get('w750_velocity') or 0.0)
if not math.isfinite(v50_vel): v50_vel = 0.0
if not math.isfinite(v750_vel): v750_vel = 0.0
self.last_w750_vel = v750_vel
# Feed live OB data into OBF engine for this bar (AGENT_SPEC_OBF_LIVE_SWITCHOVER)
if self.ob_eng is not None and self.ob_assets:
self.ob_eng.step_live(self.ob_assets, self.bar_idx)
# Live posture sync — update engine posture + regime_dd_halt together
posture_now = self._read_posture()
with self.eng_lock:
prev_posture = getattr(self.eng, '_day_posture', 'APEX')
if posture_now != prev_posture:
if posture_now in ('TURTLE', 'HIBERNATE'):
self.eng.regime_dd_halt = True # always block new entries
if posture_now == 'HIBERNATE' and self.eng.position is not None:
open_tid = str(getattr(self.eng.position, "trade_id", "") or "")
if not open_tid:
self._mark_restore_failure("HIBERNATE posture with open position missing trade_id")
return
if open_tid not in self._pending_entries:
self._mark_restore_failure(
f"HIBERNATE posture with open position missing pending entry: {open_tid}"
)
return
if (posture_now == 'HIBERNATE'
and self.eng.position is not None
and not self._hibernate_protect_active):
# Position in flight: arm TP+SL instead of letting
# _manage_position() fire HIBERNATE_HALT next bar.
# _day_posture stays at prev value — no HALT fires.
self._hibernate_protect_position()
else:
self.eng._day_posture = posture_now
log(f"POSTURE_SYNC: {posture_now} — halt set")
else:
self.eng._day_posture = posture_now
self.eng.regime_dd_halt = False
if self._hibernate_protect_active:
log(f"POSTURE_SYNC: {posture_now} — posture recovered, clearing protect mode")
self._hibernate_protect_active = None
else:
log(f"POSTURE_SYNC: {posture_now} — halt lifted")
# EsoF value gate — exposure only, no alpha or selection changes.
self._sync_esof_size_gate()
self._sync_tp_threshold()
self._sync_sc_threshold_advisor(scan_number=scan_number, vel_div=vel_div)
self._sync_sc_gauge_advisor(scan_number=scan_number, vel_div=vel_div)
self._apply_runtime_direction()
if self._market_state_runtime is not None:
try:
self._market_state_runtime.update_scan_state(
scan_payload=scan,
prices_dict=prices_dict,
scan_number=scan_number,
vel_div=vel_div,
v50_vel=v50_vel,
v750_vel=v750_vel,
vol_ok=vol_ok,
posture=posture_now,
exf_snapshot=getattr(self, "_last_exf", {}) or {},
esof_payload=self._read_esof_payload(),
top_k_assets=5,
)
except Exception as e:
log(f" MarketStateRuntime scan update failed: {e}")
if self.eng.position is not None and prices_dict:
prices_dict = self._inject_obf_midprice(prices_dict)
step_start = time.time()
with self.eng_lock:
self._apply_catastrophic_floor_to_open_position()
result = self.eng.step_bar(
bar_idx=self.bar_idx, vel_div=vel_div, prices=prices_dict,
vol_regime_ok=vol_ok, v50_vel=v50_vel, v750_vel=v750_vel
)
self.bar_idx += 1
scan_to_fill_ms = (time.time() - listener_time) * 1000
step_bar_ms = (time.time() - step_start) * 1000
log(f"LATENCY scan #{scan_number}: scan→fill={scan_to_fill_ms:.1f}ms step_bar={step_bar_ms:.1f}ms vel_div={vel_div:.5f}")
ch_put("eigen_scans", {
"ts": _ch_ts_us(),
"scan_number": scan_number,
"scan_uuid": str(scan.get("scan_uuid") or ""),
"vel_div": vel_div,
"w50_velocity": v50_vel,
"w750_velocity": v750_vel,
"instability_50": float(scan.get("instability_50") or 0.0),
"scan_to_fill_ms": scan_to_fill_ms,
"step_bar_ms": step_bar_ms,
})
if result.get('entry'):
self.trades_executed += 1
e = result['entry']
log(f"ENTRY: {e} [{ALGO_VERSION}]")
# Cache entry fields for CH trade_events on exit
tid = self._resolve_trade_id(e.get('trade_id'), create_if_missing=True)
e['trade_id'] = tid
if tid:
efsm_decision = None
overlay_flip = False
if self._efsm is not None and int(e.get('direction', -1)) == 1 and int(self.trade_direction) == -1:
efsm_decision = self._efsm.tag_next_entry(
asset=str(e.get('asset', '') or ''),
entry_ts=datetime.now(timezone.utc),
metadata={"trade_id": tid},
)
overlay_flip = bool(efsm_decision and efsm_decision.action == "TAG" and efsm_decision.side == "LONG")
self._pending_entries[tid] = {
'trade_id': tid,
'asset': e.get('asset', ''),
'side': 'SHORT' if e.get('direction', -1) == -1 else 'LONG',
'entry_price': float(e.get('entry_price', 0) or 0),
'quantity': round(float(e.get('notional', 0) or 0) / float(e.get('entry_price', 1) or 1), 6),
'notional': float(e.get('notional', 0) or 0),
'notional_entry': float(e.get('notional', 0) or 0),
'leverage': float(e.get('leverage', 0) or 0),
'vel_div_entry': float(e.get('vel_div', 0) or 0),
'boost_at_entry': float(getattr(getattr(self, 'eng', None), 'acb_boost', 1.0) or 1.0),
'beta_at_entry': float(getattr(getattr(self, 'eng', None), 'acb_beta', 1.0) or 1.0),
'posture': posture_now,
'entry_ts': _ch_ts_us(),
'entry_date': (self.current_day or ''),
'entry_bar': self.bar_idx,
'overlay_flip': overlay_flip,
'overlay_reason': getattr(efsm_decision, "reason", "") if efsm_decision else "",
'overlay_slot': int(getattr(efsm_decision, "consumed_slot", 0) or 0) if efsm_decision else 0,
'retraction_legs': 0,
'realized_pnl_legs_total': 0.0,
}
_tp_ctx = self._tp_curve_context(notional=float(self._pending_entries[tid]["notional"] or 0.0))
self._pending_entries[tid].update(_tp_ctx)
self._apply_sc_entry_size_multiplier(tid, e, self._pending_entries[tid])
self._pending_entries[tid].update(self._chain_state_for_pending(
tid,
self._pending_entries[tid],
chain_mode="LIVE",
chain_head_leg_id=f"{tid}:open",
chain_prev_leg_id="",
chain_seq=0,
))
if overlay_flip:
log(
f"EFSM TAG: trade_id={tid} asset={e.get('asset','')} "
f"slot={self._pending_entries[tid]['overlay_slot']} "
f"reason={self._pending_entries[tid]['overlay_reason']}"
)
with self.eng_lock:
self._apply_catastrophic_floor_to_open_position()
# Persist position to CH so restarts can recover it
self._ps_write_open(tid, self._pending_entries[tid])
ch_put("trade_reconstruction", {
"ts": _ch_ts_us(),
"trade_id": tid,
"event_type": "OPEN",
"event_id": f"{tid}:open",
"payload_json": json.dumps(self._pending_entries[tid], default=str),
"market_state_bundle_json": str(self._pending_entries[tid].get("market_state_bundle_json", "") or ""),
"tp_base_pct": float(self._pending_entries[tid].get("tp_base_pct", 0.0) or 0.0),
"tp_effective_pct": float(self._pending_entries[tid].get("tp_effective_pct", 0.0) or 0.0),
"our_leverage": float(self._pending_entries[tid].get("our_leverage", 0.0) or 0.0),
})
self._announce_position_event(
kind="trade_entry",
severity="info",
title=f"[BLUE] ENTRY {e.get('asset', '')} {self._pending_entries[tid]['side']}",
message=(
f"entry={float(e.get('entry_price', 0) or 0):.6f} "
f"qty={self._pending_entries[tid]['quantity']:.6f} "
f"lev={self._pending_entries[tid]['leverage']:.2f}x"
),
metadata={
"trade_id": tid,
"asset": self._pending_entries[tid]["asset"],
"side": self._pending_entries[tid]["side"],
"entry_price": self._pending_entries[tid]["entry_price"],
"quantity": self._pending_entries[tid]["quantity"],
"leverage": self._pending_entries[tid]["leverage"],
"vel_div_entry": self._pending_entries[tid]["vel_div_entry"],
"boost_at_entry": self._pending_entries[tid]["boost_at_entry"],
"beta_at_entry": self._pending_entries[tid]["beta_at_entry"],
"posture": self._pending_entries[tid]["posture"],
"entry_ts": self._pending_entries[tid]["entry_ts"],
},
)
if self._v7_exit_engine is not None:
try:
side = 1 if e.get('direction', -1) == -1 else 0
ctx = self._v7_exit_engine.make_context(
entry_price=float(e.get('entry_price', 0) or 0),
entry_bar=max(0, self.bar_idx - 1),
side=side,
)
if self._last_exf:
ctx.set_exf(
funding=float(self._last_exf.get('funding', 0.0) or 0.0),
dvol=float(self._last_exf.get('dvol', 0.0) or 0.0),
fear_greed=float(self._last_exf.get('fear_greed', 0.0) or 0.0),
taker=float(self._last_exf.get('taker', 0.0) or 0.0),
)
self._v7_contexts[tid] = ctx
self._v7_decisions.pop(tid, None)
self._v7_decision_seq[tid] = 0
except Exception as e:
log(f" V7 live context init failed for {tid}: {e}")
# Shadow AE: notify of entry (vel_div at entry bar is in scope)
if self._ae is not None:
try:
self._ae.on_entry(
trade_id=tid,
asset=e.get('asset', ''),
direction=int(e.get('direction', -1)),
entry_price=float(e.get('entry_price', 0) or 0),
vel_div_entry=vel_div,
)
except Exception:
pass
if self._sc_advisor is not None:
try:
payload = self._read_esof_payload()
rec = self._sc_advisor.evaluate(
trade_id=tid,
asset=e.get('asset', ''),
sc=_safe_float(payload.get('advisory_score', payload.get('score', 0.0)) if payload else None),
vel_div=vel_div,
exf_snapshot=getattr(self, "_last_exf", {}) or {},
trade_history=getattr(self.eng, 'trade_history', []),
current_mult=float(self._last_esof_size_mult or 1.0),
esof_payload=payload,
scan_number=scan_number,
bar_idx=self.bar_idx,
strategy="blue",
log_shadow=True,
)
self._pending_entries[tid]['sc_threshold_advisor'] = rec
self._pending_entries[tid]['sc_exec_mult'] = float(self._last_esof_size_mult or 1.0)
try:
self._record_sc_haircut(
trade_id=tid,
pending=self._pending_entries[tid],
source="sc_threshold_entry",
)
except Exception as e:
log(f"SC haircut record failed for {tid}: {e}")
except Exception:
pass
if self._sc_gauge is not None:
try:
payload = self._read_esof_payload()
rec = self._sc_gauge.evaluate(
trade_id=tid,
asset=e.get('asset', ''),
sc=_safe_float(payload.get('advisory_score', payload.get('score', 0.0)) if payload else None),
vel_div=vel_div,
exf_snapshot=getattr(self, "_last_exf", {}) or {},
obf_snapshot=self._current_obf_snapshot(e.get('asset', ''), self.bar_idx),
trade_history=getattr(self.eng, 'trade_history', []),
current_mult=float(self._last_esof_size_mult or 1.0),
esof_payload=payload,
scan_number=scan_number,
bar_idx=self.bar_idx,
strategy="blue",
log_shadow=True,
)
self._pending_entries[tid]['sc_bucket_gauge'] = rec
self._pending_entries[tid]['sc_bucket_gauge_exec_mult'] = float(self._last_esof_size_mult or 1.0)
try:
self._record_sc_haircut(
trade_id=tid,
pending=self._pending_entries[tid],
source="sc_bucket_gauge",
)
except Exception as e:
log(f"SC haircut record failed for {tid}: {e}")
except Exception:
pass
if self._bounce_advisor is not None:
try:
entry_ts_val = float(self._pending_entries[tid].get('entry_ts', 0) or 0)
entry_ts_dt = datetime.fromtimestamp(entry_ts_val / 1_000_000, tz=timezone.utc) if entry_ts_val else None
bounce_rec = self._bounce_eval(
trade_id=tid,
asset=str(e.get('asset', '')),
side=self._pending_entries[tid]['side'],
source="entry",
scan_number=scan_number,
entry_ts=entry_ts_dt,
current_price=float(prices_dict.get(e.get('asset', ''), e.get('entry_price', 0)) or e.get('entry_price', 0) or 0),
entry_price=float(e.get('entry_price', 0) or 0),
quantity=float(self._pending_entries[tid].get('quantity', 0) or 0),
notional=float(e.get('notional', 0) or 0),
leverage=float(e.get('leverage', 0) or 0),
vel_div=vel_div,
current_mult=float(self._last_esof_size_mult or 1.0),
bars_held=0,
log_shadow=True,
)
if bounce_rec:
self._pending_entries[tid]['bounce_advisor_entry'] = bounce_rec
self._pending_entries[tid]['bounce_advisor_latest'] = bounce_rec
except Exception as e:
log(f" BounceAdvisor entry eval failed for {tid}: {e}")
# V7 remains the authoritative live exit brain, but the explicit
# retract bridge must stay active even when the engine callback is
# wired. Otherwise RETRACT decisions stay observational only.
if (self._v7_exit_engine is not None
and self.eng is not None
and getattr(self.eng, 'position', None) is not None):
pos = self.eng.position
tid_v7 = getattr(pos, 'trade_id', '')
pending_v7 = self._pending_entries.get(tid_v7, {})
ctx_v7 = self._v7_contexts.get(tid_v7)
if ctx_v7 is None and pending_v7:
try:
ctx_v7 = self._v7_exit_engine.make_context(
entry_price=float(pending_v7.get('entry_price', pos.entry_price) or pos.entry_price or 0.0),
entry_bar=int(pending_v7.get('entry_bar', max(0, self.bar_idx - 1)) or max(0, self.bar_idx - 1)),
side=1 if pending_v7.get('side', 'SHORT') == 'SHORT' else 0,
)
if self._last_exf:
ctx_v7.set_exf(
funding=float(self._last_exf.get('funding', 0.0) or 0.0),
dvol=float(self._last_exf.get('dvol', 0.0) or 0.0),
fear_greed=float(self._last_exf.get('fear_greed', 0.0) or 0.0),
taker=float(self._last_exf.get('taker', 0.0) or 0.0),
)
self._v7_contexts[tid_v7] = ctx_v7
self._v7_decision_seq.setdefault(tid_v7, 0)
except Exception as e:
log(f" V7 live context restore failed for {tid_v7}: {e}")
ctx_v7 = None
if ctx_v7 is not None and pending_v7:
try:
if self.ob_eng is not None:
ob_sig = self.ob_eng.get_signal(pos.asset, float(max(0, self.bar_idx - 1)))
ob_imb = float(getattr(ob_sig, 'imbalance_ma5', 0.0) or 0.0)
else:
ob_imb = 0.0
cur_px = float(prices_dict.get(pos.asset, pos.current_price) or pos.current_price or 0.0)
if cur_px > 0.0:
v7dec = self._v7_exit_engine.evaluate(
ctx_v7,
cur_px,
max(0, self.bar_idx - 1),
ob_imb,
asset=pos.asset,
)
prev_v7dec = self._v7_decisions.get(tid_v7)
prev_v7_action = str(
prev_v7dec.get("action", "")
if isinstance(prev_v7dec, dict)
else getattr(prev_v7dec, "action", "")
).upper()
self._v7_decisions[tid_v7] = v7dec
self._record_v7_decision(
trade_id=tid_v7,
asset=pos.asset,
side=pending_v7.get('side', 'SHORT'),
decision=v7dec,
current_price=cur_px,
ob_imbalance=ob_imb,
vel_div_now=vel_div,
v50_vel=v50_vel,
v750_vel=v750_vel,
bar_idx=max(0, self.bar_idx - 1),
)
v7_action = str(v7dec.get("action", "") if isinstance(v7dec, dict) else getattr(v7dec, "action", "")).upper()
if v7_action == "RETRACT" and prev_v7_action != "RETRACT":
try:
cmd = {
"command_id": f"v7-retract-{uuid.uuid4().hex[:16]}",
"trade_id": tid_v7,
"action": "RETRACT",
"fraction": 0.50,
"reason": "V7_RETRACT",
"source": "v7",
"ts": float(time.time()),
"asset": pos.asset,
"chain_root_trade_id": str(pending_v7.get("chain_root_trade_id", tid_v7) or tid_v7),
"chain_head_leg_id": str(pending_v7.get("chain_head_leg_id", f"{tid_v7}:open") or f"{tid_v7}:open"),
"chain_prev_leg_id": str(pending_v7.get("chain_prev_leg_id", "") or ""),
"chain_seq": int(pending_v7.get("chain_seq", pending_v7.get("retraction_legs", 0)) or 0),
"chain_token": str(pending_v7.get("chain_token", "") or ""),
}
raw_q = self.control_map.blocking().get("blue_runtime_commands") if self.control_map else None
q = json.loads(raw_q) if isinstance(raw_q, str) and raw_q else []
if not isinstance(q, list):
q = []
q.append(cmd)
q = q[-200:]
if self.control_map is not None:
self.control_map.blocking().put("blue_runtime_commands", json.dumps(q))
except Exception as e:
log(f" V7 retract enqueue failed for {tid_v7}: {e}")
if self._bounce_advisor is not None:
try:
entry_ts_val = float(pending_v7.get('entry_ts', 0) or 0)
entry_ts_dt = datetime.fromtimestamp(entry_ts_val / 1_000_000, tz=timezone.utc) if entry_ts_val else None
bounce_rec = self._bounce_eval(
trade_id=tid_v7,
asset=pos.asset,
side=pending_v7.get('side', 'SHORT'),
source="open_scan",
scan_number=scan_number,
entry_ts=entry_ts_dt,
current_price=cur_px,
entry_price=float(pending_v7.get('entry_price', pos.entry_price) or pos.entry_price or 0.0),
quantity=float(pending_v7.get('quantity', getattr(pos, 'quantity', 0.0)) or getattr(pos, 'quantity', 0.0) or 0.0),
notional=float(pending_v7.get('notional', getattr(pos, 'notional', 0.0)) or getattr(pos, 'notional', 0.0) or 0.0),
leverage=float(pending_v7.get('leverage', getattr(pos, 'leverage', 0.0)) or getattr(pos, 'leverage', 0.0) or 0.0),
vel_div=vel_div,
current_mult=float(self._last_esof_size_mult or 1.0),
bars_held=max(0, int(self.bar_idx - int(pending_v7.get('entry_bar', max(0, self.bar_idx - 1)) or max(0, self.bar_idx - 1)))),
log_shadow=True,
)
if bounce_rec:
pending_v7['bounce_advisor_latest'] = bounce_rec
self._pending_entries[tid_v7] = pending_v7
except Exception as e:
log(f" BounceAdvisor open-scan eval failed for {tid_v7}: {e}")
except Exception as e:
log(f" V7 live evaluate failed for {tid_v7}: {e}")
_forced_exit = self._drain_runtime_commands(prices_dict)
if _forced_exit is not None and not result.get('exit'):
result['exit'] = _forced_exit
if result.get('exit'):
x = result['exit']
tid = x.get('trade_id')
# Hibernate-protected exits: re-label reason, finalize posture
if tid and self._hibernate_protect_active == tid:
_orig = x.get('reason', '')
_map = {'FIXED_TP': 'HIBERNATE_TP', 'STOP_LOSS': 'HIBERNATE_SL',
'MAX_HOLD': 'HIBERNATE_MAXHOLD'}
x['reason'] = _map.get(_orig, f'HIBERNATE_{_orig}')
self._hibernate_protect_active = None
# Position closed — now safe to commit posture to HIBERNATE
_cur_posture = self._read_posture()
if _cur_posture == 'HIBERNATE':
self.eng._day_posture = 'HIBERNATE'
log(f"HIBERNATE_PROTECT: closed via {x['reason']} — posture finalized HIBERNATE")
else:
log(f"HIBERNATE_PROTECT: closed via {x['reason']} — posture recovered to {_cur_posture}")
x['reason'] = _normalize_v7_exit_reason(x.get('reason', ''))
log(f"EXIT: {x} [{ALGO_VERSION}]")
_exit_reason_raw = str(x.get('reason', ''))
if _exit_reason_raw in ('FIXED_TP', 'HIBERNATE_TP', 'TP_FLOOR', 'HIBERNATE_TP_FLOOR'):
_tp_used = self.eng.exit_manager.fixed_tp_pct
_pos = self.eng.position
_bars = int(x.get('bars_held', 0) or 0)
# Effective (OB-modulated) gate: _execute_exit() rebuilds
# the exit dict and drops evaluate()'s diag keys, so read
# the manager's last_eval (same source the v7 journal uses).
_le = dict(getattr(self.eng.exit_manager, 'last_eval', {}) or {})
_dyn = float(x.get('dynamic_tp_pct', _le.get('dynamic_tp_pct', 0.0)) or 0.0)
_mod = float(x.get('tp_mod_factor', _le.get('tp_mod_factor', 0.0)) or 0.0)
_casc = int(x.get('cascade_count', _le.get('cascade_count', 0)) or 0)
log(f" TP_EXIT: tp_pct={_tp_used*100:.2f}% dyn_tp={_dyn*100:.2f}% "
f"mod={_mod:.2f}x cascade={_casc} "
f"bars_held={_bars} pnl_pct={float(x.get('pnl_pct',0) or 0):+.4f}")
tid = self._resolve_trade_id(x.get('trade_id'), create_if_missing=True)
x['trade_id'] = tid
pending = self._pending_entries.pop(tid, {}) if tid else {}
if tid:
self._v7_contexts.pop(tid, None)
self._v7_decisions.pop(tid, None)
self._v7_decision_seq.pop(tid, None)
if tid and not pending:
fallback_pending = self._fallback_pending_for_close(tid, x)
self._ps_write_closed(tid, fallback_pending, x)
log(
" EXIT pending metadata missing; wrote fallback CLOSED tombstone "
f"for trade={tid} asset={fallback_pending.get('asset', '')}"
)
if pending:
# exact bar price the engine exited against — prices_dict is still in scope
exit_price = float(prices_dict.get(pending['asset'], 0) or 0)
if self._sc_advisor is not None:
try:
_rec = pending.get('sc_threshold_advisor')
if _rec:
self._sc_advisor.observe_outcome(
_rec,
executed_mult=float(pending.get('sc_exec_mult', self._last_esof_size_mult) or 1.0),
pnl_pct=float(x.get('pnl_pct', 0) or 0),
exit_reason=str(x.get('reason', 'UNKNOWN')),
)
except Exception:
pass
if self._sc_gauge is not None:
try:
_rec = pending.get('sc_bucket_gauge')
if _rec:
self._sc_gauge.observe_outcome(
_rec,
executed_mult=float(pending.get('sc_bucket_gauge_exec_mult', self._last_esof_size_mult) or 1.0),
pnl_pct=float(x.get('pnl_pct', 0) or 0),
exit_reason=str(x.get('reason', 'UNKNOWN')),
)
except Exception:
pass
if self._bounce_advisor is not None:
try:
_bounce_rec = pending.get('bounce_advisor_entry')
if _bounce_rec:
self._bounce_advisor.observe_outcome(
_bounce_rec,
pnl_pct=float(x.get('pnl_pct', 0) or 0),
exit_reason=str(x.get('reason', 'UNKNOWN')),
)
except Exception as e:
log(f" BounceAdvisor outcome update failed for {tid}: {e}")
if self._market_state_runtime is not None:
try:
self._market_state_runtime.online_update_from_trade(
asset=str(pending.get("asset", "")),
entry_price=float(pending.get("entry_price", 0) or 0),
exit_price=float(exit_price),
direction=-1 if str(pending.get("side", "SHORT")).upper() == "SHORT" else 1,
pnl_pct=float(x.get("pnl_pct", 0) or 0),
bars_held=int(x.get("bars_held", 0) or 0),
exit_reason=str(x.get("reason", "UNKNOWN")),
trade_id=str(tid or ""),
leverage=float(pending.get("leverage", 1.0) or 1.0),
)
except Exception as e:
log(f" MarketStateRuntime outcome update failed for {tid}: {e}")
if self._efsm is not None:
try:
_efsm_out = self._efsm.observe_closed_trade(
trade_id=str(tid or ""),
asset=str(pending.get("asset", "") or ""),
side=str(pending.get("side", "SHORT") or "SHORT"),
pnl=float(x.get("net_pnl", 0) or 0),
pnl_pct=float(x.get("pnl_pct", 0) or 0),
leverage=float(pending.get("leverage", 0) or 0),
closed_ts=datetime.now(timezone.utc),
was_overlay_flip=bool(pending.get("overlay_flip", False)),
metadata={"exit_reason": str(x.get("reason", "UNKNOWN"))},
)
if _efsm_out.action in {"ARMED", "TAG", "RESET"}:
log(f"EFSM { _efsm_out.action }: { _efsm_out.to_dict() }")
except Exception as e:
log(f" EFSM observe_closed_trade failed for {tid}: {e}")
realized_pnl, realized_pnl_source = self._resolved_realized_trade_pnl(
pending,
x,
exit_price=exit_price,
)
if realized_pnl_source != "net_pnl":
log(
" realized pnl resolved from "
f"{realized_pnl_source}: raw_net={float(x.get('net_pnl', 0) or 0):+.6f} "
f"resolved={realized_pnl:+.6f}"
)
capital_apply_pnl, capital_apply_source = self._resolved_capital_apply_pnl(x, realized_pnl)
if capital_apply_source != "direct":
log(
" close capital delta suppressed: "
f"source={capital_apply_source} trade={tid} "
f"economic_pnl={realized_pnl:+.6f}"
)
capital_before, capital_after = self._apply_trade_capital_update(
capital_apply_pnl,
reason=str(x.get("reason", "UNKNOWN")),
source="trade_close",
trade_id=str(tid or ""),
asset=str(pending.get("asset", "")),
mirror_control_plane=True,
)
execution_quality = self._build_trade_execution_quality_summary(
trade_id=str(tid or ""),
pending=pending,
exit_payload=x,
capital_before=capital_before,
capital_after=capital_after,
realized_pnl=realized_pnl,
exit_price=exit_price,
source="trade_close",
)
self._persist_trade_execution_quality(execution_quality)
pending.update(self._tp_curve_context(notional=float(pending.get("notional", 0) or 0)))
ch_put("trade_events", {
"ts": _ch_ts_us(),
"date": pending['entry_date'],
"strategy": "blue",
"trade_id": tid,
"asset": pending['asset'],
"side": pending['side'],
"entry_price": pending['entry_price'],
"exit_price": exit_price,
"quantity": pending['quantity'],
"capital_before": capital_before,
"capital_after": capital_after,
"pnl": realized_pnl,
"pnl_pct": float(x.get('pnl_pct', 0) or 0),
"exit_reason": str(x.get('reason', 'UNKNOWN')),
"vel_div_entry": pending['vel_div_entry'],
"boost_at_entry": pending['boost_at_entry'],
"beta_at_entry": pending['beta_at_entry'],
"posture": pending['posture'],
"leverage": pending['leverage'],
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
# CH column is UInt16 — a negative value poisons the spool
# (head-of-line jam, incident 2026-06-12: bars_held=-106)
"bars_held": max(0, int(x.get('bars_held', 0) or 0)),
"regime_signal": 0,
"tp_threshold": float(self.eng.exit_manager.fixed_tp_pct),
"execution_quality_json": json.dumps(execution_quality, default=str),
"market_state_bundle_json": str(pending.get("market_state_bundle_json", "") or ""),
"tp_base_pct": float(pending.get("tp_base_pct", 0.0) or 0.0),
"tp_effective_pct": float(pending.get("tp_effective_pct", 0.0) or 0.0),
"our_leverage": float(pending.get("our_leverage", 0.0) or 0.0),
})
ch_put("trade_reconstruction", {
"ts": _ch_ts_us(),
"trade_id": str(tid or ""),
"event_type": "CLOSE",
"event_id": f"{tid}:close",
"payload_json": json.dumps({
"exit": x,
"pending": pending,
"exit_price": exit_price,
"retraction_legs": int(pending.get("retraction_legs", 0) or 0),
"retraction_realized_total": float(pending.get("realized_pnl_legs_total", 0.0) or 0.0),
"chain": {
"trade_id": tid,
"chain_root_trade_id": pending.get("chain_root_trade_id", tid),
"chain_head_leg_id": pending.get("chain_head_leg_id", f"{tid}:open"),
"chain_prev_leg_id": pending.get("chain_prev_leg_id", ""),
"chain_seq": int(pending.get("retraction_legs", 0) or 0),
"chain_token": pending.get("chain_token", ""),
"chain_mode": pending.get("chain_mode", "LIVE"),
},
"execution_quality": execution_quality,
}, default=str),
"market_state_bundle_json": str(pending.get("market_state_bundle_json", "") or ""),
"tp_base_pct": float(pending.get("tp_base_pct", 0.0) or 0.0),
"tp_effective_pct": float(pending.get("tp_effective_pct", 0.0) or 0.0),
"our_leverage": float(pending.get("our_leverage", 0.0) or 0.0),
})
# Mark position closed in CH (supersedes OPEN row via ReplacingMergeTree)
self._ps_write_closed(tid, pending, x)
self._announce_position_event(
kind="trade_exit",
severity="info" if float(x.get("pnl_pct", 0) or 0) >= 0 else "warning",
title=f"[BLUE] EXIT {pending.get('asset', '')} {pending.get('side', '')}",
message=(
f"reason={x.get('reason', 'UNKNOWN')} "
f"pnl={float(x.get('net_pnl', 0) or 0):+.2f} "
f"pnl_pct={float(x.get('pnl_pct', 0) or 0):+.3%}"
),
metadata={
"trade_id": tid,
"asset": pending.get("asset", ""),
"side": pending.get("side", ""),
"entry_price": pending.get("entry_price", 0),
"exit_price": exit_price,
"quantity": pending.get("quantity", 0),
"pnl": realized_pnl,
"pnl_pct": float(x.get("pnl_pct", 0) or 0),
"exit_reason": str(x.get("reason", "UNKNOWN")),
"bars_held": int(x.get("bars_held", 0) or 0),
"posture": pending.get("posture", ""),
"overlay_flip": bool(pending.get("overlay_flip", False)),
"overlay_reason": str(pending.get("overlay_reason", "")),
"overlay_slot": int(pending.get("overlay_slot", 0) or 0),
},
)
# Shadow AE: record outcome for online update
if self._ae is not None and tid:
try:
self._ae.on_exit(
trade_id=tid,
actual_exit_reason=str(x.get('reason', 'UNKNOWN')),
pnl_pct=float(x.get('pnl_pct', 0) or 0),
)
except Exception:
pass
# Shadow AE: per-bar evaluate for all open trades — daemon thread, zero hot-path impact
if self._ae is not None and self._pending_entries:
_ae_ref = self._ae
_pending_snap = dict(self._pending_entries) # shallow copy under GIL
_prices_snap = dict(prices_dict)
_vel_now = vel_div
_bar = self.bar_idx
def _ae_eval():
for _tid, _p in _pending_snap.items():
try:
_cur = _prices_snap.get(_p['asset'], 0) or 0
if not _cur:
continue
_entry_px = float(_p.get('entry_price', 0) or 0)
_bars_held = max(0, int(_bar - int(_p.get('entry_bar', _bar))))
_shadow_pnl_pct = ((_entry_px - _cur) / _entry_px) if _entry_px > 0 else 0.0
_recent_prices = self._bounce_price_path(_p['asset'])
_shadow = _ae_ref.evaluate(
trade_id=_tid,
asset=_p['asset'],
direction=-1,
entry_price=_entry_px,
current_price=_cur,
bars_held=_bars_held,
vel_div_now=_vel_now,
)
_ae_ref.log_shadow(_shadow, pnl_pct=_shadow_pnl_pct)
if self._advanced_sl is not None:
try:
_ms_state = dict(self._market_state_runtime.latest_state) if self._market_state_runtime and getattr(self._market_state_runtime, "latest_state", None) else {}
_ms_bundle = dict(self._market_state_runtime.latest_bundle_dict) if self._market_state_runtime and getattr(self._market_state_runtime, "latest_bundle_dict", None) else {}
_v7 = dict(self._v7_decisions.get(_tid, {}) or {})
_maras_ctx = self._latest_maras_context()
_adv_meta = {}
if self._efsm is not None and hasattr(self._efsm, "exit_policy_meta"):
try:
_adv_meta = self._efsm.exit_policy_meta(_maras_ctx)
except Exception:
_adv_meta = {}
_adv = self._advanced_sl.evaluate(
trade_id=_tid,
asset=_p['asset'],
side=str(_p.get("side", "SHORT") or "SHORT"),
entry_price=_entry_px,
current_price=_cur,
bars_held=_bars_held,
recent_prices=_recent_prices,
ae_shadow=_shadow,
v7_decision=_v7,
market_state=_ms_state,
market_bundle=_ms_bundle,
exf_snapshot=dict(self._last_exf or {}),
meta_performance=_adv_meta,
)
self._advanced_sl.log_shadow(_adv, pnl_pct=_shadow_pnl_pct)
_overlay_exit = False
_overlay_exit_detail = ""
try:
_overlay_exit, _overlay_exit_detail = self._overlay_advsl_should_exit(
_tid,
_p,
_v7,
_bars_held,
_cur,
)
except Exception:
_overlay_exit = False
_overlay_exit_detail = ""
if (
(self._advanced_sl_live_exit_enabled and bool(getattr(_adv, "would_exit", False)))
or _overlay_exit
):
try:
raw_q = self.control_map.blocking().get("blue_runtime_commands") if self.control_map else None
q = json.loads(raw_q) if isinstance(raw_q, str) and raw_q else []
if not isinstance(q, list):
q = []
_reason = (
f"OVERLAY_ADVSL_{_overlay_exit_detail}"
if _overlay_exit
else f"ADVSL_{_adv.reason}"
)
q.append({
"command_id": f"advsl-exit-{uuid.uuid4().hex[:16]}",
"trade_id": _tid,
"action": "RETRACT",
"fraction": 1.0,
"reason": _reason,
"source": "advanced_sl",
"ts": float(time.time()),
"asset": _p["asset"],
"chain_root_trade_id": str(_p.get("chain_root_trade_id", _tid) or _tid),
"chain_head_leg_id": str(_p.get("chain_head_leg_id", f"{_tid}:open") or f"{_tid}:open"),
"chain_prev_leg_id": str(_p.get("chain_prev_leg_id", "") or ""),
"chain_seq": int(_p.get("chain_seq", _p.get("retraction_legs", 0)) or 0),
"chain_token": str(_p.get("chain_token", "") or ""),
})
q = q[-200:]
if self.control_map is not None:
self.control_map.blocking().put("blue_runtime_commands", json.dumps(q))
log(
" AdvancedSL live exit enqueue: "
f"{_tid} {_p['asset']} reason={_reason} "
f"score={float(getattr(_adv, 'score', 0.0) or 0.0):+.3f} "
f"pnl_pct={_shadow_pnl_pct:+.3f}"
)
except Exception as e:
log(f" AdvancedSL live exit enqueue failed for {_tid}: {e}")
except Exception:
pass
except Exception:
pass
threading.Thread(target=_ae_eval, daemon=True).start()
self._push_state(scan_number, vel_div, vol_ok, self._read_posture())
except Exception as e:
log(f"ERROR in _process_scan: {e}")
def on_exf_update(self, event):
if not event.value: return
snapshot = json.loads(event.value) if isinstance(event.value, str) else event.value
if not self.current_day or not self.acb: return
try:
self._last_exf = {
'funding': float(snapshot.get('funding_btc', 0.0)),
'dvol': float(snapshot.get('dvol_btc', 50.0)),
'fear_greed': float(snapshot.get('fng', 50.0)),
'taker': float(snapshot.get('taker', 0.5)),
}
w750_vel = getattr(self, 'last_w750_vel', None)
acb_info = self.acb.get_dynamic_boost_from_hz(
date_str=self.current_day,
exf_snapshot=snapshot,
w750_velocity=float(w750_vel) if w750_vel else None,
direction=self.trade_direction,
)
with self.eng_lock:
if hasattr(self.eng, 'update_acb_boost'):
subday_exit = self.eng.update_acb_boost(
boost=acb_info['boost'],
beta=acb_info['beta']
)
if subday_exit is not None:
log(f"SUBDAY_EXIT: {subday_exit} [{ALGO_VERSION}]")
tid = self._resolve_trade_id(subday_exit.get('trade_id'), create_if_missing=True)
subday_exit['trade_id'] = tid
pending = {}
if tid:
pending = self._pending_entries.pop(tid, {})
if pending and self._sc_advisor is not None:
try:
_rec = pending.get('sc_threshold_advisor')
if _rec:
self._sc_advisor.observe_outcome(
_rec,
executed_mult=float(pending.get('sc_exec_mult', self._last_esof_size_mult) or 1.0),
pnl_pct=float(subday_exit.get('pnl_pct', 0) or 0),
exit_reason=str(subday_exit.get('reason', 'SUBDAY_ACB_NORMALIZATION')),
)
except Exception:
pass
if pending and self._sc_gauge is not None:
try:
_rec_g = pending.get('sc_bucket_gauge')
if _rec_g:
self._sc_gauge.observe_outcome(
_rec_g,
executed_mult=float(pending.get('sc_bucket_gauge_exec_mult', self._last_esof_size_mult) or 1.0),
pnl_pct=float(subday_exit.get('pnl_pct', 0) or 0),
exit_reason=str(subday_exit.get('reason', 'SUBDAY_ACB_NORMALIZATION')),
)
except Exception:
pass
if pending and self._bounce_advisor is not None:
try:
_bounce_rec = pending.get('bounce_advisor_entry')
if _bounce_rec:
self._bounce_advisor.observe_outcome(
_bounce_rec,
pnl_pct=float(subday_exit.get('pnl_pct', 0) or 0),
exit_reason=str(subday_exit.get('reason', 'SUBDAY_ACB_NORMALIZATION')),
)
except Exception as e:
log(f" BounceAdvisor outcome update failed for {tid}: {e}")
if self._market_state_runtime is not None:
try:
self._market_state_runtime.online_update_from_trade(
asset=str(pending.get("asset", "")),
entry_price=float(pending.get("entry_price", 0) or 0),
exit_price=float(subday_exit.get("exit_price", 0) or 0),
direction=-1 if str(pending.get("side", "SHORT")).upper() == "SHORT" else 1,
pnl_pct=float(subday_exit.get("pnl_pct", 0) or 0),
bars_held=int(subday_exit.get("bars_held", 0) or 0),
exit_reason=str(subday_exit.get("reason", "SUBDAY_ACB_NORMALIZATION")),
trade_id=str(tid or ""),
leverage=float(pending.get("leverage", 1.0) or 1.0),
)
except Exception as e:
log(f" MarketStateRuntime outcome update failed for {tid}: {e}")
if self._efsm is not None:
try:
_efsm_sub = self._efsm.observe_closed_trade(
trade_id=str(tid or ""),
asset=str(pending.get("asset", "") or ""),
side=str(pending.get("side", "SHORT") or "SHORT"),
pnl=float(subday_exit.get("net_pnl", 0) or 0),
pnl_pct=float(subday_exit.get("pnl_pct", 0) or 0),
leverage=float(pending.get("leverage", 0) or 0),
closed_ts=datetime.now(timezone.utc),
was_overlay_flip=bool(pending.get("overlay_flip", False)),
metadata={"exit_reason": str(subday_exit.get("reason", "SUBDAY_ACB_NORMALIZATION"))},
)
if _efsm_sub.action in {"ARMED", "TAG", "RESET"}:
log(f"EFSM { _efsm_sub.action }: { _efsm_sub.to_dict() }")
except Exception as e:
log(f" EFSM observe_closed_trade failed for {tid}: {e}")
realized_pnl, realized_pnl_source = self._resolved_realized_trade_pnl(
pending,
subday_exit,
exit_price=float(subday_exit.get("exit_price", 0) or 0),
)
if realized_pnl_source != "net_pnl":
log(
" realized pnl resolved from "
f"{realized_pnl_source}: raw_net={float(subday_exit.get('net_pnl', 0) or 0):+.6f} "
f"resolved={realized_pnl:+.6f}"
)
capital_apply_pnl, capital_apply_source = self._resolved_capital_apply_pnl(subday_exit, realized_pnl)
if capital_apply_source != "direct":
log(
" close capital delta suppressed: "
f"source={capital_apply_source} trade={tid} "
f"economic_pnl={realized_pnl:+.6f}"
)
capital_before, capital_after = self._apply_trade_capital_update(
capital_apply_pnl,
reason=str(subday_exit.get("reason", "SUBDAY_ACB_NORMALIZATION")),
source="trade_close",
trade_id=str(tid or ""),
asset=str(pending.get("asset", subday_exit.get("asset", ""))),
mirror_control_plane=True,
)
execution_quality = self._build_trade_execution_quality_summary(
trade_id=str(tid or ""),
pending=pending,
exit_payload=subday_exit,
capital_before=capital_before,
capital_after=capital_after,
realized_pnl=realized_pnl,
exit_price=float(subday_exit.get("exit_price", 0) or 0),
source="trade_close",
)
self._persist_trade_execution_quality(execution_quality)
pending.update(self._tp_curve_context(notional=float(pending.get("notional", 0) or 0)))
ch_put("trade_events", {
"ts": _ch_ts_us(),
"date": self.current_day or '',
"strategy": "blue",
"trade_id": tid,
"asset": pending.get('asset', subday_exit.get('asset', '')),
"side": pending.get('side', 'SHORT'),
"entry_price": pending.get('entry_price', 0),
"exit_price": float(subday_exit.get('exit_price', 0) or 0),
"quantity": round(float(pending.get('notional', 0) or 0) / max(float(pending.get('entry_price', 1) or 1), 1e-12), 6),
"capital_before": capital_before,
"capital_after": capital_after,
"pnl": realized_pnl,
"pnl_pct": float(subday_exit.get('pnl_pct', 0) or 0),
"exit_reason": str(subday_exit.get('reason', 'SUBDAY_ACB_NORMALIZATION')),
"vel_div_entry": float(pending.get('vel_div_entry', 0) or 0),
"boost_at_entry": float(pending.get('boost_at_entry', 0) or 0),
"beta_at_entry": float(pending.get('beta_at_entry', 0) or 0),
"posture": pending.get('posture', ''),
"leverage": float(pending.get('leverage', 0) or 0),
BLUE hardening: spool-poison guards, dead-session clock fix, HZ black-box, RETRACT race-safety Seven uncommitted production fixes to BLUE's main runner that the LIVE process has already been running since the 2026-06-15 17:23 restart (file mtime 17:17, pid started 17:23). Each fix answers a documented incident; committing now so they survive in history and a stray checkout can't silently revert running-config code on the next restart. 1. bars_held = max(0, int(...)) at BOTH journal sites (terminal + sub-day). CH column is UInt16 — a negative value poisons the spool with a head-of-line jam (incident 2026-06-12: bars_held=-106). 2. entry_bar = int(restored_entry_bar) at BOTH reconstruction sites; NEVER from chain_meta. trade_reconstruction payloads carry the DEAD session's bar counter, so the old override reinstated the stale clock frame the re-anchor exists to fix → negative bars_held → same UInt16 spool poison (zombie-trade resurrections, incident 2026-06-12). restored_entry_bar already encodes hold continuity via stored_bars in THIS session's frame. 3. capital parse handles list/ledger-style payloads: when the restore blob is a list of update rows, take the latest dict row instead of falling through to {} and losing the capital anchor. 4. _connect_hz routes the `hazelcast` logger to stderr at INFO. The silent-HZ-death investigation found ZERO client log lines because nothing routed them; without this the reactor's health is invisible. 5. _dump_blackbox(reason): forensic thread dump before a watchdog restart — lifecycle.is_running, active_connections, every thread's stack, and a flag when any hazelcast/reactor-named thread is MISSING (= reactor died, the prime suspect for the silent 40min–8h client deaths). print()-only, CIFS-safe. _watchdog_restart calls it first. 6. _drain_runtime_commands / _process_runtime_commands gain `*, allow_retract=True`; the heartbeat path drains with allow_retract=False and re-queues any RETRACT commands. A RETRACT can force a terminal close that must run through the scan-thread close finalizer, so the heartbeat must not race it. 7. +import traceback (for the black-box stack dumps). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:03:20 +02:00
# CH column is UInt16 — negative poisons the spool
"bars_held": max(0, int(subday_exit.get('bars_held', 0) or 0)),
"regime_signal": 0,
"execution_quality_json": json.dumps(execution_quality, default=str),
"market_state_bundle_json": str(pending.get("market_state_bundle_json", "") or ""),
"tp_base_pct": float(pending.get("tp_base_pct", 0.0) or 0.0),
"tp_effective_pct": float(pending.get("tp_effective_pct", 0.0) or 0.0),
"our_leverage": float(pending.get("our_leverage", 0.0) or 0.0),
})
self._announce_position_event(
kind="trade_exit",
severity="info" if float(subday_exit.get("pnl_pct", 0) or 0) >= 0 else "warning",
title=f"[BLUE] EXIT {pending.get('asset', '')} {pending.get('side', '')}",
message=(
f"reason={subday_exit.get('reason', 'SUBDAY_ACB_NORMALIZATION')} "
f"pnl={float(subday_exit.get('net_pnl', 0) or 0):+.2f} "
f"pnl_pct={float(subday_exit.get('pnl_pct', 0) or 0):+.3%}"
),
metadata={
"trade_id": tid,
"asset": pending.get("asset", subday_exit.get("asset", "")),
"side": pending.get("side", "SHORT"),
"entry_price": pending.get("entry_price", 0),
"exit_price": float(subday_exit.get("exit_price", 0) or 0),
"quantity": round(float(pending.get("notional", 0) or 0) / max(float(pending.get("entry_price", 1) or 1), 1e-12), 6),
"pnl": realized_pnl,
"pnl_pct": float(subday_exit.get("pnl_pct", 0) or 0),
"exit_reason": str(subday_exit.get("reason", "SUBDAY_ACB_NORMALIZATION")),
"bars_held": int(subday_exit.get("bars_held", 0) or 0),
"posture": pending.get("posture", ""),
"overlay_flip": bool(pending.get("overlay_flip", False)),
"overlay_reason": str(pending.get("overlay_reason", "")),
"overlay_slot": int(pending.get("overlay_slot", 0) or 0),
},
)
close_pending = pending if pending else self._fallback_pending_for_close(str(tid or ""), subday_exit)
self._ps_write_closed(str(tid or ""), close_pending, subday_exit)
if tid and not pending:
log(
" SUBDAY_EXIT pending metadata missing; wrote fallback CLOSED tombstone "
f"for trade={tid} asset={close_pending.get('asset', '')}"
)
now = time.time()
if now - self._exf_log_time >= 300:
self._exf_log_time = now
log(f"ACB subday: boost={acb_info['boost']:.4f} beta={acb_info['beta']:.4f} "
f"signals={acb_info['signals']:.1f} src={acb_info.get('source','?')}")
# ACB_EXIT disabled: update_acb_boost() called to keep boost/beta current
# (ACBv6 intact), but SUBDAY_ACB_NORMALIZATION exits are suppressed.
except ValueError as e:
log(f"ACB Stale Data Fallback: {e}")
except Exception as e:
log(f"on_exf_update Error: {e}")
def _wire_obf(self, assets):
if not assets or self.ob_assets:
return
self.ob_assets = assets
from nautilus_dolphin.nautilus.hz_ob_provider import HZOBProvider
live_ob = HZOBProvider(
hz_cluster=HZ_CLUSTER,
hz_host=HZ_HOST,
assets=assets,
)
self.ob_eng = OBFeatureEngine(live_ob)
# No preload_date() call — live mode uses step_live() per scan
self.eng.set_ob_engine(self.ob_eng)
log(f" OBF wired: HZOBProvider, {len(assets)} assets (LIVE mode)")
def _save_capital(self):
"""Persist capital to HZ (primary) and disk (fallback) so restarts survive HZ loss."""
capital = getattr(self.eng, 'capital', None)
if capital is None or not math.isfinite(capital) or capital < 1.0:
return
self._commit_capital_state(
float(capital),
reason="ENGINE_SAVE",
source="engine_snapshot",
mirror_control_plane=False,
)
def _publish_corrective_replay(self, replay_blob: Mapping[str, Any]) -> None:
"""Publish a corrective replay seed back into HZ and disk."""
try:
capital = _safe_float(replay_blob.get("capital", 0.0), 0.0)
if capital < 1.0 or not math.isfinite(capital):
return
self._commit_capital_state(
capital,
reason=str(replay_blob.get("reason", "") or "CORRECTIVE_REPLAY"),
source="corrective_replay",
trade_id=str(replay_blob.get("trade_id", "") or ""),
asset=str(replay_blob.get("asset", "") or ""),
replay_blob=replay_blob,
update_replay_key=True,
mirror_control_plane=True,
)
except Exception as e:
log(f" corrective replay publish failed: {e}")
def request_capital_update(
self,
capital: float,
*,
reason: str = "CAPITAL_UPDATE",
source: str = "control_plane",
trade_id: str = "",
asset: str = "",
event_ts: float | None = None,
applies_before_ts: float | None = None,
replay_blob: Mapping[str, Any] | None = None,
) -> dict:
"""Queue a capital update onto the BLUE runtime command channel."""
cmd = {
"command_id": f"cap-update-{uuid.uuid4().hex[:16]}",
"action": "SET_CAPITAL",
"capital": float(capital),
"reason": str(reason or "CAPITAL_UPDATE"),
"source": str(source or "control_plane"),
"ts": float(time.time()),
}
if event_ts is not None:
cmd["event_ts"] = float(event_ts)
if applies_before_ts is not None:
cmd["applies_before_ts"] = float(applies_before_ts)
if trade_id:
cmd["trade_id"] = str(trade_id)
if asset:
cmd["asset"] = str(asset)
if replay_blob is not None:
cmd["replay_blob"] = dict(replay_blob)
if self._enqueue_blue_runtime_command(cmd):
return cmd
raise RuntimeError("BLUE runtime command queue unavailable")
def _restore_capital(self):
"""Restore capital from live HZ state or ledger-backed snapshots.
The raw scalar checkpoint is legacy-only and requires the explicit
DOLPHIN_ALLOW_LEGACY_CAPITAL_CHECKPOINT=1 escape hatch.
"""
self._restore_failed = False
self._restore_failure_reason = ""
self._restore_source = ""
if self._restore_capital_from_state():
return
log(" Capital: no sane state source found — restore halted")
def _push_state(self, scan_number, vel_div, vol_ok, posture):
try:
with self.eng_lock:
capital = getattr(self.eng, 'capital', 25000.0)
# Engine uses a single NDPosition object, not a list
pos = getattr(self.eng, 'position', None)
if pos is not None:
pending = self._pending_entries.get(getattr(pos, "trade_id", ""), {})
open_notional = float(getattr(pos, 'notional', 0) or 0)
open_positions_list = [{
'trade_id': getattr(pos, 'trade_id', ''),
'asset': pos.asset,
'side': 'SHORT' if pos.direction == -1 else 'LONG',
'entry_price': pos.entry_price,
'quantity': round(open_notional / pos.entry_price, 6) if pos.entry_price else 0,
'notional': open_notional,
'retraction_legs': int(pending.get('retraction_legs', 0) or 0),
'realized_pnl_legs_total': float(pending.get('realized_pnl_legs_total', 0.0) or 0.0),
'chain_root_trade_id': str(pending.get('chain_root_trade_id', getattr(pos, 'trade_id', '')) or getattr(pos, 'trade_id', '')),
'chain_head_leg_id': str(pending.get('chain_head_leg_id', f"{getattr(pos, 'trade_id', '')}:open") or f"{getattr(pos, 'trade_id', '')}:open"),
'chain_prev_leg_id': str(pending.get('chain_prev_leg_id', '') or ''),
'chain_seq': int(pending.get('chain_seq', pending.get('retraction_legs', 0)) or 0),
'chain_token': str(pending.get('chain_token', '') or ''),
'leverage': float(getattr(pos, 'leverage', 0) or 0),
'unrealized_pnl': round(pos.pnl_pct * open_notional, 2),
}]
else:
open_notional = 0.0
open_positions_list = []
cur_leverage = (open_notional / capital) if capital and capital > 0 and math.isfinite(capital) else 0.0
snapshot = {
'capital': capital if math.isfinite(capital) else None,
'open_positions': open_positions_list,
'algo_version': ALGO_VERSION,
'last_scan_number': scan_number, 'last_vel_div': vel_div,
'vol_ok': vol_ok, 'posture': posture,
'vol_gate_threshold': float(self.vol_p60_threshold),
'scans_processed': self.scans_processed,
'trades_executed': self.trades_executed,
'bar_idx': self.bar_idx,
'timestamp': datetime.now(timezone.utc).isoformat(),
# Leverage envelope — for TUI slider
'leverage_soft_cap': getattr(self.eng, 'base_max_leverage', 8.0),
'leverage_abs_cap': getattr(self.eng, 'abs_max_leverage', 9.0),
'open_notional': round(open_notional, 2),
'current_leverage': round(cur_leverage, 4),
'trade_direction_base': int(self.trade_direction),
'trade_direction_runtime': int(self._runtime_direction),
# Launch metadata for observability only; no trading behavior.
'bingx_environment': str(os.environ.get("DOLPHIN_BINGX_ENV", "ENGINE") or "ENGINE").strip().upper(),
'bingx_sizing_mode': str(os.environ.get("DOLPHIN_BINGX_SIZING_MODE", "engine") or "engine").strip().lower(),
'bingx_allow_mainnet': bool(_env_bool("DOLPHIN_BINGX_ALLOW_MAINNET", False)),
'bingx_default_leverage': _safe_float(os.environ.get("DOLPHIN_BINGX_DEFAULT_LEVERAGE"), 1.0),
'bingx_exchange_leverage_cap': int(
_safe_float(
os.environ.get(
"DOLPHIN_BINGX_EXCHANGE_LEVERAGE_CAP",
getattr(self.eng, 'abs_max_leverage', 3.0),
),
3.0,
)
),
'efsm': self._efsm.snapshot() if self._efsm is not None else None,
'advanced_sl': self._advanced_sl.snapshot_dict() if self._advanced_sl is not None else None,
}
self._last_engine_snapshot_payload = dict(snapshot)
future = self.state_map.put('engine_snapshot', json.dumps(snapshot))
future.add_done_callback(lambda f: None)
# Heartbeat — MHS checks age < 30s; force blocking put to avoid
# silent async drop/stall under client backpressure.
if self.heartbeat_map is not None:
hb = build_runner_heartbeat_payload(
flow="nautilus_event_trader",
phase="trading",
run_date=self.current_day,
runner="blue",
)
try:
write_runner_heartbeat(self.heartbeat_map, hb)
except Exception as hb_err:
log(f" Heartbeat put failed: {hb_err}")
# Persist capital so next restart resumes from here
if capital is not None and math.isfinite(capital) and capital >= 1.0:
self._save_capital()
except Exception as e:
log(f" Failed to push state: {e}")
def run(self):
global running
log("=" * 70)
log("🐬 DOLPHIN Nautilus Event-Driven Trader Starting")
log("=" * 70)
self._build_engine()
self._connect_hz()
threading.Thread(target=self._heartbeat_loop, daemon=True).start()
threading.Thread(target=self._scan_watchdog_loop, daemon=True,
name="scan_watchdog").start()
self._restore_capital()
if self._restore_failed:
log(f"RESTORE HALT: {self._restore_failure_reason}")
self.shutdown()
return
self._rollover_day()
self._restore_position_state()
if self._restore_failed:
log(f"RESTORE HALT: {self._restore_failure_reason}")
self.shutdown()
return
# Seed the live snapshot immediately so engine_snapshot and
# capital_checkpoint reflect the restored capital before scan traffic.
try:
posture = self._read_posture()
self._push_state(self.bar_idx, 0.0, True, posture)
except Exception as e:
log(f" Startup seed push failed: {e}")
def listener(event):
self.on_scan(event)
self.features_map.add_entry_listener(
key='latest_eigen_scan', include_value=True,
updated_func=listener, added_func=listener
)
def exf_listener(event):
self.on_exf_update(event)
self.features_map.add_entry_listener(
key='exf_latest', include_value=True,
updated_func=exf_listener, added_func=exf_listener
)
log("✅ Hz listener registered")
log(f"🏷️ ALGO_VERSION: {ALGO_VERSION}")
log("⏳ Waiting for scans...")
global running
if not running:
log(" Startup SIGTERM latch cleared before main scan loop")
running = True
try:
while running:
time.sleep(1)
except KeyboardInterrupt:
log("Interrupted")
finally:
self.shutdown()
def shutdown(self):
log("Shutting down...")
self._watchdog_stop.set()
self._scan_executor.shutdown(wait=False)
if self.eng and self.current_day:
try:
with self.eng_lock:
summary = self.eng.end_day()
log(f"end_day: {summary}")
except Exception as e:
log(f"end_day failed: {e}")
if self._market_state_runtime is not None:
try:
self._market_state_runtime.save()
except Exception:
pass
if self.hz_client:
try:
self.hz_client.shutdown()
log("Hz disconnected")
except:
pass
log(f"🛑 Stopped. Scans: {self.scans_processed}, Trades: {self.trades_executed}")
def signal_handler(signum, frame):
global running
age_s = time.time() - _PROCESS_BOOT_TS
if signum == signal.SIGTERM and age_s < _SIGTERM_STARTUP_GRACE_S:
log(f"Signal {signum} received during startup grace ({age_s:.1f}s) — ignored")
return
log(f"Signal {signum} received")
running = False
def main():
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
trader = DolphinLiveTrader()
trader.run()
if __name__ == '__main__':
main()