initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
1
adaptive_exit/.write_test
Executable file
1
adaptive_exit/.write_test
Executable file
@@ -0,0 +1 @@
|
||||
test
|
||||
3
adaptive_exit/__init__.py
Executable file
3
adaptive_exit/__init__.py
Executable file
@@ -0,0 +1,3 @@
|
||||
# Smart Adaptive Exit Engine — per-bucket MAE/MFE continuation model
|
||||
# Phase 1: offline training on NG7/VBT price data
|
||||
# Phase 2: parallel shadow mode wired into BLUE per-trade
|
||||
376
adaptive_exit/adaptive_exit_engine.py
Executable file
376
adaptive_exit/adaptive_exit_engine.py
Executable file
@@ -0,0 +1,376 @@
|
||||
"""
|
||||
Adaptive Exit Engine — parallel shadow mode for BLUE.
|
||||
|
||||
Runs alongside V7 per active trade. Does NOT interfere with real exits.
|
||||
Logs shadow decisions to ClickHouse table `adaptive_exit_shadow` and
|
||||
accumulates outcomes for online model updates.
|
||||
|
||||
Integration pattern (dolphin_actor.py):
|
||||
from adaptive_exit.adaptive_exit_engine import AdaptiveExitEngine
|
||||
_adaptive_exit = AdaptiveExitEngine.load()
|
||||
|
||||
# In _on_rt_exit_timer or _on_scan_timer, per active trade:
|
||||
shadow = _adaptive_exit.evaluate(
|
||||
trade_id=_tid,
|
||||
asset=_asset,
|
||||
direction=_dir, # -1 = SHORT
|
||||
entry_price=_entry,
|
||||
current_price=_cur_px,
|
||||
bars_held=_bars,
|
||||
max_hold=120,
|
||||
recent_prices=_price_buf, # list[float], last 20+ bars
|
||||
exf=self._last_exf,
|
||||
)
|
||||
# shadow is dict with: action, p_continuation, exit_reason_shadow, bucket_id
|
||||
# Log it; never use it to exit.
|
||||
|
||||
Decision logic mirrors the spec:
|
||||
EXIT if:
|
||||
- mae > mae_threshold(vol) [hard stop]
|
||||
- giveback: mfe < k * peak_mfe AND p_continuation < p_threshold
|
||||
- tau > 1.0 [time cap]
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
import urllib.request
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from adaptive_exit.bucket_engine import build_buckets, get_bucket
|
||||
from adaptive_exit.continuation_model import ContinuationModelBank, FEATURE_COLS
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
P_THRESHOLD = 0.40 # P(continuation) below this → consider exit
|
||||
GIVEBACK_K = 0.50 # MFE giveback fraction
|
||||
MAE_MULT_TIER1 = 3.5 # vol multiplier for tier-1 stop
|
||||
MAE_MULT_TIER2 = 7.0
|
||||
ATR_WINDOW = 20
|
||||
MIN_ATR = 1e-6
|
||||
|
||||
_CH_URL = "http://localhost:8123/"
|
||||
_CH_HEADERS = {"X-ClickHouse-User": "dolphin", "X-ClickHouse-Key": "dolphin_ch_2026"}
|
||||
|
||||
# Shadow outcome logging
|
||||
_SHADOW_TABLE = "adaptive_exit_shadow"
|
||||
_SHADOW_DB = "dolphin"
|
||||
|
||||
|
||||
def _ch_insert(row: dict, db: str = _SHADOW_DB) -> None:
|
||||
"""Non-blocking fire-and-forget insert."""
|
||||
try:
|
||||
body = (json.dumps(row) + "\n").encode()
|
||||
url = f"{_CH_URL}?database={db}&query=INSERT+INTO+{_SHADOW_TABLE}+FORMAT+JSONEachRow"
|
||||
req = urllib.request.Request(url, data=body, method="POST")
|
||||
for k, v in _CH_HEADERS.items():
|
||||
req.add_header(k, v)
|
||||
urllib.request.urlopen(req, timeout=3)
|
||||
except Exception:
|
||||
pass # shadow logging is best-effort
|
||||
|
||||
|
||||
def _ensure_shadow_table() -> None:
|
||||
"""Create shadow table if it doesn't exist."""
|
||||
ddl = (
|
||||
f"CREATE TABLE IF NOT EXISTS {_SHADOW_DB}.{_SHADOW_TABLE} ("
|
||||
"ts DateTime64(6, 'UTC'),"
|
||||
"ts_day Date MATERIALIZED toDate(ts),"
|
||||
"trade_id String,"
|
||||
"asset LowCardinality(String),"
|
||||
"bucket_id UInt8,"
|
||||
"bars_held UInt16,"
|
||||
"mae_norm Float32,"
|
||||
"mfe_norm Float32,"
|
||||
"tau_norm Float32,"
|
||||
"p_cont Float32,"
|
||||
"vel_div_entry Float32,"
|
||||
"vel_div_now Float32,"
|
||||
"action LowCardinality(String),"
|
||||
"exit_reason LowCardinality(String),"
|
||||
"actual_exit LowCardinality(String),"
|
||||
"pnl_pct Float32"
|
||||
") ENGINE = MergeTree()"
|
||||
" ORDER BY (ts_day, asset, ts)"
|
||||
" TTL ts_day + INTERVAL 90 DAY"
|
||||
)
|
||||
try:
|
||||
body = ddl.encode()
|
||||
req = urllib.request.Request(_CH_URL, data=body, method="POST")
|
||||
for k, v in _CH_HEADERS.items():
|
||||
req.add_header(k, v)
|
||||
urllib.request.urlopen(req, timeout=10)
|
||||
except Exception as e:
|
||||
print(f"[AdaptiveExitEngine] Warning: could not create shadow table: {e}")
|
||||
|
||||
|
||||
# ── Per-trade state ───────────────────────────────────────────────────────────
|
||||
|
||||
class _TradeState:
|
||||
def __init__(self, trade_id: str, asset: str, direction: int,
|
||||
entry_price: float, bucket_id: int, vel_div_entry: float = 0.0):
|
||||
self.trade_id = trade_id
|
||||
self.asset = asset
|
||||
self.direction = direction # -1 = SHORT, 1 = LONG
|
||||
self.entry_price = entry_price
|
||||
self.bucket_id = bucket_id
|
||||
self.vel_div_entry = vel_div_entry
|
||||
self.mae = 0.0
|
||||
self.mfe = 0.0
|
||||
self.peak_mfe = 0.0
|
||||
self.price_buf: list[float] = [] # rolling price history
|
||||
|
||||
|
||||
# ── Engine ────────────────────────────────────────────────────────────────────
|
||||
|
||||
class AdaptiveExitEngine:
|
||||
|
||||
def __init__(self, model_bank: ContinuationModelBank, bucket_data: dict):
|
||||
self._model = model_bank
|
||||
self._bucket_data = bucket_data
|
||||
self._states: dict[str, _TradeState] = {}
|
||||
self._lock = threading.Lock()
|
||||
self._pending_outcomes: list[dict] = []
|
||||
|
||||
@classmethod
|
||||
def load(cls) -> "AdaptiveExitEngine":
|
||||
"""Load pre-trained models. Falls back gracefully if not trained yet."""
|
||||
try:
|
||||
bank = ContinuationModelBank.load()
|
||||
print("[AdaptiveExitEngine] Continuation models loaded")
|
||||
except FileNotFoundError:
|
||||
print("[AdaptiveExitEngine] WARNING: no trained model found — using untrained fallback")
|
||||
bank = ContinuationModelBank()
|
||||
|
||||
try:
|
||||
bucket_data = build_buckets(force_rebuild=False)
|
||||
print(f"[AdaptiveExitEngine] Bucket assignments loaded: "
|
||||
f"{bucket_data['n_buckets']} buckets")
|
||||
except Exception as e:
|
||||
print(f"[AdaptiveExitEngine] WARNING: bucket data unavailable ({e})")
|
||||
bucket_data = {"assignments": {}, "n_buckets": 0}
|
||||
|
||||
_ensure_shadow_table()
|
||||
return cls(bank, bucket_data)
|
||||
|
||||
# ── Trade lifecycle ───────────────────────────────────────────────────────
|
||||
|
||||
def on_entry(self, trade_id: str, asset: str, direction: int,
|
||||
entry_price: float, vel_div_entry: float = 0.0) -> None:
|
||||
bid = get_bucket(asset, self._bucket_data, fallback=0)
|
||||
with self._lock:
|
||||
self._states[trade_id] = _TradeState(trade_id, asset, direction,
|
||||
entry_price, bid, vel_div_entry)
|
||||
|
||||
def on_exit(self, trade_id: str, actual_exit_reason: str,
|
||||
pnl_pct: float) -> None:
|
||||
"""Called when the real system closes a trade — records outcome for online update.
|
||||
|
||||
Only natural exits feed the model (FIXED_TP, MAX_HOLD, V7/AE stops).
|
||||
Forced exits (HIBERNATE_HALT, SUBDAY_ACB_NORMALIZATION) are filtered by
|
||||
the model bank's natural-exits-only guard, preventing regime artifacts
|
||||
from biasing the continuation distribution.
|
||||
"""
|
||||
with self._lock:
|
||||
st = self._states.pop(trade_id, None)
|
||||
if st is None:
|
||||
return
|
||||
cont = 1 if pnl_pct > 0 else 0
|
||||
if st.price_buf:
|
||||
prices = np.array(st.price_buf[-ATR_WINDOW:])
|
||||
atr = max(np.std(np.diff(np.log(np.maximum(prices, 1e-12)))), MIN_ATR)
|
||||
mae_norm = st.mae / atr
|
||||
mfe_norm = st.mfe / atr
|
||||
tau_norm = min(len(st.price_buf) / 120.0, 1.0)
|
||||
ret_1 = float(np.log(prices[-1] / prices[-2])) if len(prices) >= 2 else 0.0
|
||||
ret_3 = float(np.log(prices[-1] / prices[-4])) if len(prices) >= 4 else ret_1
|
||||
|
||||
obf = self._bucket_data.get("features", {})
|
||||
obf_row = {}
|
||||
if hasattr(obf, "loc") and st.asset in obf.index:
|
||||
obf_row = obf.loc[st.asset].to_dict()
|
||||
|
||||
vel_div_now = float(prices[-1]) if len(prices) >= 1 else st.vel_div_entry # placeholder; overridden if caller passes it
|
||||
p_pred = self._model.predict(
|
||||
mae_norm=mae_norm, mfe_norm=mfe_norm, tau_norm=tau_norm,
|
||||
ret_1=ret_1, ret_3=ret_3,
|
||||
vel_div_entry=st.vel_div_entry, vel_div_now=st.vel_div_entry,
|
||||
spread_bps=float(obf_row.get("spread_bps", 0.0)),
|
||||
depth_usd=float(obf_row.get("depth_usd", 0.0)),
|
||||
fill_prob=float(obf_row.get("fill_prob", 0.9)),
|
||||
bucket_id=st.bucket_id,
|
||||
)
|
||||
|
||||
self._model.online_update(
|
||||
bucket_id=st.bucket_id,
|
||||
mae_norm=mae_norm,
|
||||
mfe_norm=mfe_norm,
|
||||
tau_norm=tau_norm,
|
||||
ret_1=ret_1,
|
||||
ret_3=ret_3,
|
||||
vel_div_entry=st.vel_div_entry,
|
||||
vel_div_now=st.vel_div_entry,
|
||||
spread_bps=float(obf_row.get("spread_bps", 0.0)),
|
||||
depth_usd=float(obf_row.get("depth_usd", 0.0)),
|
||||
fill_prob=float(obf_row.get("fill_prob", 0.9)),
|
||||
continuation=cont,
|
||||
exit_reason=actual_exit_reason,
|
||||
p_pred=p_pred,
|
||||
)
|
||||
|
||||
# Log one final shadow row at close so actual_exit is queryable for comparison
|
||||
threading.Thread(target=_ch_insert, args=({
|
||||
"ts": int(time.time() * 1e6),
|
||||
"trade_id": trade_id,
|
||||
"asset": st.asset,
|
||||
"bucket_id": int(st.bucket_id),
|
||||
"bars_held": int(tau_norm * 120),
|
||||
"mae_norm": float(mae_norm),
|
||||
"mfe_norm": float(mfe_norm),
|
||||
"tau_norm": float(tau_norm),
|
||||
"p_cont": float(p_pred),
|
||||
"vel_div_entry": float(st.vel_div_entry),
|
||||
"vel_div_now": float(st.vel_div_entry),
|
||||
"action": "CLOSED",
|
||||
"exit_reason": "",
|
||||
"actual_exit": actual_exit_reason,
|
||||
"pnl_pct": float(pnl_pct),
|
||||
},), daemon=True).start()
|
||||
|
||||
# ── Per-bar evaluation ────────────────────────────────────────────────────
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
trade_id: str,
|
||||
asset: str,
|
||||
direction: int,
|
||||
entry_price: float,
|
||||
current_price: float,
|
||||
bars_held: int,
|
||||
max_hold: int = 120,
|
||||
recent_prices: Optional[list] = None,
|
||||
exf: Optional[dict] = None,
|
||||
vel_div_now: float = 0.0,
|
||||
) -> dict:
|
||||
"""
|
||||
Evaluate whether the adaptive engine would exit this trade.
|
||||
|
||||
Returns shadow decision dict (never executed — caller logs only).
|
||||
"""
|
||||
with self._lock:
|
||||
if trade_id not in self._states:
|
||||
bid = get_bucket(asset, self._bucket_data, fallback=0)
|
||||
self._states[trade_id] = _TradeState(trade_id, asset, direction,
|
||||
entry_price, bid, vel_div_now)
|
||||
st = self._states[trade_id]
|
||||
|
||||
# Update price buffer
|
||||
if recent_prices:
|
||||
st.price_buf = list(recent_prices[-ATR_WINDOW - 5:])
|
||||
elif current_price:
|
||||
st.price_buf.append(current_price)
|
||||
|
||||
# Compute delta (positive = favorable for direction)
|
||||
delta = direction * (entry_price - current_price) / entry_price
|
||||
# For SHORT (dir=-1): delta = -(entry - cur)/entry = (cur - entry)/entry
|
||||
# Wait — direction=-1 means SHORT, favorable = price drops = cur < entry
|
||||
# delta = (entry - cur)/entry * abs(direction) ... let's be explicit:
|
||||
if direction == -1: # SHORT
|
||||
delta = (entry_price - current_price) / entry_price # +ve if price dropped
|
||||
else: # LONG
|
||||
delta = (current_price - entry_price) / entry_price # +ve if price rose
|
||||
|
||||
adverse = max(0.0, -delta)
|
||||
favorable = max(0.0, delta)
|
||||
st.mae = max(st.mae, adverse)
|
||||
st.mfe = max(st.mfe, favorable)
|
||||
st.peak_mfe = max(st.peak_mfe, st.mfe)
|
||||
|
||||
# ATR from price buffer
|
||||
prices_arr = np.array(st.price_buf, dtype=float) if st.price_buf else np.array([current_price])
|
||||
if len(prices_arr) >= 2:
|
||||
log_rets = np.diff(np.log(np.maximum(prices_arr, 1e-12)))
|
||||
atr = max(float(np.std(log_rets[-ATR_WINDOW:])), MIN_ATR)
|
||||
else:
|
||||
atr = MIN_ATR
|
||||
|
||||
mae_norm = st.mae / atr
|
||||
mfe_norm = st.mfe / atr
|
||||
tau_norm = bars_held / max_hold
|
||||
|
||||
prices_f = prices_arr[-ATR_WINDOW:]
|
||||
ret_1 = float(np.log(prices_f[-1] / prices_f[-2])) if len(prices_f) >= 2 else 0.0
|
||||
ret_3 = float(np.log(prices_f[-1] / prices_f[-4])) if len(prices_f) >= 4 else ret_1
|
||||
|
||||
# OBF static features for this asset
|
||||
obf_feats = self._bucket_data.get("features", {})
|
||||
obf_row = {}
|
||||
if hasattr(obf_feats, "loc") and asset in obf_feats.index:
|
||||
obf_row = obf_feats.loc[asset].to_dict()
|
||||
|
||||
# P(continuation)
|
||||
p_cont = self._model.predict(
|
||||
mae_norm=mae_norm,
|
||||
mfe_norm=mfe_norm,
|
||||
tau_norm=tau_norm,
|
||||
ret_1=ret_1,
|
||||
ret_3=ret_3,
|
||||
vel_div_entry=st.vel_div_entry,
|
||||
vel_div_now=vel_div_now,
|
||||
spread_bps=float(obf_row.get("spread_bps", 0.0)),
|
||||
depth_usd=float(obf_row.get("depth_usd", 0.0)),
|
||||
fill_prob=float(obf_row.get("fill_prob", 0.9)),
|
||||
bucket_id=st.bucket_id,
|
||||
)
|
||||
|
||||
# Decision logic
|
||||
mae_threshold = max(0.005, MAE_MULT_TIER1 * atr)
|
||||
action = "HOLD"
|
||||
exit_reason = ""
|
||||
|
||||
if st.mae > mae_threshold:
|
||||
action = "EXIT"
|
||||
exit_reason = "AE_MAE_STOP"
|
||||
elif (st.peak_mfe > 0 and st.mfe < GIVEBACK_K * st.peak_mfe
|
||||
and p_cont < P_THRESHOLD):
|
||||
action = "EXIT"
|
||||
exit_reason = "AE_GIVEBACK_LOW_CONT"
|
||||
elif tau_norm > 1.0:
|
||||
action = "EXIT"
|
||||
exit_reason = "AE_TIME"
|
||||
|
||||
return {
|
||||
"trade_id": trade_id,
|
||||
"asset": st.asset,
|
||||
"action": action,
|
||||
"exit_reason_shadow": exit_reason,
|
||||
"p_continuation": p_cont,
|
||||
"mae_norm": mae_norm,
|
||||
"mfe_norm": mfe_norm,
|
||||
"tau_norm": tau_norm,
|
||||
"bucket_id": st.bucket_id,
|
||||
"vel_div_entry": st.vel_div_entry,
|
||||
"vel_div_now": vel_div_now,
|
||||
}
|
||||
|
||||
def log_shadow(self, shadow: dict, actual_exit: str = "", pnl_pct: float = 0.0) -> None:
|
||||
"""Async log a shadow decision to ClickHouse."""
|
||||
row = {
|
||||
"ts": int(time.time() * 1e6),
|
||||
"trade_id": shadow.get("trade_id", ""),
|
||||
"asset": shadow.get("asset", ""),
|
||||
"bucket_id": int(shadow.get("bucket_id", 0)),
|
||||
"bars_held": int(shadow.get("tau_norm", 0) * 120),
|
||||
"mae_norm": float(shadow.get("mae_norm", 0)),
|
||||
"mfe_norm": float(shadow.get("mfe_norm", 0)),
|
||||
"tau_norm": float(shadow.get("tau_norm", 0)),
|
||||
"p_cont": float(shadow.get("p_continuation", 0.5)),
|
||||
"vel_div_entry": float(shadow.get("vel_div_entry", 0.0)),
|
||||
"vel_div_now": float(shadow.get("vel_div_now", 0.0)),
|
||||
"action": shadow.get("action", "HOLD"),
|
||||
"exit_reason": shadow.get("exit_reason_shadow", ""),
|
||||
"actual_exit": actual_exit,
|
||||
"pnl_pct": float(pnl_pct),
|
||||
}
|
||||
threading.Thread(target=_ch_insert, args=(row,), daemon=True).start()
|
||||
182
adaptive_exit/bucket_engine.py
Executable file
182
adaptive_exit/bucket_engine.py
Executable file
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Asset bucketing engine.
|
||||
|
||||
Clusters assets into N buckets using price-based characteristics computed
|
||||
from 1m klines historical data (5-year window):
|
||||
- vol_daily_pct : annualised daily return volatility
|
||||
- corr_btc : correlation of returns with BTC
|
||||
- log_price : log of median close price (price tier proxy)
|
||||
- vov : vol-of-vol (instability of vol regime)
|
||||
|
||||
OBF (spread, depth, imbalance) is NOT used here — it covers only ~21 days
|
||||
and would overfit to a tiny recent window. OBF is overlay-phase only.
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import silhouette_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
_BUCKET_PATH = os.path.join(os.path.dirname(__file__), "models", "bucket_assignments.pkl")
|
||||
_DEFAULT_KLINES_DIR = "/mnt/dolphin_training/data/vbt_cache_klines"
|
||||
|
||||
# Sample every Nth file to keep memory manageable (1711 files × 1440 rows = 2.5M rows/asset)
|
||||
_SAMPLE_STRIDE = 30 # ~57 monthly samples from 5yr history
|
||||
|
||||
|
||||
def _load_klines_features(klines_dir: str) -> pd.DataFrame:
|
||||
"""
|
||||
Load sampled 1m klines parquets and compute per-asset characteristics.
|
||||
Returns DataFrame indexed by symbol with columns:
|
||||
vol_daily_pct, corr_btc, log_price, vov
|
||||
"""
|
||||
files = sorted(f for f in os.listdir(klines_dir) if f.endswith(".parquet"))
|
||||
if not files:
|
||||
raise RuntimeError(f"No parquet files in {klines_dir}")
|
||||
|
||||
sampled = files[::_SAMPLE_STRIDE]
|
||||
print(f" Klines: {len(files)} files, sampling every {_SAMPLE_STRIDE}th → {len(sampled)} files")
|
||||
|
||||
dfs = []
|
||||
for fn in sampled:
|
||||
try:
|
||||
df = pd.read_parquet(os.path.join(klines_dir, fn))
|
||||
dfs.append(df)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not dfs:
|
||||
raise RuntimeError("Failed to load any klines parquets")
|
||||
|
||||
combined = pd.concat(dfs, ignore_index=True)
|
||||
|
||||
# Price columns are bare symbol names; exclude known metadata columns
|
||||
meta_cols = {"timestamp", "open_time", "close_time", "date", "scan_number",
|
||||
"v50_lambda_max_velocity", "v150_lambda_max_velocity",
|
||||
"v300_lambda_max_velocity", "v750_lambda_max_velocity",
|
||||
"vel_div", "instability_50", "instability_150"}
|
||||
price_cols = [c for c in combined.columns if c not in meta_cols]
|
||||
|
||||
# If OHLCV multi-level columns, extract close
|
||||
if any("_close" in c.lower() for c in price_cols):
|
||||
price_cols = [c for c in price_cols if "_close" in c.lower()]
|
||||
sym_map = {c: c.lower().replace("_close", "").upper() for c in price_cols}
|
||||
else:
|
||||
sym_map = {c: c for c in price_cols} # already bare symbol names
|
||||
prices = combined[price_cols].rename(columns=sym_map).astype(float)
|
||||
|
||||
# Ensure BTC present for correlation
|
||||
btc_col = next((c for c in prices.columns if "BTC" in c.upper()), None)
|
||||
if btc_col is None:
|
||||
raise RuntimeError("BTCUSDT not found in klines — cannot compute corr_btc")
|
||||
|
||||
rets = prices.pct_change(fill_method=None).dropna()
|
||||
btc_rets = rets[btc_col]
|
||||
|
||||
records = []
|
||||
for sym in prices.columns:
|
||||
r = rets[sym].dropna()
|
||||
if len(r) < 100:
|
||||
continue
|
||||
# Daily vol proxy: std of 1m returns × sqrt(1440) (1440 bars/day) × sqrt(252)
|
||||
vol_daily = r.std() * np.sqrt(1440 * 252)
|
||||
corr_btc = r.corr(btc_rets)
|
||||
log_price = np.log1p(prices[sym].median())
|
||||
# Vol-of-vol: rolling 60-bar std, then std of that series
|
||||
rolling_vol = r.rolling(60).std().dropna()
|
||||
vov = rolling_vol.std() / (rolling_vol.mean() + 1e-9)
|
||||
corr_val = float(corr_btc) if not np.isnan(corr_btc) else 0.5
|
||||
records.append({
|
||||
"symbol": sym,
|
||||
"vol_daily_pct": vol_daily * 100,
|
||||
"corr_btc": corr_val,
|
||||
"log_price": log_price,
|
||||
"btc_relevance": corr_val * log_price, # market-significance proxy
|
||||
"vov": vov,
|
||||
})
|
||||
|
||||
df = pd.DataFrame(records).set_index("symbol")
|
||||
df = df.replace([np.inf, -np.inf], np.nan).dropna()
|
||||
print(f" Computed characteristics for {len(df)} assets")
|
||||
return df
|
||||
|
||||
|
||||
def find_optimal_k(X_scaled: np.ndarray, k_min: int = 4, k_max: int = 12) -> int:
|
||||
"""Silhouette search for best k."""
|
||||
best_k, best_sil = k_min, -1.0
|
||||
for k in range(k_min, min(k_max + 1, len(X_scaled))):
|
||||
km = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||||
labels = km.fit_predict(X_scaled)
|
||||
sil = silhouette_score(X_scaled, labels, sample_size=min(500, len(X_scaled)))
|
||||
if sil > best_sil:
|
||||
best_sil, best_k = sil, k
|
||||
return best_k
|
||||
|
||||
|
||||
def build_buckets(
|
||||
klines_dir: str = _DEFAULT_KLINES_DIR,
|
||||
k_override: Optional[int] = None,
|
||||
force_rebuild: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
Build or load bucket assignments from 1m klines price characteristics.
|
||||
|
||||
Returns dict:
|
||||
- 'assignments': {symbol: bucket_id}
|
||||
- 'n_buckets': int
|
||||
- 'model': fitted KMeans
|
||||
- 'scaler': fitted StandardScaler
|
||||
- 'features': DataFrame of per-asset characteristics
|
||||
"""
|
||||
if not force_rebuild and os.path.exists(_BUCKET_PATH):
|
||||
with open(_BUCKET_PATH, "rb") as f:
|
||||
return pickle.load(f)
|
||||
|
||||
print(f"[BucketEngine] Computing price characteristics from {klines_dir} ...")
|
||||
feat = _load_klines_features(klines_dir)
|
||||
|
||||
if len(feat) < 4:
|
||||
raise RuntimeError(f"Only {len(feat)} assets — need at least 4 for bucketing")
|
||||
|
||||
feature_cols = ["vol_daily_pct", "corr_btc", "log_price", "btc_relevance", "vov"]
|
||||
X = feat[feature_cols].values
|
||||
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
if k_override is not None:
|
||||
k = k_override
|
||||
else:
|
||||
k_max = min(12, len(feat) // 4)
|
||||
print(f" Searching optimal k in [4, {k_max}] for {len(feat)} assets...")
|
||||
k = find_optimal_k(X_scaled, k_min=4, k_max=k_max)
|
||||
|
||||
print(f" Fitting KMeans k={k}...")
|
||||
km = KMeans(n_clusters=k, random_state=42, n_init=20)
|
||||
labels = km.fit_predict(X_scaled)
|
||||
|
||||
assignments = {sym: int(lbl) for sym, lbl in zip(feat.index, labels)}
|
||||
|
||||
result = {
|
||||
"assignments": assignments,
|
||||
"n_buckets": k,
|
||||
"model": km,
|
||||
"scaler": scaler,
|
||||
"features": feat,
|
||||
}
|
||||
|
||||
os.makedirs(os.path.dirname(_BUCKET_PATH), exist_ok=True)
|
||||
with open(_BUCKET_PATH, "wb") as f:
|
||||
pickle.dump(result, f)
|
||||
|
||||
print(f" Saved bucket assignments: {k} buckets, {len(assignments)} assets → {_BUCKET_PATH}")
|
||||
return result
|
||||
|
||||
|
||||
def get_bucket(symbol: str, bucket_data: dict, fallback: int = 0) -> int:
|
||||
"""Return bucket ID for a symbol, with fallback for unknowns."""
|
||||
return bucket_data["assignments"].get(symbol, fallback)
|
||||
305
adaptive_exit/continuation_model.py
Executable file
305
adaptive_exit/continuation_model.py
Executable file
@@ -0,0 +1,305 @@
|
||||
"""
|
||||
Per-bucket continuation probability model.
|
||||
|
||||
Architecture:
|
||||
- One LogisticRegression per bucket (warm_start=True for online updates)
|
||||
- Global fallback model trained on all buckets
|
||||
- Online update: accumulate buffer → partial_fit periodically
|
||||
|
||||
Anti-degradation (basin guard):
|
||||
Shadow-only exits create a feedback loop: model says EXIT → only early-exit
|
||||
outcomes are observed → model learns from biased short-horizon data → drifts
|
||||
to "always EXIT". Three safeguards prevent this:
|
||||
|
||||
1. NATURAL_EXITS_ONLY — online updates only from FIXED_TP / MAX_HOLD exits.
|
||||
Forced exits (HIBERNATE_HALT, SUBDAY_ACB_NORMALIZATION) are excluded because
|
||||
they are regime artifacts, not continuation-relevant outcomes.
|
||||
|
||||
2. Rolling accuracy monitor — tracks whether the model's continuation predictions
|
||||
match actual outcomes over a sliding window. If accuracy drops below
|
||||
DEGRADATION_THRESHOLD, online updates are paused until it recovers.
|
||||
|
||||
3. Label balance guard — if the online update buffer is >80% one class,
|
||||
the flush is skipped (insufficient signal diversity).
|
||||
|
||||
Features: [mae_norm, mfe_norm, tau_norm, ret_1, ret_3, spread_bps, depth_usd, fill_prob]
|
||||
Target: continuation (1 = still favorable, 0 = adverse)
|
||||
|
||||
Usage:
|
||||
model = ContinuationModelBank.load() # or .train(df)
|
||||
p = model.predict(asset="BTCUSDT", mae_norm=0.5, mfe_norm=0.2, tau_norm=0.3,
|
||||
ret_1=-0.001, ret_3=-0.003, bucket_id=4)
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
import threading
|
||||
from collections import defaultdict, deque
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
_MODEL_PATH = os.path.join(os.path.dirname(__file__), "models", "continuation_models.pkl")
|
||||
|
||||
FEATURE_COLS = [
|
||||
# trade state
|
||||
"mae_norm", "mfe_norm", "tau_norm", "ret_1", "ret_3",
|
||||
# eigenvalue signal — entry quality and current divergence
|
||||
"vel_div_entry", # vel_div at entry bar; always <-0.02 at BLUE inference
|
||||
"vel_div_now", # vel_div at current bar k; live signal during excursion
|
||||
# OBF (static median; zero when unavailable)
|
||||
"spread_bps", "depth_usd", "fill_prob",
|
||||
# ExF — macro/sentiment (daily NPZ backfill; zero-filled when unavailable)
|
||||
"exf_fng", # Fear & Greed / 100 (0–1)
|
||||
"exf_fng_delta", # (fng - fng_prev) / 100
|
||||
"exf_funding_btc", # BTC perpetual funding rate
|
||||
"exf_dvol_btc", # BTC implied vol / 100
|
||||
"exf_chg24_btc", # BTC 24h return / 100
|
||||
]
|
||||
|
||||
# Online update config
|
||||
ONLINE_BUFFER_SIZE = 200 # samples before triggering partial retrain
|
||||
ONLINE_MIN_SAMPLES = 50 # min samples per bucket to attempt fit
|
||||
|
||||
# Anti-degradation config
|
||||
NATURAL_EXIT_REASONS = frozenset({"FIXED_TP", "MAX_HOLD", "V7_MAE_SL_VOL_NORM",
|
||||
"V7_COMPOSITE_PRESSURE", "AE_MAE_STOP",
|
||||
"AE_GIVEBACK_LOW_CONT", "AE_TIME"})
|
||||
ACCURACY_WINDOW = 50 # rolling window for accuracy monitoring
|
||||
DEGRADATION_THRESHOLD = 0.40 # pause updates if accuracy drops below this
|
||||
LABEL_BALANCE_MIN = 0.15 # skip flush if minority class < 15% of buffer
|
||||
|
||||
|
||||
class DegradationGuard:
|
||||
"""Rolling accuracy monitor. Pauses online updates when model degrades."""
|
||||
|
||||
def __init__(self):
|
||||
self._preds: deque = deque(maxlen=ACCURACY_WINDOW) # (p_cont, actual_cont)
|
||||
self._paused = False
|
||||
|
||||
def record(self, p_cont: float, actual_continuation: int) -> None:
|
||||
correct = int((p_cont >= 0.5) == bool(actual_continuation))
|
||||
self._preds.append(correct)
|
||||
if len(self._preds) >= ACCURACY_WINDOW // 2:
|
||||
acc = sum(self._preds) / len(self._preds)
|
||||
self._paused = acc < DEGRADATION_THRESHOLD
|
||||
|
||||
@property
|
||||
def updates_allowed(self) -> bool:
|
||||
return not self._paused
|
||||
|
||||
@property
|
||||
def accuracy(self) -> float:
|
||||
return sum(self._preds) / len(self._preds) if self._preds else 0.5
|
||||
|
||||
|
||||
class _BucketModel:
|
||||
"""Single-bucket LR with online update support."""
|
||||
|
||||
def __init__(self, bucket_id: int):
|
||||
self.bucket_id = bucket_id
|
||||
self.scaler = StandardScaler()
|
||||
self.lr = LogisticRegression(
|
||||
C=0.01,
|
||||
max_iter=500,
|
||||
warm_start=True,
|
||||
solver="lbfgs",
|
||||
class_weight="balanced",
|
||||
)
|
||||
self._fitted = False
|
||||
self._n_train = 0
|
||||
self._online_buf_X: list = []
|
||||
self._online_buf_y: list = []
|
||||
self._guard = DegradationGuard()
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray) -> None:
|
||||
if len(X) < ONLINE_MIN_SAMPLES:
|
||||
return
|
||||
Xs = self.scaler.fit_transform(X)
|
||||
self.lr.fit(Xs, y)
|
||||
self._fitted = True
|
||||
self._n_train = len(X)
|
||||
|
||||
def predict_proba(self, x: np.ndarray) -> float:
|
||||
"""Return P(continuation=1) for a single sample."""
|
||||
if not self._fitted:
|
||||
return 0.5
|
||||
xs = self.scaler.transform(x.reshape(1, -1))
|
||||
return float(self.lr.predict_proba(xs)[0, 1])
|
||||
|
||||
def online_update(self, x: np.ndarray, y: int, p_pred: float = 0.5) -> None:
|
||||
# Anti-degradation: record prediction accuracy
|
||||
self._guard.record(p_pred, y)
|
||||
if not self._guard.updates_allowed:
|
||||
return # model degraded — pause updates until accuracy recovers
|
||||
self._online_buf_X.append(x.copy())
|
||||
self._online_buf_y.append(y)
|
||||
if len(self._online_buf_X) >= ONLINE_BUFFER_SIZE:
|
||||
self._flush_online_buffer()
|
||||
|
||||
def _flush_online_buffer(self) -> None:
|
||||
if not self._online_buf_X:
|
||||
return
|
||||
X_new = np.array(self._online_buf_X)
|
||||
y_new = np.array(self._online_buf_y)
|
||||
# Label balance guard: skip if minority class < LABEL_BALANCE_MIN
|
||||
pos_rate = y_new.mean()
|
||||
if pos_rate < LABEL_BALANCE_MIN or pos_rate > (1.0 - LABEL_BALANCE_MIN):
|
||||
self._online_buf_X.clear()
|
||||
self._online_buf_y.clear()
|
||||
return
|
||||
if not self._fitted:
|
||||
self.fit(X_new, y_new)
|
||||
else:
|
||||
Xs = self.scaler.transform(X_new)
|
||||
if len(np.unique(y_new)) > 1:
|
||||
self.lr.fit(Xs, y_new)
|
||||
self._online_buf_X.clear()
|
||||
self._online_buf_y.clear()
|
||||
|
||||
|
||||
class ContinuationModelBank:
|
||||
"""Registry of per-bucket models with a global fallback."""
|
||||
|
||||
def __init__(self):
|
||||
self._models: dict[int, _BucketModel] = {}
|
||||
self._global: Optional[_BucketModel] = None
|
||||
self._lock = threading.Lock()
|
||||
|
||||
# ── Training ──────────────────────────────────────────────────────────────
|
||||
|
||||
def train(self, df: pd.DataFrame) -> None:
|
||||
"""Fit all per-bucket models from training DataFrame."""
|
||||
print(f"[ContinuationModelBank] Training on {len(df)} samples, "
|
||||
f"{df['bucket_id'].nunique()} buckets")
|
||||
df = df.dropna(subset=FEATURE_COLS + ["bucket_id", "continuation"])
|
||||
|
||||
# Global fallback
|
||||
X_all = df[FEATURE_COLS].values
|
||||
y_all = df["continuation"].values.astype(int)
|
||||
self._global = _BucketModel(bucket_id=-1)
|
||||
self._global.fit(X_all, y_all)
|
||||
print(f" Global model: n={len(X_all)}, "
|
||||
f"pos_rate={y_all.mean():.2f}")
|
||||
|
||||
# Per-bucket
|
||||
for bid, grp in df.groupby("bucket_id"):
|
||||
X = grp[FEATURE_COLS].values
|
||||
y = grp["continuation"].values.astype(int)
|
||||
m = _BucketModel(bucket_id=int(bid))
|
||||
m.fit(X, y)
|
||||
self._models[int(bid)] = m
|
||||
print(f" Bucket {bid:2d}: n={len(X):6d}, pos_rate={y.mean():.2f}, "
|
||||
f"fitted={m._fitted}")
|
||||
|
||||
print(f"[ContinuationModelBank] Training complete: "
|
||||
f"{sum(m._fitted for m in self._models.values())}/{len(self._models)} buckets fitted")
|
||||
|
||||
# ── Inference ─────────────────────────────────────────────────────────────
|
||||
|
||||
def predict(
|
||||
self,
|
||||
mae_norm: float,
|
||||
mfe_norm: float,
|
||||
tau_norm: float,
|
||||
ret_1: float = 0.0,
|
||||
ret_3: float = 0.0,
|
||||
vel_div_entry: float = 0.0,
|
||||
vel_div_now: float = 0.0,
|
||||
spread_bps: float = 0.0,
|
||||
depth_usd: float = 0.0,
|
||||
fill_prob: float = 0.9,
|
||||
exf_fng: float = 0.0,
|
||||
exf_fng_delta: float = 0.0,
|
||||
exf_funding_btc: float = 0.0,
|
||||
exf_dvol_btc: float = 0.0,
|
||||
exf_chg24_btc: float = 0.0,
|
||||
bucket_id: int = 0,
|
||||
) -> float:
|
||||
"""Return P(continuation | state). Fallback to global if bucket missing."""
|
||||
x = np.array([mae_norm, mfe_norm, tau_norm, ret_1, ret_3,
|
||||
vel_div_entry, vel_div_now,
|
||||
spread_bps, depth_usd, fill_prob,
|
||||
exf_fng, exf_fng_delta, exf_funding_btc, exf_dvol_btc, exf_chg24_btc],
|
||||
dtype=float)
|
||||
with self._lock:
|
||||
m = self._models.get(bucket_id)
|
||||
if m is not None and m._fitted:
|
||||
return m.predict_proba(x)
|
||||
if self._global is not None and self._global._fitted:
|
||||
return self._global.predict_proba(x)
|
||||
return 0.5
|
||||
|
||||
# ── Online update ─────────────────────────────────────────────────────────
|
||||
|
||||
def online_update(
|
||||
self,
|
||||
bucket_id: int,
|
||||
mae_norm: float,
|
||||
mfe_norm: float,
|
||||
tau_norm: float,
|
||||
ret_1: float,
|
||||
ret_3: float,
|
||||
vel_div_entry: float = 0.0,
|
||||
vel_div_now: float = 0.0,
|
||||
spread_bps: float = 0.0,
|
||||
depth_usd: float = 0.0,
|
||||
fill_prob: float = 0.9,
|
||||
exf_fng: float = 0.0,
|
||||
exf_fng_delta: float = 0.0,
|
||||
exf_funding_btc: float = 0.0,
|
||||
exf_dvol_btc: float = 0.0,
|
||||
exf_chg24_btc: float = 0.0,
|
||||
continuation: int = 0,
|
||||
exit_reason: str = "",
|
||||
p_pred: float = 0.5,
|
||||
) -> None:
|
||||
# Natural-exits-only guard: skip forced/regime exits (HIBERNATE_HALT, etc.)
|
||||
# These don't reflect continuation dynamics and would bias the model.
|
||||
if exit_reason and exit_reason not in NATURAL_EXIT_REASONS:
|
||||
return
|
||||
x = np.array([mae_norm, mfe_norm, tau_norm, ret_1, ret_3,
|
||||
vel_div_entry, vel_div_now,
|
||||
spread_bps, depth_usd, fill_prob,
|
||||
exf_fng, exf_fng_delta, exf_funding_btc, exf_dvol_btc, exf_chg24_btc],
|
||||
dtype=float)
|
||||
with self._lock:
|
||||
if bucket_id not in self._models:
|
||||
self._models[bucket_id] = _BucketModel(bucket_id)
|
||||
self._models[bucket_id].online_update(x, continuation, p_pred)
|
||||
if self._global is not None:
|
||||
self._global.online_update(x, continuation, p_pred)
|
||||
|
||||
# ── Persistence ───────────────────────────────────────────────────────────
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
del state["_lock"]
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__dict__.update(state)
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def save(self, path: str = _MODEL_PATH) -> None:
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
with open(path, "wb") as f:
|
||||
pickle.dump(self, f)
|
||||
print(f"[ContinuationModelBank] Saved → {path}")
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str = _MODEL_PATH) -> "ContinuationModelBank":
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"No trained model at {path} — run train.py first")
|
||||
with open(path, "rb") as f:
|
||||
return pickle.load(f)
|
||||
|
||||
def summary(self) -> dict:
|
||||
return {
|
||||
"n_buckets": len(self._models),
|
||||
"fitted_buckets": sum(m._fitted for m in self._models.values()),
|
||||
"global_fitted": self._global._fitted if self._global else False,
|
||||
"n_train_global": self._global._n_train if self._global else 0,
|
||||
}
|
||||
356
adaptive_exit/data_pipeline.py
Executable file
356
adaptive_exit/data_pipeline.py
Executable file
@@ -0,0 +1,356 @@
|
||||
"""
|
||||
MAE/MFE training data generator.
|
||||
|
||||
Simulates SHORT entries from raw price data (vel_div < VEL_DIV_THRESHOLD),
|
||||
extracts state vectors, and labels continuation probability.
|
||||
|
||||
Data sources:
|
||||
1. VBT parquet cache -- 48 assets, 512s cadence, 56-day gold window
|
||||
2. ClickHouse obf_universe -- 542 symbols, static OBF features
|
||||
|
||||
ExF features joined by date from NPZ backfill:
|
||||
exf_fng, exf_fng_delta, exf_funding_btc, exf_dvol_btc, exf_chg24_btc
|
||||
|
||||
Output columns:
|
||||
mae_norm, mfe_norm, tau_norm, bucket_id,
|
||||
spread_bps, depth_usd, fill_prob,
|
||||
ret_1, ret_3,
|
||||
exf_fng, exf_fng_delta, exf_funding_btc, exf_dvol_btc, exf_chg24_btc,
|
||||
continuation
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import urllib.request
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
_EXF_NPZ_BASE = "/mnt/dolphin_training/data/eigenvalues"
|
||||
# NPZ field names and their normalisation divisors
|
||||
_EXF_FIELDS = {
|
||||
"fng": 100.0,
|
||||
"fng_prev": 100.0, # only for delta computation
|
||||
"funding_btc": 1.0,
|
||||
"dvol_btc": 100.0,
|
||||
"chg24_btc": 100.0,
|
||||
}
|
||||
|
||||
VEL_DIV_THRESHOLD = -0.02
|
||||
MAX_HOLD = 120
|
||||
HORIZON = 8
|
||||
ATR_WINDOW = 20
|
||||
MIN_ATR = 1e-6
|
||||
|
||||
_CH_URL = "http://localhost:8123/"
|
||||
_CH_HEADERS = {"X-ClickHouse-User": "dolphin", "X-ClickHouse-Key": "dolphin_ch_2026"}
|
||||
_VBT_DIR = "/mnt/dolphinng5_predict/vbt_cache"
|
||||
|
||||
|
||||
def _ch_query(sql: str, timeout: int = 60) -> list[dict]:
|
||||
body = (sql + "\nFORMAT JSONEachRow").encode()
|
||||
req = urllib.request.Request(_CH_URL, data=body, method="POST")
|
||||
for k, v in _CH_HEADERS.items():
|
||||
req.add_header(k, v)
|
||||
resp = urllib.request.urlopen(req, timeout=timeout)
|
||||
rows = []
|
||||
for line in resp.read().decode().strip().split("\n"):
|
||||
if line:
|
||||
rows.append(json.loads(line))
|
||||
return rows
|
||||
|
||||
|
||||
# ── ExF NPZ index ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _load_exf_index(npz_base: str = _EXF_NPZ_BASE) -> dict:
|
||||
"""Load daily ExF NPZ files into {date_str: {field: normalised_float_or_None}}."""
|
||||
index = {}
|
||||
if not os.path.isdir(npz_base):
|
||||
return index
|
||||
for date_dir in sorted(os.listdir(npz_base)):
|
||||
npz_path = os.path.join(npz_base, date_dir, "scan_000001__Indicators.npz")
|
||||
if not os.path.exists(npz_path):
|
||||
continue
|
||||
try:
|
||||
z = np.load(npz_path, allow_pickle=True)
|
||||
nd = {n: (float(v), bool(o))
|
||||
for n, v, o in zip(z["api_names"], z["api_indicators"], z["api_success"])}
|
||||
row = {}
|
||||
for field, divisor in _EXF_FIELDS.items():
|
||||
v, good = nd.get(field, (0.0, False))
|
||||
row[field] = (v / divisor) if good else None
|
||||
index[date_dir] = row
|
||||
except Exception:
|
||||
continue
|
||||
return index
|
||||
|
||||
|
||||
def _fill_exf_medians(index: dict) -> dict:
|
||||
"""Replace None values with cross-day median for each field."""
|
||||
from statistics import median
|
||||
for field in _EXF_FIELDS:
|
||||
vals = [row[field] for row in index.values() if row.get(field) is not None]
|
||||
med = median(vals) if vals else 0.0
|
||||
for row in index.values():
|
||||
if row[field] is None:
|
||||
row[field] = med
|
||||
return index
|
||||
|
||||
|
||||
def _exf_features_for_date(date_str: str, index: dict) -> dict:
|
||||
"""Return 5 ExF model features for a date; falls back to nearest prior day."""
|
||||
if date_str in index:
|
||||
row = index[date_str]
|
||||
else:
|
||||
prior = [d for d in sorted(index.keys()) if d <= date_str]
|
||||
row = index[prior[-1]] if prior else {}
|
||||
fng = row.get("fng", 0.0) or 0.0
|
||||
fng_prev = row.get("fng_prev", fng) or fng
|
||||
return {
|
||||
"exf_fng": fng,
|
||||
"exf_fng_delta": fng - fng_prev,
|
||||
"exf_funding_btc": row.get("funding_btc", 0.0) or 0.0,
|
||||
"exf_dvol_btc": row.get("dvol_btc", 0.0) or 0.0,
|
||||
"exf_chg24_btc": row.get("chg24_btc", 0.0) or 0.0,
|
||||
}
|
||||
|
||||
|
||||
# ── VBT parquet source ────────────────────────────────────────────────────────
|
||||
|
||||
def _load_vbt(vbt_dir: str = _VBT_DIR) -> tuple[pd.DataFrame, list[str]]:
|
||||
"""Load all VBT parquets, return (df, price_cols)."""
|
||||
files = sorted(f for f in os.listdir(vbt_dir) if f.endswith(".parquet"))
|
||||
meta = {"timestamp", "scan_number", "v50_lambda_max_velocity", "v150_lambda_max_velocity",
|
||||
"v300_lambda_max_velocity", "v750_lambda_max_velocity", "vel_div",
|
||||
"instability_50", "instability_150"}
|
||||
dfs = [pd.read_parquet(os.path.join(vbt_dir, f)) for f in files]
|
||||
df = pd.concat(dfs, ignore_index=True)
|
||||
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
|
||||
df = df.sort_values("timestamp").reset_index(drop=True)
|
||||
price_cols = [c for c in df.columns if c not in meta]
|
||||
return df, price_cols
|
||||
|
||||
|
||||
# ── Core trade simulation ─────────────────────────────────────────────────────
|
||||
|
||||
def _simulate_trades_on_series(
|
||||
prices: np.ndarray,
|
||||
vel_div: Optional[np.ndarray],
|
||||
asset: str,
|
||||
bucket_id: int,
|
||||
obf_row: Optional[dict] = None,
|
||||
timestamps: Optional[np.ndarray] = None,
|
||||
max_samples: int = 50_000,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Simulate SHORT entries on a price series and return training samples.
|
||||
|
||||
Entry bars are pre-selected (ceil(max_samples / MAX_HOLD) bars drawn uniformly
|
||||
at random from all valid candidates) so that peak memory is bounded to
|
||||
~max_samples dicts regardless of series length. Full k-trajectories are kept
|
||||
for each selected entry, preserving excursion-path structure.
|
||||
|
||||
timestamps: parallel array for ExF date lookup (None = no date recorded).
|
||||
"""
|
||||
n = len(prices)
|
||||
samples = []
|
||||
if n < MAX_HOLD + HORIZON + ATR_WINDOW:
|
||||
return samples
|
||||
|
||||
log_ret = np.diff(np.log(np.maximum(prices, 1e-12)))
|
||||
atr_arr = np.array([
|
||||
np.std(log_ret[max(0, i - ATR_WINDOW):i]) if i >= ATR_WINDOW else np.std(log_ret[:i + 1])
|
||||
for i in range(len(log_ret))
|
||||
])
|
||||
|
||||
obf_spread = float(obf_row["spread_bps"]) if obf_row else 0.0
|
||||
obf_depth = float(obf_row["depth_usd"]) if obf_row else 0.0
|
||||
obf_fill = float(obf_row["fill_prob"]) if obf_row else 0.9
|
||||
|
||||
# Pre-select entry bars to bound peak memory.
|
||||
# Each entry t generates ≤MAX_HOLD samples; select enough to reach max_samples.
|
||||
# Sorted to preserve temporal order (not required by model, but aids debugging).
|
||||
candidate_ts = np.arange(ATR_WINDOW, n - MAX_HOLD - HORIZON)
|
||||
target_entries = max(1, int(np.ceil(max_samples / MAX_HOLD)))
|
||||
if len(candidate_ts) > target_entries:
|
||||
selected_ts = np.sort(
|
||||
np.random.choice(candidate_ts, target_entries, replace=False)
|
||||
)
|
||||
else:
|
||||
selected_ts = candidate_ts
|
||||
|
||||
for t in selected_ts:
|
||||
# Universal sampling: vel_div_entry is a feature, not a filter.
|
||||
# BLUE inference always queries with vel_div < -0.02, naturally selecting
|
||||
# the well-conditioned region of the learned surface.
|
||||
vde = float(vel_div[t]) if vel_div is not None else 0.0
|
||||
entry = prices[t]
|
||||
atr = max(atr_arr[t], MIN_ATR)
|
||||
date_str = str(timestamps[t])[:10] if timestamps is not None else None
|
||||
|
||||
mae = 0.0
|
||||
mfe = 0.0
|
||||
|
||||
for k in range(1, MAX_HOLD + 1):
|
||||
if t + k >= n:
|
||||
break
|
||||
cur = prices[t + k]
|
||||
delta = (entry - cur) / entry
|
||||
|
||||
mae = max(mae, max(0.0, -delta))
|
||||
mfe = max(mfe, max(0.0, delta))
|
||||
|
||||
future_t = t + k + HORIZON
|
||||
if future_t >= n:
|
||||
break
|
||||
future_delta = (entry - prices[future_t]) / entry
|
||||
continuation = 1 if future_delta > 0.0 else 0
|
||||
|
||||
ret_1 = log_ret[t + k - 1] if t + k - 1 < len(log_ret) else 0.0
|
||||
ret_3 = np.mean(log_ret[max(0, t + k - 3):t + k]) if k >= 3 else ret_1
|
||||
vdn = float(vel_div[t + k]) if vel_div is not None and t + k < len(vel_div) else vde
|
||||
|
||||
samples.append({
|
||||
"asset": asset,
|
||||
"bucket_id": bucket_id,
|
||||
"mae_norm": mae / atr,
|
||||
"mfe_norm": mfe / atr,
|
||||
"tau_norm": k / MAX_HOLD,
|
||||
"atr": atr,
|
||||
"vel_div_entry": vde,
|
||||
"vel_div_now": vdn,
|
||||
"spread_bps": obf_spread,
|
||||
"depth_usd": obf_depth,
|
||||
"fill_prob": obf_fill,
|
||||
"ret_1": ret_1,
|
||||
"ret_3": ret_3,
|
||||
"continuation": continuation,
|
||||
"_date": date_str,
|
||||
})
|
||||
|
||||
return samples
|
||||
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def build_training_data(
|
||||
bucket_assignments: dict,
|
||||
vbt_dir: str = _VBT_DIR,
|
||||
use_obf_ch: bool = True,
|
||||
max_samples_per_asset: int = 50_000,
|
||||
) -> pd.DataFrame:
|
||||
"""Build full training DataFrame from all available price data."""
|
||||
all_samples = []
|
||||
|
||||
# Static OBF features per asset
|
||||
obf_static: dict[str, dict] = {}
|
||||
if use_obf_ch:
|
||||
try:
|
||||
rows = _ch_query("""
|
||||
SELECT symbol,
|
||||
median(spread_bps) AS spread_bps,
|
||||
median(depth_1pct_usd) AS depth_usd,
|
||||
median(fill_probability) AS fill_prob
|
||||
FROM dolphin.obf_universe
|
||||
GROUP BY symbol
|
||||
""", timeout=60)
|
||||
obf_static = {r["symbol"]: r for r in rows}
|
||||
print(f"[DataPipeline] OBF static features: {len(obf_static)} assets")
|
||||
except Exception as e:
|
||||
print(f"[DataPipeline] OBF unavailable ({e})")
|
||||
|
||||
# ExF NPZ index
|
||||
print("[DataPipeline] Loading ExF NPZ index...")
|
||||
exf_index = _load_exf_index()
|
||||
if exf_index:
|
||||
exf_index = _fill_exf_medians(exf_index)
|
||||
print(f" ExF: {len(exf_index)} days ({min(exf_index)} -> {max(exf_index)})")
|
||||
else:
|
||||
print(" ExF: unavailable, columns will be zero")
|
||||
|
||||
# SOURCE 1: VBT parquet cache
|
||||
print("[DataPipeline] Loading VBT parquet cache...")
|
||||
df_vbt, price_cols = _load_vbt(vbt_dir)
|
||||
vel_div_arr = df_vbt["vel_div"].values if "vel_div" in df_vbt.columns else None
|
||||
ts_arr = df_vbt["timestamp"].values if "timestamp" in df_vbt.columns else None
|
||||
|
||||
for asset in price_cols:
|
||||
prices = df_vbt[asset].values.astype(float)
|
||||
bid = bucket_assignments.get(asset, 0)
|
||||
obf = obf_static.get(asset)
|
||||
samps = _simulate_trades_on_series(
|
||||
prices, vel_div_arr, asset, bid, obf, ts_arr,
|
||||
max_samples=max_samples_per_asset,
|
||||
)
|
||||
all_samples.extend(samps)
|
||||
print(f" VBT {asset}: {len(samps)} samples -> bucket {bid}")
|
||||
|
||||
# SOURCE 2: NG7 eigenvalue JSON price data
|
||||
eigen_dir = "/mnt/ng6_data/eigenvalues"
|
||||
if os.path.isdir(eigen_dir):
|
||||
print("[DataPipeline] Scanning NG7 eigenvalue JSON files...")
|
||||
samps = _load_from_eigenvalue_json(eigen_dir, bucket_assignments, obf_static, max_samples_per_asset)
|
||||
all_samples.extend(samps)
|
||||
print(f" NG7 eigen: {len(samps)} samples total")
|
||||
|
||||
print(f"[DataPipeline] Total samples: {len(all_samples)}")
|
||||
df = pd.DataFrame(all_samples)
|
||||
|
||||
# Join ExF features by date
|
||||
if exf_index and "_date" in df.columns:
|
||||
print("[DataPipeline] Joining ExF features by date...")
|
||||
unique_dates = df["_date"].dropna().unique()
|
||||
exf_map = {d: _exf_features_for_date(d, exf_index) for d in unique_dates}
|
||||
exf_df = df["_date"].map(exf_map).apply(pd.Series)
|
||||
df = pd.concat([df, exf_df], axis=1)
|
||||
print(f" ExF join: {exf_df.notna().all(axis=1).mean():.1%} rows covered")
|
||||
else:
|
||||
for col in ["exf_fng", "exf_fng_delta", "exf_funding_btc", "exf_dvol_btc", "exf_chg24_btc"]:
|
||||
df[col] = 0.0
|
||||
|
||||
df = df.drop(columns=["_date"], errors="ignore")
|
||||
return df
|
||||
|
||||
|
||||
def _load_from_eigenvalue_json(
|
||||
eigen_dir: str,
|
||||
bucket_assignments: dict,
|
||||
obf_static: dict,
|
||||
max_per_asset: int,
|
||||
) -> list[dict]:
|
||||
"""Extract price series from NG7 eigenvalue JSON files."""
|
||||
import glob
|
||||
|
||||
asset_prices: dict[str, list[float]] = {}
|
||||
for date_dir in sorted(os.listdir(eigen_dir)):
|
||||
day_path = os.path.join(eigen_dir, date_dir)
|
||||
if not os.path.isdir(day_path):
|
||||
continue
|
||||
for jf in sorted(glob.glob(os.path.join(day_path, "scan_*.json")))[::3]:
|
||||
try:
|
||||
with open(jf) as f:
|
||||
data = json.load(f)
|
||||
except Exception:
|
||||
continue
|
||||
prices_json = data.get("asset_prices_json") or data.get("result", {}).get("asset_prices_json")
|
||||
if prices_json:
|
||||
if isinstance(prices_json, str):
|
||||
try:
|
||||
prices_json = json.loads(prices_json)
|
||||
except Exception:
|
||||
continue
|
||||
for sym, px in prices_json.items():
|
||||
asset_prices.setdefault(sym, []).append(float(px))
|
||||
|
||||
samples = []
|
||||
for asset, prices in asset_prices.items():
|
||||
if len(prices) < MAX_HOLD + HORIZON + ATR_WINDOW:
|
||||
continue
|
||||
bid = bucket_assignments.get(asset, 0)
|
||||
obf = obf_static.get(asset)
|
||||
arr = np.array(prices, dtype=float)
|
||||
samps = _simulate_trades_on_series(arr, None, asset, bid, obf,
|
||||
max_samples=max_per_asset)
|
||||
samples.extend(samps)
|
||||
|
||||
return samples
|
||||
BIN
adaptive_exit/models/bucket_assignments.pkl
Executable file
BIN
adaptive_exit/models/bucket_assignments.pkl
Executable file
Binary file not shown.
BIN
adaptive_exit/models/continuation_models.pkl
Executable file
BIN
adaptive_exit/models/continuation_models.pkl
Executable file
Binary file not shown.
BIN
adaptive_exit/models/training_data.parquet
Executable file
BIN
adaptive_exit/models/training_data.parquet
Executable file
Binary file not shown.
75
adaptive_exit/train.py
Executable file
75
adaptive_exit/train.py
Executable file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
Offline training script.
|
||||
|
||||
Run once to build bucket assignments and train continuation models:
|
||||
|
||||
cd /mnt/dolphinng5_predict
|
||||
siloqy-env python adaptive_exit/train.py
|
||||
|
||||
Artifacts written:
|
||||
adaptive_exit/models/bucket_assignments.pkl
|
||||
adaptive_exit/models/continuation_models.pkl
|
||||
adaptive_exit/models/training_data.parquet (optional, for audit)
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, "/mnt/dolphinng5_predict")
|
||||
|
||||
from adaptive_exit.bucket_engine import build_buckets
|
||||
from adaptive_exit.continuation_model import ContinuationModelBank
|
||||
from adaptive_exit.data_pipeline import build_training_data
|
||||
|
||||
_MODELS_DIR = os.path.join(os.path.dirname(__file__), "models")
|
||||
_TRAIN_DATA_PATH = os.path.join(_MODELS_DIR, "training_data.parquet")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Train adaptive exit models")
|
||||
parser.add_argument("--k", type=int, default=None, help="Force bucket count (default: auto)")
|
||||
parser.add_argument("--save-data", action="store_true", help="Save training parquet for audit")
|
||||
parser.add_argument("--force-rebuild", action="store_true", help="Rebuild buckets even if cached")
|
||||
parser.add_argument("--vbt-dir", default="/mnt/dolphinng5_predict/vbt_cache",
|
||||
help="VBT parquet dir for training data generation")
|
||||
parser.add_argument("--klines-dir", default="/mnt/dolphin_training/data/vbt_cache_klines",
|
||||
help="1m klines dir for asset bucketing")
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs(_MODELS_DIR, exist_ok=True)
|
||||
|
||||
# ── Step 1: Build asset buckets ──────────────────────────────────────────
|
||||
print("\n=== STEP 1: Asset Bucketing ===")
|
||||
bucket_data = build_buckets(
|
||||
klines_dir=args.klines_dir,
|
||||
k_override=args.k,
|
||||
force_rebuild=args.force_rebuild,
|
||||
)
|
||||
print(f"Buckets: {bucket_data['n_buckets']} | Assets: {len(bucket_data['assignments'])}")
|
||||
|
||||
# ── Step 2: Build training data from price series ────────────────────────
|
||||
print("\n=== STEP 2: Generate MAE/MFE Training Data ===")
|
||||
df = build_training_data(
|
||||
bucket_assignments=bucket_data["assignments"],
|
||||
vbt_dir=args.vbt_dir,
|
||||
use_obf_ch=False, # OBF is live-only (13 days); zero-fill training, bolt on at Phase 2
|
||||
)
|
||||
print(f"Training data shape: {df.shape}")
|
||||
print(f"Bucket distribution:\n{df.groupby('bucket_id').size().describe()}")
|
||||
print(f"Continuation rate: {df['continuation'].mean():.3f}")
|
||||
|
||||
if args.save_data:
|
||||
df.to_parquet(_TRAIN_DATA_PATH)
|
||||
print(f"Training data saved → {_TRAIN_DATA_PATH}")
|
||||
|
||||
# ── Step 3: Train continuation models ────────────────────────────────────
|
||||
print("\n=== STEP 3: Train Continuation Models ===")
|
||||
bank = ContinuationModelBank()
|
||||
bank.train(df)
|
||||
bank.save()
|
||||
print(f"\nModel summary: {bank.summary()}")
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user