initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
182
adaptive_exit/bucket_engine.py
Executable file
182
adaptive_exit/bucket_engine.py
Executable file
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Asset bucketing engine.
|
||||
|
||||
Clusters assets into N buckets using price-based characteristics computed
|
||||
from 1m klines historical data (5-year window):
|
||||
- vol_daily_pct : annualised daily return volatility
|
||||
- corr_btc : correlation of returns with BTC
|
||||
- log_price : log of median close price (price tier proxy)
|
||||
- vov : vol-of-vol (instability of vol regime)
|
||||
|
||||
OBF (spread, depth, imbalance) is NOT used here — it covers only ~21 days
|
||||
and would overfit to a tiny recent window. OBF is overlay-phase only.
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.metrics import silhouette_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
_BUCKET_PATH = os.path.join(os.path.dirname(__file__), "models", "bucket_assignments.pkl")
|
||||
_DEFAULT_KLINES_DIR = "/mnt/dolphin_training/data/vbt_cache_klines"
|
||||
|
||||
# Sample every Nth file to keep memory manageable (1711 files × 1440 rows = 2.5M rows/asset)
|
||||
_SAMPLE_STRIDE = 30 # ~57 monthly samples from 5yr history
|
||||
|
||||
|
||||
def _load_klines_features(klines_dir: str) -> pd.DataFrame:
|
||||
"""
|
||||
Load sampled 1m klines parquets and compute per-asset characteristics.
|
||||
Returns DataFrame indexed by symbol with columns:
|
||||
vol_daily_pct, corr_btc, log_price, vov
|
||||
"""
|
||||
files = sorted(f for f in os.listdir(klines_dir) if f.endswith(".parquet"))
|
||||
if not files:
|
||||
raise RuntimeError(f"No parquet files in {klines_dir}")
|
||||
|
||||
sampled = files[::_SAMPLE_STRIDE]
|
||||
print(f" Klines: {len(files)} files, sampling every {_SAMPLE_STRIDE}th → {len(sampled)} files")
|
||||
|
||||
dfs = []
|
||||
for fn in sampled:
|
||||
try:
|
||||
df = pd.read_parquet(os.path.join(klines_dir, fn))
|
||||
dfs.append(df)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not dfs:
|
||||
raise RuntimeError("Failed to load any klines parquets")
|
||||
|
||||
combined = pd.concat(dfs, ignore_index=True)
|
||||
|
||||
# Price columns are bare symbol names; exclude known metadata columns
|
||||
meta_cols = {"timestamp", "open_time", "close_time", "date", "scan_number",
|
||||
"v50_lambda_max_velocity", "v150_lambda_max_velocity",
|
||||
"v300_lambda_max_velocity", "v750_lambda_max_velocity",
|
||||
"vel_div", "instability_50", "instability_150"}
|
||||
price_cols = [c for c in combined.columns if c not in meta_cols]
|
||||
|
||||
# If OHLCV multi-level columns, extract close
|
||||
if any("_close" in c.lower() for c in price_cols):
|
||||
price_cols = [c for c in price_cols if "_close" in c.lower()]
|
||||
sym_map = {c: c.lower().replace("_close", "").upper() for c in price_cols}
|
||||
else:
|
||||
sym_map = {c: c for c in price_cols} # already bare symbol names
|
||||
prices = combined[price_cols].rename(columns=sym_map).astype(float)
|
||||
|
||||
# Ensure BTC present for correlation
|
||||
btc_col = next((c for c in prices.columns if "BTC" in c.upper()), None)
|
||||
if btc_col is None:
|
||||
raise RuntimeError("BTCUSDT not found in klines — cannot compute corr_btc")
|
||||
|
||||
rets = prices.pct_change(fill_method=None).dropna()
|
||||
btc_rets = rets[btc_col]
|
||||
|
||||
records = []
|
||||
for sym in prices.columns:
|
||||
r = rets[sym].dropna()
|
||||
if len(r) < 100:
|
||||
continue
|
||||
# Daily vol proxy: std of 1m returns × sqrt(1440) (1440 bars/day) × sqrt(252)
|
||||
vol_daily = r.std() * np.sqrt(1440 * 252)
|
||||
corr_btc = r.corr(btc_rets)
|
||||
log_price = np.log1p(prices[sym].median())
|
||||
# Vol-of-vol: rolling 60-bar std, then std of that series
|
||||
rolling_vol = r.rolling(60).std().dropna()
|
||||
vov = rolling_vol.std() / (rolling_vol.mean() + 1e-9)
|
||||
corr_val = float(corr_btc) if not np.isnan(corr_btc) else 0.5
|
||||
records.append({
|
||||
"symbol": sym,
|
||||
"vol_daily_pct": vol_daily * 100,
|
||||
"corr_btc": corr_val,
|
||||
"log_price": log_price,
|
||||
"btc_relevance": corr_val * log_price, # market-significance proxy
|
||||
"vov": vov,
|
||||
})
|
||||
|
||||
df = pd.DataFrame(records).set_index("symbol")
|
||||
df = df.replace([np.inf, -np.inf], np.nan).dropna()
|
||||
print(f" Computed characteristics for {len(df)} assets")
|
||||
return df
|
||||
|
||||
|
||||
def find_optimal_k(X_scaled: np.ndarray, k_min: int = 4, k_max: int = 12) -> int:
|
||||
"""Silhouette search for best k."""
|
||||
best_k, best_sil = k_min, -1.0
|
||||
for k in range(k_min, min(k_max + 1, len(X_scaled))):
|
||||
km = KMeans(n_clusters=k, random_state=42, n_init=10)
|
||||
labels = km.fit_predict(X_scaled)
|
||||
sil = silhouette_score(X_scaled, labels, sample_size=min(500, len(X_scaled)))
|
||||
if sil > best_sil:
|
||||
best_sil, best_k = sil, k
|
||||
return best_k
|
||||
|
||||
|
||||
def build_buckets(
|
||||
klines_dir: str = _DEFAULT_KLINES_DIR,
|
||||
k_override: Optional[int] = None,
|
||||
force_rebuild: bool = False,
|
||||
) -> dict:
|
||||
"""
|
||||
Build or load bucket assignments from 1m klines price characteristics.
|
||||
|
||||
Returns dict:
|
||||
- 'assignments': {symbol: bucket_id}
|
||||
- 'n_buckets': int
|
||||
- 'model': fitted KMeans
|
||||
- 'scaler': fitted StandardScaler
|
||||
- 'features': DataFrame of per-asset characteristics
|
||||
"""
|
||||
if not force_rebuild and os.path.exists(_BUCKET_PATH):
|
||||
with open(_BUCKET_PATH, "rb") as f:
|
||||
return pickle.load(f)
|
||||
|
||||
print(f"[BucketEngine] Computing price characteristics from {klines_dir} ...")
|
||||
feat = _load_klines_features(klines_dir)
|
||||
|
||||
if len(feat) < 4:
|
||||
raise RuntimeError(f"Only {len(feat)} assets — need at least 4 for bucketing")
|
||||
|
||||
feature_cols = ["vol_daily_pct", "corr_btc", "log_price", "btc_relevance", "vov"]
|
||||
X = feat[feature_cols].values
|
||||
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
if k_override is not None:
|
||||
k = k_override
|
||||
else:
|
||||
k_max = min(12, len(feat) // 4)
|
||||
print(f" Searching optimal k in [4, {k_max}] for {len(feat)} assets...")
|
||||
k = find_optimal_k(X_scaled, k_min=4, k_max=k_max)
|
||||
|
||||
print(f" Fitting KMeans k={k}...")
|
||||
km = KMeans(n_clusters=k, random_state=42, n_init=20)
|
||||
labels = km.fit_predict(X_scaled)
|
||||
|
||||
assignments = {sym: int(lbl) for sym, lbl in zip(feat.index, labels)}
|
||||
|
||||
result = {
|
||||
"assignments": assignments,
|
||||
"n_buckets": k,
|
||||
"model": km,
|
||||
"scaler": scaler,
|
||||
"features": feat,
|
||||
}
|
||||
|
||||
os.makedirs(os.path.dirname(_BUCKET_PATH), exist_ok=True)
|
||||
with open(_BUCKET_PATH, "wb") as f:
|
||||
pickle.dump(result, f)
|
||||
|
||||
print(f" Saved bucket assignments: {k} buckets, {len(assignments)} assets → {_BUCKET_PATH}")
|
||||
return result
|
||||
|
||||
|
||||
def get_bucket(symbol: str, bucket_data: dict, fallback: int = 0) -> int:
|
||||
"""Return bucket ID for a symbol, with fallback for unknowns."""
|
||||
return bucket_data["assignments"].get(symbol, fallback)
|
||||
Reference in New Issue
Block a user