""" Asset bucketing engine. Clusters assets into N buckets using price-based characteristics computed from 1m klines historical data (5-year window): - vol_daily_pct : annualised daily return volatility - corr_btc : correlation of returns with BTC - log_price : log of median close price (price tier proxy) - vov : vol-of-vol (instability of vol regime) OBF (spread, depth, imbalance) is NOT used here — it covers only ~21 days and would overfit to a tiny recent window. OBF is overlay-phase only. """ import os import pickle from typing import Optional import numpy as np import pandas as pd from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score from sklearn.preprocessing import StandardScaler _BUCKET_PATH = os.path.join(os.path.dirname(__file__), "models", "bucket_assignments.pkl") _DEFAULT_KLINES_DIR = "/mnt/dolphin_training/data/vbt_cache_klines" # Sample every Nth file to keep memory manageable (1711 files × 1440 rows = 2.5M rows/asset) _SAMPLE_STRIDE = 30 # ~57 monthly samples from 5yr history def _load_klines_features(klines_dir: str) -> pd.DataFrame: """ Load sampled 1m klines parquets and compute per-asset characteristics. Returns DataFrame indexed by symbol with columns: vol_daily_pct, corr_btc, log_price, vov """ files = sorted(f for f in os.listdir(klines_dir) if f.endswith(".parquet")) if not files: raise RuntimeError(f"No parquet files in {klines_dir}") sampled = files[::_SAMPLE_STRIDE] print(f" Klines: {len(files)} files, sampling every {_SAMPLE_STRIDE}th → {len(sampled)} files") dfs = [] for fn in sampled: try: df = pd.read_parquet(os.path.join(klines_dir, fn)) dfs.append(df) except Exception: continue if not dfs: raise RuntimeError("Failed to load any klines parquets") combined = pd.concat(dfs, ignore_index=True) # Price columns are bare symbol names; exclude known metadata columns meta_cols = {"timestamp", "open_time", "close_time", "date", "scan_number", "v50_lambda_max_velocity", "v150_lambda_max_velocity", "v300_lambda_max_velocity", "v750_lambda_max_velocity", "vel_div", "instability_50", "instability_150"} price_cols = [c for c in combined.columns if c not in meta_cols] # If OHLCV multi-level columns, extract close if any("_close" in c.lower() for c in price_cols): price_cols = [c for c in price_cols if "_close" in c.lower()] sym_map = {c: c.lower().replace("_close", "").upper() for c in price_cols} else: sym_map = {c: c for c in price_cols} # already bare symbol names prices = combined[price_cols].rename(columns=sym_map).astype(float) # Ensure BTC present for correlation btc_col = next((c for c in prices.columns if "BTC" in c.upper()), None) if btc_col is None: raise RuntimeError("BTCUSDT not found in klines — cannot compute corr_btc") rets = prices.pct_change(fill_method=None).dropna() btc_rets = rets[btc_col] records = [] for sym in prices.columns: r = rets[sym].dropna() if len(r) < 100: continue # Daily vol proxy: std of 1m returns × sqrt(1440) (1440 bars/day) × sqrt(252) vol_daily = r.std() * np.sqrt(1440 * 252) corr_btc = r.corr(btc_rets) log_price = np.log1p(prices[sym].median()) # Vol-of-vol: rolling 60-bar std, then std of that series rolling_vol = r.rolling(60).std().dropna() vov = rolling_vol.std() / (rolling_vol.mean() + 1e-9) corr_val = float(corr_btc) if not np.isnan(corr_btc) else 0.5 records.append({ "symbol": sym, "vol_daily_pct": vol_daily * 100, "corr_btc": corr_val, "log_price": log_price, "btc_relevance": corr_val * log_price, # market-significance proxy "vov": vov, }) df = pd.DataFrame(records).set_index("symbol") df = df.replace([np.inf, -np.inf], np.nan).dropna() print(f" Computed characteristics for {len(df)} assets") return df def find_optimal_k(X_scaled: np.ndarray, k_min: int = 4, k_max: int = 12) -> int: """Silhouette search for best k.""" best_k, best_sil = k_min, -1.0 for k in range(k_min, min(k_max + 1, len(X_scaled))): km = KMeans(n_clusters=k, random_state=42, n_init=10) labels = km.fit_predict(X_scaled) sil = silhouette_score(X_scaled, labels, sample_size=min(500, len(X_scaled))) if sil > best_sil: best_sil, best_k = sil, k return best_k def build_buckets( klines_dir: str = _DEFAULT_KLINES_DIR, k_override: Optional[int] = None, force_rebuild: bool = False, ) -> dict: """ Build or load bucket assignments from 1m klines price characteristics. Returns dict: - 'assignments': {symbol: bucket_id} - 'n_buckets': int - 'model': fitted KMeans - 'scaler': fitted StandardScaler - 'features': DataFrame of per-asset characteristics """ if not force_rebuild and os.path.exists(_BUCKET_PATH): with open(_BUCKET_PATH, "rb") as f: return pickle.load(f) print(f"[BucketEngine] Computing price characteristics from {klines_dir} ...") feat = _load_klines_features(klines_dir) if len(feat) < 4: raise RuntimeError(f"Only {len(feat)} assets — need at least 4 for bucketing") feature_cols = ["vol_daily_pct", "corr_btc", "log_price", "btc_relevance", "vov"] X = feat[feature_cols].values scaler = StandardScaler() X_scaled = scaler.fit_transform(X) if k_override is not None: k = k_override else: k_max = min(12, len(feat) // 4) print(f" Searching optimal k in [4, {k_max}] for {len(feat)} assets...") k = find_optimal_k(X_scaled, k_min=4, k_max=k_max) print(f" Fitting KMeans k={k}...") km = KMeans(n_clusters=k, random_state=42, n_init=20) labels = km.fit_predict(X_scaled) assignments = {sym: int(lbl) for sym, lbl in zip(feat.index, labels)} result = { "assignments": assignments, "n_buckets": k, "model": km, "scaler": scaler, "features": feat, } os.makedirs(os.path.dirname(_BUCKET_PATH), exist_ok=True) with open(_BUCKET_PATH, "wb") as f: pickle.dump(result, f) print(f" Saved bucket assignments: {k} buckets, {len(assignments)} assets → {_BUCKET_PATH}") return result def get_bucket(symbol: str, bucket_data: dict, fallback: int = 0) -> int: """Return bucket ID for a symbol, with fallback for unknowns.""" return bucket_data["assignments"].get(symbol, fallback)