initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems:
- prod/ (BLUE harness, configs, scripts, docs)
- nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved)
- adaptive_exit/ (AEM engine + models/bucket_assignments.pkl)
- Observability/ (EsoF advisor, TUI, dashboards)
- external_factors/ (EsoF producer)
- mc_forewarning_qlabs_fork/ (MC regime/envelope)

Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
hjnormey
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions

182
adaptive_exit/bucket_engine.py Executable file
View File

@@ -0,0 +1,182 @@
"""
Asset bucketing engine.
Clusters assets into N buckets using price-based characteristics computed
from 1m klines historical data (5-year window):
- vol_daily_pct : annualised daily return volatility
- corr_btc : correlation of returns with BTC
- log_price : log of median close price (price tier proxy)
- vov : vol-of-vol (instability of vol regime)
OBF (spread, depth, imbalance) is NOT used here — it covers only ~21 days
and would overfit to a tiny recent window. OBF is overlay-phase only.
"""
import os
import pickle
from typing import Optional
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
_BUCKET_PATH = os.path.join(os.path.dirname(__file__), "models", "bucket_assignments.pkl")
_DEFAULT_KLINES_DIR = "/mnt/dolphin_training/data/vbt_cache_klines"
# Sample every Nth file to keep memory manageable (1711 files × 1440 rows = 2.5M rows/asset)
_SAMPLE_STRIDE = 30 # ~57 monthly samples from 5yr history
def _load_klines_features(klines_dir: str) -> pd.DataFrame:
"""
Load sampled 1m klines parquets and compute per-asset characteristics.
Returns DataFrame indexed by symbol with columns:
vol_daily_pct, corr_btc, log_price, vov
"""
files = sorted(f for f in os.listdir(klines_dir) if f.endswith(".parquet"))
if not files:
raise RuntimeError(f"No parquet files in {klines_dir}")
sampled = files[::_SAMPLE_STRIDE]
print(f" Klines: {len(files)} files, sampling every {_SAMPLE_STRIDE}th → {len(sampled)} files")
dfs = []
for fn in sampled:
try:
df = pd.read_parquet(os.path.join(klines_dir, fn))
dfs.append(df)
except Exception:
continue
if not dfs:
raise RuntimeError("Failed to load any klines parquets")
combined = pd.concat(dfs, ignore_index=True)
# Price columns are bare symbol names; exclude known metadata columns
meta_cols = {"timestamp", "open_time", "close_time", "date", "scan_number",
"v50_lambda_max_velocity", "v150_lambda_max_velocity",
"v300_lambda_max_velocity", "v750_lambda_max_velocity",
"vel_div", "instability_50", "instability_150"}
price_cols = [c for c in combined.columns if c not in meta_cols]
# If OHLCV multi-level columns, extract close
if any("_close" in c.lower() for c in price_cols):
price_cols = [c for c in price_cols if "_close" in c.lower()]
sym_map = {c: c.lower().replace("_close", "").upper() for c in price_cols}
else:
sym_map = {c: c for c in price_cols} # already bare symbol names
prices = combined[price_cols].rename(columns=sym_map).astype(float)
# Ensure BTC present for correlation
btc_col = next((c for c in prices.columns if "BTC" in c.upper()), None)
if btc_col is None:
raise RuntimeError("BTCUSDT not found in klines — cannot compute corr_btc")
rets = prices.pct_change(fill_method=None).dropna()
btc_rets = rets[btc_col]
records = []
for sym in prices.columns:
r = rets[sym].dropna()
if len(r) < 100:
continue
# Daily vol proxy: std of 1m returns × sqrt(1440) (1440 bars/day) × sqrt(252)
vol_daily = r.std() * np.sqrt(1440 * 252)
corr_btc = r.corr(btc_rets)
log_price = np.log1p(prices[sym].median())
# Vol-of-vol: rolling 60-bar std, then std of that series
rolling_vol = r.rolling(60).std().dropna()
vov = rolling_vol.std() / (rolling_vol.mean() + 1e-9)
corr_val = float(corr_btc) if not np.isnan(corr_btc) else 0.5
records.append({
"symbol": sym,
"vol_daily_pct": vol_daily * 100,
"corr_btc": corr_val,
"log_price": log_price,
"btc_relevance": corr_val * log_price, # market-significance proxy
"vov": vov,
})
df = pd.DataFrame(records).set_index("symbol")
df = df.replace([np.inf, -np.inf], np.nan).dropna()
print(f" Computed characteristics for {len(df)} assets")
return df
def find_optimal_k(X_scaled: np.ndarray, k_min: int = 4, k_max: int = 12) -> int:
"""Silhouette search for best k."""
best_k, best_sil = k_min, -1.0
for k in range(k_min, min(k_max + 1, len(X_scaled))):
km = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = km.fit_predict(X_scaled)
sil = silhouette_score(X_scaled, labels, sample_size=min(500, len(X_scaled)))
if sil > best_sil:
best_sil, best_k = sil, k
return best_k
def build_buckets(
klines_dir: str = _DEFAULT_KLINES_DIR,
k_override: Optional[int] = None,
force_rebuild: bool = False,
) -> dict:
"""
Build or load bucket assignments from 1m klines price characteristics.
Returns dict:
- 'assignments': {symbol: bucket_id}
- 'n_buckets': int
- 'model': fitted KMeans
- 'scaler': fitted StandardScaler
- 'features': DataFrame of per-asset characteristics
"""
if not force_rebuild and os.path.exists(_BUCKET_PATH):
with open(_BUCKET_PATH, "rb") as f:
return pickle.load(f)
print(f"[BucketEngine] Computing price characteristics from {klines_dir} ...")
feat = _load_klines_features(klines_dir)
if len(feat) < 4:
raise RuntimeError(f"Only {len(feat)} assets — need at least 4 for bucketing")
feature_cols = ["vol_daily_pct", "corr_btc", "log_price", "btc_relevance", "vov"]
X = feat[feature_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
if k_override is not None:
k = k_override
else:
k_max = min(12, len(feat) // 4)
print(f" Searching optimal k in [4, {k_max}] for {len(feat)} assets...")
k = find_optimal_k(X_scaled, k_min=4, k_max=k_max)
print(f" Fitting KMeans k={k}...")
km = KMeans(n_clusters=k, random_state=42, n_init=20)
labels = km.fit_predict(X_scaled)
assignments = {sym: int(lbl) for sym, lbl in zip(feat.index, labels)}
result = {
"assignments": assignments,
"n_buckets": k,
"model": km,
"scaler": scaler,
"features": feat,
}
os.makedirs(os.path.dirname(_BUCKET_PATH), exist_ok=True)
with open(_BUCKET_PATH, "wb") as f:
pickle.dump(result, f)
print(f" Saved bucket assignments: {k} buckets, {len(assignments)} assets → {_BUCKET_PATH}")
return result
def get_bucket(symbol: str, bucket_data: dict, fallback: int = 0) -> int:
"""Return bucket ID for a symbol, with fallback for unknowns."""
return bucket_data["assignments"].get(symbol, fallback)