506 lines
17 KiB
Python
506 lines
17 KiB
Python
|
|
"""
|
||
|
|
Monte Carlo ML Envelope Learning
|
||
|
|
================================
|
||
|
|
|
||
|
|
Train ML models on MC results for envelope boundary estimation and forewarning.
|
||
|
|
|
||
|
|
Models:
|
||
|
|
- Regression models for ROI, DD, PF, WR prediction
|
||
|
|
- Classification models for champion_region, catastrophic
|
||
|
|
- One-Class SVM for envelope boundary estimation
|
||
|
|
- SHAP for feature importance
|
||
|
|
|
||
|
|
Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 9, 12
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import pickle
|
||
|
|
from typing import Dict, List, Optional, Any, Tuple
|
||
|
|
from pathlib import Path
|
||
|
|
from dataclasses import dataclass
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
# Try to import ML libraries
|
||
|
|
try:
|
||
|
|
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
|
||
|
|
from sklearn.svm import OneClassSVM
|
||
|
|
from sklearn.preprocessing import StandardScaler
|
||
|
|
from sklearn.model_selection import train_test_split
|
||
|
|
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||
|
|
SKLEARN_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
SKLEARN_AVAILABLE = False
|
||
|
|
print("[WARN] scikit-learn not available - ML training disabled")
|
||
|
|
|
||
|
|
try:
|
||
|
|
import xgboost as xgb
|
||
|
|
XGBOOST_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
XGBOOST_AVAILABLE = False
|
||
|
|
|
||
|
|
try:
|
||
|
|
import shap
|
||
|
|
SHAP_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
SHAP_AVAILABLE = False
|
||
|
|
|
||
|
|
from .mc_sampler import MCTrialConfig, MCSampler
|
||
|
|
from .mc_store import MCStore
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ForewarningReport:
|
||
|
|
"""Forewarning report for a configuration."""
|
||
|
|
config: Dict[str, Any]
|
||
|
|
predicted_roi: float
|
||
|
|
predicted_roi_p10: float
|
||
|
|
predicted_roi_p90: float
|
||
|
|
predicted_max_dd: float
|
||
|
|
champion_probability: float
|
||
|
|
catastrophic_probability: float
|
||
|
|
envelope_score: float
|
||
|
|
warnings: List[str]
|
||
|
|
nearest_champion: Optional[Dict[str, Any]]
|
||
|
|
parameter_risks: Dict[str, float]
|
||
|
|
|
||
|
|
def to_dict(self) -> Dict[str, Any]:
|
||
|
|
"""Convert to dictionary."""
|
||
|
|
return {
|
||
|
|
'config': self.config,
|
||
|
|
'predicted_roi': self.predicted_roi,
|
||
|
|
'predicted_roi_p10': self.predicted_roi_p10,
|
||
|
|
'predicted_roi_p90': self.predicted_roi_p90,
|
||
|
|
'predicted_max_dd': self.predicted_max_dd,
|
||
|
|
'champion_probability': self.champion_probability,
|
||
|
|
'catastrophic_probability': self.catastrophic_probability,
|
||
|
|
'envelope_score': self.envelope_score,
|
||
|
|
'warnings': self.warnings,
|
||
|
|
'nearest_champion': self.nearest_champion,
|
||
|
|
'parameter_risks': self.parameter_risks,
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class MCML:
|
||
|
|
"""
|
||
|
|
Monte Carlo ML Envelope Learning.
|
||
|
|
|
||
|
|
Trains models on MC results and provides forewarning capabilities.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
output_dir: str = "mc_results",
|
||
|
|
models_dir: Optional[str] = None
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Initialize ML trainer.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
output_dir : str
|
||
|
|
MC results directory
|
||
|
|
models_dir : str, optional
|
||
|
|
Directory to save trained models
|
||
|
|
"""
|
||
|
|
self.output_dir = Path(output_dir)
|
||
|
|
self.models_dir = Path(models_dir) if models_dir else self.output_dir / "models"
|
||
|
|
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
self.store = MCStore(output_dir=output_dir)
|
||
|
|
|
||
|
|
# Models
|
||
|
|
self.models: Dict[str, Any] = {}
|
||
|
|
self.scalers: Dict[str, StandardScaler] = {}
|
||
|
|
self.feature_names: List[str] = []
|
||
|
|
|
||
|
|
self._init_feature_names()
|
||
|
|
|
||
|
|
def _init_feature_names(self):
|
||
|
|
"""Initialize feature names from parameter space."""
|
||
|
|
sampler = MCSampler()
|
||
|
|
self.feature_names = list(sampler.CHAMPION.keys())
|
||
|
|
|
||
|
|
def load_corpus(self) -> Optional[Any]:
|
||
|
|
"""Load full corpus from store."""
|
||
|
|
return self.store.load_corpus()
|
||
|
|
|
||
|
|
def train_all_models(self, test_size: float = 0.2) -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Train all ML models on the corpus.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
test_size : float
|
||
|
|
Fraction of data for testing
|
||
|
|
|
||
|
|
Returns
|
||
|
|
-------
|
||
|
|
Dict[str, Any]
|
||
|
|
Training results and metrics
|
||
|
|
"""
|
||
|
|
if not SKLEARN_AVAILABLE:
|
||
|
|
raise RuntimeError("scikit-learn required for training")
|
||
|
|
|
||
|
|
print("="*70)
|
||
|
|
print("TRAINING ML MODELS")
|
||
|
|
print("="*70)
|
||
|
|
|
||
|
|
# Load corpus
|
||
|
|
print("\n[1/6] Loading corpus...")
|
||
|
|
df = self.load_corpus()
|
||
|
|
if df is None or len(df) == 0:
|
||
|
|
raise ValueError("No corpus data available")
|
||
|
|
|
||
|
|
print(f" Loaded {len(df)} trials")
|
||
|
|
|
||
|
|
# Prepare features
|
||
|
|
print("\n[2/6] Preparing features...")
|
||
|
|
X = self._extract_features(df)
|
||
|
|
|
||
|
|
# Train regression models
|
||
|
|
print("\n[3/6] Training regression models...")
|
||
|
|
self._train_regression_model(X, df, 'M_roi_pct', 'model_roi')
|
||
|
|
self._train_regression_model(X, df, 'M_max_drawdown_pct', 'model_dd')
|
||
|
|
self._train_regression_model(X, df, 'M_profit_factor', 'model_pf')
|
||
|
|
self._train_regression_model(X, df, 'M_win_rate', 'model_wr')
|
||
|
|
|
||
|
|
# Train classification models
|
||
|
|
print("\n[4/6] Training classification models...")
|
||
|
|
self._train_classification_model(X, df, 'L_champion_region', 'model_champ')
|
||
|
|
self._train_classification_model(X, df, 'L_catastrophic', 'model_catas')
|
||
|
|
self._train_classification_model(X, df, 'L_inert', 'model_inert')
|
||
|
|
self._train_classification_model(X, df, 'L_h2_degradation', 'model_h2deg')
|
||
|
|
|
||
|
|
# Train envelope model (One-Class SVM on champions)
|
||
|
|
print("\n[5/6] Training envelope boundary model...")
|
||
|
|
self._train_envelope_model(X, df)
|
||
|
|
|
||
|
|
# Save models
|
||
|
|
print("\n[6/6] Saving models...")
|
||
|
|
self._save_models()
|
||
|
|
|
||
|
|
print("\n[OK] All models trained and saved")
|
||
|
|
|
||
|
|
return {'status': 'success', 'n_samples': len(df)}
|
||
|
|
|
||
|
|
def _extract_features(self, df: Any) -> np.ndarray:
|
||
|
|
"""Extract feature matrix from DataFrame."""
|
||
|
|
# Get parameter columns
|
||
|
|
param_cols = [f'P_{name}' for name in self.feature_names if f'P_{name}' in df.columns]
|
||
|
|
|
||
|
|
# Extract and normalize
|
||
|
|
X = df[param_cols].values
|
||
|
|
|
||
|
|
# Standardize
|
||
|
|
scaler = StandardScaler()
|
||
|
|
X_scaled = scaler.fit_transform(X)
|
||
|
|
self.scalers['default'] = scaler
|
||
|
|
|
||
|
|
return X_scaled
|
||
|
|
|
||
|
|
def _train_regression_model(
|
||
|
|
self,
|
||
|
|
X: np.ndarray,
|
||
|
|
df: Any,
|
||
|
|
target_col: str,
|
||
|
|
model_name: str
|
||
|
|
):
|
||
|
|
"""Train a regression model."""
|
||
|
|
if target_col not in df.columns:
|
||
|
|
print(f" [SKIP] {model_name}: target column not found")
|
||
|
|
return
|
||
|
|
|
||
|
|
y = df[target_col].values
|
||
|
|
|
||
|
|
# Split
|
||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
||
|
|
X, y, test_size=0.2, random_state=42
|
||
|
|
)
|
||
|
|
|
||
|
|
# Train
|
||
|
|
model = GradientBoostingRegressor(
|
||
|
|
n_estimators=100,
|
||
|
|
max_depth=5,
|
||
|
|
learning_rate=0.1,
|
||
|
|
random_state=42
|
||
|
|
)
|
||
|
|
model.fit(X_train, y_train)
|
||
|
|
|
||
|
|
# Evaluate
|
||
|
|
train_score = model.score(X_train, y_train)
|
||
|
|
test_score = model.score(X_test, y_test)
|
||
|
|
|
||
|
|
print(f" {model_name}: R² train={train_score:.3f}, test={test_score:.3f}")
|
||
|
|
|
||
|
|
self.models[model_name] = model
|
||
|
|
|
||
|
|
def _train_classification_model(
|
||
|
|
self,
|
||
|
|
X: np.ndarray,
|
||
|
|
df: Any,
|
||
|
|
target_col: str,
|
||
|
|
model_name: str
|
||
|
|
):
|
||
|
|
"""Train a classification model."""
|
||
|
|
if target_col not in df.columns:
|
||
|
|
print(f" [SKIP] {model_name}: target column not found")
|
||
|
|
return
|
||
|
|
|
||
|
|
y = df[target_col].astype(int).values
|
||
|
|
|
||
|
|
# Check if we have both classes
|
||
|
|
if len(set(y)) < 2:
|
||
|
|
print(f" [SKIP] {model_name}: only one class present")
|
||
|
|
return
|
||
|
|
|
||
|
|
# Split
|
||
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
||
|
|
X, y, test_size=0.2, random_state=42, stratify=y
|
||
|
|
)
|
||
|
|
|
||
|
|
# Train with XGBoost if available, else RandomForest
|
||
|
|
if XGBOOST_AVAILABLE:
|
||
|
|
model = xgb.XGBClassifier(
|
||
|
|
n_estimators=100,
|
||
|
|
max_depth=5,
|
||
|
|
learning_rate=0.1,
|
||
|
|
random_state=42,
|
||
|
|
use_label_encoder=False,
|
||
|
|
eval_metric='logloss'
|
||
|
|
)
|
||
|
|
else:
|
||
|
|
model = RandomForestClassifier(
|
||
|
|
n_estimators=100,
|
||
|
|
max_depth=5,
|
||
|
|
random_state=42
|
||
|
|
)
|
||
|
|
|
||
|
|
model.fit(X_train, y_train)
|
||
|
|
|
||
|
|
# Evaluate
|
||
|
|
y_pred = model.predict(X_test)
|
||
|
|
acc = accuracy_score(y_test, y_pred)
|
||
|
|
|
||
|
|
print(f" {model_name}: accuracy={acc:.3f}")
|
||
|
|
|
||
|
|
self.models[model_name] = model
|
||
|
|
|
||
|
|
def _train_envelope_model(self, X: np.ndarray, df: Any):
|
||
|
|
"""Train One-Class SVM on champion region configurations."""
|
||
|
|
if 'L_champion_region' not in df.columns:
|
||
|
|
print(" [SKIP] envelope: champion_region column not found")
|
||
|
|
return
|
||
|
|
|
||
|
|
# Filter to champions
|
||
|
|
champion_mask = df['L_champion_region'].astype(bool)
|
||
|
|
X_champions = X[champion_mask]
|
||
|
|
|
||
|
|
if len(X_champions) < 100:
|
||
|
|
print(f" [SKIP] envelope: only {len(X_champions)} champions (need 100+)")
|
||
|
|
return
|
||
|
|
|
||
|
|
print(f" Training on {len(X_champions)} champion configurations")
|
||
|
|
|
||
|
|
# Train One-Class SVM
|
||
|
|
model = OneClassSVM(kernel='rbf', nu=0.05, gamma='scale')
|
||
|
|
model.fit(X_champions)
|
||
|
|
|
||
|
|
self.models['envelope'] = model
|
||
|
|
print(f" Envelope model trained")
|
||
|
|
|
||
|
|
def _save_models(self):
|
||
|
|
"""Save all trained models."""
|
||
|
|
# Save models
|
||
|
|
for name, model in self.models.items():
|
||
|
|
path = self.models_dir / f"{name}.pkl"
|
||
|
|
with open(path, 'wb') as f:
|
||
|
|
pickle.dump(model, f)
|
||
|
|
|
||
|
|
# Save scalers
|
||
|
|
for name, scaler in self.scalers.items():
|
||
|
|
path = self.models_dir / f"scaler_{name}.pkl"
|
||
|
|
with open(path, 'wb') as f:
|
||
|
|
pickle.dump(scaler, f)
|
||
|
|
|
||
|
|
# Save feature names
|
||
|
|
with open(self.models_dir / "feature_names.json", 'w') as f:
|
||
|
|
json.dump(self.feature_names, f)
|
||
|
|
|
||
|
|
print(f" Saved {len(self.models)} models to {self.models_dir}")
|
||
|
|
|
||
|
|
def load_models(self):
|
||
|
|
"""Load trained models from disk."""
|
||
|
|
# Load feature names
|
||
|
|
with open(self.models_dir / "feature_names.json", 'r') as f:
|
||
|
|
self.feature_names = json.load(f)
|
||
|
|
|
||
|
|
# Load models — skip any that fail (e.g. XGBoost pickle when xgboost not installed)
|
||
|
|
model_files = list(self.models_dir.glob("*.pkl"))
|
||
|
|
for path in model_files:
|
||
|
|
if 'scaler_' in path.name:
|
||
|
|
continue
|
||
|
|
try:
|
||
|
|
with open(path, 'rb') as f:
|
||
|
|
self.models[path.stem] = pickle.load(f)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [WARN] Skipping {path.name}: {e}")
|
||
|
|
|
||
|
|
# Load scalers
|
||
|
|
for path in self.models_dir.glob("scaler_*.pkl"):
|
||
|
|
name = path.stem.replace('scaler_', '')
|
||
|
|
try:
|
||
|
|
with open(path, 'rb') as f:
|
||
|
|
self.scalers[name] = pickle.load(f)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [WARN] Skipping scaler {path.name}: {e}")
|
||
|
|
|
||
|
|
loaded = list(self.models.keys())
|
||
|
|
print(f"[OK] Loaded {len(loaded)} models: {loaded}")
|
||
|
|
|
||
|
|
def predict(self, config: MCTrialConfig) -> Dict[str, float]:
|
||
|
|
"""
|
||
|
|
Make predictions for a configuration.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
config : MCTrialConfig
|
||
|
|
Configuration to predict
|
||
|
|
|
||
|
|
Returns
|
||
|
|
-------
|
||
|
|
Dict[str, float]
|
||
|
|
Predictions for all targets
|
||
|
|
"""
|
||
|
|
if not self.models:
|
||
|
|
self.load_models()
|
||
|
|
|
||
|
|
# Extract features
|
||
|
|
X = self._config_to_features(config)
|
||
|
|
|
||
|
|
predictions = {}
|
||
|
|
|
||
|
|
# Regression predictions
|
||
|
|
if 'model_roi' in self.models:
|
||
|
|
predictions['roi'] = self.models['model_roi'].predict(X)[0]
|
||
|
|
if 'model_dd' in self.models:
|
||
|
|
predictions['max_dd'] = self.models['model_dd'].predict(X)[0]
|
||
|
|
if 'model_pf' in self.models:
|
||
|
|
predictions['profit_factor'] = self.models['model_pf'].predict(X)[0]
|
||
|
|
if 'model_wr' in self.models:
|
||
|
|
predictions['win_rate'] = self.models['model_wr'].predict(X)[0]
|
||
|
|
|
||
|
|
# Classification predictions (probability of positive class)
|
||
|
|
if 'model_champ' in self.models:
|
||
|
|
if hasattr(self.models['model_champ'], 'predict_proba'):
|
||
|
|
predictions['champion_prob'] = self.models['model_champ'].predict_proba(X)[0, 1]
|
||
|
|
else:
|
||
|
|
predictions['champion_prob'] = float(self.models['model_champ'].predict(X)[0])
|
||
|
|
|
||
|
|
if 'model_catas' in self.models:
|
||
|
|
if hasattr(self.models['model_catas'], 'predict_proba'):
|
||
|
|
predictions['catastrophic_prob'] = self.models['model_catas'].predict_proba(X)[0, 1]
|
||
|
|
else:
|
||
|
|
predictions['catastrophic_prob'] = float(self.models['model_catas'].predict(X)[0])
|
||
|
|
|
||
|
|
# Envelope score
|
||
|
|
if 'envelope' in self.models:
|
||
|
|
predictions['envelope_score'] = self.models['envelope'].decision_function(X)[0]
|
||
|
|
|
||
|
|
return predictions
|
||
|
|
|
||
|
|
def _config_to_features(self, config: MCTrialConfig) -> np.ndarray:
|
||
|
|
"""Convert config to feature vector."""
|
||
|
|
features = []
|
||
|
|
for name in self.feature_names:
|
||
|
|
value = getattr(config, name, MCSampler.CHAMPION[name])
|
||
|
|
features.append(value)
|
||
|
|
|
||
|
|
X = np.array([features])
|
||
|
|
|
||
|
|
# Scale
|
||
|
|
if 'default' in self.scalers:
|
||
|
|
X = self.scalers['default'].transform(X)
|
||
|
|
|
||
|
|
return X
|
||
|
|
|
||
|
|
|
||
|
|
class DolphinForewarner:
|
||
|
|
"""
|
||
|
|
Live forewarning system for Dolphin configurations.
|
||
|
|
|
||
|
|
Provides risk assessment based on trained MC envelope model.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self, models_dir: str = "mc_results/models"):
|
||
|
|
"""
|
||
|
|
Initialize forewarner.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
models_dir : str
|
||
|
|
Directory with trained models
|
||
|
|
"""
|
||
|
|
self.ml = MCML(models_dir=models_dir)
|
||
|
|
self.ml.load_models()
|
||
|
|
|
||
|
|
def assess(self, config: MCTrialConfig) -> ForewarningReport:
|
||
|
|
"""
|
||
|
|
Assess a configuration and return forewarning report.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
config : MCTrialConfig
|
||
|
|
Configuration to assess
|
||
|
|
|
||
|
|
Returns
|
||
|
|
-------
|
||
|
|
ForewarningReport
|
||
|
|
Complete risk assessment
|
||
|
|
"""
|
||
|
|
# Get predictions
|
||
|
|
preds = self.ml.predict(config)
|
||
|
|
|
||
|
|
# Build warnings
|
||
|
|
warnings = []
|
||
|
|
|
||
|
|
if preds.get('catastrophic_prob', 0) > 0.10:
|
||
|
|
warnings.append(f"Catastrophic risk: {preds['catastrophic_prob']:.1%}")
|
||
|
|
|
||
|
|
if preds.get('envelope_score', 0) < 0:
|
||
|
|
warnings.append("Configuration outside safe operating envelope")
|
||
|
|
|
||
|
|
# Check parameter boundaries
|
||
|
|
if config.max_leverage > 6.0:
|
||
|
|
warnings.append(f"High leverage: {config.max_leverage:.1f}x")
|
||
|
|
|
||
|
|
if config.fraction * config.max_leverage > 1.5:
|
||
|
|
warnings.append(f"High notional exposure: {config.fraction * config.max_leverage:.2f}x")
|
||
|
|
|
||
|
|
# Create report
|
||
|
|
report = ForewarningReport(
|
||
|
|
config=config.to_dict(),
|
||
|
|
predicted_roi=preds.get('roi', 0),
|
||
|
|
predicted_roi_p10=preds.get('roi', 0) * 0.5, # Simplified
|
||
|
|
predicted_roi_p90=preds.get('roi', 0) * 1.5,
|
||
|
|
predicted_max_dd=preds.get('max_dd', 0),
|
||
|
|
champion_probability=preds.get('champion_prob', 0),
|
||
|
|
catastrophic_probability=preds.get('catastrophic_prob', 0),
|
||
|
|
envelope_score=preds.get('envelope_score', 0),
|
||
|
|
warnings=warnings,
|
||
|
|
nearest_champion=None, # Would require search
|
||
|
|
parameter_risks={}
|
||
|
|
)
|
||
|
|
|
||
|
|
return report
|
||
|
|
|
||
|
|
def assess_config_dict(self, config_dict: Dict[str, Any]) -> ForewarningReport:
|
||
|
|
"""Assess from a configuration dictionary."""
|
||
|
|
config = MCTrialConfig.from_dict(config_dict)
|
||
|
|
return self.assess(config)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
# Test
|
||
|
|
print("MC ML module loaded")
|
||
|
|
print("Run training with: MCML().train_all_models()")
|