""" Monte Carlo ML Envelope Learning ================================ Train ML models on MC results for envelope boundary estimation and forewarning. Models: - Regression models for ROI, DD, PF, WR prediction - Classification models for champion_region, catastrophic - One-Class SVM for envelope boundary estimation - SHAP for feature importance Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 9, 12 """ import json import pickle from typing import Dict, List, Optional, Any, Tuple from pathlib import Path from dataclasses import dataclass import numpy as np # Try to import ML libraries try: from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier from sklearn.svm import OneClassSVM from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score SKLEARN_AVAILABLE = True except ImportError: SKLEARN_AVAILABLE = False print("[WARN] scikit-learn not available - ML training disabled") try: import xgboost as xgb XGBOOST_AVAILABLE = True except ImportError: XGBOOST_AVAILABLE = False try: import shap SHAP_AVAILABLE = True except ImportError: SHAP_AVAILABLE = False from .mc_sampler import MCTrialConfig, MCSampler from .mc_store import MCStore @dataclass class ForewarningReport: """Forewarning report for a configuration.""" config: Dict[str, Any] predicted_roi: float predicted_roi_p10: float predicted_roi_p90: float predicted_max_dd: float champion_probability: float catastrophic_probability: float envelope_score: float warnings: List[str] nearest_champion: Optional[Dict[str, Any]] parameter_risks: Dict[str, float] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { 'config': self.config, 'predicted_roi': self.predicted_roi, 'predicted_roi_p10': self.predicted_roi_p10, 'predicted_roi_p90': self.predicted_roi_p90, 'predicted_max_dd': self.predicted_max_dd, 'champion_probability': self.champion_probability, 'catastrophic_probability': self.catastrophic_probability, 'envelope_score': self.envelope_score, 'warnings': self.warnings, 'nearest_champion': self.nearest_champion, 'parameter_risks': self.parameter_risks, } class MCML: """ Monte Carlo ML Envelope Learning. Trains models on MC results and provides forewarning capabilities. """ def __init__( self, output_dir: str = "mc_results", models_dir: Optional[str] = None ): """ Initialize ML trainer. Parameters ---------- output_dir : str MC results directory models_dir : str, optional Directory to save trained models """ self.output_dir = Path(output_dir) self.models_dir = Path(models_dir) if models_dir else self.output_dir / "models" self.models_dir.mkdir(parents=True, exist_ok=True) self.store = MCStore(output_dir=output_dir) # Models self.models: Dict[str, Any] = {} self.scalers: Dict[str, StandardScaler] = {} self.feature_names: List[str] = [] self._init_feature_names() def _init_feature_names(self): """Initialize feature names from parameter space.""" sampler = MCSampler() self.feature_names = list(sampler.CHAMPION.keys()) def load_corpus(self) -> Optional[Any]: """Load full corpus from store.""" return self.store.load_corpus() def train_all_models(self, test_size: float = 0.2) -> Dict[str, Any]: """ Train all ML models on the corpus. Parameters ---------- test_size : float Fraction of data for testing Returns ------- Dict[str, Any] Training results and metrics """ if not SKLEARN_AVAILABLE: raise RuntimeError("scikit-learn required for training") print("="*70) print("TRAINING ML MODELS") print("="*70) # Load corpus print("\n[1/6] Loading corpus...") df = self.load_corpus() if df is None or len(df) == 0: raise ValueError("No corpus data available") print(f" Loaded {len(df)} trials") # Prepare features print("\n[2/6] Preparing features...") X = self._extract_features(df) # Train regression models print("\n[3/6] Training regression models...") self._train_regression_model(X, df, 'M_roi_pct', 'model_roi') self._train_regression_model(X, df, 'M_max_drawdown_pct', 'model_dd') self._train_regression_model(X, df, 'M_profit_factor', 'model_pf') self._train_regression_model(X, df, 'M_win_rate', 'model_wr') # Train classification models print("\n[4/6] Training classification models...") self._train_classification_model(X, df, 'L_champion_region', 'model_champ') self._train_classification_model(X, df, 'L_catastrophic', 'model_catas') self._train_classification_model(X, df, 'L_inert', 'model_inert') self._train_classification_model(X, df, 'L_h2_degradation', 'model_h2deg') # Train envelope model (One-Class SVM on champions) print("\n[5/6] Training envelope boundary model...") self._train_envelope_model(X, df) # Save models print("\n[6/6] Saving models...") self._save_models() print("\n[OK] All models trained and saved") return {'status': 'success', 'n_samples': len(df)} def _extract_features(self, df: Any) -> np.ndarray: """Extract feature matrix from DataFrame.""" # Get parameter columns param_cols = [f'P_{name}' for name in self.feature_names if f'P_{name}' in df.columns] # Extract and normalize X = df[param_cols].values # Standardize scaler = StandardScaler() X_scaled = scaler.fit_transform(X) self.scalers['default'] = scaler return X_scaled def _train_regression_model( self, X: np.ndarray, df: Any, target_col: str, model_name: str ): """Train a regression model.""" if target_col not in df.columns: print(f" [SKIP] {model_name}: target column not found") return y = df[target_col].values # Split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Train model = GradientBoostingRegressor( n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42 ) model.fit(X_train, y_train) # Evaluate train_score = model.score(X_train, y_train) test_score = model.score(X_test, y_test) print(f" {model_name}: R² train={train_score:.3f}, test={test_score:.3f}") self.models[model_name] = model def _train_classification_model( self, X: np.ndarray, df: Any, target_col: str, model_name: str ): """Train a classification model.""" if target_col not in df.columns: print(f" [SKIP] {model_name}: target column not found") return y = df[target_col].astype(int).values # Check if we have both classes if len(set(y)) < 2: print(f" [SKIP] {model_name}: only one class present") return # Split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Train with XGBoost if available, else RandomForest if XGBOOST_AVAILABLE: model = xgb.XGBClassifier( n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss' ) else: model = RandomForestClassifier( n_estimators=100, max_depth=5, random_state=42 ) model.fit(X_train, y_train) # Evaluate y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) print(f" {model_name}: accuracy={acc:.3f}") self.models[model_name] = model def _train_envelope_model(self, X: np.ndarray, df: Any): """Train One-Class SVM on champion region configurations.""" if 'L_champion_region' not in df.columns: print(" [SKIP] envelope: champion_region column not found") return # Filter to champions champion_mask = df['L_champion_region'].astype(bool) X_champions = X[champion_mask] if len(X_champions) < 100: print(f" [SKIP] envelope: only {len(X_champions)} champions (need 100+)") return print(f" Training on {len(X_champions)} champion configurations") # Train One-Class SVM model = OneClassSVM(kernel='rbf', nu=0.05, gamma='scale') model.fit(X_champions) self.models['envelope'] = model print(f" Envelope model trained") def _save_models(self): """Save all trained models.""" # Save models for name, model in self.models.items(): path = self.models_dir / f"{name}.pkl" with open(path, 'wb') as f: pickle.dump(model, f) # Save scalers for name, scaler in self.scalers.items(): path = self.models_dir / f"scaler_{name}.pkl" with open(path, 'wb') as f: pickle.dump(scaler, f) # Save feature names with open(self.models_dir / "feature_names.json", 'w') as f: json.dump(self.feature_names, f) print(f" Saved {len(self.models)} models to {self.models_dir}") def load_models(self): """Load trained models from disk.""" # Load feature names with open(self.models_dir / "feature_names.json", 'r') as f: self.feature_names = json.load(f) # Load models — skip any that fail (e.g. XGBoost pickle when xgboost not installed) model_files = list(self.models_dir.glob("*.pkl")) for path in model_files: if 'scaler_' in path.name: continue try: with open(path, 'rb') as f: self.models[path.stem] = pickle.load(f) except Exception as e: print(f" [WARN] Skipping {path.name}: {e}") # Load scalers for path in self.models_dir.glob("scaler_*.pkl"): name = path.stem.replace('scaler_', '') try: with open(path, 'rb') as f: self.scalers[name] = pickle.load(f) except Exception as e: print(f" [WARN] Skipping scaler {path.name}: {e}") loaded = list(self.models.keys()) print(f"[OK] Loaded {len(loaded)} models: {loaded}") def predict(self, config: MCTrialConfig) -> Dict[str, float]: """ Make predictions for a configuration. Parameters ---------- config : MCTrialConfig Configuration to predict Returns ------- Dict[str, float] Predictions for all targets """ if not self.models: self.load_models() # Extract features X = self._config_to_features(config) predictions = {} # Regression predictions if 'model_roi' in self.models: predictions['roi'] = self.models['model_roi'].predict(X)[0] if 'model_dd' in self.models: predictions['max_dd'] = self.models['model_dd'].predict(X)[0] if 'model_pf' in self.models: predictions['profit_factor'] = self.models['model_pf'].predict(X)[0] if 'model_wr' in self.models: predictions['win_rate'] = self.models['model_wr'].predict(X)[0] # Classification predictions (probability of positive class) if 'model_champ' in self.models: if hasattr(self.models['model_champ'], 'predict_proba'): predictions['champion_prob'] = self.models['model_champ'].predict_proba(X)[0, 1] else: predictions['champion_prob'] = float(self.models['model_champ'].predict(X)[0]) if 'model_catas' in self.models: if hasattr(self.models['model_catas'], 'predict_proba'): predictions['catastrophic_prob'] = self.models['model_catas'].predict_proba(X)[0, 1] else: predictions['catastrophic_prob'] = float(self.models['model_catas'].predict(X)[0]) # Envelope score if 'envelope' in self.models: predictions['envelope_score'] = self.models['envelope'].decision_function(X)[0] return predictions def _config_to_features(self, config: MCTrialConfig) -> np.ndarray: """Convert config to feature vector.""" features = [] for name in self.feature_names: value = getattr(config, name, MCSampler.CHAMPION[name]) features.append(value) X = np.array([features]) # Scale if 'default' in self.scalers: X = self.scalers['default'].transform(X) return X class DolphinForewarner: """ Live forewarning system for Dolphin configurations. Provides risk assessment based on trained MC envelope model. """ def __init__(self, models_dir: str = "mc_results/models"): """ Initialize forewarner. Parameters ---------- models_dir : str Directory with trained models """ self.ml = MCML(models_dir=models_dir) self.ml.load_models() def assess(self, config: MCTrialConfig) -> ForewarningReport: """ Assess a configuration and return forewarning report. Parameters ---------- config : MCTrialConfig Configuration to assess Returns ------- ForewarningReport Complete risk assessment """ # Get predictions preds = self.ml.predict(config) # Build warnings warnings = [] if preds.get('catastrophic_prob', 0) > 0.10: warnings.append(f"Catastrophic risk: {preds['catastrophic_prob']:.1%}") if preds.get('envelope_score', 0) < 0: warnings.append("Configuration outside safe operating envelope") # Check parameter boundaries if config.max_leverage > 6.0: warnings.append(f"High leverage: {config.max_leverage:.1f}x") if config.fraction * config.max_leverage > 1.5: warnings.append(f"High notional exposure: {config.fraction * config.max_leverage:.2f}x") # Create report report = ForewarningReport( config=config.to_dict(), predicted_roi=preds.get('roi', 0), predicted_roi_p10=preds.get('roi', 0) * 0.5, # Simplified predicted_roi_p90=preds.get('roi', 0) * 1.5, predicted_max_dd=preds.get('max_dd', 0), champion_probability=preds.get('champion_prob', 0), catastrophic_probability=preds.get('catastrophic_prob', 0), envelope_score=preds.get('envelope_score', 0), warnings=warnings, nearest_champion=None, # Would require search parameter_risks={} ) return report def assess_config_dict(self, config_dict: Dict[str, Any]) -> ForewarningReport: """Assess from a configuration dictionary.""" config = MCTrialConfig.from_dict(config_dict) return self.assess(config) if __name__ == "__main__": # Test print("MC ML module loaded") print("Run training with: MCML().train_all_models()")