initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/nautilus_dolphin/mc/mc_ml.py
+++ b/nautilus_dolphin/mc/mc_ml.py
@@ -0,0 +1,505 @@
+"""
+Monte Carlo ML Envelope Learning
+================================
+
+Train ML models on MC results for envelope boundary estimation and forewarning.
+
+Models:
+- Regression models for ROI, DD, PF, WR prediction
+- Classification models for champion_region, catastrophic
+- One-Class SVM for envelope boundary estimation
+- SHAP for feature importance
+
+Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 9, 12
+"""
+
+import json
+import pickle
+from typing import Dict, List, Optional, Any, Tuple
+from pathlib import Path
+from dataclasses import dataclass
+import numpy as np
+
+# Try to import ML libraries
+try:
+    from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
+    from sklearn.svm import OneClassSVM
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.model_selection import train_test_split
+    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+    SKLEARN_AVAILABLE = True
+except ImportError:
+    SKLEARN_AVAILABLE = False
+    print("[WARN] scikit-learn not available - ML training disabled")
+
+try:
+    import xgboost as xgb
+    XGBOOST_AVAILABLE = True
+except ImportError:
+    XGBOOST_AVAILABLE = False
+
+try:
+    import shap
+    SHAP_AVAILABLE = True
+except ImportError:
+    SHAP_AVAILABLE = False
+
+from .mc_sampler import MCTrialConfig, MCSampler
+from .mc_store import MCStore
+
+
+@dataclass
+class ForewarningReport:
+    """Forewarning report for a configuration."""
+    config: Dict[str, Any]
+    predicted_roi: float
+    predicted_roi_p10: float
+    predicted_roi_p90: float
+    predicted_max_dd: float
+    champion_probability: float
+    catastrophic_probability: float
+    envelope_score: float
+    warnings: List[str]
+    nearest_champion: Optional[Dict[str, Any]]
+    parameter_risks: Dict[str, float]
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            'config': self.config,
+            'predicted_roi': self.predicted_roi,
+            'predicted_roi_p10': self.predicted_roi_p10,
+            'predicted_roi_p90': self.predicted_roi_p90,
+            'predicted_max_dd': self.predicted_max_dd,
+            'champion_probability': self.champion_probability,
+            'catastrophic_probability': self.catastrophic_probability,
+            'envelope_score': self.envelope_score,
+            'warnings': self.warnings,
+            'nearest_champion': self.nearest_champion,
+            'parameter_risks': self.parameter_risks,
+        }
+
+
+class MCML:
+    """
+    Monte Carlo ML Envelope Learning.
+    
+    Trains models on MC results and provides forewarning capabilities.
+    """
+    
+    def __init__(
+        self,
+        output_dir: str = "mc_results",
+        models_dir: Optional[str] = None
+    ):
+        """
+        Initialize ML trainer.
+        
+        Parameters
+        ----------
+        output_dir : str
+            MC results directory
+        models_dir : str, optional
+            Directory to save trained models
+        """
+        self.output_dir = Path(output_dir)
+        self.models_dir = Path(models_dir) if models_dir else self.output_dir / "models"
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+        
+        self.store = MCStore(output_dir=output_dir)
+        
+        # Models
+        self.models: Dict[str, Any] = {}
+        self.scalers: Dict[str, StandardScaler] = {}
+        self.feature_names: List[str] = []
+        
+        self._init_feature_names()
+    
+    def _init_feature_names(self):
+        """Initialize feature names from parameter space."""
+        sampler = MCSampler()
+        self.feature_names = list(sampler.CHAMPION.keys())
+    
+    def load_corpus(self) -> Optional[Any]:
+        """Load full corpus from store."""
+        return self.store.load_corpus()
+    
+    def train_all_models(self, test_size: float = 0.2) -> Dict[str, Any]:
+        """
+        Train all ML models on the corpus.
+        
+        Parameters
+        ----------
+        test_size : float
+            Fraction of data for testing
+            
+        Returns
+        -------
+        Dict[str, Any]
+            Training results and metrics
+        """
+        if not SKLEARN_AVAILABLE:
+            raise RuntimeError("scikit-learn required for training")
+        
+        print("="*70)
+        print("TRAINING ML MODELS")
+        print("="*70)
+        
+        # Load corpus
+        print("\n[1/6] Loading corpus...")
+        df = self.load_corpus()
+        if df is None or len(df) == 0:
+            raise ValueError("No corpus data available")
+        
+        print(f"  Loaded {len(df)} trials")
+        
+        # Prepare features
+        print("\n[2/6] Preparing features...")
+        X = self._extract_features(df)
+        
+        # Train regression models
+        print("\n[3/6] Training regression models...")
+        self._train_regression_model(X, df, 'M_roi_pct', 'model_roi')
+        self._train_regression_model(X, df, 'M_max_drawdown_pct', 'model_dd')
+        self._train_regression_model(X, df, 'M_profit_factor', 'model_pf')
+        self._train_regression_model(X, df, 'M_win_rate', 'model_wr')
+        
+        # Train classification models
+        print("\n[4/6] Training classification models...")
+        self._train_classification_model(X, df, 'L_champion_region', 'model_champ')
+        self._train_classification_model(X, df, 'L_catastrophic', 'model_catas')
+        self._train_classification_model(X, df, 'L_inert', 'model_inert')
+        self._train_classification_model(X, df, 'L_h2_degradation', 'model_h2deg')
+        
+        # Train envelope model (One-Class SVM on champions)
+        print("\n[5/6] Training envelope boundary model...")
+        self._train_envelope_model(X, df)
+        
+        # Save models
+        print("\n[6/6] Saving models...")
+        self._save_models()
+        
+        print("\n[OK] All models trained and saved")
+        
+        return {'status': 'success', 'n_samples': len(df)}
+    
+    def _extract_features(self, df: Any) -> np.ndarray:
+        """Extract feature matrix from DataFrame."""
+        # Get parameter columns
+        param_cols = [f'P_{name}' for name in self.feature_names if f'P_{name}' in df.columns]
+        
+        # Extract and normalize
+        X = df[param_cols].values
+        
+        # Standardize
+        scaler = StandardScaler()
+        X_scaled = scaler.fit_transform(X)
+        self.scalers['default'] = scaler
+        
+        return X_scaled
+    
+    def _train_regression_model(
+        self,
+        X: np.ndarray,
+        df: Any,
+        target_col: str,
+        model_name: str
+    ):
+        """Train a regression model."""
+        if target_col not in df.columns:
+            print(f"  [SKIP] {model_name}: target column not found")
+            return
+        
+        y = df[target_col].values
+        
+        # Split
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42
+        )
+        
+        # Train
+        model = GradientBoostingRegressor(
+            n_estimators=100,
+            max_depth=5,
+            learning_rate=0.1,
+            random_state=42
+        )
+        model.fit(X_train, y_train)
+        
+        # Evaluate
+        train_score = model.score(X_train, y_train)
+        test_score = model.score(X_test, y_test)
+        
+        print(f"  {model_name}: R² train={train_score:.3f}, test={test_score:.3f}")
+        
+        self.models[model_name] = model
+    
+    def _train_classification_model(
+        self,
+        X: np.ndarray,
+        df: Any,
+        target_col: str,
+        model_name: str
+    ):
+        """Train a classification model."""
+        if target_col not in df.columns:
+            print(f"  [SKIP] {model_name}: target column not found")
+            return
+        
+        y = df[target_col].astype(int).values
+        
+        # Check if we have both classes
+        if len(set(y)) < 2:
+            print(f"  [SKIP] {model_name}: only one class present")
+            return
+        
+        # Split
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
+        )
+        
+        # Train with XGBoost if available, else RandomForest
+        if XGBOOST_AVAILABLE:
+            model = xgb.XGBClassifier(
+                n_estimators=100,
+                max_depth=5,
+                learning_rate=0.1,
+                random_state=42,
+                use_label_encoder=False,
+                eval_metric='logloss'
+            )
+        else:
+            model = RandomForestClassifier(
+                n_estimators=100,
+                max_depth=5,
+                random_state=42
+            )
+        
+        model.fit(X_train, y_train)
+        
+        # Evaluate
+        y_pred = model.predict(X_test)
+        acc = accuracy_score(y_test, y_pred)
+        
+        print(f"  {model_name}: accuracy={acc:.3f}")
+        
+        self.models[model_name] = model
+    
+    def _train_envelope_model(self, X: np.ndarray, df: Any):
+        """Train One-Class SVM on champion region configurations."""
+        if 'L_champion_region' not in df.columns:
+            print("  [SKIP] envelope: champion_region column not found")
+            return
+        
+        # Filter to champions
+        champion_mask = df['L_champion_region'].astype(bool)
+        X_champions = X[champion_mask]
+        
+        if len(X_champions) < 100:
+            print(f"  [SKIP] envelope: only {len(X_champions)} champions (need 100+)")
+            return
+        
+        print(f"  Training on {len(X_champions)} champion configurations")
+        
+        # Train One-Class SVM
+        model = OneClassSVM(kernel='rbf', nu=0.05, gamma='scale')
+        model.fit(X_champions)
+        
+        self.models['envelope'] = model
+        print(f"  Envelope model trained")
+    
+    def _save_models(self):
+        """Save all trained models."""
+        # Save models
+        for name, model in self.models.items():
+            path = self.models_dir / f"{name}.pkl"
+            with open(path, 'wb') as f:
+                pickle.dump(model, f)
+        
+        # Save scalers
+        for name, scaler in self.scalers.items():
+            path = self.models_dir / f"scaler_{name}.pkl"
+            with open(path, 'wb') as f:
+                pickle.dump(scaler, f)
+        
+        # Save feature names
+        with open(self.models_dir / "feature_names.json", 'w') as f:
+            json.dump(self.feature_names, f)
+        
+        print(f"  Saved {len(self.models)} models to {self.models_dir}")
+    
+    def load_models(self):
+        """Load trained models from disk."""
+        # Load feature names
+        with open(self.models_dir / "feature_names.json", 'r') as f:
+            self.feature_names = json.load(f)
+
+        # Load models — skip any that fail (e.g. XGBoost pickle when xgboost not installed)
+        model_files = list(self.models_dir.glob("*.pkl"))
+        for path in model_files:
+            if 'scaler_' in path.name:
+                continue
+            try:
+                with open(path, 'rb') as f:
+                    self.models[path.stem] = pickle.load(f)
+            except Exception as e:
+                print(f"  [WARN] Skipping {path.name}: {e}")
+
+        # Load scalers
+        for path in self.models_dir.glob("scaler_*.pkl"):
+            name = path.stem.replace('scaler_', '')
+            try:
+                with open(path, 'rb') as f:
+                    self.scalers[name] = pickle.load(f)
+            except Exception as e:
+                print(f"  [WARN] Skipping scaler {path.name}: {e}")
+
+        loaded = list(self.models.keys())
+        print(f"[OK] Loaded {len(loaded)} models: {loaded}")
+    
+    def predict(self, config: MCTrialConfig) -> Dict[str, float]:
+        """
+        Make predictions for a configuration.
+        
+        Parameters
+        ----------
+        config : MCTrialConfig
+            Configuration to predict
+            
+        Returns
+        -------
+        Dict[str, float]
+            Predictions for all targets
+        """
+        if not self.models:
+            self.load_models()
+        
+        # Extract features
+        X = self._config_to_features(config)
+        
+        predictions = {}
+        
+        # Regression predictions
+        if 'model_roi' in self.models:
+            predictions['roi'] = self.models['model_roi'].predict(X)[0]
+        if 'model_dd' in self.models:
+            predictions['max_dd'] = self.models['model_dd'].predict(X)[0]
+        if 'model_pf' in self.models:
+            predictions['profit_factor'] = self.models['model_pf'].predict(X)[0]
+        if 'model_wr' in self.models:
+            predictions['win_rate'] = self.models['model_wr'].predict(X)[0]
+        
+        # Classification predictions (probability of positive class)
+        if 'model_champ' in self.models:
+            if hasattr(self.models['model_champ'], 'predict_proba'):
+                predictions['champion_prob'] = self.models['model_champ'].predict_proba(X)[0, 1]
+            else:
+                predictions['champion_prob'] = float(self.models['model_champ'].predict(X)[0])
+        
+        if 'model_catas' in self.models:
+            if hasattr(self.models['model_catas'], 'predict_proba'):
+                predictions['catastrophic_prob'] = self.models['model_catas'].predict_proba(X)[0, 1]
+            else:
+                predictions['catastrophic_prob'] = float(self.models['model_catas'].predict(X)[0])
+        
+        # Envelope score
+        if 'envelope' in self.models:
+            predictions['envelope_score'] = self.models['envelope'].decision_function(X)[0]
+        
+        return predictions
+    
+    def _config_to_features(self, config: MCTrialConfig) -> np.ndarray:
+        """Convert config to feature vector."""
+        features = []
+        for name in self.feature_names:
+            value = getattr(config, name, MCSampler.CHAMPION[name])
+            features.append(value)
+        
+        X = np.array([features])
+        
+        # Scale
+        if 'default' in self.scalers:
+            X = self.scalers['default'].transform(X)
+        
+        return X
+
+
+class DolphinForewarner:
+    """
+    Live forewarning system for Dolphin configurations.
+    
+    Provides risk assessment based on trained MC envelope model.
+    """
+    
+    def __init__(self, models_dir: str = "mc_results/models"):
+        """
+        Initialize forewarner.
+        
+        Parameters
+        ----------
+        models_dir : str
+            Directory with trained models
+        """
+        self.ml = MCML(models_dir=models_dir)
+        self.ml.load_models()
+    
+    def assess(self, config: MCTrialConfig) -> ForewarningReport:
+        """
+        Assess a configuration and return forewarning report.
+        
+        Parameters
+        ----------
+        config : MCTrialConfig
+            Configuration to assess
+            
+        Returns
+        -------
+        ForewarningReport
+            Complete risk assessment
+        """
+        # Get predictions
+        preds = self.ml.predict(config)
+        
+        # Build warnings
+        warnings = []
+        
+        if preds.get('catastrophic_prob', 0) > 0.10:
+            warnings.append(f"Catastrophic risk: {preds['catastrophic_prob']:.1%}")
+        
+        if preds.get('envelope_score', 0) < 0:
+            warnings.append("Configuration outside safe operating envelope")
+        
+        # Check parameter boundaries
+        if config.max_leverage > 6.0:
+            warnings.append(f"High leverage: {config.max_leverage:.1f}x")
+        
+        if config.fraction * config.max_leverage > 1.5:
+            warnings.append(f"High notional exposure: {config.fraction * config.max_leverage:.2f}x")
+        
+        # Create report
+        report = ForewarningReport(
+            config=config.to_dict(),
+            predicted_roi=preds.get('roi', 0),
+            predicted_roi_p10=preds.get('roi', 0) * 0.5,  # Simplified
+            predicted_roi_p90=preds.get('roi', 0) * 1.5,
+            predicted_max_dd=preds.get('max_dd', 0),
+            champion_probability=preds.get('champion_prob', 0),
+            catastrophic_probability=preds.get('catastrophic_prob', 0),
+            envelope_score=preds.get('envelope_score', 0),
+            warnings=warnings,
+            nearest_champion=None,  # Would require search
+            parameter_risks={}
+        )
+        
+        return report
+    
+    def assess_config_dict(self, config_dict: Dict[str, Any]) -> ForewarningReport:
+        """Assess from a configuration dictionary."""
+        config = MCTrialConfig.from_dict(config_dict)
+        return self.assess(config)
+
+
+if __name__ == "__main__":
+    # Test
+    print("MC ML module loaded")
+    print("Run training with: MCML().train_all_models()")