initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/nautilus_dolphin/mc/mc_store.py
+++ b/nautilus_dolphin/mc/mc_store.py
@@ -0,0 +1,327 @@
+"""
+Monte Carlo Result Store
+========================
+
+Persistence layer for MC trial results.
+
+Supports:
+- Parquet files for bulk data storage
+- SQLite index for fast querying
+- Incremental/resumable runs
+- Batch organization
+
+Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 8
+"""
+
+import json
+import sqlite3
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Union
+from datetime import datetime
+import numpy as np
+
+# Try to import pandas/pyarrow
+try:
+    import pandas as pd
+    PANDAS_AVAILABLE = True
+except ImportError:
+    PANDAS_AVAILABLE = False
+    print("[WARN] pandas not available - Parquet storage disabled")
+
+from .mc_metrics import MCTrialResult
+from .mc_validator import ValidationResult
+
+
+class MCStore:
+    """
+    Monte Carlo Result Store.
+    
+    Manages persistence of trial configurations, results, and indices.
+    """
+    
+    def __init__(
+        self,
+        output_dir: Union[str, Path] = "mc_results",
+        batch_size: int = 1000
+    ):
+        """
+        Initialize the store.
+        
+        Parameters
+        ----------
+        output_dir : str or Path
+            Directory for all MC results
+        batch_size : int
+            Number of trials per batch file
+        """
+        self.output_dir = Path(output_dir)
+        self.batch_size = batch_size
+        
+        # Create directory structure
+        self.manifests_dir = self.output_dir / "manifests"
+        self.results_dir = self.output_dir / "results"
+        self.models_dir = self.output_dir / "models"
+        
+        self.manifests_dir.mkdir(parents=True, exist_ok=True)
+        self.results_dir.mkdir(parents=True, exist_ok=True)
+        self.models_dir.mkdir(parents=True, exist_ok=True)
+        
+        # SQLite index
+        self.index_path = self.output_dir / "mc_index.sqlite"
+        self._init_index()
+        
+        self.current_batch = self._get_latest_batch() + 1
+        
+    def _init_index(self):
+        """Initialize SQLite index."""
+        conn = sqlite3.connect(self.index_path)
+        cursor = conn.cursor()
+        
+        cursor.execute('''
+            CREATE TABLE IF NOT EXISTS mc_index (
+                trial_id INTEGER PRIMARY KEY,
+                batch_id INTEGER,
+                status TEXT,
+                roi_pct REAL,
+                profit_factor REAL,
+                win_rate REAL,
+                max_dd_pct REAL,
+                sharpe REAL,
+                n_trades INTEGER,
+                champion_region INTEGER,
+                catastrophic INTEGER,
+                created_at INTEGER
+            )
+        ''')
+        
+        # Create indices
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_roi ON mc_index (roi_pct)')
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_champion ON mc_index (champion_region)')
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_catastrophic ON mc_index (catastrophic)')
+        cursor.execute('CREATE INDEX IF NOT EXISTS idx_batch ON mc_index (batch_id)')
+        
+        conn.commit()
+        conn.close()
+    
+    def _get_latest_batch(self) -> int:
+        """Get the highest batch ID in the index."""
+        conn = sqlite3.connect(self.index_path)
+        cursor = conn.cursor()
+        
+        cursor.execute('SELECT MAX(batch_id) FROM mc_index')
+        result = cursor.fetchone()
+        conn.close()
+        
+        return result[0] if result and result[0] else 0
+    
+    def save_validation_results(self, results: List[ValidationResult], batch_id: int):
+        """Save validation results to manifest."""
+        manifest_path = self.manifests_dir / f"batch_{batch_id:04d}_validation.json"
+        
+        data = [r.to_dict() for r in results]
+        with open(manifest_path, 'w') as f:
+            json.dump(data, f, indent=2)
+        
+        print(f"[OK] Saved validation manifest: {manifest_path}")
+    
+    def save_trial_results(
+        self,
+        results: List[MCTrialResult],
+        batch_id: Optional[int] = None
+    ):
+        """
+        Save trial results to Parquet and update index.
+        
+        Parameters
+        ----------
+        results : List[MCTrialResult]
+            Trial results to save
+        batch_id : int, optional
+            Batch ID (auto-incremented if not provided)
+        """
+        if batch_id is None:
+            batch_id = self.current_batch
+            self.current_batch += 1
+        
+        if not results:
+            return
+        
+        # Save to Parquet
+        if PANDAS_AVAILABLE:
+            self._save_parquet(results, batch_id)
+        
+        # Update SQLite index
+        self._update_index(results, batch_id)
+        
+        print(f"[OK] Saved batch {batch_id}: {len(results)} trials")
+    
+    def _save_parquet(self, results: List[MCTrialResult], batch_id: int):
+        """Save results to Parquet file."""
+        parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"
+        
+        # Convert to DataFrame
+        data = [r.to_dict() for r in results]
+        df = pd.DataFrame(data)
+        
+        # Save
+        df.to_parquet(parquet_path, index=False, compression='zstd')
+    
+    def _update_index(self, results: List[MCTrialResult], batch_id: int):
+        """Update SQLite index with result summaries."""
+        conn = sqlite3.connect(self.index_path)
+        cursor = conn.cursor()
+        
+        timestamp = int(datetime.now().timestamp())
+        
+        for r in results:
+            cursor.execute('''
+                INSERT OR REPLACE INTO mc_index
+                (trial_id, batch_id, status, roi_pct, profit_factor, win_rate,
+                 max_dd_pct, sharpe, n_trades, champion_region, catastrophic, created_at)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ''', (
+                r.trial_id,
+                batch_id,
+                r.status,
+                r.roi_pct,
+                r.profit_factor,
+                r.win_rate,
+                r.max_drawdown_pct,
+                r.sharpe_ratio,
+                r.n_trades,
+                int(r.champion_region),
+                int(r.catastrophic),
+                timestamp
+            ))
+        
+        conn.commit()
+        conn.close()
+    
+    def query_index(
+        self,
+        status: Optional[str] = None,
+        min_roi: Optional[float] = None,
+        champion_only: bool = False,
+        catastrophic_only: bool = False,
+        limit: int = 1000
+    ) -> List[Dict[str, Any]]:
+        """
+        Query the SQLite index.
+        
+        Parameters
+        ----------
+        status : str, optional
+            Filter by status
+        min_roi : float, optional
+            Minimum ROI percentage
+        champion_only : bool
+            Only champion region configs
+        catastrophic_only : bool
+            Only catastrophic configs
+        limit : int
+            Maximum results
+            
+        Returns
+        -------
+        List[Dict]
+            Matching index entries
+        """
+        conn = sqlite3.connect(self.index_path)
+        conn.row_factory = sqlite3.Row
+        cursor = conn.cursor()
+        
+        query = 'SELECT * FROM mc_index WHERE 1=1'
+        params = []
+        
+        if status:
+            query += ' AND status = ?'
+            params.append(status)
+        
+        if min_roi is not None:
+            query += ' AND roi_pct >= ?'
+            params.append(min_roi)
+        
+        if champion_only:
+            query += ' AND champion_region = 1'
+        
+        if catastrophic_only:
+            query += ' AND catastrophic = 1'
+        
+        query += ' ORDER BY roi_pct DESC LIMIT ?'
+        params.append(limit)
+        
+        cursor.execute(query, params)
+        rows = cursor.fetchall()
+        conn.close()
+        
+        return [dict(row) for row in rows]
+    
+    def get_corpus_stats(self) -> Dict[str, Any]:
+        """Get statistics about the stored corpus."""
+        conn = sqlite3.connect(self.index_path)
+        cursor = conn.cursor()
+        
+        # Total trials
+        cursor.execute('SELECT COUNT(*) FROM mc_index')
+        total = cursor.fetchone()[0]
+        
+        # By status
+        cursor.execute('SELECT status, COUNT(*) FROM mc_index GROUP BY status')
+        by_status = {row[0]: row[1] for row in cursor.fetchall()}
+        
+        # Champion region
+        cursor.execute('SELECT COUNT(*) FROM mc_index WHERE champion_region = 1')
+        champion_count = cursor.fetchone()[0]
+        
+        # Catastrophic
+        cursor.execute('SELECT COUNT(*) FROM mc_index WHERE catastrophic = 1')
+        catastrophic_count = cursor.fetchone()[0]
+        
+        # ROI stats
+        cursor.execute('''
+            SELECT AVG(roi_pct), MIN(roi_pct), MAX(roi_pct), 
+                   AVG(sharpe), AVG(max_dd_pct)
+            FROM mc_index WHERE status = 'completed'
+        ''')
+        roi_stats = cursor.fetchone()
+        
+        conn.close()
+        
+        return {
+            'total_trials': total,
+            'by_status': by_status,
+            'champion_count': champion_count,
+            'catastrophic_count': catastrophic_count,
+            'avg_roi_pct': roi_stats[0] if roi_stats else 0,
+            'min_roi_pct': roi_stats[1] if roi_stats else 0,
+            'max_roi_pct': roi_stats[2] if roi_stats else 0,
+            'avg_sharpe': roi_stats[3] if roi_stats else 0,
+            'avg_max_dd_pct': roi_stats[4] if roi_stats else 0,
+        }
+    
+    def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
+        """Load a batch of results from Parquet."""
+        if not PANDAS_AVAILABLE:
+            return None
+        
+        parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"
+        
+        if not parquet_path.exists():
+            return None
+        
+        return pd.read_parquet(parquet_path)
+    
+    def load_corpus(self) -> Optional[pd.DataFrame]:
+        """Load entire corpus from all batches."""
+        if not PANDAS_AVAILABLE:
+            return None
+        
+        batches = []
+        for parquet_file in sorted(self.results_dir.glob("batch_*_results.parquet")):
+            df = pd.read_parquet(parquet_file)
+            batches.append(df)
+        
+        if not batches:
+            return None
+        
+        return pd.concat(batches, ignore_index=True)