DOLPHIN/nautilus_dolphin/mc/mc_store.py

"""
Monte Carlo Result Store
========================

Persistence layer for MC trial results.

Supports:
- Parquet files for bulk data storage
- SQLite index for fast querying
- Incremental/resumable runs
- Batch organization

Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 8
"""

import json
import sqlite3
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
from datetime import datetime
import numpy as np

# Try to import pandas/pyarrow
try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except ImportError:
    PANDAS_AVAILABLE = False
    print("[WARN] pandas not available - Parquet storage disabled")

from .mc_metrics import MCTrialResult
from .mc_validator import ValidationResult


class MCStore:
    """
    Monte Carlo Result Store.
    
    Manages persistence of trial configurations, results, and indices.
    """
    
    def __init__(
        self,
        output_dir: Union[str, Path] = "mc_results",
        batch_size: int = 1000
    ):
        """
        Initialize the store.
        
        Parameters
        ----------
        output_dir : str or Path
            Directory for all MC results
        batch_size : int
            Number of trials per batch file
        """
        self.output_dir = Path(output_dir)
        self.batch_size = batch_size
        
        # Create directory structure
        self.manifests_dir = self.output_dir / "manifests"
        self.results_dir = self.output_dir / "results"
        self.models_dir = self.output_dir / "models"
        
        self.manifests_dir.mkdir(parents=True, exist_ok=True)
        self.results_dir.mkdir(parents=True, exist_ok=True)
        self.models_dir.mkdir(parents=True, exist_ok=True)
        
        # SQLite index
        self.index_path = self.output_dir / "mc_index.sqlite"
        self._init_index()
        
        self.current_batch = self._get_latest_batch() + 1
        
    def _init_index(self):
        """Initialize SQLite index."""
        conn = sqlite3.connect(self.index_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS mc_index (
                trial_id INTEGER PRIMARY KEY,
                batch_id INTEGER,
                status TEXT,
                roi_pct REAL,
                profit_factor REAL,
                win_rate REAL,
                max_dd_pct REAL,
                sharpe REAL,
                n_trades INTEGER,
                champion_region INTEGER,
                catastrophic INTEGER,
                created_at INTEGER
            )
        ''')
        
        # Create indices
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_roi ON mc_index (roi_pct)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_champion ON mc_index (champion_region)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_catastrophic ON mc_index (catastrophic)')
        cursor.execute('CREATE INDEX IF NOT EXISTS idx_batch ON mc_index (batch_id)')
        
        conn.commit()
        conn.close()
    
    def _get_latest_batch(self) -> int:
        """Get the highest batch ID in the index."""
        conn = sqlite3.connect(self.index_path)
        cursor = conn.cursor()
        
        cursor.execute('SELECT MAX(batch_id) FROM mc_index')
        result = cursor.fetchone()
        conn.close()
        
        return result[0] if result and result[0] else 0
    
    def save_validation_results(self, results: List[ValidationResult], batch_id: int):
        """Save validation results to manifest."""
        manifest_path = self.manifests_dir / f"batch_{batch_id:04d}_validation.json"
        
        data = [r.to_dict() for r in results]
        with open(manifest_path, 'w') as f:
            json.dump(data, f, indent=2)
        
        print(f"[OK] Saved validation manifest: {manifest_path}")
    
    def save_trial_results(
        self,
        results: List[MCTrialResult],
        batch_id: Optional[int] = None
    ):
        """
        Save trial results to Parquet and update index.
        
        Parameters
        ----------
        results : List[MCTrialResult]
            Trial results to save
        batch_id : int, optional
            Batch ID (auto-incremented if not provided)
        """
        if batch_id is None:
            batch_id = self.current_batch
            self.current_batch += 1
        
        if not results:
            return
        
        # Save to Parquet
        if PANDAS_AVAILABLE:
            self._save_parquet(results, batch_id)
        
        # Update SQLite index
        self._update_index(results, batch_id)
        
        print(f"[OK] Saved batch {batch_id}: {len(results)} trials")
    
    def _save_parquet(self, results: List[MCTrialResult], batch_id: int):
        """Save results to Parquet file."""
        parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"
        
        # Convert to DataFrame
        data = [r.to_dict() for r in results]
        df = pd.DataFrame(data)
        
        # Save
        df.to_parquet(parquet_path, index=False, compression='zstd')
    
    def _update_index(self, results: List[MCTrialResult], batch_id: int):
        """Update SQLite index with result summaries."""
        conn = sqlite3.connect(self.index_path)
        cursor = conn.cursor()
        
        timestamp = int(datetime.now().timestamp())
        
        for r in results:
            cursor.execute('''
                INSERT OR REPLACE INTO mc_index
                (trial_id, batch_id, status, roi_pct, profit_factor, win_rate,
                 max_dd_pct, sharpe, n_trades, champion_region, catastrophic, created_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                r.trial_id,
                batch_id,
                r.status,
                r.roi_pct,
                r.profit_factor,
                r.win_rate,
                r.max_drawdown_pct,
                r.sharpe_ratio,
                r.n_trades,
                int(r.champion_region),
                int(r.catastrophic),
                timestamp
            ))
        
        conn.commit()
        conn.close()
    
    def query_index(
        self,
        status: Optional[str] = None,
        min_roi: Optional[float] = None,
        champion_only: bool = False,
        catastrophic_only: bool = False,
        limit: int = 1000
    ) -> List[Dict[str, Any]]:
        """
        Query the SQLite index.
        
        Parameters
        ----------
        status : str, optional
            Filter by status
        min_roi : float, optional
            Minimum ROI percentage
        champion_only : bool
            Only champion region configs
        catastrophic_only : bool
            Only catastrophic configs
        limit : int
            Maximum results
            
        Returns
        -------
        List[Dict]
            Matching index entries
        """
        conn = sqlite3.connect(self.index_path)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        
        query = 'SELECT * FROM mc_index WHERE 1=1'
        params = []
        
        if status:
            query += ' AND status = ?'
            params.append(status)
        
        if min_roi is not None:
            query += ' AND roi_pct >= ?'
            params.append(min_roi)
        
        if champion_only:
            query += ' AND champion_region = 1'
        
        if catastrophic_only:
            query += ' AND catastrophic = 1'
        
        query += ' ORDER BY roi_pct DESC LIMIT ?'
        params.append(limit)
        
        cursor.execute(query, params)
        rows = cursor.fetchall()
        conn.close()
        
        return [dict(row) for row in rows]
    
    def get_corpus_stats(self) -> Dict[str, Any]:
        """Get statistics about the stored corpus."""
        conn = sqlite3.connect(self.index_path)
        cursor = conn.cursor()
        
        # Total trials
        cursor.execute('SELECT COUNT(*) FROM mc_index')
        total = cursor.fetchone()[0]
        
        # By status
        cursor.execute('SELECT status, COUNT(*) FROM mc_index GROUP BY status')
        by_status = {row[0]: row[1] for row in cursor.fetchall()}
        
        # Champion region
        cursor.execute('SELECT COUNT(*) FROM mc_index WHERE champion_region = 1')
        champion_count = cursor.fetchone()[0]
        
        # Catastrophic
        cursor.execute('SELECT COUNT(*) FROM mc_index WHERE catastrophic = 1')
        catastrophic_count = cursor.fetchone()[0]
        
        # ROI stats
        cursor.execute('''
            SELECT AVG(roi_pct), MIN(roi_pct), MAX(roi_pct), 
                   AVG(sharpe), AVG(max_dd_pct)
            FROM mc_index WHERE status = 'completed'
        ''')
        roi_stats = cursor.fetchone()
        
        conn.close()
        
        return {
            'total_trials': total,
            'by_status': by_status,
            'champion_count': champion_count,
            'catastrophic_count': catastrophic_count,
            'avg_roi_pct': roi_stats[0] if roi_stats else 0,
            'min_roi_pct': roi_stats[1] if roi_stats else 0,
            'max_roi_pct': roi_stats[2] if roi_stats else 0,
            'avg_sharpe': roi_stats[3] if roi_stats else 0,
            'avg_max_dd_pct': roi_stats[4] if roi_stats else 0,
        }
    
    def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
        """Load a batch of results from Parquet."""
        if not PANDAS_AVAILABLE:
            return None
        
        parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"
        
        if not parquet_path.exists():
            return None
        
        return pd.read_parquet(parquet_path)
    
    def load_corpus(self) -> Optional[pd.DataFrame]:
        """Load entire corpus from all batches."""
        if not PANDAS_AVAILABLE:
            return None
        
        batches = []
        for parquet_file in sorted(self.results_dir.glob("batch_*_results.parquet")):
            df = pd.read_parquet(parquet_file)
            batches.append(df)
        
        if not batches:
            return None
        
        return pd.concat(batches, ignore_index=True)
initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore. 2026-04-21 16:58:38 +02:00			`"""`
			`Monte Carlo Result Store`
			`========================`

			`Persistence layer for MC trial results.`

			`Supports:`
			`- Parquet files for bulk data storage`
			`- SQLite index for fast querying`
			`- Incremental/resumable runs`
			`- Batch organization`

			`Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 8`
			`"""`

			`import json`
			`import sqlite3`
			`from pathlib import Path`
			`from typing import Dict, List, Optional, Any, Union`
			`from datetime import datetime`
			`import numpy as np`

			`# Try to import pandas/pyarrow`
			`try:`
			`import pandas as pd`
			`PANDAS_AVAILABLE = True`
			`except ImportError:`
			`PANDAS_AVAILABLE = False`
			`print("[WARN] pandas not available - Parquet storage disabled")`

			`from .mc_metrics import MCTrialResult`
			`from .mc_validator import ValidationResult`


			`class MCStore:`
			`"""`
			`Monte Carlo Result Store.`

			`Manages persistence of trial configurations, results, and indices.`
			`"""`

			`def __init__(`
			`self,`
			`output_dir: Union[str, Path] = "mc_results",`
			`batch_size: int = 1000`
			`):`
			`"""`
			`Initialize the store.`

			`Parameters`
			`----------`
			`output_dir : str or Path`
			`Directory for all MC results`
			`batch_size : int`
			`Number of trials per batch file`
			`"""`
			`self.output_dir = Path(output_dir)`
			`self.batch_size = batch_size`

			`# Create directory structure`
			`self.manifests_dir = self.output_dir / "manifests"`
			`self.results_dir = self.output_dir / "results"`
			`self.models_dir = self.output_dir / "models"`

			`self.manifests_dir.mkdir(parents=True, exist_ok=True)`
			`self.results_dir.mkdir(parents=True, exist_ok=True)`
			`self.models_dir.mkdir(parents=True, exist_ok=True)`

			`# SQLite index`
			`self.index_path = self.output_dir / "mc_index.sqlite"`
			`self._init_index()`

			`self.current_batch = self._get_latest_batch() + 1`

			`def _init_index(self):`
			`"""Initialize SQLite index."""`
			`conn = sqlite3.connect(self.index_path)`
			`cursor = conn.cursor()`

			`cursor.execute('''`
			`CREATE TABLE IF NOT EXISTS mc_index (`
			`trial_id INTEGER PRIMARY KEY,`
			`batch_id INTEGER,`
			`status TEXT,`
			`roi_pct REAL,`
			`profit_factor REAL,`
			`win_rate REAL,`
			`max_dd_pct REAL,`
			`sharpe REAL,`
			`n_trades INTEGER,`
			`champion_region INTEGER,`
			`catastrophic INTEGER,`
			`created_at INTEGER`
			`)`
			`''')`

			`# Create indices`
			`cursor.execute('CREATE INDEX IF NOT EXISTS idx_roi ON mc_index (roi_pct)')`
			`cursor.execute('CREATE INDEX IF NOT EXISTS idx_champion ON mc_index (champion_region)')`
			`cursor.execute('CREATE INDEX IF NOT EXISTS idx_catastrophic ON mc_index (catastrophic)')`
			`cursor.execute('CREATE INDEX IF NOT EXISTS idx_batch ON mc_index (batch_id)')`

			`conn.commit()`
			`conn.close()`

			`def _get_latest_batch(self) -> int:`
			`"""Get the highest batch ID in the index."""`
			`conn = sqlite3.connect(self.index_path)`
			`cursor = conn.cursor()`

			`cursor.execute('SELECT MAX(batch_id) FROM mc_index')`
			`result = cursor.fetchone()`
			`conn.close()`

			`return result[0] if result and result[0] else 0`

			`def save_validation_results(self, results: List[ValidationResult], batch_id: int):`
			`"""Save validation results to manifest."""`
			`manifest_path = self.manifests_dir / f"batch_{batch_id:04d}_validation.json"`

			`data = [r.to_dict() for r in results]`
			`with open(manifest_path, 'w') as f:`
			`json.dump(data, f, indent=2)`

			`print(f"[OK] Saved validation manifest: {manifest_path}")`

			`def save_trial_results(`
			`self,`
			`results: List[MCTrialResult],`
			`batch_id: Optional[int] = None`
			`):`
			`"""`
			`Save trial results to Parquet and update index.`

			`Parameters`
			`----------`
			`results : List[MCTrialResult]`
			`Trial results to save`
			`batch_id : int, optional`
			`Batch ID (auto-incremented if not provided)`
			`"""`
			`if batch_id is None:`
			`batch_id = self.current_batch`
			`self.current_batch += 1`

			`if not results:`
			`return`

			`# Save to Parquet`
			`if PANDAS_AVAILABLE:`
			`self._save_parquet(results, batch_id)`

			`# Update SQLite index`
			`self._update_index(results, batch_id)`

			`print(f"[OK] Saved batch {batch_id}: {len(results)} trials")`

			`def _save_parquet(self, results: List[MCTrialResult], batch_id: int):`
			`"""Save results to Parquet file."""`
			`parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"`

			`# Convert to DataFrame`
			`data = [r.to_dict() for r in results]`
			`df = pd.DataFrame(data)`

			`# Save`
			`df.to_parquet(parquet_path, index=False, compression='zstd')`

			`def _update_index(self, results: List[MCTrialResult], batch_id: int):`
			`"""Update SQLite index with result summaries."""`
			`conn = sqlite3.connect(self.index_path)`
			`cursor = conn.cursor()`

			`timestamp = int(datetime.now().timestamp())`

			`for r in results:`
			`cursor.execute('''`
			`INSERT OR REPLACE INTO mc_index`
			`(trial_id, batch_id, status, roi_pct, profit_factor, win_rate,`
			`max_dd_pct, sharpe, n_trades, champion_region, catastrophic, created_at)`
			`VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`
			`''', (`
			`r.trial_id,`
			`batch_id,`
			`r.status,`
			`r.roi_pct,`
			`r.profit_factor,`
			`r.win_rate,`
			`r.max_drawdown_pct,`
			`r.sharpe_ratio,`
			`r.n_trades,`
			`int(r.champion_region),`
			`int(r.catastrophic),`
			`timestamp`
			`))`

			`conn.commit()`
			`conn.close()`

			`def query_index(`
			`self,`
			`status: Optional[str] = None,`
			`min_roi: Optional[float] = None,`
			`champion_only: bool = False,`
			`catastrophic_only: bool = False,`
			`limit: int = 1000`
			`) -> List[Dict[str, Any]]:`
			`"""`
			`Query the SQLite index.`

			`Parameters`
			`----------`
			`status : str, optional`
			`Filter by status`
			`min_roi : float, optional`
			`Minimum ROI percentage`
			`champion_only : bool`
			`Only champion region configs`
			`catastrophic_only : bool`
			`Only catastrophic configs`
			`limit : int`
			`Maximum results`

			`Returns`
			`-------`
			`List[Dict]`
			`Matching index entries`
			`"""`
			`conn = sqlite3.connect(self.index_path)`
			`conn.row_factory = sqlite3.Row`
			`cursor = conn.cursor()`

			`query = 'SELECT * FROM mc_index WHERE 1=1'`
			`params = []`

			`if status:`
			`query += ' AND status = ?'`
			`params.append(status)`

			`if min_roi is not None:`
			`query += ' AND roi_pct >= ?'`
			`params.append(min_roi)`

			`if champion_only:`
			`query += ' AND champion_region = 1'`

			`if catastrophic_only:`
			`query += ' AND catastrophic = 1'`

			`query += ' ORDER BY roi_pct DESC LIMIT ?'`
			`params.append(limit)`

			`cursor.execute(query, params)`
			`rows = cursor.fetchall()`
			`conn.close()`

			`return [dict(row) for row in rows]`

			`def get_corpus_stats(self) -> Dict[str, Any]:`
			`"""Get statistics about the stored corpus."""`
			`conn = sqlite3.connect(self.index_path)`
			`cursor = conn.cursor()`

			`# Total trials`
			`cursor.execute('SELECT COUNT(*) FROM mc_index')`
			`total = cursor.fetchone()[0]`

			`# By status`
			`cursor.execute('SELECT status, COUNT(*) FROM mc_index GROUP BY status')`
			`by_status = {row[0]: row[1] for row in cursor.fetchall()}`

			`# Champion region`
			`cursor.execute('SELECT COUNT(*) FROM mc_index WHERE champion_region = 1')`
			`champion_count = cursor.fetchone()[0]`

			`# Catastrophic`
			`cursor.execute('SELECT COUNT(*) FROM mc_index WHERE catastrophic = 1')`
			`catastrophic_count = cursor.fetchone()[0]`

			`# ROI stats`
			`cursor.execute('''`
			`SELECT AVG(roi_pct), MIN(roi_pct), MAX(roi_pct),`
			`AVG(sharpe), AVG(max_dd_pct)`
			`FROM mc_index WHERE status = 'completed'`
			`''')`
			`roi_stats = cursor.fetchone()`

			`conn.close()`

			`return {`
			`'total_trials': total,`
			`'by_status': by_status,`
			`'champion_count': champion_count,`
			`'catastrophic_count': catastrophic_count,`
			`'avg_roi_pct': roi_stats[0] if roi_stats else 0,`
			`'min_roi_pct': roi_stats[1] if roi_stats else 0,`
			`'max_roi_pct': roi_stats[2] if roi_stats else 0,`
			`'avg_sharpe': roi_stats[3] if roi_stats else 0,`
			`'avg_max_dd_pct': roi_stats[4] if roi_stats else 0,`
			`}`

			`def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:`
			`"""Load a batch of results from Parquet."""`
			`if not PANDAS_AVAILABLE:`
			`return None`

			`parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"`

			`if not parquet_path.exists():`
			`return None`

			`return pd.read_parquet(parquet_path)`

			`def load_corpus(self) -> Optional[pd.DataFrame]:`
			`"""Load entire corpus from all batches."""`
			`if not PANDAS_AVAILABLE:`
			`return None`

			`batches = []`
			`for parquet_file in sorted(self.results_dir.glob("batch_*_results.parquet")):`
			`df = pd.read_parquet(parquet_file)`
			`batches.append(df)`

			`if not batches:`
			`return None`

			`return pd.concat(batches, ignore_index=True)`