""" Monte Carlo Result Store ======================== Persistence layer for MC trial results. Supports: - Parquet files for bulk data storage - SQLite index for fast querying - Incremental/resumable runs - Batch organization Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 8 """ import json import sqlite3 from pathlib import Path from typing import Dict, List, Optional, Any, Union from datetime import datetime import numpy as np # Try to import pandas/pyarrow try: import pandas as pd PANDAS_AVAILABLE = True except ImportError: PANDAS_AVAILABLE = False print("[WARN] pandas not available - Parquet storage disabled") from .mc_metrics import MCTrialResult from .mc_validator import ValidationResult class MCStore: """ Monte Carlo Result Store. Manages persistence of trial configurations, results, and indices. """ def __init__( self, output_dir: Union[str, Path] = "mc_results", batch_size: int = 1000 ): """ Initialize the store. Parameters ---------- output_dir : str or Path Directory for all MC results batch_size : int Number of trials per batch file """ self.output_dir = Path(output_dir) self.batch_size = batch_size # Create directory structure self.manifests_dir = self.output_dir / "manifests" self.results_dir = self.output_dir / "results" self.models_dir = self.output_dir / "models" self.manifests_dir.mkdir(parents=True, exist_ok=True) self.results_dir.mkdir(parents=True, exist_ok=True) self.models_dir.mkdir(parents=True, exist_ok=True) # SQLite index self.index_path = self.output_dir / "mc_index.sqlite" self._init_index() self.current_batch = self._get_latest_batch() + 1 def _init_index(self): """Initialize SQLite index.""" conn = sqlite3.connect(self.index_path) cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS mc_index ( trial_id INTEGER PRIMARY KEY, batch_id INTEGER, status TEXT, roi_pct REAL, profit_factor REAL, win_rate REAL, max_dd_pct REAL, sharpe REAL, n_trades INTEGER, champion_region INTEGER, catastrophic INTEGER, created_at INTEGER ) ''') # Create indices cursor.execute('CREATE INDEX IF NOT EXISTS idx_roi ON mc_index (roi_pct)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_champion ON mc_index (champion_region)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_catastrophic ON mc_index (catastrophic)') cursor.execute('CREATE INDEX IF NOT EXISTS idx_batch ON mc_index (batch_id)') conn.commit() conn.close() def _get_latest_batch(self) -> int: """Get the highest batch ID in the index.""" conn = sqlite3.connect(self.index_path) cursor = conn.cursor() cursor.execute('SELECT MAX(batch_id) FROM mc_index') result = cursor.fetchone() conn.close() return result[0] if result and result[0] else 0 def save_validation_results(self, results: List[ValidationResult], batch_id: int): """Save validation results to manifest.""" manifest_path = self.manifests_dir / f"batch_{batch_id:04d}_validation.json" data = [r.to_dict() for r in results] with open(manifest_path, 'w') as f: json.dump(data, f, indent=2) print(f"[OK] Saved validation manifest: {manifest_path}") def save_trial_results( self, results: List[MCTrialResult], batch_id: Optional[int] = None ): """ Save trial results to Parquet and update index. Parameters ---------- results : List[MCTrialResult] Trial results to save batch_id : int, optional Batch ID (auto-incremented if not provided) """ if batch_id is None: batch_id = self.current_batch self.current_batch += 1 if not results: return # Save to Parquet if PANDAS_AVAILABLE: self._save_parquet(results, batch_id) # Update SQLite index self._update_index(results, batch_id) print(f"[OK] Saved batch {batch_id}: {len(results)} trials") def _save_parquet(self, results: List[MCTrialResult], batch_id: int): """Save results to Parquet file.""" parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet" # Convert to DataFrame data = [r.to_dict() for r in results] df = pd.DataFrame(data) # Save df.to_parquet(parquet_path, index=False, compression='zstd') def _update_index(self, results: List[MCTrialResult], batch_id: int): """Update SQLite index with result summaries.""" conn = sqlite3.connect(self.index_path) cursor = conn.cursor() timestamp = int(datetime.now().timestamp()) for r in results: cursor.execute(''' INSERT OR REPLACE INTO mc_index (trial_id, batch_id, status, roi_pct, profit_factor, win_rate, max_dd_pct, sharpe, n_trades, champion_region, catastrophic, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( r.trial_id, batch_id, r.status, r.roi_pct, r.profit_factor, r.win_rate, r.max_drawdown_pct, r.sharpe_ratio, r.n_trades, int(r.champion_region), int(r.catastrophic), timestamp )) conn.commit() conn.close() def query_index( self, status: Optional[str] = None, min_roi: Optional[float] = None, champion_only: bool = False, catastrophic_only: bool = False, limit: int = 1000 ) -> List[Dict[str, Any]]: """ Query the SQLite index. Parameters ---------- status : str, optional Filter by status min_roi : float, optional Minimum ROI percentage champion_only : bool Only champion region configs catastrophic_only : bool Only catastrophic configs limit : int Maximum results Returns ------- List[Dict] Matching index entries """ conn = sqlite3.connect(self.index_path) conn.row_factory = sqlite3.Row cursor = conn.cursor() query = 'SELECT * FROM mc_index WHERE 1=1' params = [] if status: query += ' AND status = ?' params.append(status) if min_roi is not None: query += ' AND roi_pct >= ?' params.append(min_roi) if champion_only: query += ' AND champion_region = 1' if catastrophic_only: query += ' AND catastrophic = 1' query += ' ORDER BY roi_pct DESC LIMIT ?' params.append(limit) cursor.execute(query, params) rows = cursor.fetchall() conn.close() return [dict(row) for row in rows] def get_corpus_stats(self) -> Dict[str, Any]: """Get statistics about the stored corpus.""" conn = sqlite3.connect(self.index_path) cursor = conn.cursor() # Total trials cursor.execute('SELECT COUNT(*) FROM mc_index') total = cursor.fetchone()[0] # By status cursor.execute('SELECT status, COUNT(*) FROM mc_index GROUP BY status') by_status = {row[0]: row[1] for row in cursor.fetchall()} # Champion region cursor.execute('SELECT COUNT(*) FROM mc_index WHERE champion_region = 1') champion_count = cursor.fetchone()[0] # Catastrophic cursor.execute('SELECT COUNT(*) FROM mc_index WHERE catastrophic = 1') catastrophic_count = cursor.fetchone()[0] # ROI stats cursor.execute(''' SELECT AVG(roi_pct), MIN(roi_pct), MAX(roi_pct), AVG(sharpe), AVG(max_dd_pct) FROM mc_index WHERE status = 'completed' ''') roi_stats = cursor.fetchone() conn.close() return { 'total_trials': total, 'by_status': by_status, 'champion_count': champion_count, 'catastrophic_count': catastrophic_count, 'avg_roi_pct': roi_stats[0] if roi_stats else 0, 'min_roi_pct': roi_stats[1] if roi_stats else 0, 'max_roi_pct': roi_stats[2] if roi_stats else 0, 'avg_sharpe': roi_stats[3] if roi_stats else 0, 'avg_max_dd_pct': roi_stats[4] if roi_stats else 0, } def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]: """Load a batch of results from Parquet.""" if not PANDAS_AVAILABLE: return None parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet" if not parquet_path.exists(): return None return pd.read_parquet(parquet_path) def load_corpus(self) -> Optional[pd.DataFrame]: """Load entire corpus from all batches.""" if not PANDAS_AVAILABLE: return None batches = [] for parquet_file in sorted(self.results_dir.glob("batch_*_results.parquet")): df = pd.read_parquet(parquet_file) batches.append(df) if not batches: return None return pd.concat(batches, ignore_index=True)