Files
DOLPHIN/nautilus_dolphin/mc/mc_store.py

328 lines
10 KiB
Python
Raw Normal View History

"""
Monte Carlo Result Store
========================
Persistence layer for MC trial results.
Supports:
- Parquet files for bulk data storage
- SQLite index for fast querying
- Incremental/resumable runs
- Batch organization
Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 8
"""
import json
import sqlite3
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
from datetime import datetime
import numpy as np
# Try to import pandas/pyarrow
try:
import pandas as pd
PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False
print("[WARN] pandas not available - Parquet storage disabled")
from .mc_metrics import MCTrialResult
from .mc_validator import ValidationResult
class MCStore:
"""
Monte Carlo Result Store.
Manages persistence of trial configurations, results, and indices.
"""
def __init__(
self,
output_dir: Union[str, Path] = "mc_results",
batch_size: int = 1000
):
"""
Initialize the store.
Parameters
----------
output_dir : str or Path
Directory for all MC results
batch_size : int
Number of trials per batch file
"""
self.output_dir = Path(output_dir)
self.batch_size = batch_size
# Create directory structure
self.manifests_dir = self.output_dir / "manifests"
self.results_dir = self.output_dir / "results"
self.models_dir = self.output_dir / "models"
self.manifests_dir.mkdir(parents=True, exist_ok=True)
self.results_dir.mkdir(parents=True, exist_ok=True)
self.models_dir.mkdir(parents=True, exist_ok=True)
# SQLite index
self.index_path = self.output_dir / "mc_index.sqlite"
self._init_index()
self.current_batch = self._get_latest_batch() + 1
def _init_index(self):
"""Initialize SQLite index."""
conn = sqlite3.connect(self.index_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS mc_index (
trial_id INTEGER PRIMARY KEY,
batch_id INTEGER,
status TEXT,
roi_pct REAL,
profit_factor REAL,
win_rate REAL,
max_dd_pct REAL,
sharpe REAL,
n_trades INTEGER,
champion_region INTEGER,
catastrophic INTEGER,
created_at INTEGER
)
''')
# Create indices
cursor.execute('CREATE INDEX IF NOT EXISTS idx_roi ON mc_index (roi_pct)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_champion ON mc_index (champion_region)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_catastrophic ON mc_index (catastrophic)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_batch ON mc_index (batch_id)')
conn.commit()
conn.close()
def _get_latest_batch(self) -> int:
"""Get the highest batch ID in the index."""
conn = sqlite3.connect(self.index_path)
cursor = conn.cursor()
cursor.execute('SELECT MAX(batch_id) FROM mc_index')
result = cursor.fetchone()
conn.close()
return result[0] if result and result[0] else 0
def save_validation_results(self, results: List[ValidationResult], batch_id: int):
"""Save validation results to manifest."""
manifest_path = self.manifests_dir / f"batch_{batch_id:04d}_validation.json"
data = [r.to_dict() for r in results]
with open(manifest_path, 'w') as f:
json.dump(data, f, indent=2)
print(f"[OK] Saved validation manifest: {manifest_path}")
def save_trial_results(
self,
results: List[MCTrialResult],
batch_id: Optional[int] = None
):
"""
Save trial results to Parquet and update index.
Parameters
----------
results : List[MCTrialResult]
Trial results to save
batch_id : int, optional
Batch ID (auto-incremented if not provided)
"""
if batch_id is None:
batch_id = self.current_batch
self.current_batch += 1
if not results:
return
# Save to Parquet
if PANDAS_AVAILABLE:
self._save_parquet(results, batch_id)
# Update SQLite index
self._update_index(results, batch_id)
print(f"[OK] Saved batch {batch_id}: {len(results)} trials")
def _save_parquet(self, results: List[MCTrialResult], batch_id: int):
"""Save results to Parquet file."""
parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"
# Convert to DataFrame
data = [r.to_dict() for r in results]
df = pd.DataFrame(data)
# Save
df.to_parquet(parquet_path, index=False, compression='zstd')
def _update_index(self, results: List[MCTrialResult], batch_id: int):
"""Update SQLite index with result summaries."""
conn = sqlite3.connect(self.index_path)
cursor = conn.cursor()
timestamp = int(datetime.now().timestamp())
for r in results:
cursor.execute('''
INSERT OR REPLACE INTO mc_index
(trial_id, batch_id, status, roi_pct, profit_factor, win_rate,
max_dd_pct, sharpe, n_trades, champion_region, catastrophic, created_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
r.trial_id,
batch_id,
r.status,
r.roi_pct,
r.profit_factor,
r.win_rate,
r.max_drawdown_pct,
r.sharpe_ratio,
r.n_trades,
int(r.champion_region),
int(r.catastrophic),
timestamp
))
conn.commit()
conn.close()
def query_index(
self,
status: Optional[str] = None,
min_roi: Optional[float] = None,
champion_only: bool = False,
catastrophic_only: bool = False,
limit: int = 1000
) -> List[Dict[str, Any]]:
"""
Query the SQLite index.
Parameters
----------
status : str, optional
Filter by status
min_roi : float, optional
Minimum ROI percentage
champion_only : bool
Only champion region configs
catastrophic_only : bool
Only catastrophic configs
limit : int
Maximum results
Returns
-------
List[Dict]
Matching index entries
"""
conn = sqlite3.connect(self.index_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
query = 'SELECT * FROM mc_index WHERE 1=1'
params = []
if status:
query += ' AND status = ?'
params.append(status)
if min_roi is not None:
query += ' AND roi_pct >= ?'
params.append(min_roi)
if champion_only:
query += ' AND champion_region = 1'
if catastrophic_only:
query += ' AND catastrophic = 1'
query += ' ORDER BY roi_pct DESC LIMIT ?'
params.append(limit)
cursor.execute(query, params)
rows = cursor.fetchall()
conn.close()
return [dict(row) for row in rows]
def get_corpus_stats(self) -> Dict[str, Any]:
"""Get statistics about the stored corpus."""
conn = sqlite3.connect(self.index_path)
cursor = conn.cursor()
# Total trials
cursor.execute('SELECT COUNT(*) FROM mc_index')
total = cursor.fetchone()[0]
# By status
cursor.execute('SELECT status, COUNT(*) FROM mc_index GROUP BY status')
by_status = {row[0]: row[1] for row in cursor.fetchall()}
# Champion region
cursor.execute('SELECT COUNT(*) FROM mc_index WHERE champion_region = 1')
champion_count = cursor.fetchone()[0]
# Catastrophic
cursor.execute('SELECT COUNT(*) FROM mc_index WHERE catastrophic = 1')
catastrophic_count = cursor.fetchone()[0]
# ROI stats
cursor.execute('''
SELECT AVG(roi_pct), MIN(roi_pct), MAX(roi_pct),
AVG(sharpe), AVG(max_dd_pct)
FROM mc_index WHERE status = 'completed'
''')
roi_stats = cursor.fetchone()
conn.close()
return {
'total_trials': total,
'by_status': by_status,
'champion_count': champion_count,
'catastrophic_count': catastrophic_count,
'avg_roi_pct': roi_stats[0] if roi_stats else 0,
'min_roi_pct': roi_stats[1] if roi_stats else 0,
'max_roi_pct': roi_stats[2] if roi_stats else 0,
'avg_sharpe': roi_stats[3] if roi_stats else 0,
'avg_max_dd_pct': roi_stats[4] if roi_stats else 0,
}
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
"""Load a batch of results from Parquet."""
if not PANDAS_AVAILABLE:
return None
parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"
if not parquet_path.exists():
return None
return pd.read_parquet(parquet_path)
def load_corpus(self) -> Optional[pd.DataFrame]:
"""Load entire corpus from all batches."""
if not PANDAS_AVAILABLE:
return None
batches = []
for parquet_file in sorted(self.results_dir.glob("batch_*_results.parquet")):
df = pd.read_parquet(parquet_file)
batches.append(df)
if not batches:
return None
return pd.concat(batches, ignore_index=True)