328 lines
10 KiB
Python
328 lines
10 KiB
Python
|
|
"""
|
||
|
|
Monte Carlo Result Store
|
||
|
|
========================
|
||
|
|
|
||
|
|
Persistence layer for MC trial results.
|
||
|
|
|
||
|
|
Supports:
|
||
|
|
- Parquet files for bulk data storage
|
||
|
|
- SQLite index for fast querying
|
||
|
|
- Incremental/resumable runs
|
||
|
|
- Batch organization
|
||
|
|
|
||
|
|
Reference: MONTE_CARLO_SYSTEM_ENVELOPE_SPEC.md Section 8
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import sqlite3
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Dict, List, Optional, Any, Union
|
||
|
|
from datetime import datetime
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
# Try to import pandas/pyarrow
|
||
|
|
try:
|
||
|
|
import pandas as pd
|
||
|
|
PANDAS_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
PANDAS_AVAILABLE = False
|
||
|
|
print("[WARN] pandas not available - Parquet storage disabled")
|
||
|
|
|
||
|
|
from .mc_metrics import MCTrialResult
|
||
|
|
from .mc_validator import ValidationResult
|
||
|
|
|
||
|
|
|
||
|
|
class MCStore:
|
||
|
|
"""
|
||
|
|
Monte Carlo Result Store.
|
||
|
|
|
||
|
|
Manages persistence of trial configurations, results, and indices.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
output_dir: Union[str, Path] = "mc_results",
|
||
|
|
batch_size: int = 1000
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Initialize the store.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
output_dir : str or Path
|
||
|
|
Directory for all MC results
|
||
|
|
batch_size : int
|
||
|
|
Number of trials per batch file
|
||
|
|
"""
|
||
|
|
self.output_dir = Path(output_dir)
|
||
|
|
self.batch_size = batch_size
|
||
|
|
|
||
|
|
# Create directory structure
|
||
|
|
self.manifests_dir = self.output_dir / "manifests"
|
||
|
|
self.results_dir = self.output_dir / "results"
|
||
|
|
self.models_dir = self.output_dir / "models"
|
||
|
|
|
||
|
|
self.manifests_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
self.results_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
self.models_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# SQLite index
|
||
|
|
self.index_path = self.output_dir / "mc_index.sqlite"
|
||
|
|
self._init_index()
|
||
|
|
|
||
|
|
self.current_batch = self._get_latest_batch() + 1
|
||
|
|
|
||
|
|
def _init_index(self):
|
||
|
|
"""Initialize SQLite index."""
|
||
|
|
conn = sqlite3.connect(self.index_path)
|
||
|
|
cursor = conn.cursor()
|
||
|
|
|
||
|
|
cursor.execute('''
|
||
|
|
CREATE TABLE IF NOT EXISTS mc_index (
|
||
|
|
trial_id INTEGER PRIMARY KEY,
|
||
|
|
batch_id INTEGER,
|
||
|
|
status TEXT,
|
||
|
|
roi_pct REAL,
|
||
|
|
profit_factor REAL,
|
||
|
|
win_rate REAL,
|
||
|
|
max_dd_pct REAL,
|
||
|
|
sharpe REAL,
|
||
|
|
n_trades INTEGER,
|
||
|
|
champion_region INTEGER,
|
||
|
|
catastrophic INTEGER,
|
||
|
|
created_at INTEGER
|
||
|
|
)
|
||
|
|
''')
|
||
|
|
|
||
|
|
# Create indices
|
||
|
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_roi ON mc_index (roi_pct)')
|
||
|
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_champion ON mc_index (champion_region)')
|
||
|
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_catastrophic ON mc_index (catastrophic)')
|
||
|
|
cursor.execute('CREATE INDEX IF NOT EXISTS idx_batch ON mc_index (batch_id)')
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
def _get_latest_batch(self) -> int:
|
||
|
|
"""Get the highest batch ID in the index."""
|
||
|
|
conn = sqlite3.connect(self.index_path)
|
||
|
|
cursor = conn.cursor()
|
||
|
|
|
||
|
|
cursor.execute('SELECT MAX(batch_id) FROM mc_index')
|
||
|
|
result = cursor.fetchone()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
return result[0] if result and result[0] else 0
|
||
|
|
|
||
|
|
def save_validation_results(self, results: List[ValidationResult], batch_id: int):
|
||
|
|
"""Save validation results to manifest."""
|
||
|
|
manifest_path = self.manifests_dir / f"batch_{batch_id:04d}_validation.json"
|
||
|
|
|
||
|
|
data = [r.to_dict() for r in results]
|
||
|
|
with open(manifest_path, 'w') as f:
|
||
|
|
json.dump(data, f, indent=2)
|
||
|
|
|
||
|
|
print(f"[OK] Saved validation manifest: {manifest_path}")
|
||
|
|
|
||
|
|
def save_trial_results(
|
||
|
|
self,
|
||
|
|
results: List[MCTrialResult],
|
||
|
|
batch_id: Optional[int] = None
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Save trial results to Parquet and update index.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
results : List[MCTrialResult]
|
||
|
|
Trial results to save
|
||
|
|
batch_id : int, optional
|
||
|
|
Batch ID (auto-incremented if not provided)
|
||
|
|
"""
|
||
|
|
if batch_id is None:
|
||
|
|
batch_id = self.current_batch
|
||
|
|
self.current_batch += 1
|
||
|
|
|
||
|
|
if not results:
|
||
|
|
return
|
||
|
|
|
||
|
|
# Save to Parquet
|
||
|
|
if PANDAS_AVAILABLE:
|
||
|
|
self._save_parquet(results, batch_id)
|
||
|
|
|
||
|
|
# Update SQLite index
|
||
|
|
self._update_index(results, batch_id)
|
||
|
|
|
||
|
|
print(f"[OK] Saved batch {batch_id}: {len(results)} trials")
|
||
|
|
|
||
|
|
def _save_parquet(self, results: List[MCTrialResult], batch_id: int):
|
||
|
|
"""Save results to Parquet file."""
|
||
|
|
parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"
|
||
|
|
|
||
|
|
# Convert to DataFrame
|
||
|
|
data = [r.to_dict() for r in results]
|
||
|
|
df = pd.DataFrame(data)
|
||
|
|
|
||
|
|
# Save
|
||
|
|
df.to_parquet(parquet_path, index=False, compression='zstd')
|
||
|
|
|
||
|
|
def _update_index(self, results: List[MCTrialResult], batch_id: int):
|
||
|
|
"""Update SQLite index with result summaries."""
|
||
|
|
conn = sqlite3.connect(self.index_path)
|
||
|
|
cursor = conn.cursor()
|
||
|
|
|
||
|
|
timestamp = int(datetime.now().timestamp())
|
||
|
|
|
||
|
|
for r in results:
|
||
|
|
cursor.execute('''
|
||
|
|
INSERT OR REPLACE INTO mc_index
|
||
|
|
(trial_id, batch_id, status, roi_pct, profit_factor, win_rate,
|
||
|
|
max_dd_pct, sharpe, n_trades, champion_region, catastrophic, created_at)
|
||
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||
|
|
''', (
|
||
|
|
r.trial_id,
|
||
|
|
batch_id,
|
||
|
|
r.status,
|
||
|
|
r.roi_pct,
|
||
|
|
r.profit_factor,
|
||
|
|
r.win_rate,
|
||
|
|
r.max_drawdown_pct,
|
||
|
|
r.sharpe_ratio,
|
||
|
|
r.n_trades,
|
||
|
|
int(r.champion_region),
|
||
|
|
int(r.catastrophic),
|
||
|
|
timestamp
|
||
|
|
))
|
||
|
|
|
||
|
|
conn.commit()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
def query_index(
|
||
|
|
self,
|
||
|
|
status: Optional[str] = None,
|
||
|
|
min_roi: Optional[float] = None,
|
||
|
|
champion_only: bool = False,
|
||
|
|
catastrophic_only: bool = False,
|
||
|
|
limit: int = 1000
|
||
|
|
) -> List[Dict[str, Any]]:
|
||
|
|
"""
|
||
|
|
Query the SQLite index.
|
||
|
|
|
||
|
|
Parameters
|
||
|
|
----------
|
||
|
|
status : str, optional
|
||
|
|
Filter by status
|
||
|
|
min_roi : float, optional
|
||
|
|
Minimum ROI percentage
|
||
|
|
champion_only : bool
|
||
|
|
Only champion region configs
|
||
|
|
catastrophic_only : bool
|
||
|
|
Only catastrophic configs
|
||
|
|
limit : int
|
||
|
|
Maximum results
|
||
|
|
|
||
|
|
Returns
|
||
|
|
-------
|
||
|
|
List[Dict]
|
||
|
|
Matching index entries
|
||
|
|
"""
|
||
|
|
conn = sqlite3.connect(self.index_path)
|
||
|
|
conn.row_factory = sqlite3.Row
|
||
|
|
cursor = conn.cursor()
|
||
|
|
|
||
|
|
query = 'SELECT * FROM mc_index WHERE 1=1'
|
||
|
|
params = []
|
||
|
|
|
||
|
|
if status:
|
||
|
|
query += ' AND status = ?'
|
||
|
|
params.append(status)
|
||
|
|
|
||
|
|
if min_roi is not None:
|
||
|
|
query += ' AND roi_pct >= ?'
|
||
|
|
params.append(min_roi)
|
||
|
|
|
||
|
|
if champion_only:
|
||
|
|
query += ' AND champion_region = 1'
|
||
|
|
|
||
|
|
if catastrophic_only:
|
||
|
|
query += ' AND catastrophic = 1'
|
||
|
|
|
||
|
|
query += ' ORDER BY roi_pct DESC LIMIT ?'
|
||
|
|
params.append(limit)
|
||
|
|
|
||
|
|
cursor.execute(query, params)
|
||
|
|
rows = cursor.fetchall()
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
return [dict(row) for row in rows]
|
||
|
|
|
||
|
|
def get_corpus_stats(self) -> Dict[str, Any]:
|
||
|
|
"""Get statistics about the stored corpus."""
|
||
|
|
conn = sqlite3.connect(self.index_path)
|
||
|
|
cursor = conn.cursor()
|
||
|
|
|
||
|
|
# Total trials
|
||
|
|
cursor.execute('SELECT COUNT(*) FROM mc_index')
|
||
|
|
total = cursor.fetchone()[0]
|
||
|
|
|
||
|
|
# By status
|
||
|
|
cursor.execute('SELECT status, COUNT(*) FROM mc_index GROUP BY status')
|
||
|
|
by_status = {row[0]: row[1] for row in cursor.fetchall()}
|
||
|
|
|
||
|
|
# Champion region
|
||
|
|
cursor.execute('SELECT COUNT(*) FROM mc_index WHERE champion_region = 1')
|
||
|
|
champion_count = cursor.fetchone()[0]
|
||
|
|
|
||
|
|
# Catastrophic
|
||
|
|
cursor.execute('SELECT COUNT(*) FROM mc_index WHERE catastrophic = 1')
|
||
|
|
catastrophic_count = cursor.fetchone()[0]
|
||
|
|
|
||
|
|
# ROI stats
|
||
|
|
cursor.execute('''
|
||
|
|
SELECT AVG(roi_pct), MIN(roi_pct), MAX(roi_pct),
|
||
|
|
AVG(sharpe), AVG(max_dd_pct)
|
||
|
|
FROM mc_index WHERE status = 'completed'
|
||
|
|
''')
|
||
|
|
roi_stats = cursor.fetchone()
|
||
|
|
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
return {
|
||
|
|
'total_trials': total,
|
||
|
|
'by_status': by_status,
|
||
|
|
'champion_count': champion_count,
|
||
|
|
'catastrophic_count': catastrophic_count,
|
||
|
|
'avg_roi_pct': roi_stats[0] if roi_stats else 0,
|
||
|
|
'min_roi_pct': roi_stats[1] if roi_stats else 0,
|
||
|
|
'max_roi_pct': roi_stats[2] if roi_stats else 0,
|
||
|
|
'avg_sharpe': roi_stats[3] if roi_stats else 0,
|
||
|
|
'avg_max_dd_pct': roi_stats[4] if roi_stats else 0,
|
||
|
|
}
|
||
|
|
|
||
|
|
def load_batch(self, batch_id: int) -> Optional[pd.DataFrame]:
|
||
|
|
"""Load a batch of results from Parquet."""
|
||
|
|
if not PANDAS_AVAILABLE:
|
||
|
|
return None
|
||
|
|
|
||
|
|
parquet_path = self.results_dir / f"batch_{batch_id:04d}_results.parquet"
|
||
|
|
|
||
|
|
if not parquet_path.exists():
|
||
|
|
return None
|
||
|
|
|
||
|
|
return pd.read_parquet(parquet_path)
|
||
|
|
|
||
|
|
def load_corpus(self) -> Optional[pd.DataFrame]:
|
||
|
|
"""Load entire corpus from all batches."""
|
||
|
|
if not PANDAS_AVAILABLE:
|
||
|
|
return None
|
||
|
|
|
||
|
|
batches = []
|
||
|
|
for parquet_file in sorted(self.results_dir.glob("batch_*_results.parquet")):
|
||
|
|
df = pd.read_parquet(parquet_file)
|
||
|
|
batches.append(df)
|
||
|
|
|
||
|
|
if not batches:
|
||
|
|
return None
|
||
|
|
|
||
|
|
return pd.concat(batches, ignore_index=True)
|