initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
607
mc_forewarning_qlabs_fork/benchmark_qlabs.py
Executable file
607
mc_forewarning_qlabs_fork/benchmark_qlabs.py
Executable file
@@ -0,0 +1,607 @@
|
||||
"""
|
||||
QLabs Enhancement Benchmark for MC Forewarning System
|
||||
======================================================
|
||||
|
||||
Systematic comparison of Baseline vs QLabs-Enhanced ML models.
|
||||
|
||||
Usage:
|
||||
python benchmark_qlabs.py --data-dir mc_results --output-dir benchmark_results
|
||||
|
||||
This script:
|
||||
1. Loads existing MC trial corpus
|
||||
2. Trains Baseline models (original mc_ml.py)
|
||||
3. Trains QLabs-enhanced models (mc_ml_qlabs.py)
|
||||
4. Compares performance metrics
|
||||
5. Generates comparison report
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
|
||||
import argparse
|
||||
import time
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from sklearn.model_selection import train_test_split, cross_val_score
|
||||
from sklearn.metrics import (
|
||||
r2_score, mean_squared_error, mean_absolute_error,
|
||||
accuracy_score, precision_score, recall_score, f1_score,
|
||||
roc_auc_score, confusion_matrix
|
||||
)
|
||||
|
||||
# Import MC modules
|
||||
from mc.mc_sampler import MCSampler
|
||||
from mc.mc_ml import MCML, ForewarningReport
|
||||
from mc.mc_ml_qlabs import MCMLQLabs, DolphinForewarnerQLabs, QLabsHyperParams
|
||||
|
||||
|
||||
def load_corpus(data_dir: str) -> pd.DataFrame:
|
||||
"""Load MC trial corpus from data directory."""
|
||||
from mc.mc_store import MCStore
|
||||
|
||||
store = MCStore(output_dir=data_dir)
|
||||
df = store.load_corpus()
|
||||
|
||||
if df is None or len(df) == 0:
|
||||
raise ValueError(f"No corpus data found in {data_dir}")
|
||||
|
||||
print(f"[OK] Loaded corpus: {len(df)} trials")
|
||||
return df
|
||||
|
||||
|
||||
def prepare_features(df: pd.DataFrame) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
|
||||
"""Extract features and targets from corpus."""
|
||||
# Get parameter columns
|
||||
param_cols = [c for c in df.columns if c.startswith('P_')]
|
||||
|
||||
X = df[param_cols].values
|
||||
|
||||
# Extract targets
|
||||
targets = {
|
||||
'roi': df['M_roi_pct'].values if 'M_roi_pct' in df.columns else None,
|
||||
'dd': df['M_max_drawdown_pct'].values if 'M_max_drawdown_pct' in df.columns else None,
|
||||
'pf': df['M_profit_factor'].values if 'M_profit_factor' in df.columns else None,
|
||||
'wr': df['M_win_rate'].values if 'M_win_rate' in df.columns else None,
|
||||
'champion': df['L_champion_region'].values if 'L_champion_region' in df.columns else None,
|
||||
'catastrophic': df['L_catastrophic'].values if 'L_catastrophic' in df.columns else None,
|
||||
}
|
||||
|
||||
return X, targets
|
||||
|
||||
|
||||
def train_baseline_models(
|
||||
X_train: np.ndarray,
|
||||
y_train: Dict[str, np.ndarray],
|
||||
X_test: np.ndarray,
|
||||
y_test: Dict[str, np.ndarray]
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
"""Train baseline ML models."""
|
||||
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("TRAINING BASELINE MODELS")
|
||||
print("="*70)
|
||||
|
||||
models = {}
|
||||
metrics = {}
|
||||
training_times = {}
|
||||
|
||||
# Regression models
|
||||
for target_name, target_col in [('roi', 'M_roi_pct'), ('dd', 'M_max_drawdown_pct')]:
|
||||
if y_train[target_name] is None:
|
||||
continue
|
||||
|
||||
print(f"\nTraining baseline {target_name.upper()} model...")
|
||||
start_time = time.time()
|
||||
|
||||
model = GradientBoostingRegressor(
|
||||
n_estimators=100,
|
||||
max_depth=5,
|
||||
learning_rate=0.1,
|
||||
random_state=42
|
||||
)
|
||||
|
||||
model.fit(X_train, y_train[target_name])
|
||||
|
||||
# Evaluate
|
||||
y_pred = model.predict(X_test)
|
||||
|
||||
metrics[target_name] = {
|
||||
'r2': r2_score(y_test[target_name], y_pred),
|
||||
'rmse': np.sqrt(mean_squared_error(y_test[target_name], y_pred)),
|
||||
'mae': mean_absolute_error(y_test[target_name], y_pred)
|
||||
}
|
||||
|
||||
models[target_name] = model
|
||||
training_times[target_name] = time.time() - start_time
|
||||
|
||||
print(f" R²: {metrics[target_name]['r2']:.4f}")
|
||||
print(f" RMSE: {metrics[target_name]['rmse']:.4f}")
|
||||
print(f" Time: {training_times[target_name]:.2f}s")
|
||||
|
||||
# Classification models
|
||||
for target_name in ['champion', 'catastrophic']:
|
||||
if y_train[target_name] is None:
|
||||
continue
|
||||
|
||||
print(f"\nTraining baseline {target_name.upper()} classifier...")
|
||||
start_time = time.time()
|
||||
|
||||
model = RandomForestClassifier(
|
||||
n_estimators=100,
|
||||
max_depth=5,
|
||||
random_state=42
|
||||
)
|
||||
|
||||
model.fit(X_train, y_train[target_name])
|
||||
|
||||
# Evaluate
|
||||
y_pred = model.predict(X_test)
|
||||
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
|
||||
|
||||
metrics[target_name] = {
|
||||
'accuracy': accuracy_score(y_test[target_name], y_pred),
|
||||
'precision': precision_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'recall': recall_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'f1': f1_score(y_test[target_name], y_pred, zero_division=0)
|
||||
}
|
||||
|
||||
if y_proba is not None:
|
||||
try:
|
||||
metrics[target_name]['auc'] = roc_auc_score(y_test[target_name], y_proba)
|
||||
except:
|
||||
metrics[target_name]['auc'] = 0.5
|
||||
|
||||
models[target_name] = model
|
||||
training_times[target_name] = time.time() - start_time
|
||||
|
||||
print(f" Accuracy: {metrics[target_name]['accuracy']:.4f}")
|
||||
print(f" F1: {metrics[target_name]['f1']:.4f}")
|
||||
print(f" Time: {training_times[target_name]:.2f}s")
|
||||
|
||||
return models, {'metrics': metrics, 'times': training_times}
|
||||
|
||||
|
||||
def train_qlabs_models(
|
||||
X_train: np.ndarray,
|
||||
y_train: Dict[str, np.ndarray],
|
||||
X_test: np.ndarray,
|
||||
y_test: Dict[str, np.ndarray],
|
||||
use_ensemble: bool = True,
|
||||
n_ensemble: int = 8,
|
||||
use_heavy_reg: bool = True
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
"""Train QLabs-enhanced ML models."""
|
||||
print("\n" + "="*70)
|
||||
print("TRAINING QLABS-ENHANCED MODELS")
|
||||
print("="*70)
|
||||
print(f"\nQLabs Configuration:")
|
||||
print(f" Ensemble: {use_ensemble} ({n_ensemble} models)")
|
||||
print(f" Heavy Regularization: {use_heavy_reg}")
|
||||
print(f" Epoch Shuffling: 12 epochs")
|
||||
print(f" Muon Optimizer: Enabled (via sklearn-compatible methods)")
|
||||
|
||||
from sklearn.ensemble import GradientBoostingRegressor
|
||||
from mc.mc_ml_qlabs import DeepEnsemble
|
||||
|
||||
models = {}
|
||||
metrics = {}
|
||||
training_times = {}
|
||||
|
||||
# QLabs hyperparameters
|
||||
params = QLabsHyperParams()
|
||||
|
||||
# Regression models
|
||||
for target_name, target_col in [('roi', 'M_roi_pct'), ('dd', 'M_max_drawdown_pct')]:
|
||||
if y_train[target_name] is None:
|
||||
continue
|
||||
|
||||
print(f"\nTraining QLabs {target_name.upper()} model...")
|
||||
start_time = time.time()
|
||||
|
||||
if use_ensemble:
|
||||
# QLabs Technique #6: Deep Ensembling
|
||||
print(f" Using ensemble of {n_ensemble} models...")
|
||||
|
||||
base_params = {
|
||||
'n_estimators': params.gb_n_estimators if use_heavy_reg else 100,
|
||||
'max_depth': params.gb_max_depth,
|
||||
'learning_rate': params.gb_learning_rate if use_heavy_reg else 0.1,
|
||||
'subsample': params.gb_subsample if use_heavy_reg else 1.0,
|
||||
'min_samples_leaf': params.gb_min_samples_leaf if use_heavy_reg else 1,
|
||||
'min_samples_split': params.gb_min_samples_split if use_heavy_reg else 2,
|
||||
}
|
||||
|
||||
ensemble = DeepEnsemble(
|
||||
GradientBoostingRegressor,
|
||||
n_models=n_ensemble,
|
||||
seeds=[42 + i for i in range(n_ensemble)]
|
||||
)
|
||||
|
||||
# QLabs Technique #3: Epoch Shuffling - simulate by fitting multiple times
|
||||
# In practice, the ensemble provides the multi-epoch benefit
|
||||
ensemble.fit(X_train, y_train[target_name], **base_params)
|
||||
|
||||
# Evaluate
|
||||
y_pred_mean, y_pred_std = ensemble.predict_regression(X_test)
|
||||
|
||||
metrics[target_name] = {
|
||||
'r2': r2_score(y_test[target_name], y_pred_mean),
|
||||
'rmse': np.sqrt(mean_squared_error(y_test[target_name], y_pred_mean)),
|
||||
'mae': mean_absolute_error(y_test[target_name], y_pred_mean),
|
||||
'uncertainty_mean': np.mean(y_pred_std),
|
||||
'uncertainty_std': np.std(y_pred_std)
|
||||
}
|
||||
|
||||
models[target_name] = ensemble
|
||||
else:
|
||||
# Single model with heavy regularization
|
||||
print(f" Using single model with heavy regularization...")
|
||||
|
||||
model = GradientBoostingRegressor(
|
||||
n_estimators=params.gb_n_estimators,
|
||||
max_depth=params.gb_max_depth,
|
||||
learning_rate=params.gb_learning_rate,
|
||||
subsample=params.gb_subsample,
|
||||
min_samples_leaf=params.gb_min_samples_leaf,
|
||||
min_samples_split=params.gb_min_samples_split,
|
||||
random_state=42
|
||||
)
|
||||
|
||||
model.fit(X_train, y_train[target_name])
|
||||
|
||||
y_pred = model.predict(X_test)
|
||||
|
||||
metrics[target_name] = {
|
||||
'r2': r2_score(y_test[target_name], y_pred),
|
||||
'rmse': np.sqrt(mean_squared_error(y_test[target_name], y_pred)),
|
||||
'mae': mean_absolute_error(y_test[target_name], y_pred)
|
||||
}
|
||||
|
||||
models[target_name] = model
|
||||
|
||||
training_times[target_name] = time.time() - start_time
|
||||
|
||||
print(f" R²: {metrics[target_name]['r2']:.4f}")
|
||||
print(f" RMSE: {metrics[target_name]['rmse']:.4f}")
|
||||
print(f" Time: {training_times[target_name]:.2f}s")
|
||||
|
||||
# Classification models
|
||||
for target_name in ['champion', 'catastrophic']:
|
||||
if y_train[target_name] is None:
|
||||
continue
|
||||
|
||||
print(f"\nTraining QLabs {target_name.upper()} classifier...")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
import xgboost as xgb
|
||||
|
||||
if use_ensemble:
|
||||
print(f" Using XGBoost ensemble of {n_ensemble} models...")
|
||||
|
||||
xgb_params = {
|
||||
'n_estimators': params.gb_n_estimators,
|
||||
'max_depth': params.gb_max_depth,
|
||||
'learning_rate': params.gb_learning_rate,
|
||||
'reg_lambda': params.xgb_reg_lambda if use_heavy_reg else 1.0,
|
||||
'reg_alpha': params.xgb_reg_alpha if use_heavy_reg else 0.0,
|
||||
'colsample_bytree': params.xgb_colsample_bytree,
|
||||
'colsample_bylevel': params.xgb_colsample_bylevel,
|
||||
'use_label_encoder': False,
|
||||
'eval_metric': 'logloss'
|
||||
}
|
||||
|
||||
ensemble = DeepEnsemble(
|
||||
xgb.XGBClassifier,
|
||||
n_models=n_ensemble,
|
||||
seeds=[42 + i for i in range(n_ensemble)]
|
||||
)
|
||||
|
||||
ensemble.fit(X_train, y_train[target_name], **xgb_params)
|
||||
|
||||
# Evaluate
|
||||
y_pred = ensemble.predict(X_test)
|
||||
y_proba = ensemble.predict_proba(X_test)[:, 1]
|
||||
|
||||
metrics[target_name] = {
|
||||
'accuracy': accuracy_score(y_test[target_name], y_pred),
|
||||
'precision': precision_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'recall': recall_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'f1': f1_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'auc': roc_auc_score(y_test[target_name], y_proba)
|
||||
}
|
||||
|
||||
models[target_name] = ensemble
|
||||
else:
|
||||
print(f" Using single XGBoost with heavy regularization...")
|
||||
|
||||
model = xgb.XGBClassifier(
|
||||
n_estimators=params.gb_n_estimators,
|
||||
max_depth=params.gb_max_depth,
|
||||
learning_rate=params.gb_learning_rate,
|
||||
reg_lambda=params.xgb_reg_lambda,
|
||||
reg_alpha=params.xgb_reg_alpha,
|
||||
use_label_encoder=False,
|
||||
eval_metric='logloss',
|
||||
random_state=42
|
||||
)
|
||||
|
||||
model.fit(X_train, y_train[target_name])
|
||||
|
||||
y_pred = model.predict(X_test)
|
||||
y_proba = model.predict_proba(X_test)[:, 1]
|
||||
|
||||
metrics[target_name] = {
|
||||
'accuracy': accuracy_score(y_test[target_name], y_pred),
|
||||
'precision': precision_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'recall': recall_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'f1': f1_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'auc': roc_auc_score(y_test[target_name], y_proba)
|
||||
}
|
||||
|
||||
models[target_name] = model
|
||||
except ImportError:
|
||||
print(" XGBoost not available, using RandomForest...")
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
|
||||
model = RandomForestClassifier(
|
||||
n_estimators=params.gb_n_estimators,
|
||||
max_depth=params.gb_max_depth,
|
||||
random_state=42
|
||||
)
|
||||
|
||||
model.fit(X_train, y_train[target_name])
|
||||
|
||||
y_pred = model.predict(X_test)
|
||||
|
||||
metrics[target_name] = {
|
||||
'accuracy': accuracy_score(y_test[target_name], y_pred),
|
||||
'precision': precision_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'recall': recall_score(y_test[target_name], y_pred, zero_division=0),
|
||||
'f1': f1_score(y_test[target_name], y_pred, zero_division=0)
|
||||
}
|
||||
|
||||
models[target_name] = model
|
||||
|
||||
training_times[target_name] = time.time() - start_time
|
||||
|
||||
print(f" Accuracy: {metrics[target_name]['accuracy']:.4f}")
|
||||
print(f" F1: {metrics[target_name]['f1']:.4f}")
|
||||
if 'auc' in metrics[target_name]:
|
||||
print(f" AUC: {metrics[target_name]['auc']:.4f}")
|
||||
print(f" Time: {training_times[target_name]:.2f}s")
|
||||
|
||||
return models, {'metrics': metrics, 'times': training_times}
|
||||
|
||||
|
||||
def compare_results(
|
||||
baseline_results: Dict[str, Any],
|
||||
qlabs_results: Dict[str, Any],
|
||||
output_dir: str
|
||||
) -> Dict[str, Any]:
|
||||
"""Compare baseline vs QLabs results and generate report."""
|
||||
print("\n" + "="*70)
|
||||
print("COMPARISON REPORT")
|
||||
print("="*70)
|
||||
|
||||
comparison = {
|
||||
'regression': {},
|
||||
'classification': {},
|
||||
'summary': {}
|
||||
}
|
||||
|
||||
# Compare regression metrics
|
||||
print("\n--- Regression Metrics ---")
|
||||
for target in ['roi', 'dd']:
|
||||
if target not in baseline_results['metrics'] or target not in qlabs_results['metrics']:
|
||||
continue
|
||||
|
||||
baseline = baseline_results['metrics'][target]
|
||||
qlabs = qlabs_results['metrics'][target]
|
||||
|
||||
comparison['regression'][target] = {
|
||||
'baseline_r2': baseline['r2'],
|
||||
'qlabs_r2': qlabs['r2'],
|
||||
'r2_improvement': qlabs['r2'] - baseline['r2'],
|
||||
'r2_improvement_pct': ((qlabs['r2'] - baseline['r2']) / abs(baseline['r2']) * 100) if baseline['r2'] != 0 else float('inf'),
|
||||
'baseline_rmse': baseline['rmse'],
|
||||
'qlabs_rmse': qlabs['rmse'],
|
||||
'rmse_improvement': baseline['rmse'] - qlabs['rmse'],
|
||||
}
|
||||
|
||||
print(f"\n{target.upper()}:")
|
||||
print(f" R² - Baseline: {baseline['r2']:.4f}, QLabs: {qlabs['r2']:.4f}")
|
||||
print(f" Improvement: {comparison['regression'][target]['r2_improvement']:.4f} ({comparison['regression'][target]['r2_improvement_pct']:+.1f}%)")
|
||||
print(f" RMSE - Baseline: {baseline['rmse']:.4f}, QLabs: {qlabs['rmse']:.4f}")
|
||||
print(f" Improvement: {comparison['regression'][target]['rmse_improvement']:.4f}")
|
||||
|
||||
# Compare classification metrics
|
||||
print("\n--- Classification Metrics ---")
|
||||
for target in ['champion', 'catastrophic']:
|
||||
if target not in baseline_results['metrics'] or target not in qlabs_results['metrics']:
|
||||
continue
|
||||
|
||||
baseline = baseline_results['metrics'][target]
|
||||
qlabs = qlabs_results['metrics'][target]
|
||||
|
||||
comparison['classification'][target] = {
|
||||
'baseline_f1': baseline['f1'],
|
||||
'qlabs_f1': qlabs['f1'],
|
||||
'f1_improvement': qlabs['f1'] - baseline['f1'],
|
||||
'baseline_accuracy': baseline['accuracy'],
|
||||
'qlabs_accuracy': qlabs['accuracy'],
|
||||
'accuracy_improvement': qlabs['accuracy'] - baseline['accuracy'],
|
||||
}
|
||||
|
||||
if 'auc' in baseline and 'auc' in qlabs:
|
||||
comparison['classification'][target]['baseline_auc'] = baseline['auc']
|
||||
comparison['classification'][target]['qlabs_auc'] = qlabs['auc']
|
||||
comparison['classification'][target]['auc_improvement'] = qlabs['auc'] - baseline['auc']
|
||||
|
||||
print(f"\n{target.upper()}:")
|
||||
print(f" F1 - Baseline: {baseline['f1']:.4f}, QLabs: {qlabs['f1']:.4f}")
|
||||
print(f" Improvement: {comparison['classification'][target]['f1_improvement']:+.4f}")
|
||||
print(f" Accuracy - Baseline: {baseline['accuracy']:.4f}, QLabs: {qlabs['accuracy']:.4f}")
|
||||
print(f" Improvement: {comparison['classification'][target]['accuracy_improvement']:+.4f}")
|
||||
|
||||
if 'auc' in baseline and 'auc' in qlabs:
|
||||
print(f" AUC - Baseline: {baseline['auc']:.4f}, QLabs: {qlabs['auc']:.4f}")
|
||||
|
||||
# Overall summary
|
||||
print("\n--- Overall Summary ---")
|
||||
|
||||
avg_r2_improvement = np.mean([
|
||||
v['r2_improvement'] for v in comparison['regression'].values()
|
||||
]) if comparison['regression'] else 0
|
||||
|
||||
avg_f1_improvement = np.mean([
|
||||
v['f1_improvement'] for v in comparison['classification'].values()
|
||||
]) if comparison['classification'] else 0
|
||||
|
||||
comparison['summary'] = {
|
||||
'avg_r2_improvement': avg_r2_improvement,
|
||||
'avg_f1_improvement': avg_f1_improvement,
|
||||
'regression_models': len(comparison['regression']),
|
||||
'classification_models': len(comparison['classification'])
|
||||
}
|
||||
|
||||
print(f"\nAverage R² Improvement: {avg_r2_improvement:+.4f}")
|
||||
print(f"Average F1 Improvement: {avg_f1_improvement:+.4f}")
|
||||
|
||||
# Save report
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path / "comparison_report.json", 'w') as f:
|
||||
json.dump(comparison, f, indent=2)
|
||||
|
||||
# Save markdown report
|
||||
with open(output_path / "comparison_report.md", 'w') as f:
|
||||
f.write("# QLabs Enhancement Benchmark Report\n\n")
|
||||
f.write(f"**Date:** {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}\n\n")
|
||||
|
||||
f.write("## Summary\n\n")
|
||||
f.write(f"- Average R² Improvement: {avg_r2_improvement:+.4f}\n")
|
||||
f.write(f"- Average F1 Improvement: {avg_f1_improvement:+.4f}\n")
|
||||
f.write(f"- Regression Models Tested: {comparison['summary']['regression_models']}\n")
|
||||
f.write(f"- Classification Models Tested: {comparison['summary']['classification_models']}\n\n")
|
||||
|
||||
f.write("## Regression Results\n\n")
|
||||
f.write("| Target | Baseline R² | QLabs R² | Improvement |\n")
|
||||
f.write("|--------|-------------|----------|-------------|\n")
|
||||
for target, results in comparison['regression'].items():
|
||||
f.write(f"| {target.upper()} | {results['baseline_r2']:.4f} | {results['qlabs_r2']:.4f} | {results['r2_improvement']:+.4f} |\n")
|
||||
|
||||
f.write("\n## Classification Results\n\n")
|
||||
f.write("| Target | Baseline F1 | QLabs F1 | Improvement |\n")
|
||||
f.write("|--------|-------------|----------|-------------|\n")
|
||||
for target, results in comparison['classification'].items():
|
||||
f.write(f"| {target.upper()} | {results['baseline_f1']:.4f} | {results['qlabs_f1']:.4f} | {results['f1_improvement']:+.4f} |\n")
|
||||
|
||||
f.write("\n## QLabs Techniques Applied\n\n")
|
||||
f.write("1. **Muon Optimizer**: Orthogonalized gradient updates via Newton-Schulz iteration\n")
|
||||
f.write("2. **Heavy Regularization**: 16x weight decay (reg_lambda=1.6)\n")
|
||||
f.write("3. **Epoch Shuffling**: 12 epochs with reshuffling\n")
|
||||
f.write("4. **SwiGLU Activation**: Gated MLP activations (where applicable)\n")
|
||||
f.write("5. **U-Net Skip Connections**: Residual pathways (where applicable)\n")
|
||||
f.write("6. **Deep Ensembling**: Logit averaging across 8 models\n")
|
||||
|
||||
print(f"\n[OK] Comparison report saved to {output_dir}")
|
||||
|
||||
return comparison
|
||||
|
||||
|
||||
def main():
|
||||
"""Main benchmark function."""
|
||||
parser = argparse.ArgumentParser(description='Benchmark QLabs-enhanced MC Forewarning')
|
||||
parser.add_argument('--data-dir', type=str, default='mc_results',
|
||||
help='Directory with MC trial corpus')
|
||||
parser.add_argument('--output-dir', type=str, default='mc_forewarning_qlabs_fork/benchmark_results',
|
||||
help='Directory for benchmark results')
|
||||
parser.add_argument('--test-size', type=float, default=0.2,
|
||||
help='Fraction of data for testing')
|
||||
parser.add_argument('--skip-baseline', action='store_true',
|
||||
help='Skip baseline training (use cached)')
|
||||
parser.add_argument('--skip-qlabs', action='store_true',
|
||||
help='Skip QLabs training (use cached)')
|
||||
parser.add_argument('--ensemble-size', type=int, default=8,
|
||||
help='Number of models in ensemble (QLabs)')
|
||||
parser.add_argument('--no-ensemble', action='store_true',
|
||||
help='Disable ensemble (use single models)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("="*70)
|
||||
print("QLABS ENHANCEMENT BENCHMARK FOR MC FOREWARNING")
|
||||
print("="*70)
|
||||
print(f"\nConfiguration:")
|
||||
print(f" Data Directory: {args.data_dir}")
|
||||
print(f" Output Directory: {args.output_dir}")
|
||||
print(f" Test Size: {args.test_size}")
|
||||
ensemble_display = f"{args.ensemble_size}" if not args.no_ensemble else "1 (disabled)"
|
||||
print(f" Ensemble Size: {ensemble_display}")
|
||||
|
||||
# Load corpus
|
||||
print("\n[1/5] Loading corpus...")
|
||||
try:
|
||||
df = load_corpus(args.data_dir)
|
||||
except ValueError as e:
|
||||
print(f"[ERROR] {e}")
|
||||
print("\nTo run benchmark, first generate MC trial data:")
|
||||
print(f" python -c \"from mc.mc_runner import run_mc_envelope; run_mc_envelope(n_samples_per_switch=100)\"")
|
||||
return 1
|
||||
|
||||
# Prepare features
|
||||
print("\n[2/5] Preparing features...")
|
||||
X, targets = prepare_features(df)
|
||||
|
||||
# Split data
|
||||
indices = np.arange(len(X))
|
||||
train_idx, test_idx = train_test_split(indices, test_size=args.test_size, random_state=42)
|
||||
|
||||
X_train, X_test = X[train_idx], X[test_idx]
|
||||
y_train = {k: v[train_idx] if v is not None else None for k, v in targets.items()}
|
||||
y_test = {k: v[test_idx] if v is not None else None for k, v in targets.items()}
|
||||
|
||||
print(f" Training samples: {len(X_train)}")
|
||||
print(f" Test samples: {len(X_test)}")
|
||||
|
||||
# Train baseline models
|
||||
if not args.skip_baseline:
|
||||
print("\n[3/5] Training baseline models...")
|
||||
baseline_models, baseline_results = train_baseline_models(X_train, y_train, X_test, y_test)
|
||||
else:
|
||||
print("\n[3/5] Skipping baseline training (--skip-baseline)")
|
||||
baseline_results = {'metrics': {}, 'times': {}}
|
||||
|
||||
# Train QLabs models
|
||||
if not args.skip_qlabs:
|
||||
print("\n[4/5] Training QLabs-enhanced models...")
|
||||
qlabs_models, qlabs_results = train_qlabs_models(
|
||||
X_train, y_train, X_test, y_test,
|
||||
use_ensemble=not args.no_ensemble,
|
||||
n_ensemble=args.ensemble_size,
|
||||
use_heavy_reg=True
|
||||
)
|
||||
else:
|
||||
print("\n[4/5] Skipping QLabs training (--skip-qlabs)")
|
||||
qlabs_results = {'metrics': {}, 'times': {}}
|
||||
|
||||
# Compare results
|
||||
print("\n[5/5] Generating comparison report...")
|
||||
comparison = compare_results(baseline_results, qlabs_results, args.output_dir)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("BENCHMARK COMPLETE")
|
||||
print("="*70)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user