initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
464
nautilus_dolphin/tests/test_trade_by_trade_validation.py
Executable file
464
nautilus_dolphin/tests/test_trade_by_trade_validation.py
Executable file
@@ -0,0 +1,464 @@
|
||||
"""
|
||||
CRITICAL: Trade-by-Trade Validation Test
|
||||
=========================================
|
||||
|
||||
This test runs a Nautilus-Dolphin backtest and compares EVERY TRADE
|
||||
to the standalone DOLPHIN (itest_v7) reference results.
|
||||
|
||||
MUST MATCH with 0.1% tolerance:
|
||||
- Entry prices
|
||||
- Exit prices
|
||||
- P&L calculations
|
||||
- Exit types
|
||||
- Bars held
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
# ── Configuration ────────────────────────────────────────────────────────────
|
||||
REFERENCE_RESULTS_FILE = Path(__file__).parent.parent.parent / "itest_v7_results.json"
|
||||
REFERENCE_TRADES_FILE = Path(__file__).parent.parent.parent / "itest_v7_trades.jsonl"
|
||||
REFERENCE_STRATEGY = "tight_3_3"
|
||||
|
||||
# Tolerance for floating point comparisons
|
||||
PRICE_TOLERANCE = 0.001 # 0.1%
|
||||
PNL_TOLERANCE = 0.001 # 0.1%
|
||||
|
||||
|
||||
@dataclass
|
||||
class TradeComparison:
|
||||
"""Detailed trade comparison record."""
|
||||
trade_idx: int
|
||||
asset: str
|
||||
ref_entry: float
|
||||
nd_entry: float
|
||||
entry_diff_pct: float
|
||||
ref_exit: float
|
||||
nd_exit: float
|
||||
exit_diff_pct: float
|
||||
ref_pnl: float
|
||||
nd_pnl: float
|
||||
pnl_diff_pct: float
|
||||
ref_exit_type: str
|
||||
nd_exit_type: str
|
||||
exit_type_match: bool
|
||||
ref_bars: int
|
||||
nd_bars: int
|
||||
bars_match: bool
|
||||
passed: bool
|
||||
|
||||
|
||||
class TestTradeByTradeValidation:
|
||||
"""
|
||||
CRITICAL TEST: Validates EVERY trade matches between ND and standalone.
|
||||
|
||||
This test:
|
||||
1. Loads reference trades from itest_v7
|
||||
2. Runs ND backtest with identical configuration
|
||||
3. Compares trade-by-trade
|
||||
4. Reports ANY discrepancies
|
||||
"""
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def reference_data(self):
|
||||
"""Load reference data from itest_v7."""
|
||||
if not REFERENCE_RESULTS_FILE.exists() or not REFERENCE_TRADES_FILE.exists():
|
||||
pytest.skip("Reference data not available")
|
||||
|
||||
# Load results
|
||||
with open(REFERENCE_RESULTS_FILE, 'r') as f:
|
||||
results = json.load(f)
|
||||
|
||||
# Load trades
|
||||
trades = []
|
||||
with open(REFERENCE_TRADES_FILE, 'r') as f:
|
||||
for line in f:
|
||||
data = json.loads(line.strip())
|
||||
if data.get('strategy') == REFERENCE_STRATEGY:
|
||||
trades.append(data)
|
||||
|
||||
return {
|
||||
'results': results['strategies'][REFERENCE_STRATEGY],
|
||||
'trades': trades
|
||||
}
|
||||
|
||||
def test_critical_reference_data_loaded(self, reference_data):
|
||||
"""CRITICAL: Verify reference data is loaded correctly."""
|
||||
ref_results = reference_data['results']
|
||||
ref_trades = reference_data['trades']
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("CRITICAL: Reference Data Validation")
|
||||
print(f"{'='*70}")
|
||||
print(f"Strategy: {REFERENCE_STRATEGY}")
|
||||
print(f"Total Trades: {len(ref_trades)}")
|
||||
print(f"Reference Trade Count: {ref_results['trades']}")
|
||||
print(f"Win Rate: {ref_results['win_rate']:.2f}%")
|
||||
print(f"ROI: {ref_results['roi_pct']:.2f}%")
|
||||
print(f"Profit Factor: {ref_results['profit_factor']:.4f}")
|
||||
|
||||
# CRITICAL: Must have trades to compare
|
||||
assert len(ref_trades) > 0, "CRITICAL: No reference trades loaded"
|
||||
assert len(ref_trades) == ref_results['trades'], "Trade count mismatch"
|
||||
|
||||
# Store for later tests
|
||||
pytest.reference_results = ref_results
|
||||
pytest.reference_trades = ref_trades
|
||||
|
||||
def test_critical_nd_configuration_matches_reference(self):
|
||||
"""CRITICAL: Verify ND configuration matches itest_v7 exactly."""
|
||||
from nautilus_dolphin.nautilus.strategy_registration import DolphinStrategyConfig
|
||||
|
||||
# tight_3_3 configuration from itest_v7
|
||||
expected_config = {
|
||||
'venue': 'BINANCE_FUTURES',
|
||||
'max_leverage': 2.5,
|
||||
'capital_fraction': 0.15,
|
||||
'max_hold_bars': 120,
|
||||
'irp_alignment_min': 0.45,
|
||||
'momentum_magnitude_min': 0.000075,
|
||||
'tp_bps': 99,
|
||||
}
|
||||
|
||||
nd_config = DolphinStrategyConfig(
|
||||
venue=expected_config['venue'],
|
||||
max_leverage=expected_config['max_leverage'],
|
||||
capital_fraction=expected_config['capital_fraction'],
|
||||
max_hold_bars=expected_config['max_hold_bars'],
|
||||
irp_alignment_min=expected_config['irp_alignment_min'],
|
||||
momentum_magnitude_min=expected_config['momentum_magnitude_min'],
|
||||
tp_bps=expected_config['tp_bps'],
|
||||
)
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("CRITICAL: Configuration Validation")
|
||||
print(f"{'='*70}")
|
||||
|
||||
for key, expected_value in expected_config.items():
|
||||
actual_value = getattr(nd_config, key)
|
||||
match = actual_value == expected_value
|
||||
status = "✅" if match else "❌"
|
||||
print(f"{status} {key}: {actual_value} (expected: {expected_value})")
|
||||
assert match, f"Configuration mismatch: {key}"
|
||||
|
||||
def test_critical_sample_trades_structure(self, reference_data):
|
||||
"""CRITICAL: Examine structure of sample trades."""
|
||||
ref_trades = reference_data['trades']
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("CRITICAL: Sample Trade Structure")
|
||||
print(f"{'='*70}")
|
||||
|
||||
for i, trade in enumerate(ref_trades[:5]):
|
||||
print(f"\nTrade {i+1}: {trade['trade_asset']} {trade['direction']}")
|
||||
print(f" Entry: ${trade['entry_price']:.2f}")
|
||||
print(f" Exit: ${trade['exit_price']:.2f}")
|
||||
print(f" Net P&L: ${trade['net_pnl']:.4f}")
|
||||
print(f" Exit Type: {trade['exit_type']}")
|
||||
print(f" Bars Held: {trade['bars_held']}")
|
||||
|
||||
# Validate required fields exist
|
||||
required_fields = [
|
||||
'trade_asset', 'entry_price', 'exit_price', 'net_pnl',
|
||||
'exit_type', 'bars_held', 'direction'
|
||||
]
|
||||
for field in required_fields:
|
||||
assert field in trade, f"Missing field: {field}"
|
||||
|
||||
@pytest.mark.timeout(300) # 5 minute timeout
|
||||
def test_critical_trade_counts_match(self, reference_data):
|
||||
"""CRITICAL: ND must produce same number of trades as reference."""
|
||||
ref_results = reference_data['results']
|
||||
expected_count = ref_results['trades']
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("CRITICAL: Trade Count Validation")
|
||||
print(f"{'='*70}")
|
||||
print(f"Expected trades (itest_v7): {expected_count}")
|
||||
|
||||
# TODO: Run ND backtest and get actual count
|
||||
# For now, validate the test framework
|
||||
print(f"⚠️ ND backtest not yet run - test framework validated")
|
||||
|
||||
# This test will pass once ND backtest is implemented
|
||||
# nd_count = run_nd_backtest_and_get_trade_count()
|
||||
# assert nd_count == expected_count, f"Trade count mismatch: {nd_count} vs {expected_count}"
|
||||
|
||||
def test_critical_first_50_trades_sample(self, reference_data):
|
||||
"""CRITICAL: Detailed validation of first 50 trades."""
|
||||
ref_trades = reference_data['trades']
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("CRITICAL: First 50 Trades Analysis")
|
||||
print(f"{'='*70}")
|
||||
|
||||
sample = ref_trades[:50]
|
||||
|
||||
# Analyze sample
|
||||
assets = set(t['trade_asset'] for t in sample)
|
||||
exit_types = {}
|
||||
for t in sample:
|
||||
et = t['exit_type']
|
||||
exit_types[et] = exit_types.get(et, 0) + 1
|
||||
|
||||
print(f"Assets traded: {assets}")
|
||||
print(f"Exit type distribution:")
|
||||
for et, count in sorted(exit_types.items(), key=lambda x: -x[1]):
|
||||
print(f" {et}: {count} ({100*count/len(sample):.1f}%)")
|
||||
|
||||
# Validate P&L calculations
|
||||
total_pnl = sum(t['net_pnl'] for t in sample)
|
||||
winners = sum(1 for t in sample if t['net_pnl'] > 0)
|
||||
|
||||
print(f"\nSample Statistics:")
|
||||
print(f" Total P&L: ${total_pnl:.2f}")
|
||||
print(f" Winners: {winners}/{len(sample)} ({100*winners/len(sample):.1f}%)")
|
||||
|
||||
assert len(sample) == 50, "Sample size mismatch"
|
||||
|
||||
@pytest.mark.timeout(600) # 10 minute timeout for full comparison
|
||||
def test_critical_full_trade_by_trade_comparison(self, reference_data):
|
||||
"""
|
||||
CRITICAL: Full trade-by-trade comparison.
|
||||
|
||||
This is the MOST IMPORTANT test - validates EVERY trade matches.
|
||||
"""
|
||||
ref_trades = reference_data['trades']
|
||||
ref_results = reference_data['results']
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("CRITICAL: Full Trade-by-Trade Comparison")
|
||||
print(f"{'='*70}")
|
||||
print(f"Reference trades to validate: {len(ref_trades)}")
|
||||
print(f"Tolerance: {PRICE_TOLERANCE*100:.2f}% for prices, {PNL_TOLERANCE*100:.2f}% for P&L")
|
||||
|
||||
# TODO: Load ND backtest results
|
||||
# nd_trades = run_nd_backtest_and_get_trades()
|
||||
|
||||
# For now, create a placeholder comparison report
|
||||
print(f"\n⚠️ COMPARISON FRAMEWORK READY")
|
||||
print(f" - Reference trades loaded: {len(ref_trades)}")
|
||||
print(f" - Validation criteria defined")
|
||||
print(f" - Tolerance levels set")
|
||||
print(f"\n Next: Run ND backtest to generate comparison data")
|
||||
|
||||
# Once ND results are available:
|
||||
# comparisons = compare_trades(ref_trades, nd_trades)
|
||||
# report_comparison_results(comparisons)
|
||||
|
||||
assert len(ref_trades) > 0, "No reference trades to compare"
|
||||
|
||||
def test_critical_exit_type_distribution_match(self, reference_data):
|
||||
"""CRITICAL: Exit type distribution must match."""
|
||||
ref_results = reference_data['results']
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("CRITICAL: Exit Type Distribution")
|
||||
print(f"{'='*70}")
|
||||
|
||||
total = ref_results['trades']
|
||||
distributions = {
|
||||
'trailing': ref_results['trailing_exits'],
|
||||
'stop': ref_results['stop_exits'],
|
||||
'target': ref_results['target_exits'],
|
||||
'hold': ref_results['hold_exits']
|
||||
}
|
||||
|
||||
print("Reference distribution:")
|
||||
for exit_type, count in distributions.items():
|
||||
pct = 100 * count / total
|
||||
print(f" {exit_type}: {count} ({pct:.1f}%)")
|
||||
|
||||
# Validate totals
|
||||
total_exits = sum(distributions.values())
|
||||
assert total_exits == total, f"Exit count mismatch: {total_exits} vs {total}"
|
||||
|
||||
def test_critical_profit_loss_calculations(self, reference_data):
|
||||
"""CRITICAL: P&L calculations must be consistent."""
|
||||
ref_trades = reference_data['trades']
|
||||
ref_results = reference_data['results']
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("CRITICAL: P&L Calculation Validation")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# Verify aggregate P&L
|
||||
total_net_pnl = sum(t['net_pnl'] for t in ref_trades)
|
||||
avg_trade_pnl = total_net_pnl / len(ref_trades)
|
||||
|
||||
print(f"Total Net P&L: ${total_net_pnl:.2f}")
|
||||
print(f"Average per trade: ${avg_trade_pnl:.4f}")
|
||||
print(f"Winners: {ref_results['wins']} ({ref_results['win_rate']:.2f}%)")
|
||||
print(f"Profit Factor: {ref_results['profit_factor']:.4f}")
|
||||
|
||||
# Validate calculations
|
||||
calc_win_rate = 100 * ref_results['wins'] / ref_results['trades']
|
||||
assert abs(calc_win_rate - ref_results['win_rate']) < 0.1, "Win rate mismatch"
|
||||
|
||||
|
||||
class TestNDTradeGeneration:
|
||||
"""Test that ND can generate trades comparable to reference."""
|
||||
|
||||
def test_nd_strategy_can_generate_signals(self):
|
||||
"""Test that ND strategy generates signals."""
|
||||
from nautilus_dolphin.nautilus.strategy import DolphinExecutionStrategyForTesting
|
||||
|
||||
strategy = DolphinExecutionStrategyForTesting({
|
||||
'venue': 'BINANCE_FUTURES',
|
||||
'max_leverage': 2.5,
|
||||
'capital_fraction': 0.15,
|
||||
'acb_enabled': False,
|
||||
})
|
||||
|
||||
# Simulate signal generation
|
||||
test_signals = [
|
||||
{
|
||||
'asset': 'BTCUSDT',
|
||||
'direction': 'SHORT',
|
||||
'vel_div': -0.025,
|
||||
'strength': 0.75,
|
||||
'irp_alignment': 0.5,
|
||||
'direction_confirm': True,
|
||||
'lookback_momentum': 0.0001,
|
||||
'price': 50000.0
|
||||
},
|
||||
{
|
||||
'asset': 'ETHUSDT',
|
||||
'direction': 'SHORT',
|
||||
'vel_div': -0.03,
|
||||
'strength': 0.8,
|
||||
'irp_alignment': 0.6,
|
||||
'direction_confirm': True,
|
||||
'lookback_momentum': 0.00015,
|
||||
'price': 3000.0
|
||||
}
|
||||
]
|
||||
|
||||
# Set volatility to high regime
|
||||
strategy.volatility_detector._regime = 'high'
|
||||
|
||||
valid_signals = []
|
||||
for signal in test_signals:
|
||||
if strategy._should_trade(signal) == "":
|
||||
valid_signals.append(signal)
|
||||
|
||||
print(f"\nGenerated {len(valid_signals)} valid signals from {len(test_signals)} candidates")
|
||||
assert len(valid_signals) > 0, "Strategy should generate valid signals"
|
||||
|
||||
def test_nd_position_sizing_matches_reference(self):
|
||||
"""Test ND position sizing matches itest_v7."""
|
||||
from nautilus_dolphin.nautilus.strategy import DolphinExecutionStrategyForTesting
|
||||
|
||||
strategy = DolphinExecutionStrategyForTesting({
|
||||
'venue': 'BINANCE_FUTURES',
|
||||
'max_leverage': 2.5,
|
||||
'capital_fraction': 0.15,
|
||||
'min_leverage': 0.5,
|
||||
'leverage_convexity': 3.0,
|
||||
'acb_enabled': False,
|
||||
})
|
||||
|
||||
# Test signal with 0.75 strength
|
||||
signal = {
|
||||
'strength': 0.75,
|
||||
'bucket_boost': 1.0,
|
||||
'streak_mult': 1.0,
|
||||
'trend_mult': 1.0,
|
||||
}
|
||||
|
||||
account_balance = 10000.0
|
||||
notional = strategy.calculate_position_size(signal, account_balance)
|
||||
leverage = strategy.calculate_leverage(signal)
|
||||
|
||||
# itest_v7: base_notional = 10000 * 0.15 * 2.5 = 3750
|
||||
expected_base = 10000 * 0.15 * 2.5
|
||||
|
||||
print(f"\nPosition Sizing Comparison:")
|
||||
print(f" Account: ${account_balance:,.2f}")
|
||||
print(f" ND Notional: ${notional:,.2f}")
|
||||
print(f" Expected (itest_v7): ${expected_base:,.2f}")
|
||||
print(f" Calculated Leverage: {leverage:.2f}x")
|
||||
|
||||
# Allow for small differences due to convexity
|
||||
assert notional > 0, "Notional must be positive"
|
||||
assert 0.5 <= leverage <= 5.0, "Leverage must be in valid range"
|
||||
|
||||
|
||||
# ── Helper Functions for Future Implementation ───────────────────────────────
|
||||
|
||||
def compare_trades(ref_trades: List[Dict], nd_trades: List[Dict]) -> List[TradeComparison]:
|
||||
"""
|
||||
Compare reference trades to ND trades trade-by-trade.
|
||||
|
||||
This function will be used once ND backtest results are available.
|
||||
"""
|
||||
comparisons = []
|
||||
|
||||
for i, (ref, nd) in enumerate(zip(ref_trades, nd_trades)):
|
||||
# Calculate differences
|
||||
entry_diff = abs(ref['entry_price'] - nd['entry_price']) / ref['entry_price']
|
||||
exit_diff = abs(ref['exit_price'] - nd['exit_price']) / ref['exit_price']
|
||||
pnl_diff = abs(ref['net_pnl'] - nd['net_pnl']) / max(abs(ref['net_pnl']), 0.01)
|
||||
|
||||
comparison = TradeComparison(
|
||||
trade_idx=i,
|
||||
asset=ref['trade_asset'],
|
||||
ref_entry=ref['entry_price'],
|
||||
nd_entry=nd['entry_price'],
|
||||
entry_diff_pct=entry_diff * 100,
|
||||
ref_exit=ref['exit_price'],
|
||||
nd_exit=nd['exit_price'],
|
||||
exit_diff_pct=exit_diff * 100,
|
||||
ref_pnl=ref['net_pnl'],
|
||||
nd_pnl=nd['net_pnl'],
|
||||
pnl_diff_pct=pnl_diff * 100,
|
||||
ref_exit_type=ref['exit_type'],
|
||||
nd_exit_type=nd['exit_type'],
|
||||
exit_type_match=ref['exit_type'] == nd['exit_type'],
|
||||
ref_bars=ref['bars_held'],
|
||||
nd_bars=nd['bars_held'],
|
||||
bars_match=ref['bars_held'] == nd['bars_held'],
|
||||
passed=(
|
||||
entry_diff <= PRICE_TOLERANCE and
|
||||
exit_diff <= PRICE_TOLERANCE and
|
||||
pnl_diff <= PNL_TOLERANCE and
|
||||
ref['exit_type'] == nd['exit_type'] and
|
||||
ref['bars_held'] == nd['bars_held']
|
||||
)
|
||||
)
|
||||
comparisons.append(comparison)
|
||||
|
||||
return comparisons
|
||||
|
||||
|
||||
def report_comparison_results(comparisons: List[TradeComparison]):
|
||||
"""Generate detailed comparison report."""
|
||||
total = len(comparisons)
|
||||
passed = sum(1 for c in comparisons if c.passed)
|
||||
failed = total - passed
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print("TRADE-BY-TRADE COMPARISON RESULTS")
|
||||
print(f"{'='*70}")
|
||||
print(f"Total trades compared: {total}")
|
||||
print(f"Passed: {passed} ({100*passed/total:.1f}%)")
|
||||
print(f"Failed: {failed} ({100*failed/total:.1f}%)")
|
||||
|
||||
if failed > 0:
|
||||
print(f"\nFirst 5 failures:")
|
||||
for c in [c for c in comparisons if not c.passed][:5]:
|
||||
print(f"\n Trade {c.trade_idx}: {c.asset}")
|
||||
print(f" Entry diff: {c.entry_diff_pct:.4f}%")
|
||||
print(f" Exit diff: {c.exit_diff_pct:.4f}%")
|
||||
print(f" P&L diff: {c.pnl_diff_pct:.4f}%")
|
||||
print(f" Exit type match: {c.exit_type_match}")
|
||||
print(f" Bars match: {c.bars_match}")
|
||||
|
||||
return failed == 0
|
||||
Reference in New Issue
Block a user