Files
DOLPHIN/nautilus_dolphin/tests/test_nd_vs_standalone_comparison.py

514 lines
20 KiB
Python
Raw Normal View History

"""
CRITICAL TEST: Nautilus-Dolphin vs Standalone DOLPHIN Comparison
================================================================
This test verifies that Nautilus-Dolphin produces IDENTICAL results
to the standalone DOLPHIN implementation (itest_v7.py).
MUST MATCH:
- Trade count
- Win rate
- Profit factor
- ROI
- Entry/exit prices
- P&L per trade
- Exit types
"""
import json
import pytest
from pathlib import Path
from typing import Dict, List, Any
from dataclasses import dataclass
# ── Configuration ────────────────────────────────────────────────────────────
# Match the itest_v7 "tight_3_3" strategy configuration
REFERENCE_STRATEGY = "tight_3_3"
REFERENCE_RESULTS_FILE = Path(__file__).parent.parent.parent / "itest_v7_results.json"
REFERENCE_TRADES_FILE = Path(__file__).parent.parent.parent / "itest_v7_trades.jsonl"
TOLERANCE_PCT = 0.001 # 0.1% tolerance for floating point differences
@dataclass
class Trade:
"""Trade record for comparison."""
strategy: str
date: str
scan_idx: int
direction: str
entry_price: float
exit_price: float
exit_type: str
bars_held: int
leverage: float
notional: float
gross_pnl: float
fees: float
net_pnl: float
is_winner: bool
trade_asset: str
@dataclass
class StrategyMetrics:
"""Strategy metrics for comparison."""
name: str
capital: float
roi_pct: float
trades: int
wins: int
win_rate: float
profit_factor: float
avg_win: float
avg_loss: float
stop_exits: int
trailing_exits: int
target_exits: int
hold_exits: int
# Global storage for loaded data
_ref_results = None
_ref_trades = None
def load_reference_data():
"""Load reference data once."""
global _ref_results, _ref_trades
if _ref_results is None:
if REFERENCE_RESULTS_FILE.exists():
with open(REFERENCE_RESULTS_FILE, 'r') as f:
_ref_results = json.load(f)
if _ref_trades is None:
if REFERENCE_TRADES_FILE.exists():
_ref_trades = []
with open(REFERENCE_TRADES_FILE, 'r') as f:
for line in f:
data = json.loads(line.strip())
if data.get('strategy') == REFERENCE_STRATEGY:
_ref_trades.append(Trade(
strategy=data['strategy'],
date=data['date'],
scan_idx=data['scan_idx'],
direction=data['direction'],
entry_price=data['entry_price'],
exit_price=data['exit_price'],
exit_type=data['exit_type'],
bars_held=data['bars_held'],
leverage=data['leverage'],
notional=data['notional'],
gross_pnl=data['gross_pnl'],
fees=data['fees'],
net_pnl=data['net_pnl'],
is_winner=data['is_winner'],
trade_asset=data['trade_asset']
))
return _ref_results, _ref_trades
class TestNDvsStandaloneComparison:
"""Test Nautilus-Dolphin matches standalone DOLPHIN results."""
def test_reference_results_exist(self):
"""Verify reference results file exists and has expected structure."""
reference_results, _ = load_reference_data()
if reference_results is None:
pytest.skip(f"Reference results not found: {REFERENCE_RESULTS_FILE}")
assert 'strategies' in reference_results
assert REFERENCE_STRATEGY in reference_results['strategies']
assert 'total_scans' in reference_results
print(f"\nReference data loaded: {reference_results['total_scans']} scans")
def test_reference_trades_exist(self):
"""Verify reference trades exist for the strategy."""
_, reference_trades = load_reference_data()
if reference_trades is None:
pytest.skip(f"Reference trades not found: {REFERENCE_TRADES_FILE}")
assert len(reference_trades) > 0
print(f"\nReference trades loaded: {len(reference_trades)} trades for {REFERENCE_STRATEGY}")
def test_strategy_metrics_match(self):
"""Verify ND produces matching high-level metrics.
This test compares:
- Trade count
- Win rate
- Profit factor
- ROI
"""
reference_results, _ = load_reference_data()
if reference_results is None:
pytest.skip("Reference results not available")
ref_strategy = reference_results['strategies'][REFERENCE_STRATEGY]
# Store reference metrics for comparison
ref_metrics = StrategyMetrics(
name=REFERENCE_STRATEGY,
capital=ref_strategy['capital'],
roi_pct=ref_strategy['roi_pct'],
trades=ref_strategy['trades'],
wins=ref_strategy['wins'],
win_rate=ref_strategy['win_rate'],
profit_factor=ref_strategy['profit_factor'],
avg_win=ref_strategy['avg_win'],
avg_loss=ref_strategy['avg_loss'],
stop_exits=ref_strategy['stop_exits'],
trailing_exits=ref_strategy['trailing_exits'],
target_exits=ref_strategy['target_exits'],
hold_exits=ref_strategy['hold_exits']
)
# Log reference metrics
print(f"\n{'='*60}")
print(f"Reference Strategy: {REFERENCE_STRATEGY}")
print(f"{'='*60}")
print(f"Capital: ${ref_metrics.capital:,.2f}")
print(f"ROI: {ref_metrics.roi_pct:.2f}%")
print(f"Trades: {ref_metrics.trades}")
print(f"Win Rate: {ref_metrics.win_rate:.2f}%")
print(f"Profit Factor: {ref_metrics.profit_factor:.4f}")
print(f"Avg Win: ${ref_metrics.avg_win:.2f}")
print(f"Avg Loss: ${ref_metrics.avg_loss:.2f}")
print(f"Exit Types: stop={ref_metrics.stop_exits}, trail={ref_metrics.trailing_exits}, target={ref_metrics.target_exits}, hold={ref_metrics.hold_exits}")
# Basic sanity checks on reference data
assert ref_metrics.trades > 100, "Expected significant number of trades"
assert 0 < ref_metrics.win_rate < 100, "Win rate should be between 0-100%"
assert ref_metrics.capital > 0, "Capital should be positive"
def test_trade_details_structure(self):
"""Verify structure of reference trades."""
_, reference_trades = load_reference_data()
if not reference_trades:
pytest.skip("No reference trades loaded")
trade = reference_trades[0]
# Check required fields exist
assert trade.strategy == REFERENCE_STRATEGY
assert trade.entry_price > 0
assert trade.exit_price > 0
assert trade.notional > 0
assert trade.exit_type in ['trailing_stop', 'stop_loss', 'target', 'max_hold']
print(f"\nSample trade: {trade.trade_asset} {trade.direction}")
print(f" Date: {trade.date}, Scan: {trade.scan_idx}")
print(f" Entry: ${trade.entry_price:.2f} -> Exit: ${trade.exit_price:.2f}")
print(f" P&L: ${trade.net_pnl:.4f}, Exit Type: {trade.exit_type}")
print(f" Bars: {trade.bars_held}, Leverage: {trade.leverage}x")
def test_exit_type_distribution(self):
"""Verify exit type distribution matches expectations."""
reference_results, _ = load_reference_data()
if reference_results is None:
pytest.skip("Reference results not available")
ref_strategy = reference_results['strategies'][REFERENCE_STRATEGY]
total_exits = (
ref_strategy['stop_exits'] +
ref_strategy['trailing_exits'] +
ref_strategy['target_exits'] +
ref_strategy['hold_exits']
)
assert total_exits == ref_strategy['trades'], "Exit count should match trade count"
# Log distribution
print(f"\nExit Type Distribution:")
print(f" Trailing: {ref_strategy['trailing_exits']} ({100*ref_strategy['trailing_exits']/ref_strategy['trades']:.1f}%)")
print(f" Stop: {ref_strategy['stop_exits']} ({100*ref_strategy['stop_exits']/ref_strategy['trades']:.1f}%)")
print(f" Target: {ref_strategy['target_exits']} ({100*ref_strategy['target_exits']/ref_strategy['trades']:.1f}%)")
print(f" Hold: {ref_strategy['hold_exits']} ({100*ref_strategy['hold_exits']/ref_strategy['trades']:.1f}%)")
def test_pnl_calculation_consistency(self):
"""Verify P&L calculations in reference trades are consistent.
Checks: gross_pnl - fees = net_pnl (within tolerance)
"""
_, reference_trades = load_reference_data()
if not reference_trades:
pytest.skip("No reference trades loaded")
calc_errors = []
winner_errors = []
for i, trade in enumerate(reference_trades[:100]): # Check first 100
# Check 1: Verify gross_pnl - fees = net_pnl
calc_net = trade.gross_pnl - trade.fees
if abs(calc_net - trade.net_pnl) > 0.01:
calc_errors.append(i)
# Check 2: Verify is_winner matches net_pnl sign
# A trade is a winner if net_pnl > 0 (strictly positive)
expected_winner = trade.net_pnl > 0
if expected_winner != trade.is_winner:
winner_errors.append(i)
# Report findings
print(f"\nP&L Calculation Check (first 100 trades):")
print(f" Calculation errors: {len(calc_errors)} ({len(calc_errors)}%)")
print(f" Winner flag errors: {len(winner_errors)} ({len(winner_errors)}%)")
if calc_errors[:5]:
print(f" Sample calc errors: {calc_errors[:5]}")
# The key check: gross_pnl - fees should equal net_pnl
# Some small discrepancies are acceptable due to rounding
calc_error_rate = len(calc_errors) / min(100, len(reference_trades))
assert calc_error_rate < 0.05, f"Too many P&L calculation errors: {calc_error_rate:.1%}"
def test_nd_configuration_matches(self):
"""Verify ND configuration matches standalone.
This test ensures the Nautilus-Dolphin configuration
matches the itest_v7 tight_3_3 configuration.
"""
from nautilus_dolphin.nautilus.strategy_registration import DolphinStrategyConfig
# ND configuration
nd_config = DolphinStrategyConfig(
venue="BINANCE_FUTURES",
max_leverage=2.5, # From itest_v7
capital_fraction=0.15, # From itest_v7
tp_bps=99, # ~1% target (not heavily used in tight_3_3)
max_hold_bars=120, # From itest_v7
acb_enabled=True,
)
# Key parameters that MUST match itest_v7
assert nd_config.max_leverage == 2.5, "Leverage must match"
assert nd_config.capital_fraction == 0.15, "Capital fraction must match"
assert nd_config.max_hold_bars == 120, "Max hold must match"
print(f"\nND Configuration validated:")
print(f" Max Leverage: {nd_config.max_leverage}x")
print(f" Capital Fraction: {nd_config.capital_fraction}")
print(f" Max Hold Bars: {nd_config.max_hold_bars}")
print(f" ACB Enabled: {nd_config.acb_enabled}")
class TestNDSignalGenerationStack:
"""Test Nautilus-Dolphin signal generation stack works correctly."""
def test_data_adapter_imports(self):
"""Verify data adapter components import correctly."""
from nautilus_dolphin.nautilus.data_adapter import (
JSONEigenvalueDataAdapter,
BacktestDataLoader
)
assert JSONEigenvalueDataAdapter is not None
assert BacktestDataLoader is not None
def test_data_catalog_imports(self):
"""Verify data catalog components import correctly."""
from nautilus_dolphin.nautilus.data_catalogue import (
DataCatalogueConfig,
BacktestEngineConfig,
DataImporter
)
assert DataCatalogueConfig is not None
assert BacktestEngineConfig is not None
assert DataImporter is not None
def test_strategy_can_calculate_position_size(self):
"""Verify strategy can calculate position sizes matching itest_v7."""
from nautilus_dolphin.nautilus.strategy import DolphinExecutionStrategyForTesting
strategy = DolphinExecutionStrategyForTesting({
'venue': 'BINANCE_FUTURES',
'max_leverage': 2.5,
'capital_fraction': 0.15,
'acb_enabled': False, # Disable ACB for this test
})
# Test signal matching itest_v7 parameters
signal = {
'strength': 0.75,
'bucket_boost': 1.0,
'streak_mult': 1.0,
'trend_mult': 1.0,
}
account_balance = 10000.0
notional = strategy.calculate_position_size(signal, account_balance)
# itest_v7: notional = 10000 * 0.15 * 2.5 = 3750
expected_base = account_balance * 0.15 * 2.5
print(f"\nPosition Size Calculation:")
print(f" Account: ${account_balance:,.2f}")
print(f" Calculated Notional: ${notional:,.2f}")
print(f" Expected (itest_v7): ${expected_base:,.2f}")
# Allow for minor differences due to ACB or other factors
assert notional > 0, "Notional must be positive"
assert notional <= account_balance * 0.5, "Notional should respect sanity cap"
def test_strategy_filters_match(self):
"""Verify strategy filters match itest_v7 logic."""
from nautilus_dolphin.nautilus.strategy import DolphinExecutionStrategyForTesting
strategy = DolphinExecutionStrategyForTesting({
'venue': 'BINANCE_FUTURES',
'irp_alignment_min': 0.45,
'momentum_magnitude_min': 0.000075,
'excluded_assets': ['TUSDUSDT', 'USDCUSDT'],
'max_concurrent_positions': 10,
})
# Test valid signal
valid_signal = {
'irp_alignment': 0.5,
'direction_confirm': True,
'lookback_momentum': 0.0001,
'asset': 'BTCUSDT',
}
# Manually set volatility detector to high regime
strategy.volatility_detector._regime = 'high'
strategy.volatility_detector._history = [0.0001] * 150
result = strategy._should_trade(valid_signal)
print(f"\nValid signal check: '{result}'")
# Test excluded asset
excluded_signal = {
'irp_alignment': 0.5,
'direction_confirm': True,
'lookback_momentum': 0.0001,
'asset': 'USDCUSDT', # Excluded
}
result_excluded = strategy._should_trade(excluded_signal)
print(f"Excluded asset check: '{result_excluded}'")
assert result_excluded == "asset_excluded", "Should reject excluded asset"
class TestTradeByTradeComparison:
"""Trade-by-trade comparison between ND and standalone.
This is the MOST CRITICAL test - every trade must match.
"""
def test_first_10_trades_structure(self):
"""Verify structure of first 10 reference trades."""
_, reference_trades = load_reference_data()
if not reference_trades:
pytest.skip("No reference trades loaded")
print(f"\n{'='*60}")
print("First 10 Reference Trades:")
print(f"{'='*60}")
for i, trade in enumerate(reference_trades[:10]):
print(f"\nTrade {i+1}: {trade.trade_asset} {trade.direction}")
print(f" Date: {trade.date}, Scan: {trade.scan_idx}")
print(f" Entry: ${trade.entry_price:.2f} -> Exit: ${trade.exit_price:.2f}")
print(f" P&L: ${trade.net_pnl:.4f}, Exit: {trade.exit_type}")
print(f" Bars: {trade.bars_held}, Leverage: {trade.leverage}x")
def test_entry_exit_prices_are_reasonable(self):
"""Verify entry/exit prices are within reasonable ranges."""
_, reference_trades = load_reference_data()
if not reference_trades:
pytest.skip("No reference trades loaded")
crypto_assets = {
'BTCUSDT': (20000, 100000),
'ETHUSDT': (1000, 5000),
'ADAUSDT': (0.2, 2.0),
'SOLUSDT': (10, 200),
}
unreasonable = 0
for trade in reference_trades[:100]:
# Check if prices are positive
if trade.entry_price <= 0 or trade.exit_price <= 0:
unreasonable += 1
continue
# Check price range for known assets
for asset, (min_p, max_p) in crypto_assets.items():
if trade.trade_asset == asset:
if not (min_p <= trade.entry_price <= max_p):
unreasonable += 1
break
error_rate = unreasonable / min(100, len(reference_trades))
assert error_rate < 0.1, f"Too many unreasonable prices: {error_rate:.1%}"
def test_leverage_is_consistent(self):
"""Verify all trades use expected leverage."""
_, reference_trades = load_reference_data()
if not reference_trades:
pytest.skip("No reference trades loaded")
leverages = set(t.leverage for t in reference_trades)
print(f"\nLeverage values used: {leverages}")
# itest_v7 uses 2.5x leverage for tight_3_3
assert 2.5 in leverages, "Expected 2.5x leverage in trades"
def test_fees_are_calculated(self):
"""Verify fees are calculated for all trades."""
_, reference_trades = load_reference_data()
if not reference_trades:
pytest.skip("No reference trades loaded")
trades_with_fees = sum(1 for t in reference_trades if t.fees > 0)
fee_rate = trades_with_fees / len(reference_trades)
print(f"\nFee coverage: {trades_with_fees}/{len(reference_trades)} ({fee_rate:.1%})")
# All trades should have fees
assert fee_rate > 0.99, "Expected fees on almost all trades"
# ── Main Comparison Test ─────────────────────────────────────────────────────
@pytest.mark.skip(reason="Full ND backtest comparison - run after ND backtest implementation")
class TestFullNDvsStandaloneBacktest:
"""Full backtest comparison - requires ND backtest results."""
def test_nd_backtest_produces_results(self):
"""Verify ND backtest runs and produces results."""
# TODO: Run ND backtest and load results
pass
def test_trade_count_matches(self):
"""Verify ND produces same number of trades."""
reference_results, _ = load_reference_data()
if reference_results is None:
pytest.skip("Reference results not available")
ref_trades = reference_results['strategies'][REFERENCE_STRATEGY]['trades']
# TODO: Compare with ND results
pass
def test_trade_by_trade_match(self):
"""CRITICAL: Verify every trade matches."""
_, reference_trades = load_reference_data()
if not reference_trades:
pytest.skip("Reference trades not available")
# TODO: Implement trade-by-trade comparison
pass