Files
DOLPHIN/nautilus_dolphin/test_nautilus_arrow_longrun.py

306 lines
11 KiB
Python
Raw Normal View History

"""
Nautilus Arrow Long-Running Alpha Engine Test
===============================================
Reads Arrow IPC files directly (no intermediate Parquet cache), converts
on-the-fly to VBT DataFrames, and runs run_full_backtest() day-by-day
using champion_5x_f20.
Optimized: reads all Arrow files per day in bulk (open/read, not mmap)
to avoid the extreme overhead of 8k+ individual memory-map calls.
Usage (activate Siloqy first):
python test_nautilus_arrow_longrun.py
python test_nautilus_arrow_longrun.py --start 2026-02-01 --end 2026-02-25
python test_nautilus_arrow_longrun.py --poll # continuous polling
"""
import sys
import json
import time
import argparse
import warnings
from pathlib import Path
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.ipc as ipc
warnings.filterwarnings('ignore')
PROJECT_ROOT = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')
sys.path.insert(0, str(PROJECT_ROOT))
sys.path.insert(0, str(PROJECT_ROOT / 'nautilus_dolphin'))
DEFAULT_ARROW_BASE = PROJECT_ROOT / 'arrow_backfill'
EXCLUDED_ASSETS = {'TUSDUSDT', 'USDCUSDT'}
def load_arrow_day_fast(date_dir: Path) -> pd.DataFrame:
"""
Bulk-read all scan_*.arrow files for one day into a VBT-compatible DataFrame.
Uses batched file reads (much faster than individual memory-maps).
"""
arrow_files = sorted(date_dir.glob('scan_*.arrow'))
if not arrow_files:
return pd.DataFrame()
rows = []
last_prices = {}
errors = 0
for af in arrow_files:
try:
raw = af.read_bytes()
reader = ipc.open_file(pa.BufferReader(raw))
table = reader.read_all()
if len(table) == 0:
continue
row = {col: table.column(col)[0].as_py() for col in table.column_names}
ts_ns = row.get('timestamp_ns') or 0
if not ts_ns:
continue
ts = pd.Timestamp(ts_ns, unit='ns')
v50 = float(row.get('w50_velocity', 0) or 0)
v150 = float(row.get('w150_velocity', 0) or 0)
if v50 == 0.0 and v150 == 0.0:
continue
v300 = row.get('w300_velocity')
v750 = row.get('w750_velocity')
vd = float(row.get('vel_div', v50 - v150) or (v50 - v150))
i50 = row.get('w50_instability')
i150 = row.get('w150_instability')
assets_raw = json.loads(row.get('assets_json', '[]') or '[]')
prices_raw = json.loads(row.get('asset_prices_json', '[]') or '[]')
price_map = {}
for asset, price in zip(assets_raw, prices_raw):
if asset in EXCLUDED_ASSETS:
continue
if price is not None and float(price) > 0:
price_map[asset] = float(price)
last_prices[asset] = float(price)
elif asset in last_prices:
price_map[asset] = last_prices[asset]
if 'BTCUSDT' not in price_map:
continue
rec = {
'timestamp': ts,
'scan_number': int(row.get('scan_number', 0) or 0),
'v50_lambda_max_velocity': v50,
'v150_lambda_max_velocity': v150,
'v300_lambda_max_velocity': float(v300) if v300 is not None else np.nan,
'v750_lambda_max_velocity': float(v750) if v750 is not None else np.nan,
'vel_div': vd,
'instability_50': float(i50) if i50 is not None else np.nan,
'instability_150': float(i150) if i150 is not None else np.nan,
}
rec.update(price_map)
rows.append(rec)
except Exception:
errors += 1
continue
if not rows:
return pd.DataFrame()
df = pd.DataFrame(rows).sort_values('timestamp').reset_index(drop=True)
core = ['timestamp', 'scan_number', 'v50_lambda_max_velocity',
'v150_lambda_max_velocity', 'v300_lambda_max_velocity',
'v750_lambda_max_velocity', 'vel_div', 'instability_50', 'instability_150']
price_cols = [c for c in df.columns if c not in core]
if price_cols:
df[price_cols] = df[price_cols].ffill()
return df
def discover_arrow_dates(arrow_base: Path, start=None, end=None):
dates = []
if not arrow_base.exists():
return dates
for d in sorted(arrow_base.iterdir()):
if d.is_dir() and len(d.name) == 10 and d.name[4] == '-':
if '_SKIP' in d.name:
continue
if any(d.glob('scan_*.arrow')):
if start and d.name < start:
continue
if end and d.name > end:
continue
dates.append(d.name)
return dates
def run_longrun_test(arrow_base, start, end, poll=False, poll_interval=30):
from dolphin_vbt_real import run_full_backtest
from dolphin_paper_trade_adaptive_cb_v2 import STRATEGIES, INIT_CAPITAL
champion = STRATEGIES['champion_5x_f20']
print('=' * 70)
print(' NAUTILUS ARROW LONG-RUNNING ALPHA ENGINE TEST')
print(f' Strategy: champion_5x_f20 | Capital: ${INIT_CAPITAL:,.0f}')
print(f' Arrow source: {arrow_base}')
print(f' Date range: {start} -> {end}')
print(f' Mode: {"POLL (continuous)" if poll else "BATCH (one-shot)"}')
print('=' * 70)
sys.stdout.flush()
capital = INIT_CAPITAL
total_tr = 0
total_wins = 0
total_fees = 0.0
peak = capital
max_dd = 0.0
processed = set()
day_results = []
def process_date(date_str):
nonlocal capital, total_tr, total_wins, total_fees, peak, max_dd
date_dir = arrow_base / date_str
t0 = time.time()
df = load_arrow_day_fast(date_dir)
load_time = time.time() - t0
if len(df) < 200:
print(f' {date_str}: {len(df)} scans (< 200) -- SKIPPED')
sys.stdout.flush()
return None
t0 = time.time()
result = run_full_backtest(df, champion, init_cash=capital, seed=42, verbose=False)
bt_time = time.time() - t0
capital = result['capital']
total_tr += result['trades']
total_wins += result['wins']
total_fees += result['total_fees']
if capital > peak:
peak = capital
dd = (peak - capital) / peak * 100
if dd > max_dd:
max_dd = dd
wr = total_wins / max(total_tr, 1) * 100
roi = (capital - INIT_CAPITAL) / INIT_CAPITAL * 100
day_rec = {
'date': date_str, 'scans': len(df),
'day_trades': result['trades'], 'day_wins': result['wins'],
'capital': round(capital, 2),
'cum_trades': total_tr, 'cum_wr': round(wr, 2),
'cum_roi': round(roi, 4), 'max_dd': round(max_dd, 4),
'load_ms': int(load_time * 1000), 'bt_ms': int(bt_time * 1000),
}
day_results.append(day_rec)
print(f' {date_str}: {len(df):>5} scans | '
f'{result["trades"]:>2} tr ({result["wins"]}W) | '
f'cap=${capital:>10,.2f} | '
f'WR={wr:.1f}% ROI={roi:+.2f}% DD={max_dd:.1f}% | '
f'[{int(load_time*1000)}ms+{int(bt_time*1000)}ms]')
sys.stdout.flush()
return day_rec
dates = discover_arrow_dates(arrow_base, start, end)
print(f'\nFound {len(dates)} Arrow dates to process\n')
sys.stdout.flush()
for date_str in dates:
process_date(date_str)
processed.add(date_str)
wr = total_wins / max(total_tr, 1) * 100
roi = (capital - INIT_CAPITAL) / INIT_CAPITAL * 100
print('\n' + '=' * 70)
print(' FINAL RESULTS')
print('=' * 70)
print(f' Days processed: {len(day_results)}')
print(f' Total trades: {total_tr}')
print(f' Total wins: {total_wins}')
print(f' Win rate: {wr:.2f}%')
print(f' Final capital: ${capital:,.2f}')
print(f' ROI: {roi:+.4f}%')
print(f' Max drawdown: {max_dd:.4f}%')
print(f' Total fees: ${total_fees:,.2f}')
print('=' * 70)
checks = [('WR >= 40%', wr >= 40.0), ('DD <= 20%', max_dd <= 20.0)]
all_pass = True
print('\n BENCHMARK CHECK:')
for label, ok in checks:
status = 'OK' if ok else 'FAIL'
print(f' {label:20s} -> [{status}]')
if not ok:
all_pass = False
if all_pass:
print('\n VERDICT: PASS -- Arrow pipeline produces valid champion signals')
else:
print('\n VERDICT: WARN -- some benchmarks missed')
ts_str = datetime.now().strftime('%Y%m%d_%H%M%S')
results_dir = PROJECT_ROOT / 'vbt_results'
results_dir.mkdir(exist_ok=True)
out_path = results_dir / f'nautilus_arrow_longrun_{ts_str}.json'
out = {
'timestamp': datetime.now().isoformat(),
'strategy': 'champion_5x_f20',
'arrow_source': str(arrow_base),
'date_range': [start, end],
'summary': {
'days': len(day_results), 'trades': total_tr, 'wins': total_wins,
'win_rate': round(wr, 2), 'final_capital': round(capital, 2),
'roi_pct': round(roi, 4), 'max_dd_pct': round(max_dd, 4),
'total_fees': round(total_fees, 2),
},
'daily': day_results,
}
with open(out_path, 'w') as f:
json.dump(out, f, indent=2)
print(f'\n Results saved -> {out_path}')
sys.stdout.flush()
if poll:
print(f'\n POLL mode: checking every {poll_interval}s for new dates...')
sys.stdout.flush()
while True:
time.sleep(poll_interval)
new_dates = discover_arrow_dates(arrow_base)
for d in new_dates:
if d not in processed:
print(f'\n [POLL] New date: {d}')
process_date(d)
processed.add(d)
return out
def main():
parser = argparse.ArgumentParser(description='Nautilus Arrow long-running test')
parser.add_argument('--arrow-base', default=str(DEFAULT_ARROW_BASE))
parser.add_argument('--start', default='2026-01-01')
parser.add_argument('--end', default='2026-02-25')
parser.add_argument('--poll', action='store_true')
parser.add_argument('--poll-interval', type=int, default=30)
args = parser.parse_args()
run_longrun_test(
arrow_base=Path(args.arrow_base),
start=args.start, end=args.end,
poll=args.poll, poll_interval=args.poll_interval,
)
if __name__ == '__main__':
main()