@echo off REM ============================================================================= REM DOLPHIN NG5 - 5 Year Klines Dataset Backfill Script REM ============================================================================= REM This script backfills 1-minute klines data from Binance for 2021-2023 REM Expected runtime: 12-24 hours (depending on network and disk speed) REM REM IMPORTANT: This script is idempotent - you can safely re-run if interrupted REM ============================================================================= setlocal enabledelayedexpansion REM Configuration set BACKFILL_DIR=C:\Users\Lenovo\Documents\- Dolphin NG Backfill set HCM_DIR=C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict set START_DATE=2021-07-01 set END_DATE=2023-12-31 set LOG_FILE=%BACKFILL_DIR%\backfill_2021_2023_run.log echo ============================================================================ echo DOLPHIN NG5 - 5 Year Klines Backfill echo ============================================================================ echo Start date: %START_DATE% echo End date: %END_DATE% echo Log file: %LOG_FILE% echo ============================================================================ echo. REM Check Python python --version >nul 2>&1 if errorlevel 1 ( echo ERROR: Python not found in PATH exit /b 1 ) REM Phase 1: Fetch klines echo. echo ============================================================================ echo PHASE 1: Fetching klines from Binance API echo ============================================================================ echo This will download 1-minute OHLCV for 50 symbols x 914 days echo Rate limited to ~1100 req/min to stay under Binance limits echo Estimated time: 6-12 hours echo. cd /d "%BACKFILL_DIR%" echo [%date% %time%] Starting Phase 1: Fetch >> "%LOG_FILE%" python historical_klines_backfiller.py --fetch --start %START_DATE% --end %END_DATE% 2>&1 | tee -a "%LOG_FILE%" if errorlevel 1 ( echo [%date% %time%] ERROR: Phase 1 failed >> "%LOG_FILE%" echo ERROR: Phase 1 (fetch) failed. Check log: %LOG_FILE% exit /b 1 ) echo [%date% %time%] Phase 1 complete >> "%LOG_FILE%" REM Phase 2: Compute eigenvalues and write Arrow files echo. echo ============================================================================ echo PHASE 2: Computing eigenvalues and writing Arrow files echo ============================================================================ echo This processes the cached klines into Arrow format with eigenvalues echo Estimated time: 2-4 hours echo. echo [%date% %time%] Starting Phase 2: Compute >> "%LOG_FILE%" python historical_klines_backfiller.py --compute --start %START_DATE% --end %END_DATE% 2>&1 | tee -a "%LOG_FILE%" if errorlevel 1 ( echo [%date% %time%] ERROR: Phase 2 failed >> "%LOG_FILE%" echo ERROR: Phase 2 (compute) failed. Check log: %LOG_FILE% exit /b 1 ) echo [%date% %time%] Phase 2 complete >> "%LOG_FILE%" REM Phase 3: Convert Arrow to Parquet echo. echo ============================================================================ echo PHASE 3: Converting Arrow files to VBT Parquet cache echo ============================================================================ echo This converts the Arrow files to the final Parquet format echo Estimated time: 30-60 minutes echo. cd /d "%HCM_DIR%" echo [%date% %time%] Starting Phase 3: Convert >> "%LOG_FILE%" python ng5_arrow_to_vbt_cache.py --all 2>&1 | tee -a "%LOG_FILE%" if errorlevel 1 ( echo [%date% %time%] ERROR: Phase 3 failed >> "%LOG_FILE%" echo ERROR: Phase 3 (convert) failed. Check log: %LOG_FILE% exit /b 1 ) echo [%date% %time%] Phase 3 complete >> "%LOG_FILE%" REM Phase 4: Validation echo. echo ============================================================================ echo PHASE 4: Validating output echo ============================================================================ echo. echo [%date% %time%] Starting Phase 4: Validate >> "%LOG_FILE%" python -c " from pathlib import Path import pandas as pd from collections import Counter p = Path(r'%HCM_DIR%\vbt_cache_klines') parquets = sorted(p.glob('*.parquet')) stems = [f.stem for f in parquets] years = Counter([s[:4] for s in stems]) print('='*70) print('VALIDATION RESULTS') print('='*70) print(f'Total parquets: {len(parquets)}') print(f'Date range: {stems[0]} to {stems[-1]}') print('By year:') for y in sorted(years.keys()): print(f' {y}: {years[y]} days') # Check for gaps in 2021-2024 period target_dates = set() import pandas as pd dr = pd.date_range('2021-07-01', '2023-12-31', freq='D') target_dates = set(dr.strftime('%Y-%m-%d')) existing = set(stems) missing_in_target = target_dates - existing print(f'\\nMissing in target range (2021-07-01 to 2023-12-31): {len(missing_in_target)} days') if len(missing_in_target) <= 20: for d in sorted(missing_in_target): print(f' {d}') print('='*70) " 2>&1 | tee -a "%LOG_FILE%" echo [%date% %time%] All phases complete >> "%LOG_FILE%" echo. echo ============================================================================ echo BACKFILL COMPLETE echo ============================================================================ echo Log file: %LOG_FILE% echo ============================================================================ pause