Files
DOLPHIN/prod/run_5y_klines_backfill.bat

154 lines
5.3 KiB
Batchfile
Raw Normal View History

@echo off
REM =============================================================================
REM DOLPHIN NG5 - 5 Year Klines Dataset Backfill Script
REM =============================================================================
REM This script backfills 1-minute klines data from Binance for 2021-2023
REM Expected runtime: 12-24 hours (depending on network and disk speed)
REM
REM IMPORTANT: This script is idempotent - you can safely re-run if interrupted
REM =============================================================================
setlocal enabledelayedexpansion
REM Configuration
set BACKFILL_DIR=C:\Users\Lenovo\Documents\- Dolphin NG Backfill
set HCM_DIR=C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict
set START_DATE=2021-07-01
set END_DATE=2023-12-31
set LOG_FILE=%BACKFILL_DIR%\backfill_2021_2023_run.log
echo ============================================================================
echo DOLPHIN NG5 - 5 Year Klines Backfill
echo ============================================================================
echo Start date: %START_DATE%
echo End date: %END_DATE%
echo Log file: %LOG_FILE%
echo ============================================================================
echo.
REM Check Python
python --version >nul 2>&1
if errorlevel 1 (
echo ERROR: Python not found in PATH
exit /b 1
)
REM Phase 1: Fetch klines
echo.
echo ============================================================================
echo PHASE 1: Fetching klines from Binance API
echo ============================================================================
echo This will download 1-minute OHLCV for 50 symbols x 914 days
echo Rate limited to ~1100 req/min to stay under Binance limits
echo Estimated time: 6-12 hours
echo.
cd /d "%BACKFILL_DIR%"
echo [%date% %time%] Starting Phase 1: Fetch >> "%LOG_FILE%"
python historical_klines_backfiller.py --fetch --start %START_DATE% --end %END_DATE% 2>&1 | tee -a "%LOG_FILE%"
if errorlevel 1 (
echo [%date% %time%] ERROR: Phase 1 failed >> "%LOG_FILE%"
echo ERROR: Phase 1 (fetch) failed. Check log: %LOG_FILE%
exit /b 1
)
echo [%date% %time%] Phase 1 complete >> "%LOG_FILE%"
REM Phase 2: Compute eigenvalues and write Arrow files
echo.
echo ============================================================================
echo PHASE 2: Computing eigenvalues and writing Arrow files
echo ============================================================================
echo This processes the cached klines into Arrow format with eigenvalues
echo Estimated time: 2-4 hours
echo.
echo [%date% %time%] Starting Phase 2: Compute >> "%LOG_FILE%"
python historical_klines_backfiller.py --compute --start %START_DATE% --end %END_DATE% 2>&1 | tee -a "%LOG_FILE%"
if errorlevel 1 (
echo [%date% %time%] ERROR: Phase 2 failed >> "%LOG_FILE%"
echo ERROR: Phase 2 (compute) failed. Check log: %LOG_FILE%
exit /b 1
)
echo [%date% %time%] Phase 2 complete >> "%LOG_FILE%"
REM Phase 3: Convert Arrow to Parquet
echo.
echo ============================================================================
echo PHASE 3: Converting Arrow files to VBT Parquet cache
echo ============================================================================
echo This converts the Arrow files to the final Parquet format
echo Estimated time: 30-60 minutes
echo.
cd /d "%HCM_DIR%"
echo [%date% %time%] Starting Phase 3: Convert >> "%LOG_FILE%"
python ng5_arrow_to_vbt_cache.py --all 2>&1 | tee -a "%LOG_FILE%"
if errorlevel 1 (
echo [%date% %time%] ERROR: Phase 3 failed >> "%LOG_FILE%"
echo ERROR: Phase 3 (convert) failed. Check log: %LOG_FILE%
exit /b 1
)
echo [%date% %time%] Phase 3 complete >> "%LOG_FILE%"
REM Phase 4: Validation
echo.
echo ============================================================================
echo PHASE 4: Validating output
echo ============================================================================
echo.
echo [%date% %time%] Starting Phase 4: Validate >> "%LOG_FILE%"
python -c "
from pathlib import Path
import pandas as pd
from collections import Counter
p = Path(r'%HCM_DIR%\vbt_cache_klines')
parquets = sorted(p.glob('*.parquet'))
stems = [f.stem for f in parquets]
years = Counter([s[:4] for s in stems])
print('='*70)
print('VALIDATION RESULTS')
print('='*70)
print(f'Total parquets: {len(parquets)}')
print(f'Date range: {stems[0]} to {stems[-1]}')
print('By year:')
for y in sorted(years.keys()):
print(f' {y}: {years[y]} days')
# Check for gaps in 2021-2024 period
target_dates = set()
import pandas as pd
dr = pd.date_range('2021-07-01', '2023-12-31', freq='D')
target_dates = set(dr.strftime('%Y-%m-%d'))
existing = set(stems)
missing_in_target = target_dates - existing
print(f'\\nMissing in target range (2021-07-01 to 2023-12-31): {len(missing_in_target)} days')
if len(missing_in_target) <= 20:
for d in sorted(missing_in_target):
print(f' {d}')
print('='*70)
" 2>&1 | tee -a "%LOG_FILE%"
echo [%date% %time%] All phases complete >> "%LOG_FILE%"
echo.
echo ============================================================================
echo BACKFILL COMPLETE
echo ============================================================================
echo Log file: %LOG_FILE%
echo ============================================================================
pause