initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
153
prod/run_5y_klines_backfill.bat
Executable file
153
prod/run_5y_klines_backfill.bat
Executable file
@@ -0,0 +1,153 @@
|
||||
@echo off
|
||||
REM =============================================================================
|
||||
REM DOLPHIN NG5 - 5 Year Klines Dataset Backfill Script
|
||||
REM =============================================================================
|
||||
REM This script backfills 1-minute klines data from Binance for 2021-2023
|
||||
REM Expected runtime: 12-24 hours (depending on network and disk speed)
|
||||
REM
|
||||
REM IMPORTANT: This script is idempotent - you can safely re-run if interrupted
|
||||
REM =============================================================================
|
||||
|
||||
setlocal enabledelayedexpansion
|
||||
|
||||
REM Configuration
|
||||
set BACKFILL_DIR=C:\Users\Lenovo\Documents\- Dolphin NG Backfill
|
||||
set HCM_DIR=C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict
|
||||
set START_DATE=2021-07-01
|
||||
set END_DATE=2023-12-31
|
||||
set LOG_FILE=%BACKFILL_DIR%\backfill_2021_2023_run.log
|
||||
|
||||
echo ============================================================================
|
||||
echo DOLPHIN NG5 - 5 Year Klines Backfill
|
||||
echo ============================================================================
|
||||
echo Start date: %START_DATE%
|
||||
echo End date: %END_DATE%
|
||||
echo Log file: %LOG_FILE%
|
||||
echo ============================================================================
|
||||
echo.
|
||||
|
||||
REM Check Python
|
||||
python --version >nul 2>&1
|
||||
if errorlevel 1 (
|
||||
echo ERROR: Python not found in PATH
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
REM Phase 1: Fetch klines
|
||||
echo.
|
||||
echo ============================================================================
|
||||
echo PHASE 1: Fetching klines from Binance API
|
||||
echo ============================================================================
|
||||
echo This will download 1-minute OHLCV for 50 symbols x 914 days
|
||||
echo Rate limited to ~1100 req/min to stay under Binance limits
|
||||
echo Estimated time: 6-12 hours
|
||||
echo.
|
||||
|
||||
cd /d "%BACKFILL_DIR%"
|
||||
echo [%date% %time%] Starting Phase 1: Fetch >> "%LOG_FILE%"
|
||||
|
||||
python historical_klines_backfiller.py --fetch --start %START_DATE% --end %END_DATE% 2>&1 | tee -a "%LOG_FILE%"
|
||||
|
||||
if errorlevel 1 (
|
||||
echo [%date% %time%] ERROR: Phase 1 failed >> "%LOG_FILE%"
|
||||
echo ERROR: Phase 1 (fetch) failed. Check log: %LOG_FILE%
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo [%date% %time%] Phase 1 complete >> "%LOG_FILE%"
|
||||
|
||||
REM Phase 2: Compute eigenvalues and write Arrow files
|
||||
echo.
|
||||
echo ============================================================================
|
||||
echo PHASE 2: Computing eigenvalues and writing Arrow files
|
||||
echo ============================================================================
|
||||
echo This processes the cached klines into Arrow format with eigenvalues
|
||||
echo Estimated time: 2-4 hours
|
||||
echo.
|
||||
|
||||
echo [%date% %time%] Starting Phase 2: Compute >> "%LOG_FILE%"
|
||||
|
||||
python historical_klines_backfiller.py --compute --start %START_DATE% --end %END_DATE% 2>&1 | tee -a "%LOG_FILE%"
|
||||
|
||||
if errorlevel 1 (
|
||||
echo [%date% %time%] ERROR: Phase 2 failed >> "%LOG_FILE%"
|
||||
echo ERROR: Phase 2 (compute) failed. Check log: %LOG_FILE%
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo [%date% %time%] Phase 2 complete >> "%LOG_FILE%"
|
||||
|
||||
REM Phase 3: Convert Arrow to Parquet
|
||||
echo.
|
||||
echo ============================================================================
|
||||
echo PHASE 3: Converting Arrow files to VBT Parquet cache
|
||||
echo ============================================================================
|
||||
echo This converts the Arrow files to the final Parquet format
|
||||
echo Estimated time: 30-60 minutes
|
||||
echo.
|
||||
|
||||
cd /d "%HCM_DIR%"
|
||||
echo [%date% %time%] Starting Phase 3: Convert >> "%LOG_FILE%"
|
||||
|
||||
python ng5_arrow_to_vbt_cache.py --all 2>&1 | tee -a "%LOG_FILE%"
|
||||
|
||||
if errorlevel 1 (
|
||||
echo [%date% %time%] ERROR: Phase 3 failed >> "%LOG_FILE%"
|
||||
echo ERROR: Phase 3 (convert) failed. Check log: %LOG_FILE%
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo [%date% %time%] Phase 3 complete >> "%LOG_FILE%"
|
||||
|
||||
REM Phase 4: Validation
|
||||
echo.
|
||||
echo ============================================================================
|
||||
echo PHASE 4: Validating output
|
||||
echo ============================================================================
|
||||
echo.
|
||||
|
||||
echo [%date% %time%] Starting Phase 4: Validate >> "%LOG_FILE%"
|
||||
|
||||
python -c "
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from collections import Counter
|
||||
|
||||
p = Path(r'%HCM_DIR%\vbt_cache_klines')
|
||||
parquets = sorted(p.glob('*.parquet'))
|
||||
stems = [f.stem for f in parquets]
|
||||
years = Counter([s[:4] for s in stems])
|
||||
|
||||
print('='*70)
|
||||
print('VALIDATION RESULTS')
|
||||
print('='*70)
|
||||
print(f'Total parquets: {len(parquets)}')
|
||||
print(f'Date range: {stems[0]} to {stems[-1]}')
|
||||
print('By year:')
|
||||
for y in sorted(years.keys()):
|
||||
print(f' {y}: {years[y]} days')
|
||||
|
||||
# Check for gaps in 2021-2024 period
|
||||
target_dates = set()
|
||||
import pandas as pd
|
||||
dr = pd.date_range('2021-07-01', '2023-12-31', freq='D')
|
||||
target_dates = set(dr.strftime('%Y-%m-%d'))
|
||||
existing = set(stems)
|
||||
missing_in_target = target_dates - existing
|
||||
print(f'\\nMissing in target range (2021-07-01 to 2023-12-31): {len(missing_in_target)} days')
|
||||
if len(missing_in_target) <= 20:
|
||||
for d in sorted(missing_in_target):
|
||||
print(f' {d}')
|
||||
print('='*70)
|
||||
" 2>&1 | tee -a "%LOG_FILE%"
|
||||
|
||||
echo [%date% %time%] All phases complete >> "%LOG_FILE%"
|
||||
|
||||
echo.
|
||||
echo ============================================================================
|
||||
echo BACKFILL COMPLETE
|
||||
echo ============================================================================
|
||||
echo Log file: %LOG_FILE%
|
||||
echo ============================================================================
|
||||
|
||||
pause
|
||||
Reference in New Issue
Block a user