#!/usr/bin/env python3 """ DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder ====================================================== This script orchestrates the backfill of 1-minute klines data from Binance to extend the DOLPHIN dataset from ~2 years to 5+ years of history. Usage: python klines_backfill_5y_10y.py --plan # Show execution plan python klines_backfill_5y_10y.py --preflight # Run pre-flight checks only python klines_backfill_5y_10y.py --backfill-5y # Run 5-year backfill (2021-2023) python klines_backfill_5y_10y.py --backfill-10y # Run 10-year backfill (2017-2023) python klines_backfill_5y_10y.py --convert # Convert Arrow to Parquet only python klines_backfill_5y_10y.py --validate # Validate output only python klines_backfill_5y_10y.py --full-5y # Run all: backfill + convert + validate Expected Runtime (5-year): Phase 1 (Fetch): 6-12 hours (depends on network, rate limits) Phase 2 (Compute): 2-4 hours (depends on CPU) Phase 3 (Convert): 30-60 minutes Phase 4 (Validate): 5-10 minutes Total: 10-18 hours Disk Space Requirements: 5-year: ~150 GB free (including klines_cache, can delete after) 10-year: ~400 GB free (likely too large for most systems) Resume Capability: The backfiller is fully idempotent. If interrupted, simply re-run the same command - already-fetched dates are skipped automatically. """ import argparse import subprocess import sys import shutil import time from pathlib import Path from datetime import datetime from typing import List, Tuple, Optional # Configuration BACKFILL_DIR = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill') HCM_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict') BACKFILL_SCRIPT = BACKFILL_DIR / 'historical_klines_backfiller.py' CONVERT_SCRIPT = HCM_DIR / 'ng5_arrow_to_vbt_cache.py' ARROW_DIR = BACKFILL_DIR / 'backfilled_data' / 'arrow_klines' PARQUET_DIR = HCM_DIR / 'vbt_cache_klines' LOG_DIR = HCM_DIR / 'run_logs' # Date ranges DATE_RANGES = { '5y': ('2021-07-01', '2023-12-31'), # Reliable symbol coverage '7y': ('2019-01-01', '2023-12-31'), # Most symbols available 'max': ('2017-07-01', '2023-12-31'), # Binance launch date } def log(msg: str): """Print with timestamp.""" ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(f'[{ts}] {msg}') def check_disk_space() -> Tuple[float, str]: """Check free disk space on C:\ and return GB and status.""" u = shutil.disk_usage('C:\\') free_gb = u.free / (1024**3) if free_gb >= 400: status = 'SUFFICIENT for 10-year extension' elif free_gb >= 150: status = 'SUFFICIENT for 5-year extension' elif free_gb >= 100: status = 'MARGINAL - may need cache cleanup during run' else: status = 'INSUFFICIENT - free up disk space first' return free_gb, status def get_current_coverage() -> dict: """Analyze current data coverage.""" result = { 'parquet_count': 0, 'parquet_range': None, 'arrow_count': 0, 'arrow_range': None, 'klines_cache_size_gb': 0, } # Parquet coverage if PARQUET_DIR.exists(): parquets = sorted(PARQUET_DIR.glob('*.parquet')) result['parquet_count'] = len(parquets) if parquets: stems = [f.stem for f in parquets] result['parquet_range'] = (stems[0], stems[-1]) # Arrow coverage if ARROW_DIR.exists(): arrow_dates = sorted([d.name for d in ARROW_DIR.iterdir() if d.is_dir()]) result['arrow_count'] = len(arrow_dates) if arrow_dates: result['arrow_range'] = (arrow_dates[0], arrow_dates[-1]) # Klines cache size klines_cache = BACKFILL_DIR / 'klines_cache' if klines_cache.exists(): total_size = 0 for dirpath, dirnames, filenames in shutil.os.walk(klines_cache): for f in filenames: fp = shutil.os.path.join(dirpath, f) total_size += shutil.os.path.getsize(fp) result['klines_cache_size_gb'] = total_size / (1024**3) return result def run_preflight_checks(): """Run all pre-flight checks.""" log('=' * 70) log('PRE-FLIGHT CHECKS') log('=' * 70) # Disk space free_gb, status = check_disk_space() log(f'\\nDisk Space:') log(f' Free: {free_gb:.1f} GB') log(f' Status: {status}') # Coverage cov = get_current_coverage() log(f'\\nCurrent Coverage:') log(f' Parquet files: {cov["parquet_count"]}') if cov['parquet_range']: log(f' Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}') log(f' Arrow directories: {cov["arrow_count"]}') if cov['arrow_range']: log(f' Arrow range: {cov["arrow_range"][0]} to {cov["arrow_range"][1]}') log(f' Klines cache size: {cov["klines_cache_size_gb"]:.2f} GB') # Check scripts exist log(f'\\nScript Availability:') log(f' Backfiller: {"OK" if BACKFILL_SCRIPT.exists() else "MISSING - " + str(BACKFILL_SCRIPT)}') log(f' Converter: {"OK" if CONVERT_SCRIPT.exists() else "MISSING - " + str(CONVERT_SCRIPT)}') log('=' * 70) def run_backfill(start_date: str, end_date: str, phase: str = 'both'): """ Run the backfill script for a date range. phase: 'fetch', 'compute', or 'both' """ if not BACKFILL_SCRIPT.exists(): log(f'ERROR: Backfill script not found: {BACKFILL_SCRIPT}') return False LOG_DIR.mkdir(exist_ok=True) log_file = LOG_DIR / f'backfill_{start_date}_{end_date}.log' log(f'Starting backfill: {start_date} to {end_date}') log(f'Log file: {log_file}') cmd_base = [ 'python', str(BACKFILL_SCRIPT), '--start', start_date, '--end', end_date, ] phases = [] if phase == 'both': phases = [(['--fetch'], 'fetch'), (['--compute'], 'compute')] elif phase == 'fetch': phases = [(['--fetch'], 'fetch')] elif phase == 'compute': phases = [(['--compute'], 'compute')] for flags, name in phases: log(f'\\n>>> Phase: {name.upper()}') cmd = cmd_base + flags log(f'Command: {" ".join(cmd)}') start_time = time.time() # Run with output tee to log file with open(log_file, 'a') as lf: lf.write(f'\\n{"="*70}\\n') lf.write(f'Phase: {name} started at {datetime.now()}\\n') lf.write(f'{"="*70}\\n') process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=str(BACKFILL_DIR) ) for line in process.stdout: print(line, end='') lf.write(line) lf.flush() elapsed = time.time() - start_time log(f'Phase {name} completed in {elapsed/3600:.2f} hours') if process.returncode != 0: log(f'ERROR: Phase {name} failed with code {process.returncode}') return False return True def run_convert(): """Run the Arrow to Parquet conversion.""" if not CONVERT_SCRIPT.exists(): log(f'ERROR: Converter script not found: {CONVERT_SCRIPT}') return False LOG_DIR.mkdir(exist_ok=True) log_file = LOG_DIR / f'convert_{datetime.now():%Y%m%d_%H%M%S}.log' log(f'Starting conversion') log(f'Log file: {log_file}') cmd = ['python', str(CONVERT_SCRIPT), '--all'] log(f'Command: {" ".join(cmd)}') start_time = time.time() with open(log_file, 'a') as lf: lf.write(f'\\n{"="*70}\\n') lf.write(f'Conversion started at {datetime.now()}\\n') lf.write(f'{"="*70}\\n') process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=str(HCM_DIR) ) for line in process.stdout: print(line, end='') lf.write(line) lf.flush() elapsed = time.time() - start_time log(f'Conversion completed in {elapsed/60:.1f} minutes') return process.returncode == 0 def run_validate(): """Validate the output.""" log('=' * 70) log('VALIDATION') log('=' * 70) cov = get_current_coverage() log(f'\\nParquet files: {cov["parquet_count"]}') if cov['parquet_range']: log(f'Range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}') # Sample validation if cov['parquet_count'] > 0: import random import pandas as pd parquets = sorted(PARQUET_DIR.glob('*.parquet')) # Sample 5 random files samples = random.sample(parquets, min(5, len(parquets))) log(f'\\nSample validation ({len(samples)} random files):') for p in sorted(samples): try: df = pd.read_parquet(p) price_cols = [c for c in df.columns if c.endswith('USDT')] log(f' {p.stem}: {len(df)} rows, {len(price_cols)} price cols, ' f'vel_div std: {df["vel_div"].std():.4f}') except Exception as e: log(f' {p.stem}: ERROR - {e}') log('=' * 70) def show_plan(): """Show the execution plan.""" log('=' * 70) log('EXECUTION PLAN') log('=' * 70) free_gb, status = check_disk_space() cov = get_current_coverage() log(f'\\n1. DISK SPACE ANALYSIS') log(f' Free space: {free_gb:.1f} GB') log(f' Status: {status}') log(f'\\n2. CURRENT STATE') log(f' Parquet files: {cov["parquet_count"]}') if cov['parquet_range']: log(f' Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}') log(f' Arrow directories: {cov["arrow_count"]}') log(f' Klines cache: {cov["klines_cache_size_gb"]:.2f} GB') log(f'\\n3. RECOMMENDED BACKFILL RANGES') for name, (start, end) in DATE_RANGES.items(): log(f' {name}: {start} to {end}') log(f'\\n4. EXECUTION STEPS') log(f' Step 1: Fetch klines (longest - 6-12 hours)') log(f' Step 2: Compute eigenvalues (2-4 hours)') log(f' Step 3: Convert to Parquet (30-60 minutes)') log(f' Step 4: Validate (5-10 minutes)') log(f'\\n5. COMMANDS TO RUN') log(f' Option A - Run everything (5-year):') log(f' python klines_backfill_5y_10y.py --full-5y') log(f' Option B - Step by step:') log(f' python klines_backfill_5y_10y.py --backfill-5y') log(f' python klines_backfill_5y_10y.py --convert') log(f' python klines_backfill_5y_10y.py --validate') log('=' * 70) def main(): parser = argparse.ArgumentParser( description='DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=''' Examples: python klines_backfill_5y_10y.py --plan # Show execution plan python klines_backfill_5y_10y.py --preflight # Check prerequisites python klines_backfill_5y_10y.py --full-5y # Run complete 5-year backfill ''' ) parser.add_argument('--plan', action='store_true', help='Show execution plan without running') parser.add_argument('--preflight', action='store_true', help='Run pre-flight checks only') parser.add_argument('--backfill-5y', action='store_true', help='Backfill 2021-2023 (5-year extension)') parser.add_argument('--backfill-10y', action='store_true', help='Backfill 2017-2023 (10-year extension, needs 400GB)') parser.add_argument('--backfill-max', action='store_true', help='Backfill 2017-2023 (max available from Binance)') parser.add_argument('--fetch-only', action='store_true', help='Only fetch klines, skip compute') parser.add_argument('--compute-only', action='store_true', help='Only compute from existing klines cache') parser.add_argument('--convert', action='store_true', help='Convert Arrow to Parquet only') parser.add_argument('--validate', action='store_true', help='Validate output only') parser.add_argument('--full-5y', action='store_true', help='Run complete 5-year pipeline: backfill + convert + validate') parser.add_argument('--start', help='Custom start date (YYYY-MM-DD)') parser.add_argument('--end', help='Custom end date (YYYY-MM-DD)') args = parser.parse_args() # Default to plan if no args if not any([args.plan, args.preflight, args.backfill_5y, args.backfill_10y, args.backfill_max, args.fetch_only, args.compute_only, args.convert, args.validate, args.full_5y, args.start]): show_plan() return if args.plan: show_plan() if args.preflight: run_preflight_checks() if args.backfill_5y or args.full_5y: start, end = DATE_RANGES['5y'] phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both' if not run_backfill(start, end, phase): sys.exit(1) if args.backfill_10y: free_gb, _ = check_disk_space() if free_gb < 400: log('ERROR: Insufficient disk space for 10-year backfill (need 400GB)') sys.exit(1) start, end = DATE_RANGES['max'] phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both' if not run_backfill(start, end, phase): sys.exit(1) if args.backfill_max: start, end = DATE_RANGES['max'] phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both' if not run_backfill(start, end, phase): sys.exit(1) if args.start and args.end: phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both' if not run_backfill(args.start, args.end, phase): sys.exit(1) if args.convert or args.full_5y: if not run_convert(): sys.exit(1) if args.validate or args.full_5y: run_validate() if __name__ == '__main__': main()