422 lines
15 KiB
Python
422 lines
15 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder
|
||
|
|
======================================================
|
||
|
|
|
||
|
|
This script orchestrates the backfill of 1-minute klines data from Binance
|
||
|
|
to extend the DOLPHIN dataset from ~2 years to 5+ years of history.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python klines_backfill_5y_10y.py --plan # Show execution plan
|
||
|
|
python klines_backfill_5y_10y.py --preflight # Run pre-flight checks only
|
||
|
|
python klines_backfill_5y_10y.py --backfill-5y # Run 5-year backfill (2021-2023)
|
||
|
|
python klines_backfill_5y_10y.py --backfill-10y # Run 10-year backfill (2017-2023)
|
||
|
|
python klines_backfill_5y_10y.py --convert # Convert Arrow to Parquet only
|
||
|
|
python klines_backfill_5y_10y.py --validate # Validate output only
|
||
|
|
python klines_backfill_5y_10y.py --full-5y # Run all: backfill + convert + validate
|
||
|
|
|
||
|
|
Expected Runtime (5-year):
|
||
|
|
Phase 1 (Fetch): 6-12 hours (depends on network, rate limits)
|
||
|
|
Phase 2 (Compute): 2-4 hours (depends on CPU)
|
||
|
|
Phase 3 (Convert): 30-60 minutes
|
||
|
|
Phase 4 (Validate): 5-10 minutes
|
||
|
|
Total: 10-18 hours
|
||
|
|
|
||
|
|
Disk Space Requirements:
|
||
|
|
5-year: ~150 GB free (including klines_cache, can delete after)
|
||
|
|
10-year: ~400 GB free (likely too large for most systems)
|
||
|
|
|
||
|
|
Resume Capability:
|
||
|
|
The backfiller is fully idempotent. If interrupted, simply re-run the
|
||
|
|
same command - already-fetched dates are skipped automatically.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import argparse
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
import shutil
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
from typing import List, Tuple, Optional
|
||
|
|
|
||
|
|
# Configuration
|
||
|
|
BACKFILL_DIR = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill')
|
||
|
|
HCM_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')
|
||
|
|
BACKFILL_SCRIPT = BACKFILL_DIR / 'historical_klines_backfiller.py'
|
||
|
|
CONVERT_SCRIPT = HCM_DIR / 'ng5_arrow_to_vbt_cache.py'
|
||
|
|
ARROW_DIR = BACKFILL_DIR / 'backfilled_data' / 'arrow_klines'
|
||
|
|
PARQUET_DIR = HCM_DIR / 'vbt_cache_klines'
|
||
|
|
LOG_DIR = HCM_DIR / 'run_logs'
|
||
|
|
|
||
|
|
# Date ranges
|
||
|
|
DATE_RANGES = {
|
||
|
|
'5y': ('2021-07-01', '2023-12-31'), # Reliable symbol coverage
|
||
|
|
'7y': ('2019-01-01', '2023-12-31'), # Most symbols available
|
||
|
|
'max': ('2017-07-01', '2023-12-31'), # Binance launch date
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def log(msg: str):
|
||
|
|
"""Print with timestamp."""
|
||
|
|
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
|
print(f'[{ts}] {msg}')
|
||
|
|
|
||
|
|
|
||
|
|
def check_disk_space() -> Tuple[float, str]:
|
||
|
|
"""Check free disk space on C:\ and return GB and status."""
|
||
|
|
u = shutil.disk_usage('C:\\')
|
||
|
|
free_gb = u.free / (1024**3)
|
||
|
|
|
||
|
|
if free_gb >= 400:
|
||
|
|
status = 'SUFFICIENT for 10-year extension'
|
||
|
|
elif free_gb >= 150:
|
||
|
|
status = 'SUFFICIENT for 5-year extension'
|
||
|
|
elif free_gb >= 100:
|
||
|
|
status = 'MARGINAL - may need cache cleanup during run'
|
||
|
|
else:
|
||
|
|
status = 'INSUFFICIENT - free up disk space first'
|
||
|
|
|
||
|
|
return free_gb, status
|
||
|
|
|
||
|
|
|
||
|
|
def get_current_coverage() -> dict:
|
||
|
|
"""Analyze current data coverage."""
|
||
|
|
result = {
|
||
|
|
'parquet_count': 0,
|
||
|
|
'parquet_range': None,
|
||
|
|
'arrow_count': 0,
|
||
|
|
'arrow_range': None,
|
||
|
|
'klines_cache_size_gb': 0,
|
||
|
|
}
|
||
|
|
|
||
|
|
# Parquet coverage
|
||
|
|
if PARQUET_DIR.exists():
|
||
|
|
parquets = sorted(PARQUET_DIR.glob('*.parquet'))
|
||
|
|
result['parquet_count'] = len(parquets)
|
||
|
|
if parquets:
|
||
|
|
stems = [f.stem for f in parquets]
|
||
|
|
result['parquet_range'] = (stems[0], stems[-1])
|
||
|
|
|
||
|
|
# Arrow coverage
|
||
|
|
if ARROW_DIR.exists():
|
||
|
|
arrow_dates = sorted([d.name for d in ARROW_DIR.iterdir() if d.is_dir()])
|
||
|
|
result['arrow_count'] = len(arrow_dates)
|
||
|
|
if arrow_dates:
|
||
|
|
result['arrow_range'] = (arrow_dates[0], arrow_dates[-1])
|
||
|
|
|
||
|
|
# Klines cache size
|
||
|
|
klines_cache = BACKFILL_DIR / 'klines_cache'
|
||
|
|
if klines_cache.exists():
|
||
|
|
total_size = 0
|
||
|
|
for dirpath, dirnames, filenames in shutil.os.walk(klines_cache):
|
||
|
|
for f in filenames:
|
||
|
|
fp = shutil.os.path.join(dirpath, f)
|
||
|
|
total_size += shutil.os.path.getsize(fp)
|
||
|
|
result['klines_cache_size_gb'] = total_size / (1024**3)
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def run_preflight_checks():
|
||
|
|
"""Run all pre-flight checks."""
|
||
|
|
log('=' * 70)
|
||
|
|
log('PRE-FLIGHT CHECKS')
|
||
|
|
log('=' * 70)
|
||
|
|
|
||
|
|
# Disk space
|
||
|
|
free_gb, status = check_disk_space()
|
||
|
|
log(f'\\nDisk Space:')
|
||
|
|
log(f' Free: {free_gb:.1f} GB')
|
||
|
|
log(f' Status: {status}')
|
||
|
|
|
||
|
|
# Coverage
|
||
|
|
cov = get_current_coverage()
|
||
|
|
log(f'\\nCurrent Coverage:')
|
||
|
|
log(f' Parquet files: {cov["parquet_count"]}')
|
||
|
|
if cov['parquet_range']:
|
||
|
|
log(f' Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
|
||
|
|
log(f' Arrow directories: {cov["arrow_count"]}')
|
||
|
|
if cov['arrow_range']:
|
||
|
|
log(f' Arrow range: {cov["arrow_range"][0]} to {cov["arrow_range"][1]}')
|
||
|
|
log(f' Klines cache size: {cov["klines_cache_size_gb"]:.2f} GB')
|
||
|
|
|
||
|
|
# Check scripts exist
|
||
|
|
log(f'\\nScript Availability:')
|
||
|
|
log(f' Backfiller: {"OK" if BACKFILL_SCRIPT.exists() else "MISSING - " + str(BACKFILL_SCRIPT)}')
|
||
|
|
log(f' Converter: {"OK" if CONVERT_SCRIPT.exists() else "MISSING - " + str(CONVERT_SCRIPT)}')
|
||
|
|
|
||
|
|
log('=' * 70)
|
||
|
|
|
||
|
|
|
||
|
|
def run_backfill(start_date: str, end_date: str, phase: str = 'both'):
|
||
|
|
"""
|
||
|
|
Run the backfill script for a date range.
|
||
|
|
|
||
|
|
phase: 'fetch', 'compute', or 'both'
|
||
|
|
"""
|
||
|
|
if not BACKFILL_SCRIPT.exists():
|
||
|
|
log(f'ERROR: Backfill script not found: {BACKFILL_SCRIPT}')
|
||
|
|
return False
|
||
|
|
|
||
|
|
LOG_DIR.mkdir(exist_ok=True)
|
||
|
|
log_file = LOG_DIR / f'backfill_{start_date}_{end_date}.log'
|
||
|
|
|
||
|
|
log(f'Starting backfill: {start_date} to {end_date}')
|
||
|
|
log(f'Log file: {log_file}')
|
||
|
|
|
||
|
|
cmd_base = [
|
||
|
|
'python', str(BACKFILL_SCRIPT),
|
||
|
|
'--start', start_date,
|
||
|
|
'--end', end_date,
|
||
|
|
]
|
||
|
|
|
||
|
|
phases = []
|
||
|
|
if phase == 'both':
|
||
|
|
phases = [(['--fetch'], 'fetch'), (['--compute'], 'compute')]
|
||
|
|
elif phase == 'fetch':
|
||
|
|
phases = [(['--fetch'], 'fetch')]
|
||
|
|
elif phase == 'compute':
|
||
|
|
phases = [(['--compute'], 'compute')]
|
||
|
|
|
||
|
|
for flags, name in phases:
|
||
|
|
log(f'\\n>>> Phase: {name.upper()}')
|
||
|
|
cmd = cmd_base + flags
|
||
|
|
log(f'Command: {" ".join(cmd)}')
|
||
|
|
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
# Run with output tee to log file
|
||
|
|
with open(log_file, 'a') as lf:
|
||
|
|
lf.write(f'\\n{"="*70}\\n')
|
||
|
|
lf.write(f'Phase: {name} started at {datetime.now()}\\n')
|
||
|
|
lf.write(f'{"="*70}\\n')
|
||
|
|
|
||
|
|
process = subprocess.Popen(
|
||
|
|
cmd,
|
||
|
|
stdout=subprocess.PIPE,
|
||
|
|
stderr=subprocess.STDOUT,
|
||
|
|
text=True,
|
||
|
|
cwd=str(BACKFILL_DIR)
|
||
|
|
)
|
||
|
|
|
||
|
|
for line in process.stdout:
|
||
|
|
print(line, end='')
|
||
|
|
lf.write(line)
|
||
|
|
lf.flush()
|
||
|
|
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
log(f'Phase {name} completed in {elapsed/3600:.2f} hours')
|
||
|
|
|
||
|
|
if process.returncode != 0:
|
||
|
|
log(f'ERROR: Phase {name} failed with code {process.returncode}')
|
||
|
|
return False
|
||
|
|
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
def run_convert():
|
||
|
|
"""Run the Arrow to Parquet conversion."""
|
||
|
|
if not CONVERT_SCRIPT.exists():
|
||
|
|
log(f'ERROR: Converter script not found: {CONVERT_SCRIPT}')
|
||
|
|
return False
|
||
|
|
|
||
|
|
LOG_DIR.mkdir(exist_ok=True)
|
||
|
|
log_file = LOG_DIR / f'convert_{datetime.now():%Y%m%d_%H%M%S}.log'
|
||
|
|
|
||
|
|
log(f'Starting conversion')
|
||
|
|
log(f'Log file: {log_file}')
|
||
|
|
|
||
|
|
cmd = ['python', str(CONVERT_SCRIPT), '--all']
|
||
|
|
log(f'Command: {" ".join(cmd)}')
|
||
|
|
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
with open(log_file, 'a') as lf:
|
||
|
|
lf.write(f'\\n{"="*70}\\n')
|
||
|
|
lf.write(f'Conversion started at {datetime.now()}\\n')
|
||
|
|
lf.write(f'{"="*70}\\n')
|
||
|
|
|
||
|
|
process = subprocess.Popen(
|
||
|
|
cmd,
|
||
|
|
stdout=subprocess.PIPE,
|
||
|
|
stderr=subprocess.STDOUT,
|
||
|
|
text=True,
|
||
|
|
cwd=str(HCM_DIR)
|
||
|
|
)
|
||
|
|
|
||
|
|
for line in process.stdout:
|
||
|
|
print(line, end='')
|
||
|
|
lf.write(line)
|
||
|
|
lf.flush()
|
||
|
|
|
||
|
|
elapsed = time.time() - start_time
|
||
|
|
log(f'Conversion completed in {elapsed/60:.1f} minutes')
|
||
|
|
|
||
|
|
return process.returncode == 0
|
||
|
|
|
||
|
|
|
||
|
|
def run_validate():
|
||
|
|
"""Validate the output."""
|
||
|
|
log('=' * 70)
|
||
|
|
log('VALIDATION')
|
||
|
|
log('=' * 70)
|
||
|
|
|
||
|
|
cov = get_current_coverage()
|
||
|
|
|
||
|
|
log(f'\\nParquet files: {cov["parquet_count"]}')
|
||
|
|
if cov['parquet_range']:
|
||
|
|
log(f'Range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
|
||
|
|
|
||
|
|
# Sample validation
|
||
|
|
if cov['parquet_count'] > 0:
|
||
|
|
import random
|
||
|
|
import pandas as pd
|
||
|
|
|
||
|
|
parquets = sorted(PARQUET_DIR.glob('*.parquet'))
|
||
|
|
|
||
|
|
# Sample 5 random files
|
||
|
|
samples = random.sample(parquets, min(5, len(parquets)))
|
||
|
|
log(f'\\nSample validation ({len(samples)} random files):')
|
||
|
|
|
||
|
|
for p in sorted(samples):
|
||
|
|
try:
|
||
|
|
df = pd.read_parquet(p)
|
||
|
|
price_cols = [c for c in df.columns if c.endswith('USDT')]
|
||
|
|
log(f' {p.stem}: {len(df)} rows, {len(price_cols)} price cols, '
|
||
|
|
f'vel_div std: {df["vel_div"].std():.4f}')
|
||
|
|
except Exception as e:
|
||
|
|
log(f' {p.stem}: ERROR - {e}')
|
||
|
|
|
||
|
|
log('=' * 70)
|
||
|
|
|
||
|
|
|
||
|
|
def show_plan():
|
||
|
|
"""Show the execution plan."""
|
||
|
|
log('=' * 70)
|
||
|
|
log('EXECUTION PLAN')
|
||
|
|
log('=' * 70)
|
||
|
|
|
||
|
|
free_gb, status = check_disk_space()
|
||
|
|
cov = get_current_coverage()
|
||
|
|
|
||
|
|
log(f'\\n1. DISK SPACE ANALYSIS')
|
||
|
|
log(f' Free space: {free_gb:.1f} GB')
|
||
|
|
log(f' Status: {status}')
|
||
|
|
|
||
|
|
log(f'\\n2. CURRENT STATE')
|
||
|
|
log(f' Parquet files: {cov["parquet_count"]}')
|
||
|
|
if cov['parquet_range']:
|
||
|
|
log(f' Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
|
||
|
|
log(f' Arrow directories: {cov["arrow_count"]}')
|
||
|
|
log(f' Klines cache: {cov["klines_cache_size_gb"]:.2f} GB')
|
||
|
|
|
||
|
|
log(f'\\n3. RECOMMENDED BACKFILL RANGES')
|
||
|
|
for name, (start, end) in DATE_RANGES.items():
|
||
|
|
log(f' {name}: {start} to {end}')
|
||
|
|
|
||
|
|
log(f'\\n4. EXECUTION STEPS')
|
||
|
|
log(f' Step 1: Fetch klines (longest - 6-12 hours)')
|
||
|
|
log(f' Step 2: Compute eigenvalues (2-4 hours)')
|
||
|
|
log(f' Step 3: Convert to Parquet (30-60 minutes)')
|
||
|
|
log(f' Step 4: Validate (5-10 minutes)')
|
||
|
|
|
||
|
|
log(f'\\n5. COMMANDS TO RUN')
|
||
|
|
log(f' Option A - Run everything (5-year):')
|
||
|
|
log(f' python klines_backfill_5y_10y.py --full-5y')
|
||
|
|
log(f' Option B - Step by step:')
|
||
|
|
log(f' python klines_backfill_5y_10y.py --backfill-5y')
|
||
|
|
log(f' python klines_backfill_5y_10y.py --convert')
|
||
|
|
log(f' python klines_backfill_5y_10y.py --validate')
|
||
|
|
|
||
|
|
log('=' * 70)
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(
|
||
|
|
description='DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder',
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
|
|
epilog='''
|
||
|
|
Examples:
|
||
|
|
python klines_backfill_5y_10y.py --plan # Show execution plan
|
||
|
|
python klines_backfill_5y_10y.py --preflight # Check prerequisites
|
||
|
|
python klines_backfill_5y_10y.py --full-5y # Run complete 5-year backfill
|
||
|
|
'''
|
||
|
|
)
|
||
|
|
|
||
|
|
parser.add_argument('--plan', action='store_true',
|
||
|
|
help='Show execution plan without running')
|
||
|
|
parser.add_argument('--preflight', action='store_true',
|
||
|
|
help='Run pre-flight checks only')
|
||
|
|
parser.add_argument('--backfill-5y', action='store_true',
|
||
|
|
help='Backfill 2021-2023 (5-year extension)')
|
||
|
|
parser.add_argument('--backfill-10y', action='store_true',
|
||
|
|
help='Backfill 2017-2023 (10-year extension, needs 400GB)')
|
||
|
|
parser.add_argument('--backfill-max', action='store_true',
|
||
|
|
help='Backfill 2017-2023 (max available from Binance)')
|
||
|
|
parser.add_argument('--fetch-only', action='store_true',
|
||
|
|
help='Only fetch klines, skip compute')
|
||
|
|
parser.add_argument('--compute-only', action='store_true',
|
||
|
|
help='Only compute from existing klines cache')
|
||
|
|
parser.add_argument('--convert', action='store_true',
|
||
|
|
help='Convert Arrow to Parquet only')
|
||
|
|
parser.add_argument('--validate', action='store_true',
|
||
|
|
help='Validate output only')
|
||
|
|
parser.add_argument('--full-5y', action='store_true',
|
||
|
|
help='Run complete 5-year pipeline: backfill + convert + validate')
|
||
|
|
parser.add_argument('--start', help='Custom start date (YYYY-MM-DD)')
|
||
|
|
parser.add_argument('--end', help='Custom end date (YYYY-MM-DD)')
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Default to plan if no args
|
||
|
|
if not any([args.plan, args.preflight, args.backfill_5y, args.backfill_10y,
|
||
|
|
args.backfill_max, args.fetch_only, args.compute_only,
|
||
|
|
args.convert, args.validate, args.full_5y, args.start]):
|
||
|
|
show_plan()
|
||
|
|
return
|
||
|
|
|
||
|
|
if args.plan:
|
||
|
|
show_plan()
|
||
|
|
|
||
|
|
if args.preflight:
|
||
|
|
run_preflight_checks()
|
||
|
|
|
||
|
|
if args.backfill_5y or args.full_5y:
|
||
|
|
start, end = DATE_RANGES['5y']
|
||
|
|
phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
|
||
|
|
if not run_backfill(start, end, phase):
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
if args.backfill_10y:
|
||
|
|
free_gb, _ = check_disk_space()
|
||
|
|
if free_gb < 400:
|
||
|
|
log('ERROR: Insufficient disk space for 10-year backfill (need 400GB)')
|
||
|
|
sys.exit(1)
|
||
|
|
start, end = DATE_RANGES['max']
|
||
|
|
phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
|
||
|
|
if not run_backfill(start, end, phase):
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
if args.backfill_max:
|
||
|
|
start, end = DATE_RANGES['max']
|
||
|
|
phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
|
||
|
|
if not run_backfill(start, end, phase):
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
if args.start and args.end:
|
||
|
|
phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
|
||
|
|
if not run_backfill(args.start, args.end, phase):
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
if args.convert or args.full_5y:
|
||
|
|
if not run_convert():
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
if args.validate or args.full_5y:
|
||
|
|
run_validate()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|