DOLPHIN/prod/klines_backfill_5y_10y.py

#!/usr/bin/env python3
"""
DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder
======================================================

This script orchestrates the backfill of 1-minute klines data from Binance
to extend the DOLPHIN dataset from ~2 years to 5+ years of history.

Usage:
    python klines_backfill_5y_10y.py --plan           # Show execution plan
    python klines_backfill_5y_10y.py --preflight      # Run pre-flight checks only
    python klines_backfill_5y_10y.py --backfill-5y    # Run 5-year backfill (2021-2023)
    python klines_backfill_5y_10y.py --backfill-10y   # Run 10-year backfill (2017-2023)
    python klines_backfill_5y_10y.py --convert        # Convert Arrow to Parquet only
    python klines_backfill_5y_10y.py --validate       # Validate output only
    python klines_backfill_5y_10y.py --full-5y        # Run all: backfill + convert + validate

Expected Runtime (5-year):
    Phase 1 (Fetch):     6-12 hours  (depends on network, rate limits)
    Phase 2 (Compute):   2-4 hours   (depends on CPU)
    Phase 3 (Convert):   30-60 minutes
    Phase 4 (Validate):  5-10 minutes
    Total:               10-18 hours

Disk Space Requirements:
    5-year:  ~150 GB free (including klines_cache, can delete after)
    10-year: ~400 GB free (likely too large for most systems)

Resume Capability:
    The backfiller is fully idempotent. If interrupted, simply re-run the
    same command - already-fetched dates are skipped automatically.
"""

import argparse
import subprocess
import sys
import shutil
import time
from pathlib import Path
from datetime import datetime
from typing import List, Tuple, Optional

# Configuration
BACKFILL_DIR = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill')
HCM_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')
BACKFILL_SCRIPT = BACKFILL_DIR / 'historical_klines_backfiller.py'
CONVERT_SCRIPT = HCM_DIR / 'ng5_arrow_to_vbt_cache.py'
ARROW_DIR = BACKFILL_DIR / 'backfilled_data' / 'arrow_klines'
PARQUET_DIR = HCM_DIR / 'vbt_cache_klines'
LOG_DIR = HCM_DIR / 'run_logs'

# Date ranges
DATE_RANGES = {
    '5y': ('2021-07-01', '2023-12-31'),   # Reliable symbol coverage
    '7y': ('2019-01-01', '2023-12-31'),   # Most symbols available
    'max': ('2017-07-01', '2023-12-31'),  # Binance launch date
}


def log(msg: str):
    """Print with timestamp."""
    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(f'[{ts}] {msg}')


def check_disk_space() -> Tuple[float, str]:
    """Check free disk space on C:\ and return GB and status."""
    u = shutil.disk_usage('C:\\')
    free_gb = u.free / (1024**3)
    
    if free_gb >= 400:
        status = 'SUFFICIENT for 10-year extension'
    elif free_gb >= 150:
        status = 'SUFFICIENT for 5-year extension'
    elif free_gb >= 100:
        status = 'MARGINAL - may need cache cleanup during run'
    else:
        status = 'INSUFFICIENT - free up disk space first'
    
    return free_gb, status


def get_current_coverage() -> dict:
    """Analyze current data coverage."""
    result = {
        'parquet_count': 0,
        'parquet_range': None,
        'arrow_count': 0,
        'arrow_range': None,
        'klines_cache_size_gb': 0,
    }
    
    # Parquet coverage
    if PARQUET_DIR.exists():
        parquets = sorted(PARQUET_DIR.glob('*.parquet'))
        result['parquet_count'] = len(parquets)
        if parquets:
            stems = [f.stem for f in parquets]
            result['parquet_range'] = (stems[0], stems[-1])
    
    # Arrow coverage
    if ARROW_DIR.exists():
        arrow_dates = sorted([d.name for d in ARROW_DIR.iterdir() if d.is_dir()])
        result['arrow_count'] = len(arrow_dates)
        if arrow_dates:
            result['arrow_range'] = (arrow_dates[0], arrow_dates[-1])
    
    # Klines cache size
    klines_cache = BACKFILL_DIR / 'klines_cache'
    if klines_cache.exists():
        total_size = 0
        for dirpath, dirnames, filenames in shutil.os.walk(klines_cache):
            for f in filenames:
                fp = shutil.os.path.join(dirpath, f)
                total_size += shutil.os.path.getsize(fp)
        result['klines_cache_size_gb'] = total_size / (1024**3)
    
    return result


def run_preflight_checks():
    """Run all pre-flight checks."""
    log('=' * 70)
    log('PRE-FLIGHT CHECKS')
    log('=' * 70)
    
    # Disk space
    free_gb, status = check_disk_space()
    log(f'\\nDisk Space:')
    log(f'  Free: {free_gb:.1f} GB')
    log(f'  Status: {status}')
    
    # Coverage
    cov = get_current_coverage()
    log(f'\\nCurrent Coverage:')
    log(f'  Parquet files: {cov["parquet_count"]}')
    if cov['parquet_range']:
        log(f'  Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
    log(f'  Arrow directories: {cov["arrow_count"]}')
    if cov['arrow_range']:
        log(f'  Arrow range: {cov["arrow_range"][0]} to {cov["arrow_range"][1]}')
    log(f'  Klines cache size: {cov["klines_cache_size_gb"]:.2f} GB')
    
    # Check scripts exist
    log(f'\\nScript Availability:')
    log(f'  Backfiller: {"OK" if BACKFILL_SCRIPT.exists() else "MISSING - " + str(BACKFILL_SCRIPT)}')
    log(f'  Converter:  {"OK" if CONVERT_SCRIPT.exists() else "MISSING - " + str(CONVERT_SCRIPT)}')
    
    log('=' * 70)


def run_backfill(start_date: str, end_date: str, phase: str = 'both'):
    """
    Run the backfill script for a date range.
    
    phase: 'fetch', 'compute', or 'both'
    """
    if not BACKFILL_SCRIPT.exists():
        log(f'ERROR: Backfill script not found: {BACKFILL_SCRIPT}')
        return False
    
    LOG_DIR.mkdir(exist_ok=True)
    log_file = LOG_DIR / f'backfill_{start_date}_{end_date}.log'
    
    log(f'Starting backfill: {start_date} to {end_date}')
    log(f'Log file: {log_file}')
    
    cmd_base = [
        'python', str(BACKFILL_SCRIPT),
        '--start', start_date,
        '--end', end_date,
    ]
    
    phases = []
    if phase == 'both':
        phases = [(['--fetch'], 'fetch'), (['--compute'], 'compute')]
    elif phase == 'fetch':
        phases = [(['--fetch'], 'fetch')]
    elif phase == 'compute':
        phases = [(['--compute'], 'compute')]
    
    for flags, name in phases:
        log(f'\\n>>> Phase: {name.upper()}')
        cmd = cmd_base + flags
        log(f'Command: {" ".join(cmd)}')
        
        start_time = time.time()
        
        # Run with output tee to log file
        with open(log_file, 'a') as lf:
            lf.write(f'\\n{"="*70}\\n')
            lf.write(f'Phase: {name} started at {datetime.now()}\\n')
            lf.write(f'{"="*70}\\n')
            
            process = subprocess.Popen(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                cwd=str(BACKFILL_DIR)
            )
            
            for line in process.stdout:
                print(line, end='')
                lf.write(line)
                lf.flush()
        
        elapsed = time.time() - start_time
        log(f'Phase {name} completed in {elapsed/3600:.2f} hours')
        
        if process.returncode != 0:
            log(f'ERROR: Phase {name} failed with code {process.returncode}')
            return False
    
    return True


def run_convert():
    """Run the Arrow to Parquet conversion."""
    if not CONVERT_SCRIPT.exists():
        log(f'ERROR: Converter script not found: {CONVERT_SCRIPT}')
        return False
    
    LOG_DIR.mkdir(exist_ok=True)
    log_file = LOG_DIR / f'convert_{datetime.now():%Y%m%d_%H%M%S}.log'
    
    log(f'Starting conversion')
    log(f'Log file: {log_file}')
    
    cmd = ['python', str(CONVERT_SCRIPT), '--all']
    log(f'Command: {" ".join(cmd)}')
    
    start_time = time.time()
    
    with open(log_file, 'a') as lf:
        lf.write(f'\\n{"="*70}\\n')
        lf.write(f'Conversion started at {datetime.now()}\\n')
        lf.write(f'{"="*70}\\n')
        
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            cwd=str(HCM_DIR)
        )
        
        for line in process.stdout:
            print(line, end='')
            lf.write(line)
            lf.flush()
    
    elapsed = time.time() - start_time
    log(f'Conversion completed in {elapsed/60:.1f} minutes')
    
    return process.returncode == 0


def run_validate():
    """Validate the output."""
    log('=' * 70)
    log('VALIDATION')
    log('=' * 70)
    
    cov = get_current_coverage()
    
    log(f'\\nParquet files: {cov["parquet_count"]}')
    if cov['parquet_range']:
        log(f'Range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
    
    # Sample validation
    if cov['parquet_count'] > 0:
        import random
        import pandas as pd
        
        parquets = sorted(PARQUET_DIR.glob('*.parquet'))
        
        # Sample 5 random files
        samples = random.sample(parquets, min(5, len(parquets)))
        log(f'\\nSample validation ({len(samples)} random files):')
        
        for p in sorted(samples):
            try:
                df = pd.read_parquet(p)
                price_cols = [c for c in df.columns if c.endswith('USDT')]
                log(f'  {p.stem}: {len(df)} rows, {len(price_cols)} price cols, '
                    f'vel_div std: {df["vel_div"].std():.4f}')
            except Exception as e:
                log(f'  {p.stem}: ERROR - {e}')
    
    log('=' * 70)


def show_plan():
    """Show the execution plan."""
    log('=' * 70)
    log('EXECUTION PLAN')
    log('=' * 70)
    
    free_gb, status = check_disk_space()
    cov = get_current_coverage()
    
    log(f'\\n1. DISK SPACE ANALYSIS')
    log(f'   Free space: {free_gb:.1f} GB')
    log(f'   Status: {status}')
    
    log(f'\\n2. CURRENT STATE')
    log(f'   Parquet files: {cov["parquet_count"]}')
    if cov['parquet_range']:
        log(f'   Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
    log(f'   Arrow directories: {cov["arrow_count"]}')
    log(f'   Klines cache: {cov["klines_cache_size_gb"]:.2f} GB')
    
    log(f'\\n3. RECOMMENDED BACKFILL RANGES')
    for name, (start, end) in DATE_RANGES.items():
        log(f'   {name}: {start} to {end}')
    
    log(f'\\n4. EXECUTION STEPS')
    log(f'   Step 1: Fetch klines (longest - 6-12 hours)')
    log(f'   Step 2: Compute eigenvalues (2-4 hours)')
    log(f'   Step 3: Convert to Parquet (30-60 minutes)')
    log(f'   Step 4: Validate (5-10 minutes)')
    
    log(f'\\n5. COMMANDS TO RUN')
    log(f'   Option A - Run everything (5-year):')
    log(f'      python klines_backfill_5y_10y.py --full-5y')
    log(f'   Option B - Step by step:')
    log(f'      python klines_backfill_5y_10y.py --backfill-5y')
    log(f'      python klines_backfill_5y_10y.py --convert')
    log(f'      python klines_backfill_5y_10y.py --validate')
    
    log('=' * 70)


def main():
    parser = argparse.ArgumentParser(
        description='DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Examples:
  python klines_backfill_5y_10y.py --plan           # Show execution plan
  python klines_backfill_5y_10y.py --preflight      # Check prerequisites
  python klines_backfill_5y_10y.py --full-5y        # Run complete 5-year backfill
        '''
    )
    
    parser.add_argument('--plan', action='store_true',
                        help='Show execution plan without running')
    parser.add_argument('--preflight', action='store_true',
                        help='Run pre-flight checks only')
    parser.add_argument('--backfill-5y', action='store_true',
                        help='Backfill 2021-2023 (5-year extension)')
    parser.add_argument('--backfill-10y', action='store_true',
                        help='Backfill 2017-2023 (10-year extension, needs 400GB)')
    parser.add_argument('--backfill-max', action='store_true',
                        help='Backfill 2017-2023 (max available from Binance)')
    parser.add_argument('--fetch-only', action='store_true',
                        help='Only fetch klines, skip compute')
    parser.add_argument('--compute-only', action='store_true',
                        help='Only compute from existing klines cache')
    parser.add_argument('--convert', action='store_true',
                        help='Convert Arrow to Parquet only')
    parser.add_argument('--validate', action='store_true',
                        help='Validate output only')
    parser.add_argument('--full-5y', action='store_true',
                        help='Run complete 5-year pipeline: backfill + convert + validate')
    parser.add_argument('--start', help='Custom start date (YYYY-MM-DD)')
    parser.add_argument('--end', help='Custom end date (YYYY-MM-DD)')
    
    args = parser.parse_args()
    
    # Default to plan if no args
    if not any([args.plan, args.preflight, args.backfill_5y, args.backfill_10y,
                args.backfill_max, args.fetch_only, args.compute_only,
                args.convert, args.validate, args.full_5y, args.start]):
        show_plan()
        return
    
    if args.plan:
        show_plan()
    
    if args.preflight:
        run_preflight_checks()
    
    if args.backfill_5y or args.full_5y:
        start, end = DATE_RANGES['5y']
        phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
        if not run_backfill(start, end, phase):
            sys.exit(1)
    
    if args.backfill_10y:
        free_gb, _ = check_disk_space()
        if free_gb < 400:
            log('ERROR: Insufficient disk space for 10-year backfill (need 400GB)')
            sys.exit(1)
        start, end = DATE_RANGES['max']
        phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
        if not run_backfill(start, end, phase):
            sys.exit(1)
    
    if args.backfill_max:
        start, end = DATE_RANGES['max']
        phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
        if not run_backfill(start, end, phase):
            sys.exit(1)
    
    if args.start and args.end:
        phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
        if not run_backfill(args.start, args.end, phase):
            sys.exit(1)
    
    if args.convert or args.full_5y:
        if not run_convert():
            sys.exit(1)
    
    if args.validate or args.full_5y:
        run_validate()


if __name__ == '__main__':
    main()
initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore. 2026-04-21 16:58:38 +02:00			`#!/usr/bin/env python3`
			`"""`
			`DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder`
			`======================================================`

			`This script orchestrates the backfill of 1-minute klines data from Binance`
			`to extend the DOLPHIN dataset from ~2 years to 5+ years of history.`

			`Usage:`
			`python klines_backfill_5y_10y.py --plan # Show execution plan`
			`python klines_backfill_5y_10y.py --preflight # Run pre-flight checks only`
			`python klines_backfill_5y_10y.py --backfill-5y # Run 5-year backfill (2021-2023)`
			`python klines_backfill_5y_10y.py --backfill-10y # Run 10-year backfill (2017-2023)`
			`python klines_backfill_5y_10y.py --convert # Convert Arrow to Parquet only`
			`python klines_backfill_5y_10y.py --validate # Validate output only`
			`python klines_backfill_5y_10y.py --full-5y # Run all: backfill + convert + validate`

			`Expected Runtime (5-year):`
			`Phase 1 (Fetch): 6-12 hours (depends on network, rate limits)`
			`Phase 2 (Compute): 2-4 hours (depends on CPU)`
			`Phase 3 (Convert): 30-60 minutes`
			`Phase 4 (Validate): 5-10 minutes`
			`Total: 10-18 hours`

			`Disk Space Requirements:`
			`5-year: ~150 GB free (including klines_cache, can delete after)`
			`10-year: ~400 GB free (likely too large for most systems)`

			`Resume Capability:`
			`The backfiller is fully idempotent. If interrupted, simply re-run the`
			`same command - already-fetched dates are skipped automatically.`
			`"""`

			`import argparse`
			`import subprocess`
			`import sys`
			`import shutil`
			`import time`
			`from pathlib import Path`
			`from datetime import datetime`
			`from typing import List, Tuple, Optional`

			`# Configuration`
			`BACKFILL_DIR = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill')`
			`HCM_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')`
			`BACKFILL_SCRIPT = BACKFILL_DIR / 'historical_klines_backfiller.py'`
			`CONVERT_SCRIPT = HCM_DIR / 'ng5_arrow_to_vbt_cache.py'`
			`ARROW_DIR = BACKFILL_DIR / 'backfilled_data' / 'arrow_klines'`
			`PARQUET_DIR = HCM_DIR / 'vbt_cache_klines'`
			`LOG_DIR = HCM_DIR / 'run_logs'`

			`# Date ranges`
			`DATE_RANGES = {`
			`'5y': ('2021-07-01', '2023-12-31'), # Reliable symbol coverage`
			`'7y': ('2019-01-01', '2023-12-31'), # Most symbols available`
			`'max': ('2017-07-01', '2023-12-31'), # Binance launch date`
			`}`


			`def log(msg: str):`
			`"""Print with timestamp."""`
			`ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')`
			`print(f'[{ts}] {msg}')`


			`def check_disk_space() -> Tuple[float, str]:`
			`"""Check free disk space on C:\ and return GB and status."""`
			`u = shutil.disk_usage('C:\\')`
			`free_gb = u.free / (1024**3)`

			`if free_gb >= 400:`
			`status = 'SUFFICIENT for 10-year extension'`
			`elif free_gb >= 150:`
			`status = 'SUFFICIENT for 5-year extension'`
			`elif free_gb >= 100:`
			`status = 'MARGINAL - may need cache cleanup during run'`
			`else:`
			`status = 'INSUFFICIENT - free up disk space first'`

			`return free_gb, status`


			`def get_current_coverage() -> dict:`
			`"""Analyze current data coverage."""`
			`result = {`
			`'parquet_count': 0,`
			`'parquet_range': None,`
			`'arrow_count': 0,`
			`'arrow_range': None,`
			`'klines_cache_size_gb': 0,`
			`}`

			`# Parquet coverage`
			`if PARQUET_DIR.exists():`
			`parquets = sorted(PARQUET_DIR.glob('*.parquet'))`
			`result['parquet_count'] = len(parquets)`
			`if parquets:`
			`stems = [f.stem for f in parquets]`
			`result['parquet_range'] = (stems[0], stems[-1])`

			`# Arrow coverage`
			`if ARROW_DIR.exists():`
			`arrow_dates = sorted([d.name for d in ARROW_DIR.iterdir() if d.is_dir()])`
			`result['arrow_count'] = len(arrow_dates)`
			`if arrow_dates:`
			`result['arrow_range'] = (arrow_dates[0], arrow_dates[-1])`

			`# Klines cache size`
			`klines_cache = BACKFILL_DIR / 'klines_cache'`
			`if klines_cache.exists():`
			`total_size = 0`
			`for dirpath, dirnames, filenames in shutil.os.walk(klines_cache):`
			`for f in filenames:`
			`fp = shutil.os.path.join(dirpath, f)`
			`total_size += shutil.os.path.getsize(fp)`
			`result['klines_cache_size_gb'] = total_size / (1024**3)`

			`return result`


			`def run_preflight_checks():`
			`"""Run all pre-flight checks."""`
			`log('=' * 70)`
			`log('PRE-FLIGHT CHECKS')`
			`log('=' * 70)`

			`# Disk space`
			`free_gb, status = check_disk_space()`
			`log(f'\\nDisk Space:')`
			`log(f' Free: {free_gb:.1f} GB')`
			`log(f' Status: {status}')`

			`# Coverage`
			`cov = get_current_coverage()`
			`log(f'\\nCurrent Coverage:')`
			`log(f' Parquet files: {cov["parquet_count"]}')`
			`if cov['parquet_range']:`
			`log(f' Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')`
			`log(f' Arrow directories: {cov["arrow_count"]}')`
			`if cov['arrow_range']:`
			`log(f' Arrow range: {cov["arrow_range"][0]} to {cov["arrow_range"][1]}')`
			`log(f' Klines cache size: {cov["klines_cache_size_gb"]:.2f} GB')`

			`# Check scripts exist`
			`log(f'\\nScript Availability:')`
			`log(f' Backfiller: {"OK" if BACKFILL_SCRIPT.exists() else "MISSING - " + str(BACKFILL_SCRIPT)}')`
			`log(f' Converter: {"OK" if CONVERT_SCRIPT.exists() else "MISSING - " + str(CONVERT_SCRIPT)}')`

			`log('=' * 70)`


			`def run_backfill(start_date: str, end_date: str, phase: str = 'both'):`
			`"""`
			`Run the backfill script for a date range.`

			`phase: 'fetch', 'compute', or 'both'`
			`"""`
			`if not BACKFILL_SCRIPT.exists():`
			`log(f'ERROR: Backfill script not found: {BACKFILL_SCRIPT}')`
			`return False`

			`LOG_DIR.mkdir(exist_ok=True)`
			`log_file = LOG_DIR / f'backfill_{start_date}_{end_date}.log'`

			`log(f'Starting backfill: {start_date} to {end_date}')`
			`log(f'Log file: {log_file}')`

			`cmd_base = [`
			`'python', str(BACKFILL_SCRIPT),`
			`'--start', start_date,`
			`'--end', end_date,`
			`]`

			`phases = []`
			`if phase == 'both':`
			`phases = [(['--fetch'], 'fetch'), (['--compute'], 'compute')]`
			`elif phase == 'fetch':`
			`phases = [(['--fetch'], 'fetch')]`
			`elif phase == 'compute':`
			`phases = [(['--compute'], 'compute')]`

			`for flags, name in phases:`
			`log(f'\\n>>> Phase: {name.upper()}')`
			`cmd = cmd_base + flags`
			`log(f'Command: {" ".join(cmd)}')`

			`start_time = time.time()`

			`# Run with output tee to log file`
			`with open(log_file, 'a') as lf:`
			`lf.write(f'\\n{"="*70}\\n')`
			`lf.write(f'Phase: {name} started at {datetime.now()}\\n')`
			`lf.write(f'{"="*70}\\n')`

			`process = subprocess.Popen(`
			`cmd,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.STDOUT,`
			`text=True,`
			`cwd=str(BACKFILL_DIR)`
			`)`

			`for line in process.stdout:`
			`print(line, end='')`
			`lf.write(line)`
			`lf.flush()`

			`elapsed = time.time() - start_time`
			`log(f'Phase {name} completed in {elapsed/3600:.2f} hours')`

			`if process.returncode != 0:`
			`log(f'ERROR: Phase {name} failed with code {process.returncode}')`
			`return False`

			`return True`


			`def run_convert():`
			`"""Run the Arrow to Parquet conversion."""`
			`if not CONVERT_SCRIPT.exists():`
			`log(f'ERROR: Converter script not found: {CONVERT_SCRIPT}')`
			`return False`

			`LOG_DIR.mkdir(exist_ok=True)`
			`log_file = LOG_DIR / f'convert_{datetime.now():%Y%m%d_%H%M%S}.log'`

			`log(f'Starting conversion')`
			`log(f'Log file: {log_file}')`

			`cmd = ['python', str(CONVERT_SCRIPT), '--all']`
			`log(f'Command: {" ".join(cmd)}')`

			`start_time = time.time()`

			`with open(log_file, 'a') as lf:`
			`lf.write(f'\\n{"="*70}\\n')`
			`lf.write(f'Conversion started at {datetime.now()}\\n')`
			`lf.write(f'{"="*70}\\n')`

			`process = subprocess.Popen(`
			`cmd,`
			`stdout=subprocess.PIPE,`
			`stderr=subprocess.STDOUT,`
			`text=True,`
			`cwd=str(HCM_DIR)`
			`)`

			`for line in process.stdout:`
			`print(line, end='')`
			`lf.write(line)`
			`lf.flush()`

			`elapsed = time.time() - start_time`
			`log(f'Conversion completed in {elapsed/60:.1f} minutes')`

			`return process.returncode == 0`


			`def run_validate():`
			`"""Validate the output."""`
			`log('=' * 70)`
			`log('VALIDATION')`
			`log('=' * 70)`

			`cov = get_current_coverage()`

			`log(f'\\nParquet files: {cov["parquet_count"]}')`
			`if cov['parquet_range']:`
			`log(f'Range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')`

			`# Sample validation`
			`if cov['parquet_count'] > 0:`
			`import random`
			`import pandas as pd`

			`parquets = sorted(PARQUET_DIR.glob('*.parquet'))`

			`# Sample 5 random files`
			`samples = random.sample(parquets, min(5, len(parquets)))`
			`log(f'\\nSample validation ({len(samples)} random files):')`

			`for p in sorted(samples):`
			`try:`
			`df = pd.read_parquet(p)`
			`price_cols = [c for c in df.columns if c.endswith('USDT')]`
			`log(f' {p.stem}: {len(df)} rows, {len(price_cols)} price cols, '`
			`f'vel_div std: {df["vel_div"].std():.4f}')`
			`except Exception as e:`
			`log(f' {p.stem}: ERROR - {e}')`

			`log('=' * 70)`


			`def show_plan():`
			`"""Show the execution plan."""`
			`log('=' * 70)`
			`log('EXECUTION PLAN')`
			`log('=' * 70)`

			`free_gb, status = check_disk_space()`
			`cov = get_current_coverage()`

			`log(f'\\n1. DISK SPACE ANALYSIS')`
			`log(f' Free space: {free_gb:.1f} GB')`
			`log(f' Status: {status}')`

			`log(f'\\n2. CURRENT STATE')`
			`log(f' Parquet files: {cov["parquet_count"]}')`
			`if cov['parquet_range']:`
			`log(f' Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')`
			`log(f' Arrow directories: {cov["arrow_count"]}')`
			`log(f' Klines cache: {cov["klines_cache_size_gb"]:.2f} GB')`

			`log(f'\\n3. RECOMMENDED BACKFILL RANGES')`
			`for name, (start, end) in DATE_RANGES.items():`
			`log(f' {name}: {start} to {end}')`

			`log(f'\\n4. EXECUTION STEPS')`
			`log(f' Step 1: Fetch klines (longest - 6-12 hours)')`
			`log(f' Step 2: Compute eigenvalues (2-4 hours)')`
			`log(f' Step 3: Convert to Parquet (30-60 minutes)')`
			`log(f' Step 4: Validate (5-10 minutes)')`

			`log(f'\\n5. COMMANDS TO RUN')`
			`log(f' Option A - Run everything (5-year):')`
			`log(f' python klines_backfill_5y_10y.py --full-5y')`
			`log(f' Option B - Step by step:')`
			`log(f' python klines_backfill_5y_10y.py --backfill-5y')`
			`log(f' python klines_backfill_5y_10y.py --convert')`
			`log(f' python klines_backfill_5y_10y.py --validate')`

			`log('=' * 70)`


			`def main():`
			`parser = argparse.ArgumentParser(`
			`description='DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder',`
			`formatter_class=argparse.RawDescriptionHelpFormatter,`
			`epilog='''`
			`Examples:`
			`python klines_backfill_5y_10y.py --plan # Show execution plan`
			`python klines_backfill_5y_10y.py --preflight # Check prerequisites`
			`python klines_backfill_5y_10y.py --full-5y # Run complete 5-year backfill`
			`'''`
			`)`

			`parser.add_argument('--plan', action='store_true',`
			`help='Show execution plan without running')`
			`parser.add_argument('--preflight', action='store_true',`
			`help='Run pre-flight checks only')`
			`parser.add_argument('--backfill-5y', action='store_true',`
			`help='Backfill 2021-2023 (5-year extension)')`
			`parser.add_argument('--backfill-10y', action='store_true',`
			`help='Backfill 2017-2023 (10-year extension, needs 400GB)')`
			`parser.add_argument('--backfill-max', action='store_true',`
			`help='Backfill 2017-2023 (max available from Binance)')`
			`parser.add_argument('--fetch-only', action='store_true',`
			`help='Only fetch klines, skip compute')`
			`parser.add_argument('--compute-only', action='store_true',`
			`help='Only compute from existing klines cache')`
			`parser.add_argument('--convert', action='store_true',`
			`help='Convert Arrow to Parquet only')`
			`parser.add_argument('--validate', action='store_true',`
			`help='Validate output only')`
			`parser.add_argument('--full-5y', action='store_true',`
			`help='Run complete 5-year pipeline: backfill + convert + validate')`
			`parser.add_argument('--start', help='Custom start date (YYYY-MM-DD)')`
			`parser.add_argument('--end', help='Custom end date (YYYY-MM-DD)')`

			`args = parser.parse_args()`

			`# Default to plan if no args`
			`if not any([args.plan, args.preflight, args.backfill_5y, args.backfill_10y,`
			`args.backfill_max, args.fetch_only, args.compute_only,`
			`args.convert, args.validate, args.full_5y, args.start]):`
			`show_plan()`
			`return`

			`if args.plan:`
			`show_plan()`

			`if args.preflight:`
			`run_preflight_checks()`

			`if args.backfill_5y or args.full_5y:`
			`start, end = DATE_RANGES['5y']`
			`phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'`
			`if not run_backfill(start, end, phase):`
			`sys.exit(1)`

			`if args.backfill_10y:`
			`free_gb, _ = check_disk_space()`
			`if free_gb < 400:`
			`log('ERROR: Insufficient disk space for 10-year backfill (need 400GB)')`
			`sys.exit(1)`
			`start, end = DATE_RANGES['max']`
			`phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'`
			`if not run_backfill(start, end, phase):`
			`sys.exit(1)`

			`if args.backfill_max:`
			`start, end = DATE_RANGES['max']`
			`phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'`
			`if not run_backfill(start, end, phase):`
			`sys.exit(1)`

			`if args.start and args.end:`
			`phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'`
			`if not run_backfill(args.start, args.end, phase):`
			`sys.exit(1)`

			`if args.convert or args.full_5y:`
			`if not run_convert():`
			`sys.exit(1)`

			`if args.validate or args.full_5y:`
			`run_validate()`


			`if __name__ == '__main__':`
			`main()`