initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/prod/klines_backfill_5y_10y.py
+++ b/prod/klines_backfill_5y_10y.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python3
+"""
+DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder
+======================================================
+
+This script orchestrates the backfill of 1-minute klines data from Binance
+to extend the DOLPHIN dataset from ~2 years to 5+ years of history.
+
+Usage:
+    python klines_backfill_5y_10y.py --plan           # Show execution plan
+    python klines_backfill_5y_10y.py --preflight      # Run pre-flight checks only
+    python klines_backfill_5y_10y.py --backfill-5y    # Run 5-year backfill (2021-2023)
+    python klines_backfill_5y_10y.py --backfill-10y   # Run 10-year backfill (2017-2023)
+    python klines_backfill_5y_10y.py --convert        # Convert Arrow to Parquet only
+    python klines_backfill_5y_10y.py --validate       # Validate output only
+    python klines_backfill_5y_10y.py --full-5y        # Run all: backfill + convert + validate
+
+Expected Runtime (5-year):
+    Phase 1 (Fetch):     6-12 hours  (depends on network, rate limits)
+    Phase 2 (Compute):   2-4 hours   (depends on CPU)
+    Phase 3 (Convert):   30-60 minutes
+    Phase 4 (Validate):  5-10 minutes
+    Total:               10-18 hours
+
+Disk Space Requirements:
+    5-year:  ~150 GB free (including klines_cache, can delete after)
+    10-year: ~400 GB free (likely too large for most systems)
+
+Resume Capability:
+    The backfiller is fully idempotent. If interrupted, simply re-run the
+    same command - already-fetched dates are skipped automatically.
+"""
+
+import argparse
+import subprocess
+import sys
+import shutil
+import time
+from pathlib import Path
+from datetime import datetime
+from typing import List, Tuple, Optional
+
+# Configuration
+BACKFILL_DIR = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill')
+HCM_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')
+BACKFILL_SCRIPT = BACKFILL_DIR / 'historical_klines_backfiller.py'
+CONVERT_SCRIPT = HCM_DIR / 'ng5_arrow_to_vbt_cache.py'
+ARROW_DIR = BACKFILL_DIR / 'backfilled_data' / 'arrow_klines'
+PARQUET_DIR = HCM_DIR / 'vbt_cache_klines'
+LOG_DIR = HCM_DIR / 'run_logs'
+
+# Date ranges
+DATE_RANGES = {
+    '5y': ('2021-07-01', '2023-12-31'),   # Reliable symbol coverage
+    '7y': ('2019-01-01', '2023-12-31'),   # Most symbols available
+    'max': ('2017-07-01', '2023-12-31'),  # Binance launch date
+}
+
+
+def log(msg: str):
+    """Print with timestamp."""
+    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    print(f'[{ts}] {msg}')
+
+
+def check_disk_space() -> Tuple[float, str]:
+    """Check free disk space on C:\ and return GB and status."""
+    u = shutil.disk_usage('C:\\')
+    free_gb = u.free / (1024**3)
+    
+    if free_gb >= 400:
+        status = 'SUFFICIENT for 10-year extension'
+    elif free_gb >= 150:
+        status = 'SUFFICIENT for 5-year extension'
+    elif free_gb >= 100:
+        status = 'MARGINAL - may need cache cleanup during run'
+    else:
+        status = 'INSUFFICIENT - free up disk space first'
+    
+    return free_gb, status
+
+
+def get_current_coverage() -> dict:
+    """Analyze current data coverage."""
+    result = {
+        'parquet_count': 0,
+        'parquet_range': None,
+        'arrow_count': 0,
+        'arrow_range': None,
+        'klines_cache_size_gb': 0,
+    }
+    
+    # Parquet coverage
+    if PARQUET_DIR.exists():
+        parquets = sorted(PARQUET_DIR.glob('*.parquet'))
+        result['parquet_count'] = len(parquets)
+        if parquets:
+            stems = [f.stem for f in parquets]
+            result['parquet_range'] = (stems[0], stems[-1])
+    
+    # Arrow coverage
+    if ARROW_DIR.exists():
+        arrow_dates = sorted([d.name for d in ARROW_DIR.iterdir() if d.is_dir()])
+        result['arrow_count'] = len(arrow_dates)
+        if arrow_dates:
+            result['arrow_range'] = (arrow_dates[0], arrow_dates[-1])
+    
+    # Klines cache size
+    klines_cache = BACKFILL_DIR / 'klines_cache'
+    if klines_cache.exists():
+        total_size = 0
+        for dirpath, dirnames, filenames in shutil.os.walk(klines_cache):
+            for f in filenames:
+                fp = shutil.os.path.join(dirpath, f)
+                total_size += shutil.os.path.getsize(fp)
+        result['klines_cache_size_gb'] = total_size / (1024**3)
+    
+    return result
+
+
+def run_preflight_checks():
+    """Run all pre-flight checks."""
+    log('=' * 70)
+    log('PRE-FLIGHT CHECKS')
+    log('=' * 70)
+    
+    # Disk space
+    free_gb, status = check_disk_space()
+    log(f'\\nDisk Space:')
+    log(f'  Free: {free_gb:.1f} GB')
+    log(f'  Status: {status}')
+    
+    # Coverage
+    cov = get_current_coverage()
+    log(f'\\nCurrent Coverage:')
+    log(f'  Parquet files: {cov["parquet_count"]}')
+    if cov['parquet_range']:
+        log(f'  Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
+    log(f'  Arrow directories: {cov["arrow_count"]}')
+    if cov['arrow_range']:
+        log(f'  Arrow range: {cov["arrow_range"][0]} to {cov["arrow_range"][1]}')
+    log(f'  Klines cache size: {cov["klines_cache_size_gb"]:.2f} GB')
+    
+    # Check scripts exist
+    log(f'\\nScript Availability:')
+    log(f'  Backfiller: {"OK" if BACKFILL_SCRIPT.exists() else "MISSING - " + str(BACKFILL_SCRIPT)}')
+    log(f'  Converter:  {"OK" if CONVERT_SCRIPT.exists() else "MISSING - " + str(CONVERT_SCRIPT)}')
+    
+    log('=' * 70)
+
+
+def run_backfill(start_date: str, end_date: str, phase: str = 'both'):
+    """
+    Run the backfill script for a date range.
+    
+    phase: 'fetch', 'compute', or 'both'
+    """
+    if not BACKFILL_SCRIPT.exists():
+        log(f'ERROR: Backfill script not found: {BACKFILL_SCRIPT}')
+        return False
+    
+    LOG_DIR.mkdir(exist_ok=True)
+    log_file = LOG_DIR / f'backfill_{start_date}_{end_date}.log'
+    
+    log(f'Starting backfill: {start_date} to {end_date}')
+    log(f'Log file: {log_file}')
+    
+    cmd_base = [
+        'python', str(BACKFILL_SCRIPT),
+        '--start', start_date,
+        '--end', end_date,
+    ]
+    
+    phases = []
+    if phase == 'both':
+        phases = [(['--fetch'], 'fetch'), (['--compute'], 'compute')]
+    elif phase == 'fetch':
+        phases = [(['--fetch'], 'fetch')]
+    elif phase == 'compute':
+        phases = [(['--compute'], 'compute')]
+    
+    for flags, name in phases:
+        log(f'\\n>>> Phase: {name.upper()}')
+        cmd = cmd_base + flags
+        log(f'Command: {" ".join(cmd)}')
+        
+        start_time = time.time()
+        
+        # Run with output tee to log file
+        with open(log_file, 'a') as lf:
+            lf.write(f'\\n{"="*70}\\n')
+            lf.write(f'Phase: {name} started at {datetime.now()}\\n')
+            lf.write(f'{"="*70}\\n')
+            
+            process = subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                cwd=str(BACKFILL_DIR)
+            )
+            
+            for line in process.stdout:
+                print(line, end='')
+                lf.write(line)
+                lf.flush()
+        
+        elapsed = time.time() - start_time
+        log(f'Phase {name} completed in {elapsed/3600:.2f} hours')
+        
+        if process.returncode != 0:
+            log(f'ERROR: Phase {name} failed with code {process.returncode}')
+            return False
+    
+    return True
+
+
+def run_convert():
+    """Run the Arrow to Parquet conversion."""
+    if not CONVERT_SCRIPT.exists():
+        log(f'ERROR: Converter script not found: {CONVERT_SCRIPT}')
+        return False
+    
+    LOG_DIR.mkdir(exist_ok=True)
+    log_file = LOG_DIR / f'convert_{datetime.now():%Y%m%d_%H%M%S}.log'
+    
+    log(f'Starting conversion')
+    log(f'Log file: {log_file}')
+    
+    cmd = ['python', str(CONVERT_SCRIPT), '--all']
+    log(f'Command: {" ".join(cmd)}')
+    
+    start_time = time.time()
+    
+    with open(log_file, 'a') as lf:
+        lf.write(f'\\n{"="*70}\\n')
+        lf.write(f'Conversion started at {datetime.now()}\\n')
+        lf.write(f'{"="*70}\\n')
+        
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            cwd=str(HCM_DIR)
+        )
+        
+        for line in process.stdout:
+            print(line, end='')
+            lf.write(line)
+            lf.flush()
+    
+    elapsed = time.time() - start_time
+    log(f'Conversion completed in {elapsed/60:.1f} minutes')
+    
+    return process.returncode == 0
+
+
+def run_validate():
+    """Validate the output."""
+    log('=' * 70)
+    log('VALIDATION')
+    log('=' * 70)
+    
+    cov = get_current_coverage()
+    
+    log(f'\\nParquet files: {cov["parquet_count"]}')
+    if cov['parquet_range']:
+        log(f'Range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
+    
+    # Sample validation
+    if cov['parquet_count'] > 0:
+        import random
+        import pandas as pd
+        
+        parquets = sorted(PARQUET_DIR.glob('*.parquet'))
+        
+        # Sample 5 random files
+        samples = random.sample(parquets, min(5, len(parquets)))
+        log(f'\\nSample validation ({len(samples)} random files):')
+        
+        for p in sorted(samples):
+            try:
+                df = pd.read_parquet(p)
+                price_cols = [c for c in df.columns if c.endswith('USDT')]
+                log(f'  {p.stem}: {len(df)} rows, {len(price_cols)} price cols, '
+                    f'vel_div std: {df["vel_div"].std():.4f}')
+            except Exception as e:
+                log(f'  {p.stem}: ERROR - {e}')
+    
+    log('=' * 70)
+
+
+def show_plan():
+    """Show the execution plan."""
+    log('=' * 70)
+    log('EXECUTION PLAN')
+    log('=' * 70)
+    
+    free_gb, status = check_disk_space()
+    cov = get_current_coverage()
+    
+    log(f'\\n1. DISK SPACE ANALYSIS')
+    log(f'   Free space: {free_gb:.1f} GB')
+    log(f'   Status: {status}')
+    
+    log(f'\\n2. CURRENT STATE')
+    log(f'   Parquet files: {cov["parquet_count"]}')
+    if cov['parquet_range']:
+        log(f'   Parquet range: {cov["parquet_range"][0]} to {cov["parquet_range"][1]}')
+    log(f'   Arrow directories: {cov["arrow_count"]}')
+    log(f'   Klines cache: {cov["klines_cache_size_gb"]:.2f} GB')
+    
+    log(f'\\n3. RECOMMENDED BACKFILL RANGES')
+    for name, (start, end) in DATE_RANGES.items():
+        log(f'   {name}: {start} to {end}')
+    
+    log(f'\\n4. EXECUTION STEPS')
+    log(f'   Step 1: Fetch klines (longest - 6-12 hours)')
+    log(f'   Step 2: Compute eigenvalues (2-4 hours)')
+    log(f'   Step 3: Convert to Parquet (30-60 minutes)')
+    log(f'   Step 4: Validate (5-10 minutes)')
+    
+    log(f'\\n5. COMMANDS TO RUN')
+    log(f'   Option A - Run everything (5-year):')
+    log(f'      python klines_backfill_5y_10y.py --full-5y')
+    log(f'   Option B - Step by step:')
+    log(f'      python klines_backfill_5y_10y.py --backfill-5y')
+    log(f'      python klines_backfill_5y_10y.py --convert')
+    log(f'      python klines_backfill_5y_10y.py --validate')
+    
+    log('=' * 70)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='DOLPHIN NG5 - 5 Year / 10 Year Klines Dataset Builder',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='''
+Examples:
+  python klines_backfill_5y_10y.py --plan           # Show execution plan
+  python klines_backfill_5y_10y.py --preflight      # Check prerequisites
+  python klines_backfill_5y_10y.py --full-5y        # Run complete 5-year backfill
+        '''
+    )
+    
+    parser.add_argument('--plan', action='store_true',
+                        help='Show execution plan without running')
+    parser.add_argument('--preflight', action='store_true',
+                        help='Run pre-flight checks only')
+    parser.add_argument('--backfill-5y', action='store_true',
+                        help='Backfill 2021-2023 (5-year extension)')
+    parser.add_argument('--backfill-10y', action='store_true',
+                        help='Backfill 2017-2023 (10-year extension, needs 400GB)')
+    parser.add_argument('--backfill-max', action='store_true',
+                        help='Backfill 2017-2023 (max available from Binance)')
+    parser.add_argument('--fetch-only', action='store_true',
+                        help='Only fetch klines, skip compute')
+    parser.add_argument('--compute-only', action='store_true',
+                        help='Only compute from existing klines cache')
+    parser.add_argument('--convert', action='store_true',
+                        help='Convert Arrow to Parquet only')
+    parser.add_argument('--validate', action='store_true',
+                        help='Validate output only')
+    parser.add_argument('--full-5y', action='store_true',
+                        help='Run complete 5-year pipeline: backfill + convert + validate')
+    parser.add_argument('--start', help='Custom start date (YYYY-MM-DD)')
+    parser.add_argument('--end', help='Custom end date (YYYY-MM-DD)')
+    
+    args = parser.parse_args()
+    
+    # Default to plan if no args
+    if not any([args.plan, args.preflight, args.backfill_5y, args.backfill_10y,
+                args.backfill_max, args.fetch_only, args.compute_only,
+                args.convert, args.validate, args.full_5y, args.start]):
+        show_plan()
+        return
+    
+    if args.plan:
+        show_plan()
+    
+    if args.preflight:
+        run_preflight_checks()
+    
+    if args.backfill_5y or args.full_5y:
+        start, end = DATE_RANGES['5y']
+        phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
+        if not run_backfill(start, end, phase):
+            sys.exit(1)
+    
+    if args.backfill_10y:
+        free_gb, _ = check_disk_space()
+        if free_gb < 400:
+            log('ERROR: Insufficient disk space for 10-year backfill (need 400GB)')
+            sys.exit(1)
+        start, end = DATE_RANGES['max']
+        phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
+        if not run_backfill(start, end, phase):
+            sys.exit(1)
+    
+    if args.backfill_max:
+        start, end = DATE_RANGES['max']
+        phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
+        if not run_backfill(start, end, phase):
+            sys.exit(1)
+    
+    if args.start and args.end:
+        phase = 'fetch' if args.fetch_only else 'compute' if args.compute_only else 'both'
+        if not run_backfill(args.start, args.end, phase):
+            sys.exit(1)
+    
+    if args.convert or args.full_5y:
+        if not run_convert():
+            sys.exit(1)
+    
+    if args.validate or args.full_5y:
+        run_validate()
+
+
+if __name__ == '__main__':
+    main()