DOLPHIN/prod/convert_arrow_to_parquet_batch.py

#!/usr/bin/env python3
"""
Batch converter for Arrow to Parquet with progress tracking.
Processes in chunks to avoid timeout issues.
"""

import sys
import time
from pathlib import Path
from datetime import datetime
import json

# Add paths
sys.path.insert(0, str(Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')))

import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.ipc as ipc

# Configuration
ARROW_BASE = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill\backfilled_data\arrow_klines')
OUTPUT_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines')
LOG_FILE = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\prod\conversion_batch.log')

def log(msg):
    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    line = f'[{ts}] {msg}'
    print(line)
    with open(LOG_FILE, 'a') as f:
        f.write(line + '\n')

def get_arrow_dates():
    """Get all dates from Arrow directory."""
    if not ARROW_BASE.exists():
        return []
    dates = []
    for d in ARROW_BASE.iterdir():
        if d.is_dir() and len(d.name) == 10 and d.name[4] == '-':
            if any(d.glob('*.arrow')):
                dates.append(d.name)
    return sorted(dates)

def get_parquet_dates():
    """Get all dates from Parquet directory."""
    if not OUTPUT_DIR.exists():
        return []
    return sorted([f.stem for f in OUTPUT_DIR.glob('*.parquet')])

def convert_date(date_str, force=False):
    """Convert a single date."""
    from ng5_arrow_to_vbt_cache import convert_arrow_date
    
    out_file = OUTPUT_DIR / f"{date_str}.parquet"
    if out_file.exists() and not force:
        return True, "already_exists"
    
    try:
        df = convert_arrow_date(date_str, arrow_base=ARROW_BASE, out_dir=OUTPUT_DIR, force=force)
        if df is not None:
            return True, f"converted_{len(df)}_rows"
        else:
            return False, "no_data"
    except Exception as e:
        return False, f"error_{e}"

def main():
    log('='*70)
    log('BATCH CONVERSION START')
    log('='*70)
    
    arrow_dates = get_arrow_dates()
    parquet_dates = get_parquet_dates()
    
    log(f'Arrow dates: {len(arrow_dates)}')
    log(f'Parquet dates: {len(parquet_dates)}')
    
    arrow_set = set(arrow_dates)
    parquet_set = set(parquet_dates)
    
    to_convert = sorted(arrow_set - parquet_set)
    log(f'Dates to convert: {len(to_convert)}')
    
    if not to_convert:
        log('Nothing to convert - all up to date!')
        return
    
    log(f'Range: {to_convert[0]} to {to_convert[-1]}')
    
    # Convert in batches
    batch_size = 50
    total = len(to_convert)
    converted = 0
    failed = []
    
    for i, date_str in enumerate(to_convert):
        success, status = convert_date(date_str)
        if success:
            converted += 1
        else:
            failed.append((date_str, status))
        
        # Progress report every 10 files
        if (i + 1) % 10 == 0 or i == total - 1:
            pct = (i + 1) / total * 100
            log(f'Progress: {i+1}/{total} ({pct:.1f}%) - Converted: {converted}, Failed: {len(failed)}')
    
    log('='*70)
    log(f'BATCH COMPLETE: {converted}/{total} converted')
    if failed:
        log(f'Failed dates: {len(failed)}')
        for d, e in failed[:10]:
            log(f'  {d}: {e}')
    log('='*70)

if __name__ == '__main__':
    main()
initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore. 2026-04-21 16:58:38 +02:00			`#!/usr/bin/env python3`
			`"""`
			`Batch converter for Arrow to Parquet with progress tracking.`
			`Processes in chunks to avoid timeout issues.`
			`"""`

			`import sys`
			`import time`
			`from pathlib import Path`
			`from datetime import datetime`
			`import json`

			`# Add paths`
			`sys.path.insert(0, str(Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')))`

			`import pandas as pd`
			`import numpy as np`
			`import pyarrow as pa`
			`import pyarrow.ipc as ipc`

			`# Configuration`
			`ARROW_BASE = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill\backfilled_data\arrow_klines')`
			`OUTPUT_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines')`
			`LOG_FILE = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\prod\conversion_batch.log')`

			`def log(msg):`
			`ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')`
			`line = f'[{ts}] {msg}'`
			`print(line)`
			`with open(LOG_FILE, 'a') as f:`
			`f.write(line + '\n')`

			`def get_arrow_dates():`
			`"""Get all dates from Arrow directory."""`
			`if not ARROW_BASE.exists():`
			`return []`
			`dates = []`
			`for d in ARROW_BASE.iterdir():`
			`if d.is_dir() and len(d.name) == 10 and d.name[4] == '-':`
			`if any(d.glob('*.arrow')):`
			`dates.append(d.name)`
			`return sorted(dates)`

			`def get_parquet_dates():`
			`"""Get all dates from Parquet directory."""`
			`if not OUTPUT_DIR.exists():`
			`return []`
			`return sorted([f.stem for f in OUTPUT_DIR.glob('*.parquet')])`

			`def convert_date(date_str, force=False):`
			`"""Convert a single date."""`
			`from ng5_arrow_to_vbt_cache import convert_arrow_date`

			`out_file = OUTPUT_DIR / f"{date_str}.parquet"`
			`if out_file.exists() and not force:`
			`return True, "already_exists"`

			`try:`
			`df = convert_arrow_date(date_str, arrow_base=ARROW_BASE, out_dir=OUTPUT_DIR, force=force)`
			`if df is not None:`
			`return True, f"converted_{len(df)}_rows"`
			`else:`
			`return False, "no_data"`
			`except Exception as e:`
			`return False, f"error_{e}"`

			`def main():`
			`log('='*70)`
			`log('BATCH CONVERSION START')`
			`log('='*70)`

			`arrow_dates = get_arrow_dates()`
			`parquet_dates = get_parquet_dates()`

			`log(f'Arrow dates: {len(arrow_dates)}')`
			`log(f'Parquet dates: {len(parquet_dates)}')`

			`arrow_set = set(arrow_dates)`
			`parquet_set = set(parquet_dates)`

			`to_convert = sorted(arrow_set - parquet_set)`
			`log(f'Dates to convert: {len(to_convert)}')`

			`if not to_convert:`
			`log('Nothing to convert - all up to date!')`
			`return`

			`log(f'Range: {to_convert[0]} to {to_convert[-1]}')`

			`# Convert in batches`
			`batch_size = 50`
			`total = len(to_convert)`
			`converted = 0`
			`failed = []`

			`for i, date_str in enumerate(to_convert):`
			`success, status = convert_date(date_str)`
			`if success:`
			`converted += 1`
			`else:`
			`failed.append((date_str, status))`

			`# Progress report every 10 files`
			`if (i + 1) % 10 == 0 or i == total - 1:`
			`pct = (i + 1) / total * 100`
			`log(f'Progress: {i+1}/{total} ({pct:.1f}%) - Converted: {converted}, Failed: {len(failed)}')`

			`log('='*70)`
			`log(f'BATCH COMPLETE: {converted}/{total} converted')`
			`if failed:`
			`log(f'Failed dates: {len(failed)}')`
			`for d, e in failed[:10]:`
			`log(f' {d}: {e}')`
			`log('='*70)`

			`if __name__ == '__main__':`
			`main()`