#!/usr/bin/env python3 """ Batch converter for Arrow to Parquet with progress tracking. Processes in chunks to avoid timeout issues. """ import sys import time from pathlib import Path from datetime import datetime import json # Add paths sys.path.insert(0, str(Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict'))) import pandas as pd import numpy as np import pyarrow as pa import pyarrow.ipc as ipc # Configuration ARROW_BASE = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill\backfilled_data\arrow_klines') OUTPUT_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines') LOG_FILE = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\prod\conversion_batch.log') def log(msg): ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S') line = f'[{ts}] {msg}' print(line) with open(LOG_FILE, 'a') as f: f.write(line + '\n') def get_arrow_dates(): """Get all dates from Arrow directory.""" if not ARROW_BASE.exists(): return [] dates = [] for d in ARROW_BASE.iterdir(): if d.is_dir() and len(d.name) == 10 and d.name[4] == '-': if any(d.glob('*.arrow')): dates.append(d.name) return sorted(dates) def get_parquet_dates(): """Get all dates from Parquet directory.""" if not OUTPUT_DIR.exists(): return [] return sorted([f.stem for f in OUTPUT_DIR.glob('*.parquet')]) def convert_date(date_str, force=False): """Convert a single date.""" from ng5_arrow_to_vbt_cache import convert_arrow_date out_file = OUTPUT_DIR / f"{date_str}.parquet" if out_file.exists() and not force: return True, "already_exists" try: df = convert_arrow_date(date_str, arrow_base=ARROW_BASE, out_dir=OUTPUT_DIR, force=force) if df is not None: return True, f"converted_{len(df)}_rows" else: return False, "no_data" except Exception as e: return False, f"error_{e}" def main(): log('='*70) log('BATCH CONVERSION START') log('='*70) arrow_dates = get_arrow_dates() parquet_dates = get_parquet_dates() log(f'Arrow dates: {len(arrow_dates)}') log(f'Parquet dates: {len(parquet_dates)}') arrow_set = set(arrow_dates) parquet_set = set(parquet_dates) to_convert = sorted(arrow_set - parquet_set) log(f'Dates to convert: {len(to_convert)}') if not to_convert: log('Nothing to convert - all up to date!') return log(f'Range: {to_convert[0]} to {to_convert[-1]}') # Convert in batches batch_size = 50 total = len(to_convert) converted = 0 failed = [] for i, date_str in enumerate(to_convert): success, status = convert_date(date_str) if success: converted += 1 else: failed.append((date_str, status)) # Progress report every 10 files if (i + 1) % 10 == 0 or i == total - 1: pct = (i + 1) / total * 100 log(f'Progress: {i+1}/{total} ({pct:.1f}%) - Converted: {converted}, Failed: {len(failed)}') log('='*70) log(f'BATCH COMPLETE: {converted}/{total} converted') if failed: log(f'Failed dates: {len(failed)}') for d, e in failed[:10]: log(f' {d}: {e}') log('='*70) if __name__ == '__main__': main()