Files
DOLPHIN/prod/convert_arrow_to_parquet_batch.py

118 lines
3.5 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Batch converter for Arrow to Parquet with progress tracking.
Processes in chunks to avoid timeout issues.
"""
import sys
import time
from pathlib import Path
from datetime import datetime
import json
# Add paths
sys.path.insert(0, str(Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')))
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.ipc as ipc
# Configuration
ARROW_BASE = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill\backfilled_data\arrow_klines')
OUTPUT_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines')
LOG_FILE = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\prod\conversion_batch.log')
def log(msg):
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
line = f'[{ts}] {msg}'
print(line)
with open(LOG_FILE, 'a') as f:
f.write(line + '\n')
def get_arrow_dates():
"""Get all dates from Arrow directory."""
if not ARROW_BASE.exists():
return []
dates = []
for d in ARROW_BASE.iterdir():
if d.is_dir() and len(d.name) == 10 and d.name[4] == '-':
if any(d.glob('*.arrow')):
dates.append(d.name)
return sorted(dates)
def get_parquet_dates():
"""Get all dates from Parquet directory."""
if not OUTPUT_DIR.exists():
return []
return sorted([f.stem for f in OUTPUT_DIR.glob('*.parquet')])
def convert_date(date_str, force=False):
"""Convert a single date."""
from ng5_arrow_to_vbt_cache import convert_arrow_date
out_file = OUTPUT_DIR / f"{date_str}.parquet"
if out_file.exists() and not force:
return True, "already_exists"
try:
df = convert_arrow_date(date_str, arrow_base=ARROW_BASE, out_dir=OUTPUT_DIR, force=force)
if df is not None:
return True, f"converted_{len(df)}_rows"
else:
return False, "no_data"
except Exception as e:
return False, f"error_{e}"
def main():
log('='*70)
log('BATCH CONVERSION START')
log('='*70)
arrow_dates = get_arrow_dates()
parquet_dates = get_parquet_dates()
log(f'Arrow dates: {len(arrow_dates)}')
log(f'Parquet dates: {len(parquet_dates)}')
arrow_set = set(arrow_dates)
parquet_set = set(parquet_dates)
to_convert = sorted(arrow_set - parquet_set)
log(f'Dates to convert: {len(to_convert)}')
if not to_convert:
log('Nothing to convert - all up to date!')
return
log(f'Range: {to_convert[0]} to {to_convert[-1]}')
# Convert in batches
batch_size = 50
total = len(to_convert)
converted = 0
failed = []
for i, date_str in enumerate(to_convert):
success, status = convert_date(date_str)
if success:
converted += 1
else:
failed.append((date_str, status))
# Progress report every 10 files
if (i + 1) % 10 == 0 or i == total - 1:
pct = (i + 1) / total * 100
log(f'Progress: {i+1}/{total} ({pct:.1f}%) - Converted: {converted}, Failed: {len(failed)}')
log('='*70)
log(f'BATCH COMPLETE: {converted}/{total} converted')
if failed:
log(f'Failed dates: {len(failed)}')
for d, e in failed[:10]:
log(f' {d}: {e}')
log('='*70)
if __name__ == '__main__':
main()