118 lines
3.5 KiB
Python
118 lines
3.5 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Batch converter for Arrow to Parquet with progress tracking.
|
||
|
|
Processes in chunks to avoid timeout issues.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
import json
|
||
|
|
|
||
|
|
# Add paths
|
||
|
|
sys.path.insert(0, str(Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')))
|
||
|
|
|
||
|
|
import pandas as pd
|
||
|
|
import numpy as np
|
||
|
|
import pyarrow as pa
|
||
|
|
import pyarrow.ipc as ipc
|
||
|
|
|
||
|
|
# Configuration
|
||
|
|
ARROW_BASE = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill\backfilled_data\arrow_klines')
|
||
|
|
OUTPUT_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines')
|
||
|
|
LOG_FILE = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\prod\conversion_batch.log')
|
||
|
|
|
||
|
|
def log(msg):
|
||
|
|
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
|
line = f'[{ts}] {msg}'
|
||
|
|
print(line)
|
||
|
|
with open(LOG_FILE, 'a') as f:
|
||
|
|
f.write(line + '\n')
|
||
|
|
|
||
|
|
def get_arrow_dates():
|
||
|
|
"""Get all dates from Arrow directory."""
|
||
|
|
if not ARROW_BASE.exists():
|
||
|
|
return []
|
||
|
|
dates = []
|
||
|
|
for d in ARROW_BASE.iterdir():
|
||
|
|
if d.is_dir() and len(d.name) == 10 and d.name[4] == '-':
|
||
|
|
if any(d.glob('*.arrow')):
|
||
|
|
dates.append(d.name)
|
||
|
|
return sorted(dates)
|
||
|
|
|
||
|
|
def get_parquet_dates():
|
||
|
|
"""Get all dates from Parquet directory."""
|
||
|
|
if not OUTPUT_DIR.exists():
|
||
|
|
return []
|
||
|
|
return sorted([f.stem for f in OUTPUT_DIR.glob('*.parquet')])
|
||
|
|
|
||
|
|
def convert_date(date_str, force=False):
|
||
|
|
"""Convert a single date."""
|
||
|
|
from ng5_arrow_to_vbt_cache import convert_arrow_date
|
||
|
|
|
||
|
|
out_file = OUTPUT_DIR / f"{date_str}.parquet"
|
||
|
|
if out_file.exists() and not force:
|
||
|
|
return True, "already_exists"
|
||
|
|
|
||
|
|
try:
|
||
|
|
df = convert_arrow_date(date_str, arrow_base=ARROW_BASE, out_dir=OUTPUT_DIR, force=force)
|
||
|
|
if df is not None:
|
||
|
|
return True, f"converted_{len(df)}_rows"
|
||
|
|
else:
|
||
|
|
return False, "no_data"
|
||
|
|
except Exception as e:
|
||
|
|
return False, f"error_{e}"
|
||
|
|
|
||
|
|
def main():
|
||
|
|
log('='*70)
|
||
|
|
log('BATCH CONVERSION START')
|
||
|
|
log('='*70)
|
||
|
|
|
||
|
|
arrow_dates = get_arrow_dates()
|
||
|
|
parquet_dates = get_parquet_dates()
|
||
|
|
|
||
|
|
log(f'Arrow dates: {len(arrow_dates)}')
|
||
|
|
log(f'Parquet dates: {len(parquet_dates)}')
|
||
|
|
|
||
|
|
arrow_set = set(arrow_dates)
|
||
|
|
parquet_set = set(parquet_dates)
|
||
|
|
|
||
|
|
to_convert = sorted(arrow_set - parquet_set)
|
||
|
|
log(f'Dates to convert: {len(to_convert)}')
|
||
|
|
|
||
|
|
if not to_convert:
|
||
|
|
log('Nothing to convert - all up to date!')
|
||
|
|
return
|
||
|
|
|
||
|
|
log(f'Range: {to_convert[0]} to {to_convert[-1]}')
|
||
|
|
|
||
|
|
# Convert in batches
|
||
|
|
batch_size = 50
|
||
|
|
total = len(to_convert)
|
||
|
|
converted = 0
|
||
|
|
failed = []
|
||
|
|
|
||
|
|
for i, date_str in enumerate(to_convert):
|
||
|
|
success, status = convert_date(date_str)
|
||
|
|
if success:
|
||
|
|
converted += 1
|
||
|
|
else:
|
||
|
|
failed.append((date_str, status))
|
||
|
|
|
||
|
|
# Progress report every 10 files
|
||
|
|
if (i + 1) % 10 == 0 or i == total - 1:
|
||
|
|
pct = (i + 1) / total * 100
|
||
|
|
log(f'Progress: {i+1}/{total} ({pct:.1f}%) - Converted: {converted}, Failed: {len(failed)}')
|
||
|
|
|
||
|
|
log('='*70)
|
||
|
|
log(f'BATCH COMPLETE: {converted}/{total} converted')
|
||
|
|
if failed:
|
||
|
|
log(f'Failed dates: {len(failed)}')
|
||
|
|
for d, e in failed[:10]:
|
||
|
|
log(f' {d}: {e}')
|
||
|
|
log('='*70)
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|