initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
117
prod/convert_arrow_to_parquet_batch.py
Executable file
117
prod/convert_arrow_to_parquet_batch.py
Executable file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch converter for Arrow to Parquet with progress tracking.
|
||||
Processes in chunks to avoid timeout issues.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import json
|
||||
|
||||
# Add paths
|
||||
sys.path.insert(0, str(Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')))
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
import pyarrow.ipc as ipc
|
||||
|
||||
# Configuration
|
||||
ARROW_BASE = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill\backfilled_data\arrow_klines')
|
||||
OUTPUT_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines')
|
||||
LOG_FILE = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\prod\conversion_batch.log')
|
||||
|
||||
def log(msg):
|
||||
ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
line = f'[{ts}] {msg}'
|
||||
print(line)
|
||||
with open(LOG_FILE, 'a') as f:
|
||||
f.write(line + '\n')
|
||||
|
||||
def get_arrow_dates():
|
||||
"""Get all dates from Arrow directory."""
|
||||
if not ARROW_BASE.exists():
|
||||
return []
|
||||
dates = []
|
||||
for d in ARROW_BASE.iterdir():
|
||||
if d.is_dir() and len(d.name) == 10 and d.name[4] == '-':
|
||||
if any(d.glob('*.arrow')):
|
||||
dates.append(d.name)
|
||||
return sorted(dates)
|
||||
|
||||
def get_parquet_dates():
|
||||
"""Get all dates from Parquet directory."""
|
||||
if not OUTPUT_DIR.exists():
|
||||
return []
|
||||
return sorted([f.stem for f in OUTPUT_DIR.glob('*.parquet')])
|
||||
|
||||
def convert_date(date_str, force=False):
|
||||
"""Convert a single date."""
|
||||
from ng5_arrow_to_vbt_cache import convert_arrow_date
|
||||
|
||||
out_file = OUTPUT_DIR / f"{date_str}.parquet"
|
||||
if out_file.exists() and not force:
|
||||
return True, "already_exists"
|
||||
|
||||
try:
|
||||
df = convert_arrow_date(date_str, arrow_base=ARROW_BASE, out_dir=OUTPUT_DIR, force=force)
|
||||
if df is not None:
|
||||
return True, f"converted_{len(df)}_rows"
|
||||
else:
|
||||
return False, "no_data"
|
||||
except Exception as e:
|
||||
return False, f"error_{e}"
|
||||
|
||||
def main():
|
||||
log('='*70)
|
||||
log('BATCH CONVERSION START')
|
||||
log('='*70)
|
||||
|
||||
arrow_dates = get_arrow_dates()
|
||||
parquet_dates = get_parquet_dates()
|
||||
|
||||
log(f'Arrow dates: {len(arrow_dates)}')
|
||||
log(f'Parquet dates: {len(parquet_dates)}')
|
||||
|
||||
arrow_set = set(arrow_dates)
|
||||
parquet_set = set(parquet_dates)
|
||||
|
||||
to_convert = sorted(arrow_set - parquet_set)
|
||||
log(f'Dates to convert: {len(to_convert)}')
|
||||
|
||||
if not to_convert:
|
||||
log('Nothing to convert - all up to date!')
|
||||
return
|
||||
|
||||
log(f'Range: {to_convert[0]} to {to_convert[-1]}')
|
||||
|
||||
# Convert in batches
|
||||
batch_size = 50
|
||||
total = len(to_convert)
|
||||
converted = 0
|
||||
failed = []
|
||||
|
||||
for i, date_str in enumerate(to_convert):
|
||||
success, status = convert_date(date_str)
|
||||
if success:
|
||||
converted += 1
|
||||
else:
|
||||
failed.append((date_str, status))
|
||||
|
||||
# Progress report every 10 files
|
||||
if (i + 1) % 10 == 0 or i == total - 1:
|
||||
pct = (i + 1) / total * 100
|
||||
log(f'Progress: {i+1}/{total} ({pct:.1f}%) - Converted: {converted}, Failed: {len(failed)}')
|
||||
|
||||
log('='*70)
|
||||
log(f'BATCH COMPLETE: {converted}/{total} converted')
|
||||
if failed:
|
||||
log(f'Failed dates: {len(failed)}')
|
||||
for d, e in failed[:10]:
|
||||
log(f' {d}: {e}')
|
||||
log('='*70)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user