initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/prod/convert_arrow_to_parquet_batch.py
+++ b/prod/convert_arrow_to_parquet_batch.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+Batch converter for Arrow to Parquet with progress tracking.
+Processes in chunks to avoid timeout issues.
+"""
+
+import sys
+import time
+from pathlib import Path
+from datetime import datetime
+import json
+
+# Add paths
+sys.path.insert(0, str(Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')))
+
+import pandas as pd
+import numpy as np
+import pyarrow as pa
+import pyarrow.ipc as ipc
+
+# Configuration
+ARROW_BASE = Path(r'C:\Users\Lenovo\Documents\- Dolphin NG Backfill\backfilled_data\arrow_klines')
+OUTPUT_DIR = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines')
+LOG_FILE = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\prod\conversion_batch.log')
+
+def log(msg):
+    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    line = f'[{ts}] {msg}'
+    print(line)
+    with open(LOG_FILE, 'a') as f:
+        f.write(line + '\n')
+
+def get_arrow_dates():
+    """Get all dates from Arrow directory."""
+    if not ARROW_BASE.exists():
+        return []
+    dates = []
+    for d in ARROW_BASE.iterdir():
+        if d.is_dir() and len(d.name) == 10 and d.name[4] == '-':
+            if any(d.glob('*.arrow')):
+                dates.append(d.name)
+    return sorted(dates)
+
+def get_parquet_dates():
+    """Get all dates from Parquet directory."""
+    if not OUTPUT_DIR.exists():
+        return []
+    return sorted([f.stem for f in OUTPUT_DIR.glob('*.parquet')])
+
+def convert_date(date_str, force=False):
+    """Convert a single date."""
+    from ng5_arrow_to_vbt_cache import convert_arrow_date
+    
+    out_file = OUTPUT_DIR / f"{date_str}.parquet"
+    if out_file.exists() and not force:
+        return True, "already_exists"
+    
+    try:
+        df = convert_arrow_date(date_str, arrow_base=ARROW_BASE, out_dir=OUTPUT_DIR, force=force)
+        if df is not None:
+            return True, f"converted_{len(df)}_rows"
+        else:
+            return False, "no_data"
+    except Exception as e:
+        return False, f"error_{e}"
+
+def main():
+    log('='*70)
+    log('BATCH CONVERSION START')
+    log('='*70)
+    
+    arrow_dates = get_arrow_dates()
+    parquet_dates = get_parquet_dates()
+    
+    log(f'Arrow dates: {len(arrow_dates)}')
+    log(f'Parquet dates: {len(parquet_dates)}')
+    
+    arrow_set = set(arrow_dates)
+    parquet_set = set(parquet_dates)
+    
+    to_convert = sorted(arrow_set - parquet_set)
+    log(f'Dates to convert: {len(to_convert)}')
+    
+    if not to_convert:
+        log('Nothing to convert - all up to date!')
+        return
+    
+    log(f'Range: {to_convert[0]} to {to_convert[-1]}')
+    
+    # Convert in batches
+    batch_size = 50
+    total = len(to_convert)
+    converted = 0
+    failed = []
+    
+    for i, date_str in enumerate(to_convert):
+        success, status = convert_date(date_str)
+        if success:
+            converted += 1
+        else:
+            failed.append((date_str, status))
+        
+        # Progress report every 10 files
+        if (i + 1) % 10 == 0 or i == total - 1:
+            pct = (i + 1) / total * 100
+            log(f'Progress: {i+1}/{total} ({pct:.1f}%) - Converted: {converted}, Failed: {len(failed)}')
+    
+    log('='*70)
+    log(f'BATCH COMPLETE: {converted}/{total} converted')
+    if failed:
+        log(f'Failed dates: {len(failed)}')
+        for d, e in failed[:10]:
+            log(f'  {d}: {e}')
+    log('='*70)
+
+if __name__ == '__main__':
+    main()