""" Klines Pipeline Orchestrator ============================= Watches the backfill job completion, validates files, then chains: [1] historical_klines_backfiller.py (already running as babejerbg) -> wait for: "Arrow files written:" in output log [2] Validate arrow files (random sample of 2026 dates) [3] ng5_arrow_to_vbt_cache.py --all (converts 2026 arrow_klines -> vbt_cache_klines) [4] Validate parquets (2026 dates in vbt_cache_klines) [5] test_pf_klines_2y_experiment.py (full ~795-day run) Run: python klines_pipeline_orchestrator.py """ import sys, time, subprocess, shutil, json, random from pathlib import Path from datetime import datetime sys.stdout.reconfigure(encoding='utf-8', errors='replace') PYTHON = r"C:\Users\Lenovo\Documents\- Siloqy\Scripts\python.exe" HCM = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict") BACKFILL = Path(r"C:\Users\Lenovo\Documents\- Dolphin NG Backfill") ARROW_KL = BACKFILL / "backfilled_data" / "arrow_klines" VBT_KL = HCM / "vbt_cache_klines" EXPERIMENT = HCM / "nautilus_dolphin" / "test_pf_klines_2y_experiment.py" CONVERTER = HCM / "ng5_arrow_to_vbt_cache.py" BACKFILL_LOG = Path(r"C:\Users\Lenovo\AppData\Local\Temp\claude\C--Users-Lenovo-Documents---Dolphin-NG-HD--NG3-\tasks\babejerbg.output") POLL_INTERVAL = 30 # seconds between polls # ── Helpers ──────────────────────────────────────────────────────────────────── def ts(): return datetime.now().strftime('%H:%M:%S') def log(msg): print(f"[{ts()}] {msg}", flush=True) def disk_free_gb(): usage = shutil.disk_usage(r"C:\\") return usage.free / (1024**3) def check_disk(): free = disk_free_gb() log(f"Disk free: {free:.1f} GB") if free < 10: log("WARNING: < 10 GB free — monitor closely") return free def dir_size_mb(path: Path): if not path.exists(): return 0.0 total = sum(f.stat().st_size for f in path.rglob('*') if f.is_file()) return total / (1024**2) # ── Step 1: Poll backfill log for completion ─────────────────────────────────── def wait_for_backfill(): log("=" * 60) log("STEP 1: Waiting for klines backfill to complete...") log(f" Polling: {BACKFILL_LOG}") log(f" Poll interval: {POLL_INTERVAL}s") last_line_count = 0 last_disk_check = time.time() while True: time.sleep(POLL_INTERVAL) # Read log tail if BACKFILL_LOG.exists(): try: lines = BACKFILL_LOG.read_text(encoding='utf-8', errors='replace').splitlines() except Exception: lines = [] new_lines = lines[last_line_count:] last_line_count = len(lines) for line in new_lines: if line.strip(): print(f" >> {line}", flush=True) # Completion markers full_text = '\n'.join(lines) if 'Arrow files written:' in full_text: log("Backfill COMPLETE — 'Arrow files written:' detected") # print final summary lines for line in lines[-10:]: if line.strip(): print(f" SUMMARY: {line}", flush=True) return True if 'Traceback' in full_text or 'Error' in full_text.split('Arrow files written:')[0]: last_error = [l for l in lines if 'Error' in l or 'Traceback' in l][-3:] log(f"ERROR detected in backfill log:") for l in last_error: print(f" ERROR: {l}", flush=True) # Continue watching — might be a non-fatal error else: log(f" Log file not found yet: {BACKFILL_LOG}") # Periodic disk + progress check now = time.time() if now - last_disk_check > 300: # every 5 min check_disk() n_2026 = len([d for d in ARROW_KL.iterdir() if d.is_dir() and d.name.startswith('2026')]) if ARROW_KL.exists() else 0 kl_mb = dir_size_mb(ARROW_KL) log(f" Progress: {n_2026}/64 2026 date dirs in arrow_klines | total dir size: {kl_mb:.0f} MB") last_disk_check = now # ── Step 2: Validate arrow files ─────────────────────────────────────────────── def validate_arrow_files(): import pyarrow as pa import pyarrow.ipc as ipc log("=" * 60) log("STEP 2: Validating 2026 arrow_klines files...") REQUIRED_FIELDS = [ 'timestamp_ns', 'w50_velocity', 'w150_velocity', 'w300_velocity', 'w750_velocity', 'vel_div', 'w50_instability', 'w150_instability', 'assets_json', 'asset_prices_json', ] dates_2026 = sorted([d.name for d in ARROW_KL.iterdir() if d.is_dir() and d.name.startswith('2026')]) if not dates_2026: log("ERROR: No 2026 date dirs found in arrow_klines!") return False log(f" Found {len(dates_2026)} 2026 date dirs: {dates_2026[0]} .. {dates_2026[-1]}") # Sample 5 dates spread across the range sample_dates = [dates_2026[i] for i in sorted(random.sample(range(len(dates_2026)), min(5, len(dates_2026))))] errors = [] for date_str in sample_dates: date_dir = ARROW_KL / date_str arrow_files = sorted(date_dir.glob('scan_*.arrow')) n_files = len(arrow_files) if n_files < 1400: errors.append(f"{date_str}: only {n_files} arrow files (expected ~1440)") continue # Sample 3 files from beginning, middle, end of day indices = [0, n_files // 2, n_files - 1] file_errors = [] for idx in indices: af = arrow_files[idx] try: with pa.memory_map(str(af), 'r') as src: table = ipc.open_file(src).read_all() if len(table) != 1: file_errors.append(f"{af.name}: {len(table)} rows (expected 1)") continue row = {col: table.column(col)[0].as_py() for col in table.column_names} missing = [f for f in REQUIRED_FIELDS if f not in row] if missing: file_errors.append(f"{af.name}: missing fields {missing}") continue ts_ns = row.get('timestamp_ns', 0) if not ts_ns or ts_ns < 1e15: file_errors.append(f"{af.name}: bad timestamp_ns={ts_ns}") continue import json as _json prices = _json.loads(row.get('asset_prices_json', '[]') or '[]') if not prices or prices[0] is None or float(prices[0]) < 1000: file_errors.append(f"{af.name}: BTC price suspicious: {prices[:3]}") continue v50 = row.get('w50_velocity', 0) v150 = row.get('w150_velocity', 0) if v50 == 0.0 and v150 == 0.0 and idx > 0: file_errors.append(f"{af.name}: both w50/w150 velocity = 0.0 (warmup issue?)") except Exception as e: file_errors.append(f"{af.name}: read error: {e}") if file_errors: errors.append(f"{date_str} ({n_files} files): " + "; ".join(file_errors)) else: log(f" {date_str}: OK ({n_files} files, BTC=${float(prices[0]):.0f}, " f"vel_div={float(row.get('vel_div',0)):.4f})") if errors: log(f"VALIDATION ERRORS ({len(errors)}):") for e in errors: log(f" ERR: {e}") return False log(f"Arrow validation PASSED — {len(sample_dates)} sampled dates all OK") return True # ── Step 3: VBT conversion ───────────────────────────────────────────────────── def run_vbt_conversion(): log("=" * 60) log("STEP 3: Converting 2026 arrow_klines -> vbt_cache_klines...") log(f" arrow_base : {ARROW_KL}") log(f" out_dir : {VBT_KL}") disk_before = disk_free_gb() before_count = len(list(VBT_KL.glob('*.parquet'))) log(f" Parquets before: {before_count} | Disk free: {disk_before:.1f} GB") cmd = [ PYTHON, str(CONVERTER), '--all', '--ng5-arrow-base', str(ARROW_KL), '--out-dir', str(VBT_KL), ] log(f" Running: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8', errors='replace') print(result.stdout, flush=True) if result.stderr: print("STDERR:", result.stderr[:2000], flush=True) after_count = len(list(VBT_KL.glob('*.parquet'))) disk_after = disk_free_gb() new_pqs = after_count - before_count log(f" Parquets after: {after_count} (+{new_pqs} new) | Disk free: {disk_after:.1f} GB") log(f" Disk used by conversion: {disk_before - disk_after:.2f} GB") if result.returncode != 0: log(f"ERROR: converter exited with code {result.returncode}") return False return True # ── Step 4: Validate parquets ────────────────────────────────────────────────── def validate_parquets(): import pandas as pd log("=" * 60) log("STEP 4: Validating 2026 vbt_cache_klines parquets...") REQUIRED_COLS = [ 'timestamp', 'v50_lambda_max_velocity', 'v150_lambda_max_velocity', 'v300_lambda_max_velocity', 'v750_lambda_max_velocity', 'vel_div', 'instability_50', 'instability_150', 'BTCUSDT', ] pqs_2026 = sorted(VBT_KL.glob('2026*.parquet')) if not pqs_2026: log("ERROR: No 2026 parquets found in vbt_cache_klines!") return False log(f" Found {len(pqs_2026)} 2026 parquets: {pqs_2026[0].stem} .. {pqs_2026[-1].stem}") sample = [pqs_2026[i] for i in sorted(random.sample(range(len(pqs_2026)), min(5, len(pqs_2026))))] errors = [] for pf in sample: try: df = pd.read_parquet(pf) except Exception as e: errors.append(f"{pf.name}: read error: {e}") continue missing_cols = [c for c in REQUIRED_COLS if c not in df.columns] if missing_cols: errors.append(f"{pf.name}: missing columns {missing_cols}") continue n_rows = len(df) if n_rows < 1400: errors.append(f"{pf.name}: only {n_rows} rows (expected ~1439)") continue null_vel = df['vel_div'].isna().sum() null_btc = df['BTCUSDT'].isna().sum() null_v50 = df['v50_lambda_max_velocity'].isna().sum() zero_vel = (df['v50_lambda_max_velocity'] == 0.0).sum() btc_min = df['BTCUSDT'].min() btc_max = df['BTCUSDT'].max() vd_p5 = df['vel_div'].quantile(0.05) vd_p95 = df['vel_div'].quantile(0.95) row_errors = [] if null_btc > 5: row_errors.append(f"BTCUSDT null={null_btc}") if btc_min < 1000: row_errors.append(f"BTC min={btc_min:.0f} (suspicious)") if zero_vel > n_rows * 0.10: # >10% zero velocities beyond warmup row_errors.append(f"w50 velocity zero={zero_vel}/{n_rows}") if abs(vd_p5) > 50 or abs(vd_p95) > 50: row_errors.append(f"vel_div range suspicious: p5={vd_p5:.3f}, p95={vd_p95:.3f}") if row_errors: errors.append(f"{pf.name}: " + "; ".join(row_errors)) else: log(f" {pf.stem}: OK {n_rows} rows | BTC=[{btc_min:.0f},{btc_max:.0f}] " f"| vd=[{vd_p5:.3f},{vd_p95:.3f}] | null_v50={null_v50}") if errors: log(f"VALIDATION ERRORS ({len(errors)}):") for e in errors: log(f" ERR: {e}") return False log(f"Parquet validation PASSED — {len(sample)} sampled dates all OK") return True # ── Step 5: Run experiment ───────────────────────────────────────────────────── def run_experiment(): log("=" * 60) log("STEP 5: Running klines fractal experiment (~795 days)...") log(f" Script: {EXPERIMENT}") disk_before = disk_free_gb() t0 = time.time() cmd = [PYTHON, str(EXPERIMENT)] result = subprocess.run( cmd, cwd=str(HCM / "nautilus_dolphin"), capture_output=False, # stream to stdout live text=True, ) elapsed = time.time() - t0 disk_after = disk_free_gb() log(f"Experiment finished in {elapsed/60:.1f} min | exit code: {result.returncode}") log(f"Disk free: {disk_after:.1f} GB (used {disk_before - disk_after:.2f} GB)") if result.returncode != 0: log("ERROR: experiment exited non-zero") return False return True # ── Main ────────────────────────────────────────────────────────────────────── def main(): log("Klines Pipeline Orchestrator started") check_disk() # Step 1: wait for backfill if not wait_for_backfill(): log("ABORT: backfill did not complete cleanly") sys.exit(1) check_disk() # Step 2: validate arrow files if not validate_arrow_files(): log("ABORT: arrow file validation failed — inspect errors before proceeding") sys.exit(1) # Step 3: VBT conversion if not run_vbt_conversion(): log("ABORT: VBT conversion failed") sys.exit(1) check_disk() # Step 4: validate parquets if not validate_parquets(): log("ABORT: parquet validation failed — inspect errors before proceeding") sys.exit(1) # Step 5: run experiment if not run_experiment(): log("ABORT: experiment failed") sys.exit(1) log("=" * 60) log("PIPELINE COMPLETE. Check run_logs/ for klines_2y_*.json results.") if __name__ == '__main__': main()