374 lines
14 KiB
Python
374 lines
14 KiB
Python
|
|
"""
|
||
|
|
Klines Pipeline Orchestrator
|
||
|
|
=============================
|
||
|
|
Watches the backfill job completion, validates files, then chains:
|
||
|
|
[1] historical_klines_backfiller.py (already running as babejerbg)
|
||
|
|
-> wait for: "Arrow files written:" in output log
|
||
|
|
[2] Validate arrow files (random sample of 2026 dates)
|
||
|
|
[3] ng5_arrow_to_vbt_cache.py --all (converts 2026 arrow_klines -> vbt_cache_klines)
|
||
|
|
[4] Validate parquets (2026 dates in vbt_cache_klines)
|
||
|
|
[5] test_pf_klines_2y_experiment.py (full ~795-day run)
|
||
|
|
|
||
|
|
Run: python klines_pipeline_orchestrator.py
|
||
|
|
"""
|
||
|
|
import sys, time, subprocess, shutil, json, random
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
|
|
|
||
|
|
PYTHON = r"C:\Users\Lenovo\Documents\- Siloqy\Scripts\python.exe"
|
||
|
|
HCM = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict")
|
||
|
|
BACKFILL = Path(r"C:\Users\Lenovo\Documents\- Dolphin NG Backfill")
|
||
|
|
ARROW_KL = BACKFILL / "backfilled_data" / "arrow_klines"
|
||
|
|
VBT_KL = HCM / "vbt_cache_klines"
|
||
|
|
EXPERIMENT = HCM / "nautilus_dolphin" / "test_pf_klines_2y_experiment.py"
|
||
|
|
CONVERTER = HCM / "ng5_arrow_to_vbt_cache.py"
|
||
|
|
BACKFILL_LOG = Path(r"C:\Users\Lenovo\AppData\Local\Temp\claude\C--Users-Lenovo-Documents---Dolphin-NG-HD--NG3-\tasks\babejerbg.output")
|
||
|
|
|
||
|
|
POLL_INTERVAL = 30 # seconds between polls
|
||
|
|
|
||
|
|
# ── Helpers ────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def ts():
|
||
|
|
return datetime.now().strftime('%H:%M:%S')
|
||
|
|
|
||
|
|
def log(msg):
|
||
|
|
print(f"[{ts()}] {msg}", flush=True)
|
||
|
|
|
||
|
|
def disk_free_gb():
|
||
|
|
usage = shutil.disk_usage(r"C:\\")
|
||
|
|
return usage.free / (1024**3)
|
||
|
|
|
||
|
|
def check_disk():
|
||
|
|
free = disk_free_gb()
|
||
|
|
log(f"Disk free: {free:.1f} GB")
|
||
|
|
if free < 10:
|
||
|
|
log("WARNING: < 10 GB free — monitor closely")
|
||
|
|
return free
|
||
|
|
|
||
|
|
def dir_size_mb(path: Path):
|
||
|
|
if not path.exists():
|
||
|
|
return 0.0
|
||
|
|
total = sum(f.stat().st_size for f in path.rglob('*') if f.is_file())
|
||
|
|
return total / (1024**2)
|
||
|
|
|
||
|
|
# ── Step 1: Poll backfill log for completion ───────────────────────────────────
|
||
|
|
|
||
|
|
def wait_for_backfill():
|
||
|
|
log("=" * 60)
|
||
|
|
log("STEP 1: Waiting for klines backfill to complete...")
|
||
|
|
log(f" Polling: {BACKFILL_LOG}")
|
||
|
|
log(f" Poll interval: {POLL_INTERVAL}s")
|
||
|
|
|
||
|
|
last_line_count = 0
|
||
|
|
last_disk_check = time.time()
|
||
|
|
|
||
|
|
while True:
|
||
|
|
time.sleep(POLL_INTERVAL)
|
||
|
|
|
||
|
|
# Read log tail
|
||
|
|
if BACKFILL_LOG.exists():
|
||
|
|
try:
|
||
|
|
lines = BACKFILL_LOG.read_text(encoding='utf-8', errors='replace').splitlines()
|
||
|
|
except Exception:
|
||
|
|
lines = []
|
||
|
|
|
||
|
|
new_lines = lines[last_line_count:]
|
||
|
|
last_line_count = len(lines)
|
||
|
|
|
||
|
|
for line in new_lines:
|
||
|
|
if line.strip():
|
||
|
|
print(f" >> {line}", flush=True)
|
||
|
|
|
||
|
|
# Completion markers
|
||
|
|
full_text = '\n'.join(lines)
|
||
|
|
if 'Arrow files written:' in full_text:
|
||
|
|
log("Backfill COMPLETE — 'Arrow files written:' detected")
|
||
|
|
# print final summary lines
|
||
|
|
for line in lines[-10:]:
|
||
|
|
if line.strip():
|
||
|
|
print(f" SUMMARY: {line}", flush=True)
|
||
|
|
return True
|
||
|
|
if 'Traceback' in full_text or 'Error' in full_text.split('Arrow files written:')[0]:
|
||
|
|
last_error = [l for l in lines if 'Error' in l or 'Traceback' in l][-3:]
|
||
|
|
log(f"ERROR detected in backfill log:")
|
||
|
|
for l in last_error:
|
||
|
|
print(f" ERROR: {l}", flush=True)
|
||
|
|
# Continue watching — might be a non-fatal error
|
||
|
|
else:
|
||
|
|
log(f" Log file not found yet: {BACKFILL_LOG}")
|
||
|
|
|
||
|
|
# Periodic disk + progress check
|
||
|
|
now = time.time()
|
||
|
|
if now - last_disk_check > 300: # every 5 min
|
||
|
|
check_disk()
|
||
|
|
n_2026 = len([d for d in ARROW_KL.iterdir() if d.is_dir() and d.name.startswith('2026')]) if ARROW_KL.exists() else 0
|
||
|
|
kl_mb = dir_size_mb(ARROW_KL)
|
||
|
|
log(f" Progress: {n_2026}/64 2026 date dirs in arrow_klines | total dir size: {kl_mb:.0f} MB")
|
||
|
|
last_disk_check = now
|
||
|
|
|
||
|
|
# ── Step 2: Validate arrow files ───────────────────────────────────────────────
|
||
|
|
|
||
|
|
def validate_arrow_files():
|
||
|
|
import pyarrow as pa
|
||
|
|
import pyarrow.ipc as ipc
|
||
|
|
|
||
|
|
log("=" * 60)
|
||
|
|
log("STEP 2: Validating 2026 arrow_klines files...")
|
||
|
|
|
||
|
|
REQUIRED_FIELDS = [
|
||
|
|
'timestamp_ns', 'w50_velocity', 'w150_velocity', 'w300_velocity',
|
||
|
|
'w750_velocity', 'vel_div', 'w50_instability', 'w150_instability',
|
||
|
|
'assets_json', 'asset_prices_json',
|
||
|
|
]
|
||
|
|
|
||
|
|
dates_2026 = sorted([d.name for d in ARROW_KL.iterdir()
|
||
|
|
if d.is_dir() and d.name.startswith('2026')])
|
||
|
|
if not dates_2026:
|
||
|
|
log("ERROR: No 2026 date dirs found in arrow_klines!")
|
||
|
|
return False
|
||
|
|
|
||
|
|
log(f" Found {len(dates_2026)} 2026 date dirs: {dates_2026[0]} .. {dates_2026[-1]}")
|
||
|
|
|
||
|
|
# Sample 5 dates spread across the range
|
||
|
|
sample_dates = [dates_2026[i] for i in
|
||
|
|
sorted(random.sample(range(len(dates_2026)), min(5, len(dates_2026))))]
|
||
|
|
|
||
|
|
errors = []
|
||
|
|
for date_str in sample_dates:
|
||
|
|
date_dir = ARROW_KL / date_str
|
||
|
|
arrow_files = sorted(date_dir.glob('scan_*.arrow'))
|
||
|
|
n_files = len(arrow_files)
|
||
|
|
|
||
|
|
if n_files < 1400:
|
||
|
|
errors.append(f"{date_str}: only {n_files} arrow files (expected ~1440)")
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Sample 3 files from beginning, middle, end of day
|
||
|
|
indices = [0, n_files // 2, n_files - 1]
|
||
|
|
file_errors = []
|
||
|
|
for idx in indices:
|
||
|
|
af = arrow_files[idx]
|
||
|
|
try:
|
||
|
|
with pa.memory_map(str(af), 'r') as src:
|
||
|
|
table = ipc.open_file(src).read_all()
|
||
|
|
if len(table) != 1:
|
||
|
|
file_errors.append(f"{af.name}: {len(table)} rows (expected 1)")
|
||
|
|
continue
|
||
|
|
row = {col: table.column(col)[0].as_py() for col in table.column_names}
|
||
|
|
missing = [f for f in REQUIRED_FIELDS if f not in row]
|
||
|
|
if missing:
|
||
|
|
file_errors.append(f"{af.name}: missing fields {missing}")
|
||
|
|
continue
|
||
|
|
ts_ns = row.get('timestamp_ns', 0)
|
||
|
|
if not ts_ns or ts_ns < 1e15:
|
||
|
|
file_errors.append(f"{af.name}: bad timestamp_ns={ts_ns}")
|
||
|
|
continue
|
||
|
|
import json as _json
|
||
|
|
prices = _json.loads(row.get('asset_prices_json', '[]') or '[]')
|
||
|
|
if not prices or prices[0] is None or float(prices[0]) < 1000:
|
||
|
|
file_errors.append(f"{af.name}: BTC price suspicious: {prices[:3]}")
|
||
|
|
continue
|
||
|
|
v50 = row.get('w50_velocity', 0)
|
||
|
|
v150 = row.get('w150_velocity', 0)
|
||
|
|
if v50 == 0.0 and v150 == 0.0 and idx > 0:
|
||
|
|
file_errors.append(f"{af.name}: both w50/w150 velocity = 0.0 (warmup issue?)")
|
||
|
|
except Exception as e:
|
||
|
|
file_errors.append(f"{af.name}: read error: {e}")
|
||
|
|
|
||
|
|
if file_errors:
|
||
|
|
errors.append(f"{date_str} ({n_files} files): " + "; ".join(file_errors))
|
||
|
|
else:
|
||
|
|
log(f" {date_str}: OK ({n_files} files, BTC=${float(prices[0]):.0f}, "
|
||
|
|
f"vel_div={float(row.get('vel_div',0)):.4f})")
|
||
|
|
|
||
|
|
if errors:
|
||
|
|
log(f"VALIDATION ERRORS ({len(errors)}):")
|
||
|
|
for e in errors:
|
||
|
|
log(f" ERR: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
log(f"Arrow validation PASSED — {len(sample_dates)} sampled dates all OK")
|
||
|
|
return True
|
||
|
|
|
||
|
|
# ── Step 3: VBT conversion ─────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def run_vbt_conversion():
|
||
|
|
log("=" * 60)
|
||
|
|
log("STEP 3: Converting 2026 arrow_klines -> vbt_cache_klines...")
|
||
|
|
log(f" arrow_base : {ARROW_KL}")
|
||
|
|
log(f" out_dir : {VBT_KL}")
|
||
|
|
|
||
|
|
disk_before = disk_free_gb()
|
||
|
|
before_count = len(list(VBT_KL.glob('*.parquet')))
|
||
|
|
log(f" Parquets before: {before_count} | Disk free: {disk_before:.1f} GB")
|
||
|
|
|
||
|
|
cmd = [
|
||
|
|
PYTHON, str(CONVERTER),
|
||
|
|
'--all',
|
||
|
|
'--ng5-arrow-base', str(ARROW_KL),
|
||
|
|
'--out-dir', str(VBT_KL),
|
||
|
|
]
|
||
|
|
log(f" Running: {' '.join(cmd)}")
|
||
|
|
result = subprocess.run(cmd, capture_output=True, text=True, encoding='utf-8', errors='replace')
|
||
|
|
print(result.stdout, flush=True)
|
||
|
|
if result.stderr:
|
||
|
|
print("STDERR:", result.stderr[:2000], flush=True)
|
||
|
|
|
||
|
|
after_count = len(list(VBT_KL.glob('*.parquet')))
|
||
|
|
disk_after = disk_free_gb()
|
||
|
|
new_pqs = after_count - before_count
|
||
|
|
log(f" Parquets after: {after_count} (+{new_pqs} new) | Disk free: {disk_after:.1f} GB")
|
||
|
|
log(f" Disk used by conversion: {disk_before - disk_after:.2f} GB")
|
||
|
|
|
||
|
|
if result.returncode != 0:
|
||
|
|
log(f"ERROR: converter exited with code {result.returncode}")
|
||
|
|
return False
|
||
|
|
return True
|
||
|
|
|
||
|
|
# ── Step 4: Validate parquets ──────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def validate_parquets():
|
||
|
|
import pandas as pd
|
||
|
|
|
||
|
|
log("=" * 60)
|
||
|
|
log("STEP 4: Validating 2026 vbt_cache_klines parquets...")
|
||
|
|
|
||
|
|
REQUIRED_COLS = [
|
||
|
|
'timestamp', 'v50_lambda_max_velocity', 'v150_lambda_max_velocity',
|
||
|
|
'v300_lambda_max_velocity', 'v750_lambda_max_velocity',
|
||
|
|
'vel_div', 'instability_50', 'instability_150', 'BTCUSDT',
|
||
|
|
]
|
||
|
|
|
||
|
|
pqs_2026 = sorted(VBT_KL.glob('2026*.parquet'))
|
||
|
|
if not pqs_2026:
|
||
|
|
log("ERROR: No 2026 parquets found in vbt_cache_klines!")
|
||
|
|
return False
|
||
|
|
|
||
|
|
log(f" Found {len(pqs_2026)} 2026 parquets: {pqs_2026[0].stem} .. {pqs_2026[-1].stem}")
|
||
|
|
|
||
|
|
sample = [pqs_2026[i] for i in
|
||
|
|
sorted(random.sample(range(len(pqs_2026)), min(5, len(pqs_2026))))]
|
||
|
|
|
||
|
|
errors = []
|
||
|
|
for pf in sample:
|
||
|
|
try:
|
||
|
|
df = pd.read_parquet(pf)
|
||
|
|
except Exception as e:
|
||
|
|
errors.append(f"{pf.name}: read error: {e}")
|
||
|
|
continue
|
||
|
|
|
||
|
|
missing_cols = [c for c in REQUIRED_COLS if c not in df.columns]
|
||
|
|
if missing_cols:
|
||
|
|
errors.append(f"{pf.name}: missing columns {missing_cols}")
|
||
|
|
continue
|
||
|
|
|
||
|
|
n_rows = len(df)
|
||
|
|
if n_rows < 1400:
|
||
|
|
errors.append(f"{pf.name}: only {n_rows} rows (expected ~1439)")
|
||
|
|
continue
|
||
|
|
|
||
|
|
null_vel = df['vel_div'].isna().sum()
|
||
|
|
null_btc = df['BTCUSDT'].isna().sum()
|
||
|
|
null_v50 = df['v50_lambda_max_velocity'].isna().sum()
|
||
|
|
zero_vel = (df['v50_lambda_max_velocity'] == 0.0).sum()
|
||
|
|
btc_min = df['BTCUSDT'].min()
|
||
|
|
btc_max = df['BTCUSDT'].max()
|
||
|
|
vd_p5 = df['vel_div'].quantile(0.05)
|
||
|
|
vd_p95 = df['vel_div'].quantile(0.95)
|
||
|
|
|
||
|
|
row_errors = []
|
||
|
|
if null_btc > 5:
|
||
|
|
row_errors.append(f"BTCUSDT null={null_btc}")
|
||
|
|
if btc_min < 1000:
|
||
|
|
row_errors.append(f"BTC min={btc_min:.0f} (suspicious)")
|
||
|
|
if zero_vel > n_rows * 0.10: # >10% zero velocities beyond warmup
|
||
|
|
row_errors.append(f"w50 velocity zero={zero_vel}/{n_rows}")
|
||
|
|
if abs(vd_p5) > 50 or abs(vd_p95) > 50:
|
||
|
|
row_errors.append(f"vel_div range suspicious: p5={vd_p5:.3f}, p95={vd_p95:.3f}")
|
||
|
|
|
||
|
|
if row_errors:
|
||
|
|
errors.append(f"{pf.name}: " + "; ".join(row_errors))
|
||
|
|
else:
|
||
|
|
log(f" {pf.stem}: OK {n_rows} rows | BTC=[{btc_min:.0f},{btc_max:.0f}] "
|
||
|
|
f"| vd=[{vd_p5:.3f},{vd_p95:.3f}] | null_v50={null_v50}")
|
||
|
|
|
||
|
|
if errors:
|
||
|
|
log(f"VALIDATION ERRORS ({len(errors)}):")
|
||
|
|
for e in errors:
|
||
|
|
log(f" ERR: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
log(f"Parquet validation PASSED — {len(sample)} sampled dates all OK")
|
||
|
|
return True
|
||
|
|
|
||
|
|
# ── Step 5: Run experiment ─────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def run_experiment():
|
||
|
|
log("=" * 60)
|
||
|
|
log("STEP 5: Running klines fractal experiment (~795 days)...")
|
||
|
|
log(f" Script: {EXPERIMENT}")
|
||
|
|
|
||
|
|
disk_before = disk_free_gb()
|
||
|
|
t0 = time.time()
|
||
|
|
|
||
|
|
cmd = [PYTHON, str(EXPERIMENT)]
|
||
|
|
result = subprocess.run(
|
||
|
|
cmd,
|
||
|
|
cwd=str(HCM / "nautilus_dolphin"),
|
||
|
|
capture_output=False, # stream to stdout live
|
||
|
|
text=True,
|
||
|
|
)
|
||
|
|
elapsed = time.time() - t0
|
||
|
|
disk_after = disk_free_gb()
|
||
|
|
|
||
|
|
log(f"Experiment finished in {elapsed/60:.1f} min | exit code: {result.returncode}")
|
||
|
|
log(f"Disk free: {disk_after:.1f} GB (used {disk_before - disk_after:.2f} GB)")
|
||
|
|
|
||
|
|
if result.returncode != 0:
|
||
|
|
log("ERROR: experiment exited non-zero")
|
||
|
|
return False
|
||
|
|
return True
|
||
|
|
|
||
|
|
# ── Main ──────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def main():
|
||
|
|
log("Klines Pipeline Orchestrator started")
|
||
|
|
check_disk()
|
||
|
|
|
||
|
|
# Step 1: wait for backfill
|
||
|
|
if not wait_for_backfill():
|
||
|
|
log("ABORT: backfill did not complete cleanly")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
check_disk()
|
||
|
|
|
||
|
|
# Step 2: validate arrow files
|
||
|
|
if not validate_arrow_files():
|
||
|
|
log("ABORT: arrow file validation failed — inspect errors before proceeding")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
# Step 3: VBT conversion
|
||
|
|
if not run_vbt_conversion():
|
||
|
|
log("ABORT: VBT conversion failed")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
check_disk()
|
||
|
|
|
||
|
|
# Step 4: validate parquets
|
||
|
|
if not validate_parquets():
|
||
|
|
log("ABORT: parquet validation failed — inspect errors before proceeding")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
# Step 5: run experiment
|
||
|
|
if not run_experiment():
|
||
|
|
log("ABORT: experiment failed")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
log("=" * 60)
|
||
|
|
log("PIPELINE COMPLETE. Check run_logs/ for klines_2y_*.json results.")
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|