"""DOLPHIN ExF Backfill for Klines Dates ========================================= Writes ExF Indicators NPZ files for all 1,710 klines parquet dates so that ACBv6 can read funding_btc, dvol_btc, fng, and taker for those dates. Problem: backfill_runner.py reads NG3 JSON scan directories to get timestamps. Klines dates (2021-2026) have no NG3 JSON scans → ACBv6 _load_external_factors() returns neutral defaults → boost=1.0 always → inverse-boost component is dead. Solution: For each klines date, call ExternalFactorsFetcher.fetch_sync(target_date=noon_UTC) and write a minimal NPZ to EIGENVALUES_PATH/YYYY-MM-DD/scan_000001__Indicators.npz in the exact format ACBv6 expects: api_names + api_indicators + api_success. Output format (ACBv6 compatible): data['api_names'] : np.array of indicator name strings (N_INDICATORS) data['api_indicators'] : np.float64 array of values (N_INDICATORS) data['api_success'] : np.bool_ array (N_INDICATORS) Idempotent: skips dates where the NPZ already exists. Rate-limited: configurable delay between dates (default 1.0s). Usage: cd "C:\\Users\\Lenovo\\Documents\\- DOLPHIN NG HD HCM TSF Predict\\external_factors" "C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py "C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --dry-run "C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --start 2022-01-01 --end 2022-12-31 Expected runtime: 2-5 hours for all 1710 dates (network-dependent). Most of the value (funding_btc, dvol_btc, fng, taker) comes from a few API calls per date. CURRENT-only indicators will fail gracefully (api_success=False, value=0). """ import sys, time, argparse, asyncio sys.stdout.reconfigure(encoding='utf-8', errors='replace') from pathlib import Path from datetime import datetime, timezone import numpy as np # -- Paths -- import sys as _sys HCM_DIR = Path(__file__).parent.parent if _sys.platform == 'win32' else Path('/mnt/dolphin') KLINES_DIR = HCM_DIR / "vbt_cache_klines" EIGENVALUES_PATH = (Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues") if _sys.platform == 'win32' else Path('/mnt/ng6_data/eigenvalues')) NPZ_FILENAME = "scan_000001__Indicators.npz" # single synthetic scan per date sys.path.insert(0, str(Path(__file__).parent)) def parse_args(): p = argparse.ArgumentParser(description="Backfill ExF NPZ files for klines dates") p.add_argument("--start", default=None, help="Start date YYYY-MM-DD (inclusive)") p.add_argument("--end", default=None, help="End date YYYY-MM-DD (inclusive)") p.add_argument("--dry-run", action="store_true", help="Print what would be done, skip writes") p.add_argument("--delay", type=float, default=1.0, help="Seconds between date fetches (default 1.0)") p.add_argument("--overwrite",action="store_true", help="Re-fetch and overwrite existing NPZ files") return p.parse_args() def main(): args = parse_args() # Import ExF infrastructure from external_factors_matrix import ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS # Build ordered name list (matches matrix index: names[i] = INDICATORS[i].name) ind_names = np.array([ind.name for ind in INDICATORS], dtype=object) fetcher = ExternalFactorsFetcher(Config()) # Enumerate klines dates parquet_files = sorted(KLINES_DIR.glob("*.parquet")) parquet_files = [p for p in parquet_files if 'catalog' not in str(p)] date_strings = [p.stem for p in parquet_files] # Filter by --start / --end if args.start: date_strings = [d for d in date_strings if d >= args.start] if args.end: date_strings = [d for d in date_strings if d <= args.end] total = len(date_strings) print(f"Klines dates to process: {total}") print(f"EIGENVALUES_PATH: {EIGENVALUES_PATH}") print(f"Dry run: {args.dry_run} Overwrite: {args.overwrite} Delay: {args.delay}s\n") if args.dry_run: print("DRY RUN — no files will be written.\n") skipped = 0 written = 0 errors = 0 t0 = time.time() for i, ds in enumerate(date_strings): out_dir = EIGENVALUES_PATH / ds out_npz = out_dir / NPZ_FILENAME # Skip if exists and not overwriting if out_npz.exists() and not args.overwrite: skipped += 1 continue # Fetch at noon UTC for this date try: yr, mo, dy = int(ds[:4]), int(ds[5:7]), int(ds[8:10]) target_dt = datetime(yr, mo, dy, 12, 0, 0, tzinfo=timezone.utc) except ValueError: print(f" [{i+1}/{total}] {ds}: BAD DATE FORMAT — skip") errors += 1 continue if args.dry_run: print(f" [{i+1}/{total}] {ds}: would fetch {target_dt.isoformat()} → {out_npz}") written += 1 continue try: result = fetcher.fetch_sync(target_date=target_dt) except Exception as e: print(f" [{i+1}/{total}] {ds}: FETCH ERROR — {e}") errors += 1 time.sleep(args.delay) continue # Build NPZ arrays in ACBv6-compatible format matrix = result['matrix'] # np.float64 array, 0-indexed (matrix[id-1]) details = result['details'] # {id: {'name': ..., 'value': ..., 'success': bool}} api_indicators = matrix.astype(np.float64) api_success = np.array( [details.get(i+1, {}).get('success', False) for i in range(N_INDICATORS)], dtype=np.bool_ ) success_count = result.get('success_count', int(api_success.sum())) # Write NPZ out_dir.mkdir(parents=True, exist_ok=True) np.savez_compressed( str(out_npz), api_names = ind_names, api_indicators = api_indicators, api_success = api_success, ) written += 1 # Progress every 10 dates if (i + 1) % 10 == 0: elapsed = time.time() - t0 rate = written / elapsed if elapsed > 0 else 1 eta = (total - i - 1) / rate if rate > 0 else 0 print(f" [{i+1}/{total}] {ds} ok={success_count}/{N_INDICATORS}" f" elapsed={elapsed/60:.1f}m eta={eta/60:.1f}m" f" written={written} skipped={skipped} errors={errors}") else: # Brief per-date confirmation key_vals = { 'funding': round(float(api_indicators[0]), 6), # id=1 → idx 0 'dvol': round(float(api_indicators[10]), 2), # id=11 → idx 10 } print(f" {ds} ok={success_count} funding={key_vals['funding']:+.4f} dvol={key_vals['dvol']:.1f}") time.sleep(args.delay) elapsed_total = time.time() - t0 print(f"\n{'='*60}") print(f" ExF Klines Backfill COMPLETE") print(f" Written: {written}") print(f" Skipped: {skipped} (already existed)") print(f" Errors: {errors}") print(f" Runtime: {elapsed_total/60:.1f}m") print(f"{'='*60}") if written > 0 and not args.dry_run: print(f"\n ACBv6 will now find ExF data for klines dates.") print(f" Re-run test_pf_5y_klines.py to get the full-boost ACBv6 results.") if __name__ == "__main__": main()