182 lines
7.5 KiB
Python
182 lines
7.5 KiB
Python
|
|
"""DOLPHIN ExF Backfill for Klines Dates
|
||
|
|
=========================================
|
||
|
|
Writes ExF Indicators NPZ files for all 1,710 klines parquet dates so that
|
||
|
|
ACBv6 can read funding_btc, dvol_btc, fng, and taker for those dates.
|
||
|
|
|
||
|
|
Problem:
|
||
|
|
backfill_runner.py reads NG3 JSON scan directories to get timestamps.
|
||
|
|
Klines dates (2021-2026) have no NG3 JSON scans → ACBv6 _load_external_factors()
|
||
|
|
returns neutral defaults → boost=1.0 always → inverse-boost component is dead.
|
||
|
|
|
||
|
|
Solution:
|
||
|
|
For each klines date, call ExternalFactorsFetcher.fetch_sync(target_date=noon_UTC)
|
||
|
|
and write a minimal NPZ to EIGENVALUES_PATH/YYYY-MM-DD/scan_000001__Indicators.npz
|
||
|
|
in the exact format ACBv6 expects: api_names + api_indicators + api_success.
|
||
|
|
|
||
|
|
Output format (ACBv6 compatible):
|
||
|
|
data['api_names'] : np.array of indicator name strings (N_INDICATORS)
|
||
|
|
data['api_indicators'] : np.float64 array of values (N_INDICATORS)
|
||
|
|
data['api_success'] : np.bool_ array (N_INDICATORS)
|
||
|
|
|
||
|
|
Idempotent: skips dates where the NPZ already exists.
|
||
|
|
Rate-limited: configurable delay between dates (default 1.0s).
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
cd "C:\\Users\\Lenovo\\Documents\\- DOLPHIN NG HD HCM TSF Predict\\external_factors"
|
||
|
|
"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py
|
||
|
|
"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --dry-run
|
||
|
|
"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --start 2022-01-01 --end 2022-12-31
|
||
|
|
|
||
|
|
Expected runtime: 2-5 hours for all 1710 dates (network-dependent).
|
||
|
|
Most of the value (funding_btc, dvol_btc, fng, taker) comes from a few API calls
|
||
|
|
per date. CURRENT-only indicators will fail gracefully (api_success=False, value=0).
|
||
|
|
"""
|
||
|
|
import sys, time, argparse, asyncio
|
||
|
|
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
# -- Paths --
|
||
|
|
import sys as _sys
|
||
|
|
HCM_DIR = Path(__file__).parent.parent if _sys.platform == 'win32' else Path('/mnt/dolphin')
|
||
|
|
KLINES_DIR = HCM_DIR / "vbt_cache_klines"
|
||
|
|
EIGENVALUES_PATH = (Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
|
||
|
|
if _sys.platform == 'win32' else Path('/mnt/ng6_data/eigenvalues'))
|
||
|
|
NPZ_FILENAME = "scan_000001__Indicators.npz" # single synthetic scan per date
|
||
|
|
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
||
|
|
|
||
|
|
def parse_args():
|
||
|
|
p = argparse.ArgumentParser(description="Backfill ExF NPZ files for klines dates")
|
||
|
|
p.add_argument("--start", default=None, help="Start date YYYY-MM-DD (inclusive)")
|
||
|
|
p.add_argument("--end", default=None, help="End date YYYY-MM-DD (inclusive)")
|
||
|
|
p.add_argument("--dry-run", action="store_true", help="Print what would be done, skip writes")
|
||
|
|
p.add_argument("--delay", type=float, default=1.0, help="Seconds between date fetches (default 1.0)")
|
||
|
|
p.add_argument("--overwrite",action="store_true", help="Re-fetch and overwrite existing NPZ files")
|
||
|
|
return p.parse_args()
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
args = parse_args()
|
||
|
|
|
||
|
|
# Import ExF infrastructure
|
||
|
|
from external_factors_matrix import ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS
|
||
|
|
|
||
|
|
# Build ordered name list (matches matrix index: names[i] = INDICATORS[i].name)
|
||
|
|
ind_names = np.array([ind.name for ind in INDICATORS], dtype=object)
|
||
|
|
|
||
|
|
fetcher = ExternalFactorsFetcher(Config())
|
||
|
|
|
||
|
|
# Enumerate klines dates
|
||
|
|
parquet_files = sorted(KLINES_DIR.glob("*.parquet"))
|
||
|
|
parquet_files = [p for p in parquet_files if 'catalog' not in str(p)]
|
||
|
|
date_strings = [p.stem for p in parquet_files]
|
||
|
|
|
||
|
|
# Filter by --start / --end
|
||
|
|
if args.start:
|
||
|
|
date_strings = [d for d in date_strings if d >= args.start]
|
||
|
|
if args.end:
|
||
|
|
date_strings = [d for d in date_strings if d <= args.end]
|
||
|
|
|
||
|
|
total = len(date_strings)
|
||
|
|
print(f"Klines dates to process: {total}")
|
||
|
|
print(f"EIGENVALUES_PATH: {EIGENVALUES_PATH}")
|
||
|
|
print(f"Dry run: {args.dry_run} Overwrite: {args.overwrite} Delay: {args.delay}s\n")
|
||
|
|
|
||
|
|
if args.dry_run:
|
||
|
|
print("DRY RUN — no files will be written.\n")
|
||
|
|
|
||
|
|
skipped = 0
|
||
|
|
written = 0
|
||
|
|
errors = 0
|
||
|
|
t0 = time.time()
|
||
|
|
|
||
|
|
for i, ds in enumerate(date_strings):
|
||
|
|
out_dir = EIGENVALUES_PATH / ds
|
||
|
|
out_npz = out_dir / NPZ_FILENAME
|
||
|
|
|
||
|
|
# Skip if exists and not overwriting
|
||
|
|
if out_npz.exists() and not args.overwrite:
|
||
|
|
skipped += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Fetch at noon UTC for this date
|
||
|
|
try:
|
||
|
|
yr, mo, dy = int(ds[:4]), int(ds[5:7]), int(ds[8:10])
|
||
|
|
target_dt = datetime(yr, mo, dy, 12, 0, 0, tzinfo=timezone.utc)
|
||
|
|
except ValueError:
|
||
|
|
print(f" [{i+1}/{total}] {ds}: BAD DATE FORMAT — skip")
|
||
|
|
errors += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
if args.dry_run:
|
||
|
|
print(f" [{i+1}/{total}] {ds}: would fetch {target_dt.isoformat()} → {out_npz}")
|
||
|
|
written += 1
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
result = fetcher.fetch_sync(target_date=target_dt)
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [{i+1}/{total}] {ds}: FETCH ERROR — {e}")
|
||
|
|
errors += 1
|
||
|
|
time.sleep(args.delay)
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Build NPZ arrays in ACBv6-compatible format
|
||
|
|
matrix = result['matrix'] # np.float64 array, 0-indexed (matrix[id-1])
|
||
|
|
details = result['details'] # {id: {'name': ..., 'value': ..., 'success': bool}}
|
||
|
|
|
||
|
|
api_indicators = matrix.astype(np.float64)
|
||
|
|
api_success = np.array(
|
||
|
|
[details.get(i+1, {}).get('success', False) for i in range(N_INDICATORS)],
|
||
|
|
dtype=np.bool_
|
||
|
|
)
|
||
|
|
success_count = result.get('success_count', int(api_success.sum()))
|
||
|
|
|
||
|
|
# Write NPZ
|
||
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
np.savez_compressed(
|
||
|
|
str(out_npz),
|
||
|
|
api_names = ind_names,
|
||
|
|
api_indicators = api_indicators,
|
||
|
|
api_success = api_success,
|
||
|
|
)
|
||
|
|
written += 1
|
||
|
|
|
||
|
|
# Progress every 10 dates
|
||
|
|
if (i + 1) % 10 == 0:
|
||
|
|
elapsed = time.time() - t0
|
||
|
|
rate = written / elapsed if elapsed > 0 else 1
|
||
|
|
eta = (total - i - 1) / rate if rate > 0 else 0
|
||
|
|
print(f" [{i+1}/{total}] {ds} ok={success_count}/{N_INDICATORS}"
|
||
|
|
f" elapsed={elapsed/60:.1f}m eta={eta/60:.1f}m"
|
||
|
|
f" written={written} skipped={skipped} errors={errors}")
|
||
|
|
else:
|
||
|
|
# Brief per-date confirmation
|
||
|
|
key_vals = {
|
||
|
|
'funding': round(float(api_indicators[0]), 6), # id=1 → idx 0
|
||
|
|
'dvol': round(float(api_indicators[10]), 2), # id=11 → idx 10
|
||
|
|
}
|
||
|
|
print(f" {ds} ok={success_count} funding={key_vals['funding']:+.4f} dvol={key_vals['dvol']:.1f}")
|
||
|
|
|
||
|
|
time.sleep(args.delay)
|
||
|
|
|
||
|
|
elapsed_total = time.time() - t0
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print(f" ExF Klines Backfill COMPLETE")
|
||
|
|
print(f" Written: {written}")
|
||
|
|
print(f" Skipped: {skipped} (already existed)")
|
||
|
|
print(f" Errors: {errors}")
|
||
|
|
print(f" Runtime: {elapsed_total/60:.1f}m")
|
||
|
|
print(f"{'='*60}")
|
||
|
|
|
||
|
|
if written > 0 and not args.dry_run:
|
||
|
|
print(f"\n ACBv6 will now find ExF data for klines dates.")
|
||
|
|
print(f" Re-run test_pf_5y_klines.py to get the full-boost ACBv6 results.")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|