Files
DOLPHIN/external_factors/backfill_klines_exf.py

182 lines
7.5 KiB
Python
Raw Normal View History

"""DOLPHIN ExF Backfill for Klines Dates
=========================================
Writes ExF Indicators NPZ files for all 1,710 klines parquet dates so that
ACBv6 can read funding_btc, dvol_btc, fng, and taker for those dates.
Problem:
backfill_runner.py reads NG3 JSON scan directories to get timestamps.
Klines dates (2021-2026) have no NG3 JSON scans ACBv6 _load_external_factors()
returns neutral defaults boost=1.0 always inverse-boost component is dead.
Solution:
For each klines date, call ExternalFactorsFetcher.fetch_sync(target_date=noon_UTC)
and write a minimal NPZ to EIGENVALUES_PATH/YYYY-MM-DD/scan_000001__Indicators.npz
in the exact format ACBv6 expects: api_names + api_indicators + api_success.
Output format (ACBv6 compatible):
data['api_names'] : np.array of indicator name strings (N_INDICATORS)
data['api_indicators'] : np.float64 array of values (N_INDICATORS)
data['api_success'] : np.bool_ array (N_INDICATORS)
Idempotent: skips dates where the NPZ already exists.
Rate-limited: configurable delay between dates (default 1.0s).
Usage:
cd "C:\\Users\\Lenovo\\Documents\\- DOLPHIN NG HD HCM TSF Predict\\external_factors"
"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py
"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --dry-run
"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --start 2022-01-01 --end 2022-12-31
Expected runtime: 2-5 hours for all 1710 dates (network-dependent).
Most of the value (funding_btc, dvol_btc, fng, taker) comes from a few API calls
per date. CURRENT-only indicators will fail gracefully (api_success=False, value=0).
"""
import sys, time, argparse, asyncio
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
from datetime import datetime, timezone
import numpy as np
# -- Paths --
import sys as _sys
HCM_DIR = Path(__file__).parent.parent if _sys.platform == 'win32' else Path('/mnt/dolphin')
KLINES_DIR = HCM_DIR / "vbt_cache_klines"
EIGENVALUES_PATH = (Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
if _sys.platform == 'win32' else Path('/mnt/ng6_data/eigenvalues'))
NPZ_FILENAME = "scan_000001__Indicators.npz" # single synthetic scan per date
sys.path.insert(0, str(Path(__file__).parent))
def parse_args():
p = argparse.ArgumentParser(description="Backfill ExF NPZ files for klines dates")
p.add_argument("--start", default=None, help="Start date YYYY-MM-DD (inclusive)")
p.add_argument("--end", default=None, help="End date YYYY-MM-DD (inclusive)")
p.add_argument("--dry-run", action="store_true", help="Print what would be done, skip writes")
p.add_argument("--delay", type=float, default=1.0, help="Seconds between date fetches (default 1.0)")
p.add_argument("--overwrite",action="store_true", help="Re-fetch and overwrite existing NPZ files")
return p.parse_args()
def main():
args = parse_args()
# Import ExF infrastructure
from external_factors_matrix import ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS
# Build ordered name list (matches matrix index: names[i] = INDICATORS[i].name)
ind_names = np.array([ind.name for ind in INDICATORS], dtype=object)
fetcher = ExternalFactorsFetcher(Config())
# Enumerate klines dates
parquet_files = sorted(KLINES_DIR.glob("*.parquet"))
parquet_files = [p for p in parquet_files if 'catalog' not in str(p)]
date_strings = [p.stem for p in parquet_files]
# Filter by --start / --end
if args.start:
date_strings = [d for d in date_strings if d >= args.start]
if args.end:
date_strings = [d for d in date_strings if d <= args.end]
total = len(date_strings)
print(f"Klines dates to process: {total}")
print(f"EIGENVALUES_PATH: {EIGENVALUES_PATH}")
print(f"Dry run: {args.dry_run} Overwrite: {args.overwrite} Delay: {args.delay}s\n")
if args.dry_run:
print("DRY RUN — no files will be written.\n")
skipped = 0
written = 0
errors = 0
t0 = time.time()
for i, ds in enumerate(date_strings):
out_dir = EIGENVALUES_PATH / ds
out_npz = out_dir / NPZ_FILENAME
# Skip if exists and not overwriting
if out_npz.exists() and not args.overwrite:
skipped += 1
continue
# Fetch at noon UTC for this date
try:
yr, mo, dy = int(ds[:4]), int(ds[5:7]), int(ds[8:10])
target_dt = datetime(yr, mo, dy, 12, 0, 0, tzinfo=timezone.utc)
except ValueError:
print(f" [{i+1}/{total}] {ds}: BAD DATE FORMAT — skip")
errors += 1
continue
if args.dry_run:
print(f" [{i+1}/{total}] {ds}: would fetch {target_dt.isoformat()}{out_npz}")
written += 1
continue
try:
result = fetcher.fetch_sync(target_date=target_dt)
except Exception as e:
print(f" [{i+1}/{total}] {ds}: FETCH ERROR — {e}")
errors += 1
time.sleep(args.delay)
continue
# Build NPZ arrays in ACBv6-compatible format
matrix = result['matrix'] # np.float64 array, 0-indexed (matrix[id-1])
details = result['details'] # {id: {'name': ..., 'value': ..., 'success': bool}}
api_indicators = matrix.astype(np.float64)
api_success = np.array(
[details.get(i+1, {}).get('success', False) for i in range(N_INDICATORS)],
dtype=np.bool_
)
success_count = result.get('success_count', int(api_success.sum()))
# Write NPZ
out_dir.mkdir(parents=True, exist_ok=True)
np.savez_compressed(
str(out_npz),
api_names = ind_names,
api_indicators = api_indicators,
api_success = api_success,
)
written += 1
# Progress every 10 dates
if (i + 1) % 10 == 0:
elapsed = time.time() - t0
rate = written / elapsed if elapsed > 0 else 1
eta = (total - i - 1) / rate if rate > 0 else 0
print(f" [{i+1}/{total}] {ds} ok={success_count}/{N_INDICATORS}"
f" elapsed={elapsed/60:.1f}m eta={eta/60:.1f}m"
f" written={written} skipped={skipped} errors={errors}")
else:
# Brief per-date confirmation
key_vals = {
'funding': round(float(api_indicators[0]), 6), # id=1 → idx 0
'dvol': round(float(api_indicators[10]), 2), # id=11 → idx 10
}
print(f" {ds} ok={success_count} funding={key_vals['funding']:+.4f} dvol={key_vals['dvol']:.1f}")
time.sleep(args.delay)
elapsed_total = time.time() - t0
print(f"\n{'='*60}")
print(f" ExF Klines Backfill COMPLETE")
print(f" Written: {written}")
print(f" Skipped: {skipped} (already existed)")
print(f" Errors: {errors}")
print(f" Runtime: {elapsed_total/60:.1f}m")
print(f"{'='*60}")
if written > 0 and not args.dry_run:
print(f"\n ACBv6 will now find ExF data for klines dates.")
print(f" Re-run test_pf_5y_klines.py to get the full-boost ACBv6 results.")
if __name__ == "__main__":
main()