DOLPHIN/external_factors/backfill_klines_exf.py

"""DOLPHIN ExF Backfill for Klines Dates
=========================================
Writes ExF Indicators NPZ files for all 1,710 klines parquet dates so that
ACBv6 can read funding_btc, dvol_btc, fng, and taker for those dates.

Problem:
  backfill_runner.py reads NG3 JSON scan directories to get timestamps.
  Klines dates (2021-2026) have no NG3 JSON scans → ACBv6 _load_external_factors()
  returns neutral defaults → boost=1.0 always → inverse-boost component is dead.

Solution:
  For each klines date, call ExternalFactorsFetcher.fetch_sync(target_date=noon_UTC)
  and write a minimal NPZ to EIGENVALUES_PATH/YYYY-MM-DD/scan_000001__Indicators.npz
  in the exact format ACBv6 expects: api_names + api_indicators + api_success.

Output format (ACBv6 compatible):
  data['api_names']      : np.array of indicator name strings (N_INDICATORS)
  data['api_indicators'] : np.float64 array of values (N_INDICATORS)
  data['api_success']    : np.bool_ array (N_INDICATORS)

Idempotent: skips dates where the NPZ already exists.
Rate-limited: configurable delay between dates (default 1.0s).

Usage:
  cd "C:\\Users\\Lenovo\\Documents\\- DOLPHIN NG HD HCM TSF Predict\\external_factors"
  "C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py
  "C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --dry-run
  "C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --start 2022-01-01 --end 2022-12-31

Expected runtime: 2-5 hours for all 1710 dates (network-dependent).
Most of the value (funding_btc, dvol_btc, fng, taker) comes from a few API calls
per date. CURRENT-only indicators will fail gracefully (api_success=False, value=0).
"""
import sys, time, argparse, asyncio
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
from datetime import datetime, timezone

import numpy as np

# -- Paths --
import sys as _sys
HCM_DIR          = Path(__file__).parent.parent if _sys.platform == 'win32' else Path('/mnt/dolphin')
KLINES_DIR       = HCM_DIR / "vbt_cache_klines"
EIGENVALUES_PATH = (Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
                    if _sys.platform == 'win32' else Path('/mnt/ng6_data/eigenvalues'))
NPZ_FILENAME  = "scan_000001__Indicators.npz"   # single synthetic scan per date

sys.path.insert(0, str(Path(__file__).parent))

def parse_args():
    p = argparse.ArgumentParser(description="Backfill ExF NPZ files for klines dates")
    p.add_argument("--start",    default=None,  help="Start date YYYY-MM-DD (inclusive)")
    p.add_argument("--end",      default=None,  help="End date YYYY-MM-DD (inclusive)")
    p.add_argument("--dry-run",  action="store_true", help="Print what would be done, skip writes")
    p.add_argument("--delay",    type=float, default=1.0, help="Seconds between date fetches (default 1.0)")
    p.add_argument("--overwrite",action="store_true", help="Re-fetch and overwrite existing NPZ files")
    return p.parse_args()


def main():
    args = parse_args()

    # Import ExF infrastructure
    from external_factors_matrix import ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS

    # Build ordered name list (matches matrix index: names[i] = INDICATORS[i].name)
    ind_names = np.array([ind.name for ind in INDICATORS], dtype=object)

    fetcher = ExternalFactorsFetcher(Config())

    # Enumerate klines dates
    parquet_files = sorted(KLINES_DIR.glob("*.parquet"))
    parquet_files = [p for p in parquet_files if 'catalog' not in str(p)]
    date_strings  = [p.stem for p in parquet_files]

    # Filter by --start / --end
    if args.start:
        date_strings = [d for d in date_strings if d >= args.start]
    if args.end:
        date_strings = [d for d in date_strings if d <= args.end]

    total = len(date_strings)
    print(f"Klines dates to process: {total}")
    print(f"EIGENVALUES_PATH: {EIGENVALUES_PATH}")
    print(f"Dry run: {args.dry_run}  Overwrite: {args.overwrite}  Delay: {args.delay}s\n")

    if args.dry_run:
        print("DRY RUN — no files will be written.\n")

    skipped = 0
    written = 0
    errors  = 0
    t0 = time.time()

    for i, ds in enumerate(date_strings):
        out_dir = EIGENVALUES_PATH / ds
        out_npz = out_dir / NPZ_FILENAME

        # Skip if exists and not overwriting
        if out_npz.exists() and not args.overwrite:
            skipped += 1
            continue

        # Fetch at noon UTC for this date
        try:
            yr, mo, dy = int(ds[:4]), int(ds[5:7]), int(ds[8:10])
            target_dt  = datetime(yr, mo, dy, 12, 0, 0, tzinfo=timezone.utc)
        except ValueError:
            print(f"  [{i+1}/{total}] {ds}: BAD DATE FORMAT — skip")
            errors += 1
            continue

        if args.dry_run:
            print(f"  [{i+1}/{total}] {ds}: would fetch {target_dt.isoformat()} → {out_npz}")
            written += 1
            continue

        try:
            result = fetcher.fetch_sync(target_date=target_dt)
        except Exception as e:
            print(f"  [{i+1}/{total}] {ds}: FETCH ERROR — {e}")
            errors += 1
            time.sleep(args.delay)
            continue

        # Build NPZ arrays in ACBv6-compatible format
        matrix  = result['matrix']       # np.float64 array, 0-indexed (matrix[id-1])
        details = result['details']      # {id: {'name': ..., 'value': ..., 'success': bool}}

        api_indicators = matrix.astype(np.float64)
        api_success    = np.array(
            [details.get(i+1, {}).get('success', False) for i in range(N_INDICATORS)],
            dtype=np.bool_
        )
        success_count  = result.get('success_count', int(api_success.sum()))

        # Write NPZ
        out_dir.mkdir(parents=True, exist_ok=True)
        np.savez_compressed(
            str(out_npz),
            api_names      = ind_names,
            api_indicators = api_indicators,
            api_success    = api_success,
        )
        written += 1

        # Progress every 10 dates
        if (i + 1) % 10 == 0:
            elapsed = time.time() - t0
            rate    = written / elapsed if elapsed > 0 else 1
            eta     = (total - i - 1) / rate if rate > 0 else 0
            print(f"  [{i+1}/{total}] {ds}  ok={success_count}/{N_INDICATORS}"
                  f"  elapsed={elapsed/60:.1f}m  eta={eta/60:.1f}m"
                  f"  written={written}  skipped={skipped}  errors={errors}")
        else:
            # Brief per-date confirmation
            key_vals = {
                'funding': round(float(api_indicators[0]),  6),   # id=1 → idx 0
                'dvol':    round(float(api_indicators[10]), 2),   # id=11 → idx 10
            }
            print(f"  {ds}  ok={success_count}  funding={key_vals['funding']:+.4f}  dvol={key_vals['dvol']:.1f}")

        time.sleep(args.delay)

    elapsed_total = time.time() - t0
    print(f"\n{'='*60}")
    print(f"  ExF Klines Backfill COMPLETE")
    print(f"  Written:  {written}")
    print(f"  Skipped:  {skipped} (already existed)")
    print(f"  Errors:   {errors}")
    print(f"  Runtime:  {elapsed_total/60:.1f}m")
    print(f"{'='*60}")

    if written > 0 and not args.dry_run:
        print(f"\n  ACBv6 will now find ExF data for klines dates.")
        print(f"  Re-run test_pf_5y_klines.py to get the full-boost ACBv6 results.")


if __name__ == "__main__":
    main()
initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore. 2026-04-21 16:58:38 +02:00			`"""DOLPHIN ExF Backfill for Klines Dates`
			`=========================================`
			`Writes ExF Indicators NPZ files for all 1,710 klines parquet dates so that`
			`ACBv6 can read funding_btc, dvol_btc, fng, and taker for those dates.`

			`Problem:`
			`backfill_runner.py reads NG3 JSON scan directories to get timestamps.`
			`Klines dates (2021-2026) have no NG3 JSON scans → ACBv6 _load_external_factors()`
			`returns neutral defaults → boost=1.0 always → inverse-boost component is dead.`

			`Solution:`
			`For each klines date, call ExternalFactorsFetcher.fetch_sync(target_date=noon_UTC)`
			`and write a minimal NPZ to EIGENVALUES_PATH/YYYY-MM-DD/scan_000001__Indicators.npz`
			`in the exact format ACBv6 expects: api_names + api_indicators + api_success.`

			`Output format (ACBv6 compatible):`
			`data['api_names'] : np.array of indicator name strings (N_INDICATORS)`
			`data['api_indicators'] : np.float64 array of values (N_INDICATORS)`
			`data['api_success'] : np.bool_ array (N_INDICATORS)`

			`Idempotent: skips dates where the NPZ already exists.`
			`Rate-limited: configurable delay between dates (default 1.0s).`

			`Usage:`
			`cd "C:\\Users\\Lenovo\\Documents\\- DOLPHIN NG HD HCM TSF Predict\\external_factors"`
			`"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py`
			`"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --dry-run`
			`"C:\\Users\\Lenovo\\Documents\\- Siloqy\\Scripts\\python.exe" backfill_klines_exf.py --start 2022-01-01 --end 2022-12-31`

			`Expected runtime: 2-5 hours for all 1710 dates (network-dependent).`
			`Most of the value (funding_btc, dvol_btc, fng, taker) comes from a few API calls`
			`per date. CURRENT-only indicators will fail gracefully (api_success=False, value=0).`
			`"""`
			`import sys, time, argparse, asyncio`
			`sys.stdout.reconfigure(encoding='utf-8', errors='replace')`
			`from pathlib import Path`
			`from datetime import datetime, timezone`

			`import numpy as np`

			`# -- Paths --`
			`import sys as _sys`
			`HCM_DIR = Path(__file__).parent.parent if _sys.platform == 'win32' else Path('/mnt/dolphin')`
			`KLINES_DIR = HCM_DIR / "vbt_cache_klines"`
			`EIGENVALUES_PATH = (Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")`
			`if _sys.platform == 'win32' else Path('/mnt/ng6_data/eigenvalues'))`
			`NPZ_FILENAME = "scan_000001__Indicators.npz" # single synthetic scan per date`

			`sys.path.insert(0, str(Path(__file__).parent))`

			`def parse_args():`
			`p = argparse.ArgumentParser(description="Backfill ExF NPZ files for klines dates")`
			`p.add_argument("--start", default=None, help="Start date YYYY-MM-DD (inclusive)")`
			`p.add_argument("--end", default=None, help="End date YYYY-MM-DD (inclusive)")`
			`p.add_argument("--dry-run", action="store_true", help="Print what would be done, skip writes")`
			`p.add_argument("--delay", type=float, default=1.0, help="Seconds between date fetches (default 1.0)")`
			`p.add_argument("--overwrite",action="store_true", help="Re-fetch and overwrite existing NPZ files")`
			`return p.parse_args()`


			`def main():`
			`args = parse_args()`

			`# Import ExF infrastructure`
			`from external_factors_matrix import ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS`

			`# Build ordered name list (matches matrix index: names[i] = INDICATORS[i].name)`
			`ind_names = np.array([ind.name for ind in INDICATORS], dtype=object)`

			`fetcher = ExternalFactorsFetcher(Config())`

			`# Enumerate klines dates`
			`parquet_files = sorted(KLINES_DIR.glob("*.parquet"))`
			`parquet_files = [p for p in parquet_files if 'catalog' not in str(p)]`
			`date_strings = [p.stem for p in parquet_files]`

			`# Filter by --start / --end`
			`if args.start:`
			`date_strings = [d for d in date_strings if d >= args.start]`
			`if args.end:`
			`date_strings = [d for d in date_strings if d <= args.end]`

			`total = len(date_strings)`
			`print(f"Klines dates to process: {total}")`
			`print(f"EIGENVALUES_PATH: {EIGENVALUES_PATH}")`
			`print(f"Dry run: {args.dry_run} Overwrite: {args.overwrite} Delay: {args.delay}s\n")`

			`if args.dry_run:`
			`print("DRY RUN — no files will be written.\n")`

			`skipped = 0`
			`written = 0`
			`errors = 0`
			`t0 = time.time()`

			`for i, ds in enumerate(date_strings):`
			`out_dir = EIGENVALUES_PATH / ds`
			`out_npz = out_dir / NPZ_FILENAME`

			`# Skip if exists and not overwriting`
			`if out_npz.exists() and not args.overwrite:`
			`skipped += 1`
			`continue`

			`# Fetch at noon UTC for this date`
			`try:`
			`yr, mo, dy = int(ds[:4]), int(ds[5:7]), int(ds[8:10])`
			`target_dt = datetime(yr, mo, dy, 12, 0, 0, tzinfo=timezone.utc)`
			`except ValueError:`
			`print(f" [{i+1}/{total}] {ds}: BAD DATE FORMAT — skip")`
			`errors += 1`
			`continue`

			`if args.dry_run:`
			`print(f" [{i+1}/{total}] {ds}: would fetch {target_dt.isoformat()} → {out_npz}")`
			`written += 1`
			`continue`

			`try:`
			`result = fetcher.fetch_sync(target_date=target_dt)`
			`except Exception as e:`
			`print(f" [{i+1}/{total}] {ds}: FETCH ERROR — {e}")`
			`errors += 1`
			`time.sleep(args.delay)`
			`continue`

			`# Build NPZ arrays in ACBv6-compatible format`
			`matrix = result['matrix'] # np.float64 array, 0-indexed (matrix[id-1])`
			`details = result['details'] # {id: {'name': ..., 'value': ..., 'success': bool}}`

			`api_indicators = matrix.astype(np.float64)`
			`api_success = np.array(`
			`[details.get(i+1, {}).get('success', False) for i in range(N_INDICATORS)],`
			`dtype=np.bool_`
			`)`
			`success_count = result.get('success_count', int(api_success.sum()))`

			`# Write NPZ`
			`out_dir.mkdir(parents=True, exist_ok=True)`
			`np.savez_compressed(`
			`str(out_npz),`
			`api_names = ind_names,`
			`api_indicators = api_indicators,`
			`api_success = api_success,`
			`)`
			`written += 1`

			`# Progress every 10 dates`
			`if (i + 1) % 10 == 0:`
			`elapsed = time.time() - t0`
			`rate = written / elapsed if elapsed > 0 else 1`
			`eta = (total - i - 1) / rate if rate > 0 else 0`
			`print(f" [{i+1}/{total}] {ds} ok={success_count}/{N_INDICATORS}"`
			`f" elapsed={elapsed/60:.1f}m eta={eta/60:.1f}m"`
			`f" written={written} skipped={skipped} errors={errors}")`
			`else:`
			`# Brief per-date confirmation`
			`key_vals = {`
			`'funding': round(float(api_indicators[0]), 6), # id=1 → idx 0`
			`'dvol': round(float(api_indicators[10]), 2), # id=11 → idx 10`
			`}`
			`print(f" {ds} ok={success_count} funding={key_vals['funding']:+.4f} dvol={key_vals['dvol']:.1f}")`

			`time.sleep(args.delay)`

			`elapsed_total = time.time() - t0`
			`print(f"\n{'='*60}")`
			`print(f" ExF Klines Backfill COMPLETE")`
			`print(f" Written: {written}")`
			`print(f" Skipped: {skipped} (already existed)")`
			`print(f" Errors: {errors}")`
			`print(f" Runtime: {elapsed_total/60:.1f}m")`
			`print(f"{'='*60}")`

			`if written > 0 and not args.dry_run:`
			`print(f"\n ACBv6 will now find ExF data for klines dates.")`
			`print(f" Re-run test_pf_5y_klines.py to get the full-boost ACBv6 results.")`


			`if __name__ == "__main__":`
			`main()`