initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/external_factors/backfill_patch_npz.py
+++ b/external_factors/backfill_patch_npz.py
@@ -0,0 +1,398 @@
+"""ExF NPZ Patcher — Supplemental Historical Backfill
+======================================================
+The initial backfill got ~41/85 indicators. This script patches the existing
+NPZ files with real historical values for indicators that were failing:
+
+  Priority 1 — fng (Alternative.me): one API call returns 2000+ days. EASY.
+  Priority 2 — oi_btc/eth, ls_btc/eth, ls_top, taker (Binance hist endpoints)
+  Priority 3 — vix, sp500, gold, dxy, us10y, ycurve, fedfunds (FRED — needs key)
+  Priority 4 — mvrv, nvt, addr_btc (CoinMetrics community API)
+
+Strategy: load each NPZ, replace failing indicator values with fetched historical
+data, re-save. Idempotent — re-run any time.
+
+Usage:
+  python backfill_patch_npz.py              # patch all dates
+  python backfill_patch_npz.py --dry-run    # show what would change
+  python backfill_patch_npz.py --fred-key YOUR_KEY_HERE  # enable FRED
+  python backfill_patch_npz.py --skip-binance  # skip Binance OI/LS/taker
+"""
+import sys, time, argparse, json
+sys.stdout.reconfigure(encoding='utf-8', errors='replace')
+from pathlib import Path
+from datetime import datetime, timezone, date, timedelta
+import numpy as np
+
+try:
+    import requests
+    HAS_REQUESTS = True
+except ImportError:
+    HAS_REQUESTS = False
+    print("WARNING: requests not installed. Install with: pip install requests")
+
+import sys as _sys
+EIGENVALUES_PATH = (Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
+                    if _sys.platform == 'win32' else Path('/mnt/ng6_data/eigenvalues'))
+KLINES_DIR       = (Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines")
+                    if _sys.platform == 'win32' else Path('/mnt/dolphin/vbt_cache_klines'))
+NPZ_FILENAME     = "scan_000001__Indicators.npz"
+REQUEST_TIMEOUT  = 20
+
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--dry-run",      action="store_true")
+    p.add_argument("--fred-key",     default="", help="FRED API key (free: fred.stlouisfed.org)")
+    p.add_argument("--skip-binance", action="store_true")
+    p.add_argument("--skip-fred",    action="store_true")
+    p.add_argument("--skip-fng",     action="store_true")
+    p.add_argument("--start",        default=None, help="Start date YYYY-MM-DD")
+    p.add_argument("--end",          default=None, help="End date YYYY-MM-DD")
+    return p.parse_args()
+
+# ── FNG (Alternative.me) — one call, all history ─────────────────────────────
+
+def fetch_fng_history():
+    """Returns dict: date_str -> fng_value (int)."""
+    url = "https://api.alternative.me/fng/?limit=2000&format=json&date_format=us"
+    try:
+        r = requests.get(url, timeout=REQUEST_TIMEOUT)
+        r.raise_for_status()
+        data = r.json()
+        result = {}
+        for entry in data.get('data', []):
+            # date_format=us gives MM/DD/YYYY
+            raw_date = entry.get('timestamp') or entry.get('time_until_update', '')
+            # Try two formats the API uses
+            ts_str = str(entry.get('timestamp', ''))
+            parsed = False
+            for fmt in ('%m-%d-%Y', '%m/%d/%Y', '%Y-%m-%d'):
+                try:
+                    dt = datetime.strptime(ts_str, fmt)
+                    key = dt.strftime('%Y-%m-%d')
+                    result[key] = int(entry['value'])
+                    parsed = True
+                    break
+                except ValueError:
+                    pass
+            if not parsed:
+                try:
+                    ts = int(ts_str)
+                    dt = datetime.utcfromtimestamp(ts)
+                    key = dt.strftime('%Y-%m-%d')
+                    result[key] = int(entry['value'])
+                except Exception:
+                    pass
+        return result
+    except Exception as e:
+        print(f"  FNG fetch failed: {e}")
+        return {}
+
+# ── Binance historical OI / LS / taker ───────────────────────────────────────
+
+def fetch_binance_hist(url_template, symbol, date_str):
+    """Fetch a single data point from Binance hist endpoint for given date (noon UTC)."""
+    yr, mo, dy = int(date_str[:4]), int(date_str[5:7]), int(date_str[8:10])
+    noon_utc = datetime(yr, mo, dy, 12, 0, 0, tzinfo=timezone.utc)
+    start_ms = int(noon_utc.timestamp() * 1000)
+    end_ms   = start_ms + 3_600_000  # +1 hour window
+    url = url_template.format(SYMBOL=symbol, start_ms=start_ms, end_ms=end_ms)
+    try:
+        r = requests.get(url, timeout=REQUEST_TIMEOUT)
+        if r.status_code == 400:
+            return None  # data too old for this endpoint
+        r.raise_for_status()
+        data = r.json()
+        if isinstance(data, list) and len(data) > 0:
+            return data[0]
+        return None
+    except Exception:
+        return None
+
+OI_URL  = "https://fapi.binance.com/futures/data/openInterestHist?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1"
+LS_URL  = "https://fapi.binance.com/futures/data/globalLongShortAccountRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1"
+LS_TOP  = "https://fapi.binance.com/futures/data/topLongShortAccountRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1"
+TAKER_URL = "https://fapi.binance.com/futures/data/takerlongshortRatio?symbol={SYMBOL}&period=1h&startTime={start_ms}&endTime={end_ms}&limit=1"
+
+def get_binance_indicators(date_str):
+    """Returns dict of indicator_name -> value (or None on failure)."""
+    results = {}
+    for name, url, sym, field in [
+        ('oi_btc',  OI_URL,   'BTCUSDT', 'sumOpenInterest'),
+        ('oi_eth',  OI_URL,   'ETHUSDT', 'sumOpenInterest'),
+        ('ls_btc',  LS_URL,   'BTCUSDT', 'longShortRatio'),
+        ('ls_eth',  LS_URL,   'ETHUSDT', 'longShortRatio'),
+        ('ls_top',  LS_TOP,   'BTCUSDT', 'longShortRatio'),
+        ('taker',   TAKER_URL,'BTCUSDT', 'buySellRatio'),
+    ]:
+        rec = fetch_binance_hist(url, sym, date_str)
+        if rec is not None and field in rec:
+            try:
+                results[name] = float(rec[field])
+            except (TypeError, ValueError):
+                results[name] = None
+        else:
+            results[name] = None
+        time.sleep(0.05)  # light rate limiting
+    return results
+
+# ── FRED ─────────────────────────────────────────────────────────────────────
+
+FRED_SERIES = {
+    'vix':       'VIXCLS',
+    'sp500':     'SP500',
+    'gold':      'GOLDAMGBD228NLBM',
+    'dxy':       'DTWEXBGS',
+    'us10y':     'DGS10',
+    'us2y':      'DGS2',
+    'ycurve':    'T10Y2Y',
+    'fedfunds':  'DFF',
+    'hy_spread': 'BAMLH0A0HYM2',
+    'be5y':      'T5YIE',
+    'm2':        'WM2NS',
+}
+
+_fred_cache = {}  # series_id -> {date_str -> value}
+
+def fetch_fred_series(series_id, fred_key, lookback_years=6):
+    """Fetch a FRED series for the last 6 years. Cached."""
+    if series_id in _fred_cache:
+        return _fred_cache[series_id]
+    start = (date.today() - timedelta(days=lookback_years*366)).strftime('%Y-%m-%d')
+    url = (f"https://api.stlouisfed.org/fred/series/observations"
+           f"?series_id={series_id}&api_key={fred_key}&file_type=json"
+           f"&observation_start={start}")
+    try:
+        r = requests.get(url, timeout=REQUEST_TIMEOUT)
+        r.raise_for_status()
+        data = r.json()
+        result = {}
+        prev = None
+        for obs in data.get('observations', []):
+            v = obs.get('value', '.')
+            if v not in ('.', '', 'nd'):
+                try:
+                    prev = float(v)
+                except ValueError:
+                    pass
+            if prev is not None:
+                result[obs['date']] = prev  # forward-fill
+        _fred_cache[series_id] = result
+        return result
+    except Exception as e:
+        print(f"  FRED {series_id} failed: {e}")
+        _fred_cache[series_id] = {}
+        return {}
+
+def get_fred_indicators(date_str, fred_key):
+    results = {}
+    for name, series_id in FRED_SERIES.items():
+        series = fetch_fred_series(series_id, fred_key)
+        # Find value on or before date (forward-fill)
+        val = None
+        for d_str in sorted(series.keys(), reverse=True):
+            if d_str <= date_str:
+                val = series[d_str]
+                break
+        results[name] = val
+    return results
+
+# ── CoinMetrics community ─────────────────────────────────────────────────────
+
+_cm_cache = {}  # (asset, metric) -> {date_str -> value}
+
+def fetch_coinmetrics(asset, metric, date_str):
+    key = (asset, metric)
+    if key not in _cm_cache:
+        url = (f"https://community-api.coinmetrics.io/v4/timeseries/asset-metrics"
+               f"?assets={asset}&metrics={metric}&frequency=1d"
+               f"&start_time=2021-01-01T00:00:00Z")
+        try:
+            r = requests.get(url, timeout=30)
+            r.raise_for_status()
+            data = r.json()
+            result = {}
+            for row in data.get('data', []):
+                d = row.get('time', '')[:10]
+                v = row.get(metric)
+                if v is not None:
+                    try:
+                        result[d] = float(v)
+                    except (TypeError, ValueError):
+                        pass
+            _cm_cache[key] = result
+        except Exception as e:
+            print(f"  CoinMetrics {asset}/{metric} failed: {e}")
+            _cm_cache[key] = {}
+    cache = _cm_cache.get(key, {})
+    return cache.get(date_str)
+
+CM_INDICATORS = [
+    # Only include metrics confirmed as accessible on community API
+    ('mvrv',     'btc', 'CapMVRVCur'),   # works (200 OK)
+    ('addr_btc', 'btc', 'AdrActCnt'),    # works
+    ('txcnt',    'btc', 'TxCnt'),        # works
+]
+
+# ── Main patcher ──────────────────────────────────────────────────────────────
+
+def patch_npz(npz_path, updates, dry_run=False):
+    """Load NPZ, apply updates dict {name -> value}, save in-place."""
+    data = np.load(str(npz_path), allow_pickle=True)
+    names   = list(data['api_names'])
+    vals    = data['api_indicators'].copy()
+    success = data['api_success'].copy()
+
+    changed = []
+    for name, value in updates.items():
+        if value is None or not np.isfinite(float(value)):
+            continue
+        if name not in names:
+            continue
+        idx = names.index(name)
+        old = float(vals[idx])
+        old_ok = bool(success[idx])
+        new_val = float(value)
+        if not old_ok or abs(old - new_val) > 1e-9:
+            vals[idx] = new_val
+            success[idx] = True
+            changed.append(f"{name}: {old:.4f}→{new_val:.4f}")
+
+    if not changed:
+        return 0
+
+    if not dry_run:
+        ind_names = np.array(names, dtype=object)
+        np.savez_compressed(
+            str(npz_path),
+            api_names      = ind_names,
+            api_indicators = vals,
+            api_success    = success,
+        )
+    return len(changed)
+
+def main():
+    args = parse_args()
+    if not HAS_REQUESTS:
+        print("ERROR: requests required. pip install requests"); return
+
+    # Enumerate dates
+    dates = sorted(p.stem for p in KLINES_DIR.glob("*.parquet") if 'catalog' not in p.name)
+    if args.start: dates = [d for d in dates if d >= args.start]
+    if args.end:   dates = [d for d in dates if d <= args.end]
+    total = len(dates)
+    print(f"Dates to patch: {total}")
+    print(f"Dry run:  {args.dry_run}")
+    print(f"FNG:      {'skip' if args.skip_fng else 'YES'}")
+    print(f"Binance:  {'skip' if args.skip_binance else 'YES'}")
+    print(f"FRED:     {'skip (no key)' if (args.skip_fred or not args.fred_key) else f'YES (key={args.fred_key[:6]}...)'}")
+    print()
+
+    # ── Fetch FNG all-history up front (one call) ─────────────────────────────
+    fng_hist = {}
+    if not args.skip_fng:
+        print("Fetching FNG full history (one call)...")
+        fng_hist = fetch_fng_history()
+        print(f"  Got {len(fng_hist)} dates  "
+              f"range={min(fng_hist) if fng_hist else 'n/a'} → {max(fng_hist) if fng_hist else 'n/a'}")
+        if fng_hist:
+            sample = {k: v for k, v in list(fng_hist.items())[:3]}
+            print(f"  Sample: {sample}")
+
+    # ── Fetch FRED all-series up front ───────────────────────────────────────
+    if args.fred_key and not args.skip_fred:
+        print(f"\nPre-fetching FRED series ({len(FRED_SERIES)} series)...")
+        for name, sid in FRED_SERIES.items():
+            series = fetch_fred_series(sid, args.fred_key)
+            print(f"  {name:<12} ({sid}): {len(series)} observations")
+            time.sleep(0.6)  # FRED rate limit: 120/min
+
+    # ── Fetch CoinMetrics up front ────────────────────────────────────────────
+    print(f"\nPre-fetching CoinMetrics ({len(CM_INDICATORS)} metrics)...")
+    for cm_name, asset, metric in CM_INDICATORS:
+        fetch_coinmetrics(asset, metric, '2023-01-01')  # warms cache for all dates
+        n = len(_cm_cache.get((asset, metric), {}))
+        print(f"  {cm_name:<12}: {n} dates")
+        time.sleep(0.8)
+
+    # ── Per-date loop ─────────────────────────────────────────────────────────
+    print(f"\nPatching NPZ files...")
+    total_changed = 0
+    binance_fail_streak = 0
+
+    t0 = time.time()
+    for i, ds in enumerate(dates):
+        npz_path = EIGENVALUES_PATH / ds / NPZ_FILENAME
+        if not npz_path.exists():
+            continue
+
+        updates = {}
+
+        # FNG
+        if not args.skip_fng and ds in fng_hist:
+            updates['fng'] = float(fng_hist[ds])
+            # Also try to get sub-components from same entry if available
+            # (fng_prev is previous day's value)
+            prev_day = (datetime.strptime(ds, '%Y-%m-%d') - timedelta(days=1)).strftime('%Y-%m-%d')
+            if prev_day in fng_hist:
+                updates['fng_prev'] = float(fng_hist[prev_day])
+
+        # FRED
+        if args.fred_key and not args.skip_fred:
+            fred_vals = get_fred_indicators(ds, args.fred_key)
+            for name, val in fred_vals.items():
+                if val is not None:
+                    updates[name] = val
+
+        # CoinMetrics
+        for cm_name, asset, metric in CM_INDICATORS:
+            val = fetch_coinmetrics(asset, metric, ds)  # hits cache
+            if val is not None:
+                updates[cm_name] = val
+
+        # Binance OI/LS/taker (network call per date — slowest)
+        if not args.skip_binance and binance_fail_streak < 10:
+            # Only call if these are currently failing in the NPZ
+            d = np.load(str(npz_path), allow_pickle=True)
+            names_in_npz = list(d['api_names'])
+            ok_in_npz = d['api_success']
+            taker_idx = names_in_npz.index('taker') if 'taker' in names_in_npz else -1
+            taker_ok = bool(ok_in_npz[taker_idx]) if taker_idx >= 0 else False
+
+            if not taker_ok:  # proxy check: if taker failing, all Binance hist likely failing
+                binance_vals = get_binance_indicators(ds)
+                n_binance_ok = sum(1 for v in binance_vals.values() if v is not None)
+                if n_binance_ok == 0:
+                    binance_fail_streak += 1
+                else:
+                    binance_fail_streak = 0
+                    updates.update({k: v for k, v in binance_vals.items() if v is not None})
+
+        # Patch
+        n_changed = patch_npz(npz_path, updates, dry_run=args.dry_run)
+        total_changed += n_changed
+
+        if (i + 1) % 50 == 0 or n_changed > 0:
+            elapsed = time.time() - t0
+            rate = (i + 1) / elapsed
+            eta = (total - i - 1) / rate if rate > 0 else 0
+            tag = f"  +{n_changed} fields" if n_changed else ""
+            print(f"  [{i+1}/{total}] {ds}  {elapsed/60:.1f}m  eta={eta/60:.1f}m{tag}")
+
+    elapsed = time.time() - t0
+    print(f"\n{'='*60}")
+    print(f"  Patch complete in {elapsed/60:.1f}m")
+    print(f"  Total fields updated: {total_changed}")
+    print(f"  {'DRY RUN — no files written' if args.dry_run else 'Files patched in-place'}")
+    print(f"{'='*60}")
+
+    if not args.fred_key:
+        print(f"\n  *** FRED indicators (vix, sp500, gold, dxy, us10y, ycurve, fedfunds)")
+        print(f"  *** were SKIPPED. Get a free API key at: https://fred.stlouisfed.org/docs/api/api_key.html")
+        print(f"  *** Then re-run with: --fred-key YOUR_KEY_HERE")
+    if binance_fail_streak >= 10:
+        print(f"\n  *** Binance hist endpoints failed consistently.")
+        print(f"  *** OI data before 2020-09 is not available via Binance API.")
+        print(f"  *** Dates before that will remain FAIL for oi_btc, ls_btc, taker.")
+
+if __name__ == "__main__":
+    main()