initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/nautilus_dolphin/test_nautilus_arrow_longrun.py
+++ b/nautilus_dolphin/test_nautilus_arrow_longrun.py
@@ -0,0 +1,305 @@
+"""
+Nautilus Arrow Long-Running Alpha Engine Test
+===============================================
+Reads Arrow IPC files directly (no intermediate Parquet cache), converts
+on-the-fly to VBT DataFrames, and runs run_full_backtest() day-by-day
+using champion_5x_f20.
+
+Optimized: reads all Arrow files per day in bulk (open/read, not mmap)
+to avoid the extreme overhead of 8k+ individual memory-map calls.
+
+Usage (activate Siloqy first):
+    python test_nautilus_arrow_longrun.py
+    python test_nautilus_arrow_longrun.py --start 2026-02-01 --end 2026-02-25
+    python test_nautilus_arrow_longrun.py --poll   # continuous polling
+"""
+
+import sys
+import json
+import time
+import argparse
+import warnings
+from pathlib import Path
+from datetime import datetime, timedelta
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pyarrow.ipc as ipc
+
+warnings.filterwarnings('ignore')
+
+PROJECT_ROOT = Path(r'C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict')
+sys.path.insert(0, str(PROJECT_ROOT))
+sys.path.insert(0, str(PROJECT_ROOT / 'nautilus_dolphin'))
+
+DEFAULT_ARROW_BASE = PROJECT_ROOT / 'arrow_backfill'
+EXCLUDED_ASSETS = {'TUSDUSDT', 'USDCUSDT'}
+
+
+def load_arrow_day_fast(date_dir: Path) -> pd.DataFrame:
+    """
+    Bulk-read all scan_*.arrow files for one day into a VBT-compatible DataFrame.
+    Uses batched file reads (much faster than individual memory-maps).
+    """
+    arrow_files = sorted(date_dir.glob('scan_*.arrow'))
+    if not arrow_files:
+        return pd.DataFrame()
+
+    rows = []
+    last_prices = {}
+    errors = 0
+
+    for af in arrow_files:
+        try:
+            raw = af.read_bytes()
+            reader = ipc.open_file(pa.BufferReader(raw))
+            table = reader.read_all()
+            if len(table) == 0:
+                continue
+
+            row = {col: table.column(col)[0].as_py() for col in table.column_names}
+
+            ts_ns = row.get('timestamp_ns') or 0
+            if not ts_ns:
+                continue
+            ts = pd.Timestamp(ts_ns, unit='ns')
+
+            v50  = float(row.get('w50_velocity', 0) or 0)
+            v150 = float(row.get('w150_velocity', 0) or 0)
+            if v50 == 0.0 and v150 == 0.0:
+                continue
+
+            v300 = row.get('w300_velocity')
+            v750 = row.get('w750_velocity')
+            vd   = float(row.get('vel_div', v50 - v150) or (v50 - v150))
+            i50  = row.get('w50_instability')
+            i150 = row.get('w150_instability')
+
+            assets_raw  = json.loads(row.get('assets_json', '[]') or '[]')
+            prices_raw  = json.loads(row.get('asset_prices_json', '[]') or '[]')
+
+            price_map = {}
+            for asset, price in zip(assets_raw, prices_raw):
+                if asset in EXCLUDED_ASSETS:
+                    continue
+                if price is not None and float(price) > 0:
+                    price_map[asset] = float(price)
+                    last_prices[asset] = float(price)
+                elif asset in last_prices:
+                    price_map[asset] = last_prices[asset]
+
+            if 'BTCUSDT' not in price_map:
+                continue
+
+            rec = {
+                'timestamp': ts,
+                'scan_number': int(row.get('scan_number', 0) or 0),
+                'v50_lambda_max_velocity': v50,
+                'v150_lambda_max_velocity': v150,
+                'v300_lambda_max_velocity': float(v300) if v300 is not None else np.nan,
+                'v750_lambda_max_velocity': float(v750) if v750 is not None else np.nan,
+                'vel_div': vd,
+                'instability_50': float(i50) if i50 is not None else np.nan,
+                'instability_150': float(i150) if i150 is not None else np.nan,
+            }
+            rec.update(price_map)
+            rows.append(rec)
+        except Exception:
+            errors += 1
+            continue
+
+    if not rows:
+        return pd.DataFrame()
+
+    df = pd.DataFrame(rows).sort_values('timestamp').reset_index(drop=True)
+    core = ['timestamp', 'scan_number', 'v50_lambda_max_velocity',
+            'v150_lambda_max_velocity', 'v300_lambda_max_velocity',
+            'v750_lambda_max_velocity', 'vel_div', 'instability_50', 'instability_150']
+    price_cols = [c for c in df.columns if c not in core]
+    if price_cols:
+        df[price_cols] = df[price_cols].ffill()
+    return df
+
+
+def discover_arrow_dates(arrow_base: Path, start=None, end=None):
+    dates = []
+    if not arrow_base.exists():
+        return dates
+    for d in sorted(arrow_base.iterdir()):
+        if d.is_dir() and len(d.name) == 10 and d.name[4] == '-':
+            if '_SKIP' in d.name:
+                continue
+            if any(d.glob('scan_*.arrow')):
+                if start and d.name < start:
+                    continue
+                if end and d.name > end:
+                    continue
+                dates.append(d.name)
+    return dates
+
+
+def run_longrun_test(arrow_base, start, end, poll=False, poll_interval=30):
+    from dolphin_vbt_real import run_full_backtest
+    from dolphin_paper_trade_adaptive_cb_v2 import STRATEGIES, INIT_CAPITAL
+    champion = STRATEGIES['champion_5x_f20']
+
+    print('=' * 70)
+    print('  NAUTILUS ARROW LONG-RUNNING ALPHA ENGINE TEST')
+    print(f'  Strategy: champion_5x_f20 | Capital: ${INIT_CAPITAL:,.0f}')
+    print(f'  Arrow source: {arrow_base}')
+    print(f'  Date range: {start} -> {end}')
+    print(f'  Mode: {"POLL (continuous)" if poll else "BATCH (one-shot)"}')
+    print('=' * 70)
+    sys.stdout.flush()
+
+    capital    = INIT_CAPITAL
+    total_tr   = 0
+    total_wins = 0
+    total_fees = 0.0
+    peak       = capital
+    max_dd     = 0.0
+    processed  = set()
+    day_results = []
+
+    def process_date(date_str):
+        nonlocal capital, total_tr, total_wins, total_fees, peak, max_dd
+
+        date_dir = arrow_base / date_str
+        t0 = time.time()
+        df = load_arrow_day_fast(date_dir)
+        load_time = time.time() - t0
+
+        if len(df) < 200:
+            print(f'  {date_str}: {len(df)} scans (< 200) -- SKIPPED')
+            sys.stdout.flush()
+            return None
+
+        t0 = time.time()
+        result = run_full_backtest(df, champion, init_cash=capital, seed=42, verbose=False)
+        bt_time = time.time() - t0
+
+        capital = result['capital']
+        total_tr   += result['trades']
+        total_wins += result['wins']
+        total_fees += result['total_fees']
+        if capital > peak:
+            peak = capital
+        dd = (peak - capital) / peak * 100
+        if dd > max_dd:
+            max_dd = dd
+
+        wr = total_wins / max(total_tr, 1) * 100
+        roi = (capital - INIT_CAPITAL) / INIT_CAPITAL * 100
+
+        day_rec = {
+            'date': date_str, 'scans': len(df),
+            'day_trades': result['trades'], 'day_wins': result['wins'],
+            'capital': round(capital, 2),
+            'cum_trades': total_tr, 'cum_wr': round(wr, 2),
+            'cum_roi': round(roi, 4), 'max_dd': round(max_dd, 4),
+            'load_ms': int(load_time * 1000), 'bt_ms': int(bt_time * 1000),
+        }
+        day_results.append(day_rec)
+
+        print(f'  {date_str}: {len(df):>5} scans | '
+              f'{result["trades"]:>2} tr ({result["wins"]}W) | '
+              f'cap=${capital:>10,.2f} | '
+              f'WR={wr:.1f}% ROI={roi:+.2f}% DD={max_dd:.1f}% | '
+              f'[{int(load_time*1000)}ms+{int(bt_time*1000)}ms]')
+        sys.stdout.flush()
+        return day_rec
+
+    dates = discover_arrow_dates(arrow_base, start, end)
+    print(f'\nFound {len(dates)} Arrow dates to process\n')
+    sys.stdout.flush()
+
+    for date_str in dates:
+        process_date(date_str)
+        processed.add(date_str)
+
+    wr = total_wins / max(total_tr, 1) * 100
+    roi = (capital - INIT_CAPITAL) / INIT_CAPITAL * 100
+
+    print('\n' + '=' * 70)
+    print('  FINAL RESULTS')
+    print('=' * 70)
+    print(f'  Days processed:  {len(day_results)}')
+    print(f'  Total trades:    {total_tr}')
+    print(f'  Total wins:      {total_wins}')
+    print(f'  Win rate:        {wr:.2f}%')
+    print(f'  Final capital:   ${capital:,.2f}')
+    print(f'  ROI:             {roi:+.4f}%')
+    print(f'  Max drawdown:    {max_dd:.4f}%')
+    print(f'  Total fees:      ${total_fees:,.2f}')
+    print('=' * 70)
+
+    checks = [('WR >= 40%', wr >= 40.0), ('DD <= 20%', max_dd <= 20.0)]
+    all_pass = True
+    print('\n  BENCHMARK CHECK:')
+    for label, ok in checks:
+        status = 'OK' if ok else 'FAIL'
+        print(f'    {label:20s} -> [{status}]')
+        if not ok:
+            all_pass = False
+
+    if all_pass:
+        print('\n  VERDICT: PASS -- Arrow pipeline produces valid champion signals')
+    else:
+        print('\n  VERDICT: WARN -- some benchmarks missed')
+
+    ts_str = datetime.now().strftime('%Y%m%d_%H%M%S')
+    results_dir = PROJECT_ROOT / 'vbt_results'
+    results_dir.mkdir(exist_ok=True)
+    out_path = results_dir / f'nautilus_arrow_longrun_{ts_str}.json'
+    out = {
+        'timestamp': datetime.now().isoformat(),
+        'strategy': 'champion_5x_f20',
+        'arrow_source': str(arrow_base),
+        'date_range': [start, end],
+        'summary': {
+            'days': len(day_results), 'trades': total_tr, 'wins': total_wins,
+            'win_rate': round(wr, 2), 'final_capital': round(capital, 2),
+            'roi_pct': round(roi, 4), 'max_dd_pct': round(max_dd, 4),
+            'total_fees': round(total_fees, 2),
+        },
+        'daily': day_results,
+    }
+    with open(out_path, 'w') as f:
+        json.dump(out, f, indent=2)
+    print(f'\n  Results saved -> {out_path}')
+    sys.stdout.flush()
+
+    if poll:
+        print(f'\n  POLL mode: checking every {poll_interval}s for new dates...')
+        sys.stdout.flush()
+        while True:
+            time.sleep(poll_interval)
+            new_dates = discover_arrow_dates(arrow_base)
+            for d in new_dates:
+                if d not in processed:
+                    print(f'\n  [POLL] New date: {d}')
+                    process_date(d)
+                    processed.add(d)
+
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Nautilus Arrow long-running test')
+    parser.add_argument('--arrow-base', default=str(DEFAULT_ARROW_BASE))
+    parser.add_argument('--start', default='2026-01-01')
+    parser.add_argument('--end', default='2026-02-25')
+    parser.add_argument('--poll', action='store_true')
+    parser.add_argument('--poll-interval', type=int, default=30)
+    args = parser.parse_args()
+
+    run_longrun_test(
+        arrow_base=Path(args.arrow_base),
+        start=args.start, end=args.end,
+        poll=args.poll, poll_interval=args.poll_interval,
+    )
+
+
+if __name__ == '__main__':
+    main()