DOLPHIN/prod/tests/run_esof_backtest_sim.py

#!/usr/bin/env python3
"""
EsoF Gate Strategies — 56-Day Gold Backtest Simulation

Runs the gold-spec engine over all 56 vbt_cache parquet days, collects
~2000 trade records with real UTC entry timestamps, then evaluates all
EsoF gate strategies (A–E + S6) and overfitting guard tests against
that statistically substantial dataset.

Timestamp reconstruction:
  parquet 'timestamp' column → Unix seconds or nanoseconds
  NDTradeRecord.entry_bar → row index in the day's dataframe
  entry_ts = datetime.fromtimestamp(ts_col[entry_bar], UTC)

Caches trade data to /tmp/esof_bt_trades.json to avoid re-running the
56-day engine on subsequent test/analysis calls.

Run:
  source /home/dolphin/siloqy_env/bin/activate
  cd /mnt/dolphinng5_predict
  python prod/tests/run_esof_backtest_sim.py          # full run + report
  python prod/tests/run_esof_backtest_sim.py --cached # skip backtest, use cache
"""
from __future__ import annotations

import argparse
import json
import sys
import time
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional

import numpy as np
import pandas as pd

# ── paths ─────────────────────────────────────────────────────────────────────
_ROOT     = Path(__file__).parent.parent.parent
_PROD_DIR = _ROOT / "prod"
sys.path.insert(0, str(_ROOT))
sys.path.insert(0, str(_ROOT / "Observability"))
sys.path.insert(0, str(_ROOT / "nautilus_dolphin"))

PARQUET_DIR  = _ROOT / "vbt_cache"
CACHE_FILE   = Path("/tmp/esof_bt_trades.json")

# ── reuse gold engine infrastructure ──────────────────────────────────────────
from prod.backtest_gold_verify import (
    _build_engine, _load_config, _META_COLS_SET, _compute_vol_ok, INITIAL_CAPITAL,
)

# ── EsoF advisory + gate ───────────────────────────────────────────────────────
from esof_advisor import compute_esof, BASELINE_WR
from esof_gate import apply_gate, get_s6_mult, get_bucket, S6_BASE, S6_MULT

# ── statistical helpers (reuse from overfitting test) ─────────────────────────
import math, random

def wr(trades): return sum(1 for t in trades if t["pnl"] > 0) / len(trades) if trades else float("nan")
def net(trades): return sum(t["pnl"] for t in trades)
def cohen_h(p1, p2): return abs(2*math.asin(math.sqrt(max(0,min(1,p1)))) - 2*math.asin(math.sqrt(max(0,min(1,p2)))))
def bootstrap_ci(vals, n_boot=3000, ci=0.95, seed=42):
    rng = random.Random(seed)
    n   = len(vals)
    s   = sorted(rng.choice(vals) for _ in range(n_boot * n))  # approximate
    samples = []
    for _ in range(n_boot):
        samples.append(sum(rng.choice(vals) for _ in range(n)) / n)
    samples.sort()
    lo, hi = int((1-ci)/2*n_boot), int((1+ci)/2*n_boot)
    return samples[lo], samples[hi]
def binomial_se(p, n): return math.sqrt(p*(1-p)/n) if n > 0 else float("inf")
def permutation_pvalue(trades, observed_delta, key, blocked_val, n_perm=2000, seed=42):
    rng    = random.Random(seed)
    labels = [t[key] for t in trades]
    pnls   = [t["pnl"] for t in trades]
    count  = 0
    for _ in range(n_perm):
        rng.shuffle(labels)
        d = -sum(p for l,p in zip(labels,pnls) if l == blocked_val)
        if d >= observed_delta: count += 1
    return count / n_perm


# ── Backtest runner ────────────────────────────────────────────────────────────

def run_backtest() -> List[dict]:
    """
    Run gold-spec engine over all vbt_cache parquets.
    Returns list of trade dicts with real UTC entry timestamps.
    """
    print(f"[BT] Loading config from blue.yml ...")
    cfg = _load_config()

    print(f"[BT] Building engine ...")
    engine = _build_engine(cfg, INITIAL_CAPITAL)
    engine.set_esoteric_hazard_multiplier(0.0)  # gold spec

    parquet_files = sorted(PARQUET_DIR.glob("*.parquet"))
    parquet_files = [p for p in parquet_files if "catalog" not in str(p)]
    print(f"[BT] {len(parquet_files)} parquet days: {parquet_files[0].stem} → {parquet_files[-1].stem}")

    all_trades: List[dict] = []
    pkl_map: Optional[Dict[str,int]] = None
    try:
        import pickle
        with open(_ROOT / "adaptive_exit/models/bucket_assignments.pkl", "rb") as f:
            pkl_map = pickle.load(f).get("assignments", {})
    except Exception:
        pass

    t_global = time.time()
    for i, pf in enumerate(parquet_files):
        date_str = pf.stem
        df       = pd.read_parquet(pf)

        # Save timestamp array for this day before processing
        ts_raw = df["timestamp"].values if "timestamp" in df.columns else None

        asset_cols = [c for c in df.columns if c not in _META_COLS_SET]
        vol_ok     = _compute_vol_ok(df, float(cfg.get("paper_trade", {}).get("vol_p60", 0.00009868)))

        t_before = len(engine.trade_history)
        t0 = time.time()
        engine.process_day(date_str, df, asset_cols, vol_regime_ok=vol_ok,
                           direction=-1, posture="APEX")
        elapsed = time.time() - t0

        trades_today = engine.trade_history[t_before:]
        day_new = 0

        for tr in trades_today:
            entry_bar = tr.entry_bar
            # Resolve UTC timestamp
            if ts_raw is not None and 0 <= entry_bar < len(ts_raw):
                raw = float(ts_raw[entry_bar])
                if raw > 1e12:        # nanoseconds
                    entry_ts = datetime.fromtimestamp(raw / 1e9, tz=timezone.utc)
                elif raw > 1e9:       # seconds (Unix)
                    entry_ts = datetime.fromtimestamp(raw, tz=timezone.utc)
                else:                 # fractional day or other — fallback to midnight
                    entry_ts = datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
            else:
                entry_ts = datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)

            # Skip non-alpha exits
            if tr.exit_reason in ("HIBERNATE_HALT", "SUBDAY_ACB_NORMALIZATION"):
                continue

            asset    = tr.asset
            bkt      = get_bucket(asset, pkl_map)
            adv      = compute_esof(entry_ts)

            all_trades.append({
                "ts":         entry_ts.isoformat(),
                "date":       date_str,
                "asset":      asset,
                "pnl":        round(tr.pnl_absolute, 4),
                "leverage":   round(tr.leverage, 3),
                "exit_reason":tr.exit_reason,
                "bucket_id":  bkt,
                "session":    adv["session"],
                "dow":        adv["dow"],
                "score":      round(adv["advisory_score"], 4),
                "label":      adv["advisory_label"],
                "liq_bkt":    adv["liq_bucket_3h"],
            })
            day_new += 1

        cum_T   = len(all_trades)
        cap_now = engine.capital
        roi     = (cap_now / INITIAL_CAPITAL - 1) * 100
        print(f"  {date_str}: +{day_new:3d} trades (cum={cum_T:4d})  "
              f"${cap_now:>10,.0f}  ROI={roi:+.1f}%  ({elapsed:.1f}s)", flush=True)

    total_elapsed = time.time() - t_global
    print(f"\n[BT] Done: {len(all_trades)} trades in {total_elapsed:.0f}s  "
          f"ROI={((engine.capital/INITIAL_CAPITAL)-1)*100:+.2f}%")
    return all_trades


def load_or_run(use_cache: bool) -> List[dict]:
    if use_cache and CACHE_FILE.exists():
        print(f"[CACHE] Loading from {CACHE_FILE}")
        with open(CACHE_FILE) as f:
            raw = json.load(f)
        print(f"  {len(raw)} trades loaded.")
        return raw
    trades = run_backtest()
    # ts is already an ISO string (set at collection time)
    with open(CACHE_FILE, "w") as f:
        json.dump(trades, f)
    print(f"[CACHE] Saved to {CACHE_FILE}")
    return trades


# ── Strategy simulation ────────────────────────────────────────────────────────

def run_strategy(strategy: str, trades: List[dict]) -> dict:
    cf_pnl = 0.0; act_pnl = 0.0
    n_blk = 0; n_scl = 0
    n_win_cf = 0; n_win_act = 0
    for t in trades:
        act_pnl += t["pnl"]
        n_win_act += t["pnl"] > 0
        adv = {"advisory_label": t["label"], "advisory_score": t["score"],
               "session": t["session"], "dow": t["dow"]}
        r = apply_gate(strategy, adv)
        if strategy == "F":
            mult = r.s6_mult.get(t["bucket_id"], 0.4)
            cf_pnl += t["pnl"] * mult
            n_win_cf += t["pnl"] * mult > 0
            n_blk += mult < 1e-6
            n_scl += 0 < mult < 1.0
        else:
            mult = r.lev_mult
            if r.is_blocked:
                n_blk += 1
            else:
                cf_pnl += t["pnl"] * mult
                n_win_cf += t["pnl"] * mult > 0
                n_scl += mult < 1.0
    n = len(trades)
    n_exec = n - (n_blk if strategy != "F" else 0)
    wr_act = n_win_act / n * 100 if n else 0
    wr_cf  = (n_win_cf / max(n_exec,1) * 100) if strategy != "F" else (n_win_cf / n * 100)
    return dict(strategy=strategy, n=n, n_exec=n_exec, n_blk=n_blk, n_scl=n_scl,
                act_pnl=round(act_pnl,2), cf_pnl=round(cf_pnl,2),
                delta=round(cf_pnl-act_pnl,2), wr_act=round(wr_act,1), wr_cf=round(wr_cf,1))

def run_s6_base(trades):
    cf = sum(t["pnl"] * S6_BASE.get(t["bucket_id"], 0.4) for t in trades)
    wins = sum(t["pnl"] * S6_BASE.get(t["bucket_id"], 0.4) > 0 for t in trades)
    return dict(cf_pnl=round(cf,2), delta=round(cf-sum(t["pnl"] for t in trades),2),
                wr_cf=round(wins/len(trades)*100,1) if trades else 0)


# ── Overfitting guard (adapted for large sample) ──────────────────────────────

def run_overfitting_report(trades: List[dict]):
    n      = len(trades)
    h1, h2 = trades[:n//2], trades[n//2:]
    base   = wr(trades)

    ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]
    mon    = [t for t in trades if t["dow"] == 0]
    ldn    = [t for t in trades if t["session"] == "LONDON_MORNING"]
    ny_h1  = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
    ny_h2  = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
    mon_h1 = [t for t in h1 if t["dow"] == 0]
    mon_h2 = [t for t in h2 if t["dow"] == 0]

    # Permutation tests
    ny_pnl  = sum(t["pnl"] for t in ny)
    mon_pnl = sum(t["pnl"] for t in mon)
    p_ny  = permutation_pvalue(trades, -ny_pnl,  "session", "NY_AFTERNOON")
    p_mon = permutation_pvalue(trades, -mon_pnl, "dow",     0)

    # Effect sizes
    h_ny  = cohen_h(wr(ny),  base)
    h_mon = cohen_h(wr(mon), base)
    h_ldn = cohen_h(wr(ldn), base)

    # Bonferroni z
    z_ny  = (base - wr(ny))  / binomial_se(base, len(ny))  if len(ny)  else 0
    z_mon = (base - wr(mon)) / binomial_se(base, len(mon)) if len(mon) else 0

    # Walk-forward score prediction
    h2s   = sorted(h2, key=lambda t: t["score"])
    q     = max(1, len(h2s)//4)
    wr_bot, wr_top = wr(h2s[:q]), wr(h2s[-q:])

    # Bootstrap CI on WR (approximate using mean sample)
    ny_wrs = [1 if t["pnl"] > 0 else 0 for t in ny]
    ny_lo, ny_hi = bootstrap_ci(ny_wrs, n_boot=3000)

    # Session-bucket confound check
    by_bkt_ny  = defaultdict(list)
    by_bkt_out = defaultdict(list)
    for t in ny:   by_bkt_ny[t["bucket_id"]].append(t)
    for t in trades:
        if t["session"] != "NY_AFTERNOON": by_bkt_out[t["bucket_id"]].append(t)
    n_cross = sum(1 for b in by_bkt_ny if len(by_bkt_ny[b])>=5
                  and len(by_bkt_out.get(b,[]))>=5
                  and wr(by_bkt_ny[b]) < wr(by_bkt_out[b]))

    return dict(
        n=n, base=base,
        ny_n=len(ny), ny_wr=round(wr(ny),3), ny_net=round(net(ny),0),
        mon_n=len(mon), mon_wr=round(wr(mon),3), mon_net=round(net(mon),0),
        ldn_n=len(ldn), ldn_wr=round(wr(ldn),3),
        ny_h1_wr=round(wr(ny_h1),3), ny_h2_wr=round(wr(ny_h2),3),
        mon_h1_wr=round(wr(mon_h1),3), mon_h2_wr=round(wr(mon_h2),3),
        p_ny=round(p_ny,4), p_mon=round(p_mon,4),
        h_ny=round(h_ny,3), h_mon=round(h_mon,3), h_ldn=round(h_ldn,3),
        z_ny=round(z_ny,2), z_mon=round(z_mon,2),
        ny_wr_ci=(round(ny_lo,3), round(ny_hi,3)),
        wf_top=round(wr_top,3), wf_bot=round(wr_bot,3),
        n_cross_bucket=n_cross,
    )


# ── Report printer ─────────────────────────────────────────────────────────────

G="\033[32m"; R="\033[31m"; Y="\033[33m"; B="\033[1m"; D="\033[2m"; X="\033[0m"

def col(v, good_if_positive=True):
    if v > 0: return G if good_if_positive else R
    if v < 0: return R if good_if_positive else G
    return X

def print_full_report(strategies, s6base, ov):
    base_pnl = strategies[0]["act_pnl"]
    base_wr  = strategies[0]["wr_act"]
    n        = strategies[0]["n"]

    print(f"\n{B}{'═'*74}{X}")
    print(f"{B}  EsoF Gate — 56-Day Gold Backtest  ({n} clean alpha trades){X}")
    print(f"  Baseline: WR={base_wr:.1f}%  Net=${base_pnl:+,.0f}  "
          f"Period: 2025-12-31 → 2026-02-25")
    print(f"{'═'*74}{X}")

    # Gate results table
    NAMES = {"A":"A: LEV_SCALE","B":"B: HARD_BLOCK","C":"C: DOW_BLOCK",
             "D":"D: SESSION_BLOCK","E":"E: COMBINED","F":"F: S6_BUCKET"}
    hdr = f"\n  {'Strategy':<22}│{'T_exec':>7}│{'T_blk':>6}│{'CF Net':>11}│{'ΔPnL':>10}│{'WR_cf':>7}│{'WR_Δ':>6}"
    sep = f"  {'─'*22}┼{'─'*7}┼{'─'*6}┼{'─'*11}┼{'─'*10}┼{'─'*7}┼{'─'*6}"
    print(f"{B}{hdr}{X}\n{sep}")
    for r in strategies:
        nm   = NAMES.get(r["strategy"], r["strategy"])
        dpnl = r["delta"]
        dwr  = r["wr_cf"] - r["wr_act"]
        c    = G if dpnl > 0 else R
        wc   = G if dwr  > 0 else R
        print(f"  {nm:<22}│{r['n_exec']:>7}│{r['n_blk']:>6}│"
              f"{c}{r['cf_pnl']:>+11,.0f}{X}│{c}{dpnl:>+10,.0f}{X}│"
              f"{wc}{r['wr_cf']:>6.1f}%{X}│{wc}{dwr:>+5.1f}pp{X}")
    print(sep)
    f_r  = next(r for r in strategies if r["strategy"]=="F")
    fvs  = f_r["cf_pnl"] - s6base["cf_pnl"]
    c    = G if fvs > 0 else R
    print(f"  {'F vs S6_BASE (EsoF uplift)':<22}│{'':>7}│{'':>6}│{'':>11}│"
          f"{c}{fvs:>+10,.0f}{X}│{'':>7}│")
    print(f"  {'S6_BASE (flat, no EsoF)':<22}│{'':>7}│{'':>6}│"
          f"{s6base['cf_pnl']:>+11,.0f}│{s6base['delta']:>+10,.0f}│"
          f"{s6base['wr_cf']:>6.1f}%│")

    # Overfitting guard
    print(f"\n{B}  Overfitting Guard — Large-Sample Results{X}")
    print(f"  {'─'*68}")

    def orow(label, val, good=True, ref=None, fmt=".3f", suffix=""):
        v = f"{val:{fmt}}{suffix}"
        if ref is not None:
            c = G if (val < ref) == good else R
        else:
            c = X
        print(f"  {label:<42} {c}{v}{X}")

    print(f"  {'1. Temporal Stability':}")
    orow(f"  NY_AFT WR H1 (n={ov['ny_n']//2})", ov["ny_h1_wr"], ref=ov["base"])
    orow(f"  NY_AFT WR H2",                      ov["ny_h2_wr"], ref=ov["base"])
    orow(f"  Monday WR H1 (n={ov['mon_n']//2})", ov["mon_h1_wr"], ref=ov["base"])
    orow(f"  Monday WR H2",                      ov["mon_h2_wr"], ref=ov["base"])

    print(f"\n  {'2. Permutation p-values (n_perm=2000)':}")
    c_ny  = G if ov["p_ny"]  < 0.05 else Y if ov["p_ny"]  < 0.15 else R
    c_mon = G if ov["p_mon"] < 0.05 else Y if ov["p_mon"] < 0.15 else R
    print(f"  {'  NY_AFT p-value':<42} {c_ny}{ov['p_ny']:.4f}{X}  {D}(< 0.05 = significant){X}")
    print(f"  {'  Monday p-value':<42} {c_mon}{ov['p_mon']:.4f}{X}")

    print(f"\n  {'3. Effect sizes (Cohen\'s h)':}")
    for label, h, n_cell in [("NY_AFT",ov["h_ny"],ov["ny_n"]),
                              ("Monday",ov["h_mon"],ov["mon_n"]),
                              ("London",ov["h_ldn"],ov["ldn_n"])]:
        grade = "large" if h>=0.8 else "medium" if h>=0.5 else "small" if h>=0.2 else "trivial"
        c = G if h>=0.5 else Y if h>=0.2 else R
        print(f"  {'  '+label:<42} {c}{h:.3f}{X}  {D}{grade} (n={n_cell}){X}")

    print(f"\n  {'4. Bonferroni z-scores (35 cells, crit≈2.99)':}")
    crit = 2.99
    for label, z in [("NY_AFT", ov["z_ny"]), ("Monday", ov["z_mon"])]:
        c = G if z > crit else Y if z > 2.0 else R
        print(f"  {'  '+label:<42} {c}{z:.2f}{X}")

    print(f"\n  {'5. Bootstrap 95% CI on NY_AFT WR':}")
    lo, hi = ov["ny_wr_ci"]
    c = G if hi < ov["base"] else R
    print(f"  {'  NY_AFT WR CI':<42} {c}[{lo:.3f}, {hi:.3f}]{X}  "
          f"{D}({'below' if hi < ov['base'] else 'overlaps'} baseline {ov['base']:.3f}){X}")

    print(f"\n  {'6. Walk-forward: advisory score → H2 WR':}")
    c = G if ov["wf_top"] > ov["wf_bot"] else R
    print(f"  {'  Top-quartile WR (H2)':<42} {c}{ov['wf_top']:.3f}{X}")
    print(f"  {'  Bot-quartile WR (H2)':<42} {c}{ov['wf_bot']:.3f}{X}")
    print(f"  {'  Predictive?':<42} {c}{'YES' if ov['wf_top'] > ov['wf_bot'] else 'NO — overfit'}{X}")

    print(f"\n  {'7. Cross-bucket NY_AFT confound check':}")
    c = G if ov["n_cross_bucket"] >= 2 else Y if ov["n_cross_bucket"] == 1 else R
    print(f"  {'  Buckets confirming NY_AFT drag':<42} {c}{ov['n_cross_bucket']}{X}  "
          f"{D}(≥ 2 = session-driven, not bucket-proxy){X}")

    print(f"\n{'═'*74}\n")


# ── Main ───────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--cached", action="store_true", help="Use cached trades (skip backtest)")
    args = ap.parse_args()

    trades = load_or_run(use_cache=args.cached)

    if len(trades) < 100:
        print(f"{R}Too few trades ({len(trades)}) — check engine setup.{X}")
        sys.exit(1)

    print(f"\n[SIM] Running gate strategies on {len(trades)} trades ...")
    strategy_results = [run_strategy(s, trades) for s in ["A","B","C","D","E","F"]]
    s6base           = run_s6_base(trades)

    print("[OV]  Running overfitting guard ...")
    ov = run_overfitting_report(trades)

    print_full_report(strategy_results, s6base, ov)