Files
DOLPHIN/prod/tests/run_esof_backtest_sim.py

428 lines
18 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
EsoF Gate Strategies 56-Day Gold Backtest Simulation
Runs the gold-spec engine over all 56 vbt_cache parquet days, collects
~2000 trade records with real UTC entry timestamps, then evaluates all
EsoF gate strategies (AE + S6) and overfitting guard tests against
that statistically substantial dataset.
Timestamp reconstruction:
parquet 'timestamp' column Unix seconds or nanoseconds
NDTradeRecord.entry_bar row index in the day's dataframe
entry_ts = datetime.fromtimestamp(ts_col[entry_bar], UTC)
Caches trade data to /tmp/esof_bt_trades.json to avoid re-running the
56-day engine on subsequent test/analysis calls.
Run:
source /home/dolphin/siloqy_env/bin/activate
cd /mnt/dolphinng5_predict
python prod/tests/run_esof_backtest_sim.py # full run + report
python prod/tests/run_esof_backtest_sim.py --cached # skip backtest, use cache
"""
from __future__ import annotations
import argparse
import json
import sys
import time
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional
import numpy as np
import pandas as pd
# ── paths ─────────────────────────────────────────────────────────────────────
_ROOT = Path(__file__).parent.parent.parent
_PROD_DIR = _ROOT / "prod"
sys.path.insert(0, str(_ROOT))
sys.path.insert(0, str(_ROOT / "Observability"))
sys.path.insert(0, str(_ROOT / "nautilus_dolphin"))
PARQUET_DIR = _ROOT / "vbt_cache"
CACHE_FILE = Path("/tmp/esof_bt_trades.json")
# ── reuse gold engine infrastructure ──────────────────────────────────────────
from prod.backtest_gold_verify import (
_build_engine, _load_config, _META_COLS_SET, _compute_vol_ok, INITIAL_CAPITAL,
)
# ── EsoF advisory + gate ───────────────────────────────────────────────────────
from esof_advisor import compute_esof, BASELINE_WR
from esof_gate import apply_gate, get_s6_mult, get_bucket, S6_BASE, S6_MULT
# ── statistical helpers (reuse from overfitting test) ─────────────────────────
import math, random
def wr(trades): return sum(1 for t in trades if t["pnl"] > 0) / len(trades) if trades else float("nan")
def net(trades): return sum(t["pnl"] for t in trades)
def cohen_h(p1, p2): return abs(2*math.asin(math.sqrt(max(0,min(1,p1)))) - 2*math.asin(math.sqrt(max(0,min(1,p2)))))
def bootstrap_ci(vals, n_boot=3000, ci=0.95, seed=42):
rng = random.Random(seed)
n = len(vals)
s = sorted(rng.choice(vals) for _ in range(n_boot * n)) # approximate
samples = []
for _ in range(n_boot):
samples.append(sum(rng.choice(vals) for _ in range(n)) / n)
samples.sort()
lo, hi = int((1-ci)/2*n_boot), int((1+ci)/2*n_boot)
return samples[lo], samples[hi]
def binomial_se(p, n): return math.sqrt(p*(1-p)/n) if n > 0 else float("inf")
def permutation_pvalue(trades, observed_delta, key, blocked_val, n_perm=2000, seed=42):
rng = random.Random(seed)
labels = [t[key] for t in trades]
pnls = [t["pnl"] for t in trades]
count = 0
for _ in range(n_perm):
rng.shuffle(labels)
d = -sum(p for l,p in zip(labels,pnls) if l == blocked_val)
if d >= observed_delta: count += 1
return count / n_perm
# ── Backtest runner ────────────────────────────────────────────────────────────
def run_backtest() -> List[dict]:
"""
Run gold-spec engine over all vbt_cache parquets.
Returns list of trade dicts with real UTC entry timestamps.
"""
print(f"[BT] Loading config from blue.yml ...")
cfg = _load_config()
print(f"[BT] Building engine ...")
engine = _build_engine(cfg, INITIAL_CAPITAL)
engine.set_esoteric_hazard_multiplier(0.0) # gold spec
parquet_files = sorted(PARQUET_DIR.glob("*.parquet"))
parquet_files = [p for p in parquet_files if "catalog" not in str(p)]
print(f"[BT] {len(parquet_files)} parquet days: {parquet_files[0].stem}{parquet_files[-1].stem}")
all_trades: List[dict] = []
pkl_map: Optional[Dict[str,int]] = None
try:
import pickle
with open(_ROOT / "adaptive_exit/models/bucket_assignments.pkl", "rb") as f:
pkl_map = pickle.load(f).get("assignments", {})
except Exception:
pass
t_global = time.time()
for i, pf in enumerate(parquet_files):
date_str = pf.stem
df = pd.read_parquet(pf)
# Save timestamp array for this day before processing
ts_raw = df["timestamp"].values if "timestamp" in df.columns else None
asset_cols = [c for c in df.columns if c not in _META_COLS_SET]
vol_ok = _compute_vol_ok(df, float(cfg.get("paper_trade", {}).get("vol_p60", 0.00009868)))
t_before = len(engine.trade_history)
t0 = time.time()
engine.process_day(date_str, df, asset_cols, vol_regime_ok=vol_ok,
direction=-1, posture="APEX")
elapsed = time.time() - t0
trades_today = engine.trade_history[t_before:]
day_new = 0
for tr in trades_today:
entry_bar = tr.entry_bar
# Resolve UTC timestamp
if ts_raw is not None and 0 <= entry_bar < len(ts_raw):
raw = float(ts_raw[entry_bar])
if raw > 1e12: # nanoseconds
entry_ts = datetime.fromtimestamp(raw / 1e9, tz=timezone.utc)
elif raw > 1e9: # seconds (Unix)
entry_ts = datetime.fromtimestamp(raw, tz=timezone.utc)
else: # fractional day or other — fallback to midnight
entry_ts = datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
else:
entry_ts = datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
# Skip non-alpha exits
if tr.exit_reason in ("HIBERNATE_HALT", "SUBDAY_ACB_NORMALIZATION"):
continue
asset = tr.asset
bkt = get_bucket(asset, pkl_map)
adv = compute_esof(entry_ts)
all_trades.append({
"ts": entry_ts.isoformat(),
"date": date_str,
"asset": asset,
"pnl": round(tr.pnl_absolute, 4),
"leverage": round(tr.leverage, 3),
"exit_reason":tr.exit_reason,
"bucket_id": bkt,
"session": adv["session"],
"dow": adv["dow"],
"score": round(adv["advisory_score"], 4),
"label": adv["advisory_label"],
"liq_bkt": adv["liq_bucket_3h"],
})
day_new += 1
cum_T = len(all_trades)
cap_now = engine.capital
roi = (cap_now / INITIAL_CAPITAL - 1) * 100
print(f" {date_str}: +{day_new:3d} trades (cum={cum_T:4d}) "
f"${cap_now:>10,.0f} ROI={roi:+.1f}% ({elapsed:.1f}s)", flush=True)
total_elapsed = time.time() - t_global
print(f"\n[BT] Done: {len(all_trades)} trades in {total_elapsed:.0f}s "
f"ROI={((engine.capital/INITIAL_CAPITAL)-1)*100:+.2f}%")
return all_trades
def load_or_run(use_cache: bool) -> List[dict]:
if use_cache and CACHE_FILE.exists():
print(f"[CACHE] Loading from {CACHE_FILE}")
with open(CACHE_FILE) as f:
raw = json.load(f)
print(f" {len(raw)} trades loaded.")
return raw
trades = run_backtest()
# ts is already an ISO string (set at collection time)
with open(CACHE_FILE, "w") as f:
json.dump(trades, f)
print(f"[CACHE] Saved to {CACHE_FILE}")
return trades
# ── Strategy simulation ────────────────────────────────────────────────────────
def run_strategy(strategy: str, trades: List[dict]) -> dict:
cf_pnl = 0.0; act_pnl = 0.0
n_blk = 0; n_scl = 0
n_win_cf = 0; n_win_act = 0
for t in trades:
act_pnl += t["pnl"]
n_win_act += t["pnl"] > 0
adv = {"advisory_label": t["label"], "advisory_score": t["score"],
"session": t["session"], "dow": t["dow"]}
r = apply_gate(strategy, adv)
if strategy == "F":
mult = r.s6_mult.get(t["bucket_id"], 0.4)
cf_pnl += t["pnl"] * mult
n_win_cf += t["pnl"] * mult > 0
n_blk += mult < 1e-6
n_scl += 0 < mult < 1.0
else:
mult = r.lev_mult
if r.is_blocked:
n_blk += 1
else:
cf_pnl += t["pnl"] * mult
n_win_cf += t["pnl"] * mult > 0
n_scl += mult < 1.0
n = len(trades)
n_exec = n - (n_blk if strategy != "F" else 0)
wr_act = n_win_act / n * 100 if n else 0
wr_cf = (n_win_cf / max(n_exec,1) * 100) if strategy != "F" else (n_win_cf / n * 100)
return dict(strategy=strategy, n=n, n_exec=n_exec, n_blk=n_blk, n_scl=n_scl,
act_pnl=round(act_pnl,2), cf_pnl=round(cf_pnl,2),
delta=round(cf_pnl-act_pnl,2), wr_act=round(wr_act,1), wr_cf=round(wr_cf,1))
def run_s6_base(trades):
cf = sum(t["pnl"] * S6_BASE.get(t["bucket_id"], 0.4) for t in trades)
wins = sum(t["pnl"] * S6_BASE.get(t["bucket_id"], 0.4) > 0 for t in trades)
return dict(cf_pnl=round(cf,2), delta=round(cf-sum(t["pnl"] for t in trades),2),
wr_cf=round(wins/len(trades)*100,1) if trades else 0)
# ── Overfitting guard (adapted for large sample) ──────────────────────────────
def run_overfitting_report(trades: List[dict]):
n = len(trades)
h1, h2 = trades[:n//2], trades[n//2:]
base = wr(trades)
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
mon = [t for t in trades if t["dow"] == 0]
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
mon_h1 = [t for t in h1 if t["dow"] == 0]
mon_h2 = [t for t in h2 if t["dow"] == 0]
# Permutation tests
ny_pnl = sum(t["pnl"] for t in ny)
mon_pnl = sum(t["pnl"] for t in mon)
p_ny = permutation_pvalue(trades, -ny_pnl, "session", "NY_AFTERNOON")
p_mon = permutation_pvalue(trades, -mon_pnl, "dow", 0)
# Effect sizes
h_ny = cohen_h(wr(ny), base)
h_mon = cohen_h(wr(mon), base)
h_ldn = cohen_h(wr(ldn), base)
# Bonferroni z
z_ny = (base - wr(ny)) / binomial_se(base, len(ny)) if len(ny) else 0
z_mon = (base - wr(mon)) / binomial_se(base, len(mon)) if len(mon) else 0
# Walk-forward score prediction
h2s = sorted(h2, key=lambda t: t["score"])
q = max(1, len(h2s)//4)
wr_bot, wr_top = wr(h2s[:q]), wr(h2s[-q:])
# Bootstrap CI on WR (approximate using mean sample)
ny_wrs = [1 if t["pnl"] > 0 else 0 for t in ny]
ny_lo, ny_hi = bootstrap_ci(ny_wrs, n_boot=3000)
# Session-bucket confound check
by_bkt_ny = defaultdict(list)
by_bkt_out = defaultdict(list)
for t in ny: by_bkt_ny[t["bucket_id"]].append(t)
for t in trades:
if t["session"] != "NY_AFTERNOON": by_bkt_out[t["bucket_id"]].append(t)
n_cross = sum(1 for b in by_bkt_ny if len(by_bkt_ny[b])>=5
and len(by_bkt_out.get(b,[]))>=5
and wr(by_bkt_ny[b]) < wr(by_bkt_out[b]))
return dict(
n=n, base=base,
ny_n=len(ny), ny_wr=round(wr(ny),3), ny_net=round(net(ny),0),
mon_n=len(mon), mon_wr=round(wr(mon),3), mon_net=round(net(mon),0),
ldn_n=len(ldn), ldn_wr=round(wr(ldn),3),
ny_h1_wr=round(wr(ny_h1),3), ny_h2_wr=round(wr(ny_h2),3),
mon_h1_wr=round(wr(mon_h1),3), mon_h2_wr=round(wr(mon_h2),3),
p_ny=round(p_ny,4), p_mon=round(p_mon,4),
h_ny=round(h_ny,3), h_mon=round(h_mon,3), h_ldn=round(h_ldn,3),
z_ny=round(z_ny,2), z_mon=round(z_mon,2),
ny_wr_ci=(round(ny_lo,3), round(ny_hi,3)),
wf_top=round(wr_top,3), wf_bot=round(wr_bot,3),
n_cross_bucket=n_cross,
)
# ── Report printer ─────────────────────────────────────────────────────────────
G="\033[32m"; R="\033[31m"; Y="\033[33m"; B="\033[1m"; D="\033[2m"; X="\033[0m"
def col(v, good_if_positive=True):
if v > 0: return G if good_if_positive else R
if v < 0: return R if good_if_positive else G
return X
def print_full_report(strategies, s6base, ov):
base_pnl = strategies[0]["act_pnl"]
base_wr = strategies[0]["wr_act"]
n = strategies[0]["n"]
print(f"\n{B}{''*74}{X}")
print(f"{B} EsoF Gate — 56-Day Gold Backtest ({n} clean alpha trades){X}")
print(f" Baseline: WR={base_wr:.1f}% Net=${base_pnl:+,.0f} "
f"Period: 2025-12-31 → 2026-02-25")
print(f"{''*74}{X}")
# Gate results table
NAMES = {"A":"A: LEV_SCALE","B":"B: HARD_BLOCK","C":"C: DOW_BLOCK",
"D":"D: SESSION_BLOCK","E":"E: COMBINED","F":"F: S6_BUCKET"}
hdr = f"\n {'Strategy':<22}{'T_exec':>7}{'T_blk':>6}{'CF Net':>11}{'ΔPnL':>10}{'WR_cf':>7}{'WR_Δ':>6}"
sep = f" {''*22}{''*7}{''*6}{''*11}{''*10}{''*7}{''*6}"
print(f"{B}{hdr}{X}\n{sep}")
for r in strategies:
nm = NAMES.get(r["strategy"], r["strategy"])
dpnl = r["delta"]
dwr = r["wr_cf"] - r["wr_act"]
c = G if dpnl > 0 else R
wc = G if dwr > 0 else R
print(f" {nm:<22}{r['n_exec']:>7}{r['n_blk']:>6}"
f"{c}{r['cf_pnl']:>+11,.0f}{X}{c}{dpnl:>+10,.0f}{X}"
f"{wc}{r['wr_cf']:>6.1f}%{X}{wc}{dwr:>+5.1f}pp{X}")
print(sep)
f_r = next(r for r in strategies if r["strategy"]=="F")
fvs = f_r["cf_pnl"] - s6base["cf_pnl"]
c = G if fvs > 0 else R
print(f" {'F vs S6_BASE (EsoF uplift)':<22}{'':>7}{'':>6}{'':>11}"
f"{c}{fvs:>+10,.0f}{X}{'':>7}")
print(f" {'S6_BASE (flat, no EsoF)':<22}{'':>7}{'':>6}"
f"{s6base['cf_pnl']:>+11,.0f}{s6base['delta']:>+10,.0f}"
f"{s6base['wr_cf']:>6.1f}%│")
# Overfitting guard
print(f"\n{B} Overfitting Guard — Large-Sample Results{X}")
print(f" {''*68}")
def orow(label, val, good=True, ref=None, fmt=".3f", suffix=""):
v = f"{val:{fmt}}{suffix}"
if ref is not None:
c = G if (val < ref) == good else R
else:
c = X
print(f" {label:<42} {c}{v}{X}")
print(f" {'1. Temporal Stability':}")
orow(f" NY_AFT WR H1 (n={ov['ny_n']//2})", ov["ny_h1_wr"], ref=ov["base"])
orow(f" NY_AFT WR H2", ov["ny_h2_wr"], ref=ov["base"])
orow(f" Monday WR H1 (n={ov['mon_n']//2})", ov["mon_h1_wr"], ref=ov["base"])
orow(f" Monday WR H2", ov["mon_h2_wr"], ref=ov["base"])
print(f"\n {'2. Permutation p-values (n_perm=2000)':}")
c_ny = G if ov["p_ny"] < 0.05 else Y if ov["p_ny"] < 0.15 else R
c_mon = G if ov["p_mon"] < 0.05 else Y if ov["p_mon"] < 0.15 else R
print(f" {' NY_AFT p-value':<42} {c_ny}{ov['p_ny']:.4f}{X} {D}(< 0.05 = significant){X}")
print(f" {' Monday p-value':<42} {c_mon}{ov['p_mon']:.4f}{X}")
print(f"\n {'3. Effect sizes (Cohen\'s h)':}")
for label, h, n_cell in [("NY_AFT",ov["h_ny"],ov["ny_n"]),
("Monday",ov["h_mon"],ov["mon_n"]),
("London",ov["h_ldn"],ov["ldn_n"])]:
grade = "large" if h>=0.8 else "medium" if h>=0.5 else "small" if h>=0.2 else "trivial"
c = G if h>=0.5 else Y if h>=0.2 else R
print(f" {' '+label:<42} {c}{h:.3f}{X} {D}{grade} (n={n_cell}){X}")
print(f"\n {'4. Bonferroni z-scores (35 cells, crit≈2.99)':}")
crit = 2.99
for label, z in [("NY_AFT", ov["z_ny"]), ("Monday", ov["z_mon"])]:
c = G if z > crit else Y if z > 2.0 else R
print(f" {' '+label:<42} {c}{z:.2f}{X}")
print(f"\n {'5. Bootstrap 95% CI on NY_AFT WR':}")
lo, hi = ov["ny_wr_ci"]
c = G if hi < ov["base"] else R
print(f" {' NY_AFT WR CI':<42} {c}[{lo:.3f}, {hi:.3f}]{X} "
f"{D}({'below' if hi < ov['base'] else 'overlaps'} baseline {ov['base']:.3f}){X}")
print(f"\n {'6. Walk-forward: advisory score → H2 WR':}")
c = G if ov["wf_top"] > ov["wf_bot"] else R
print(f" {' Top-quartile WR (H2)':<42} {c}{ov['wf_top']:.3f}{X}")
print(f" {' Bot-quartile WR (H2)':<42} {c}{ov['wf_bot']:.3f}{X}")
print(f" {' Predictive?':<42} {c}{'YES' if ov['wf_top'] > ov['wf_bot'] else 'NO — overfit'}{X}")
print(f"\n {'7. Cross-bucket NY_AFT confound check':}")
c = G if ov["n_cross_bucket"] >= 2 else Y if ov["n_cross_bucket"] == 1 else R
print(f" {' Buckets confirming NY_AFT drag':<42} {c}{ov['n_cross_bucket']}{X} "
f"{D}(≥ 2 = session-driven, not bucket-proxy){X}")
print(f"\n{''*74}\n")
# ── Main ───────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--cached", action="store_true", help="Use cached trades (skip backtest)")
args = ap.parse_args()
trades = load_or_run(use_cache=args.cached)
if len(trades) < 100:
print(f"{R}Too few trades ({len(trades)}) — check engine setup.{X}")
sys.exit(1)
print(f"\n[SIM] Running gate strategies on {len(trades)} trades ...")
strategy_results = [run_strategy(s, trades) for s in ["A","B","C","D","E","F"]]
s6base = run_s6_base(trades)
print("[OV] Running overfitting guard ...")
ov = run_overfitting_report(trades)
print_full_report(strategy_results, s6base, ov)