872 lines
35 KiB
Python
872 lines
35 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
EsoF Overfitting Avoidance Test Suite
|
|||
|
|
|
|||
|
|
Industry-standard statistical tests to guard against overfitting in the
|
|||
|
|
EsoF calendar/session gate and the EsoF↔system interaction.
|
|||
|
|
|
|||
|
|
Why overfitting is a real risk here
|
|||
|
|
─────────────────────────────────────
|
|||
|
|
We inspected 5 sessions × 7 DoW = 35 cells on a single ~550-trade dataset
|
|||
|
|
covering only 3 weeks (2026-03-31 → 2026-04-19). That is:
|
|||
|
|
- A short temporal window (one market regime)
|
|||
|
|
- Small per-cell sample sizes (median n ≈ 14)
|
|||
|
|
- Multiple comparisons (we chose the *worst* cells after looking at all)
|
|||
|
|
- No pre-registration (we looked at the data before deciding the gate)
|
|||
|
|
|
|||
|
|
Any one of these alone warrants caution. Together they demand rigorous testing.
|
|||
|
|
|
|||
|
|
Tests implemented
|
|||
|
|
──────────────────
|
|||
|
|
1. TestTemporalStability — H1 vs H2 walk-forward: does the effect hold in both halves?
|
|||
|
|
2. TestPermutationSignificance — shuffle session/DoW labels N=2000 times; empirical p-value
|
|||
|
|
3. TestMultipleComparison — Bonferroni / FDR correction across all 35 cells
|
|||
|
|
4. TestBootstrapCI — 95% CI on WR and net PnL via bootstrap resampling
|
|||
|
|
5. TestMinimumSampleSize — flag cells with n < 30 as "insufficient evidence"
|
|||
|
|
6. TestEffectSize — Cohen's h on WR difference; require medium+ effect (h ≥ 0.3)
|
|||
|
|
7. TestWalkForwardAdvisory — train EsoF tables on H1, evaluate advisory score on H2
|
|||
|
|
8. TestAssetBucketStability — NY_AFT / Mon effect must hold across ≥ 2 asset buckets
|
|||
|
|
9. TestRegimeConfound — check if session effect is a proxy for ACB beta (regime)
|
|||
|
|
|
|||
|
|
Run:
|
|||
|
|
source /home/dolphin/siloqy_env/bin/activate
|
|||
|
|
cd /mnt/dolphinng5_predict
|
|||
|
|
python prod/tests/test_esof_overfit_guard.py # full report
|
|||
|
|
pytest prod/tests/test_esof_overfit_guard.py -v # pytest mode
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import base64
|
|||
|
|
import math
|
|||
|
|
import random
|
|||
|
|
import sys
|
|||
|
|
import urllib.request
|
|||
|
|
from collections import defaultdict
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, List, Optional, Tuple
|
|||
|
|
|
|||
|
|
import pytest
|
|||
|
|
|
|||
|
|
_ROOT = Path(__file__).parent.parent.parent
|
|||
|
|
sys.path.insert(0, str(_ROOT))
|
|||
|
|
sys.path.insert(0, str(_ROOT / "Observability"))
|
|||
|
|
|
|||
|
|
from esof_advisor import compute_esof, get_session, BASELINE_WR
|
|||
|
|
from esof_gate import get_bucket
|
|||
|
|
|
|||
|
|
# ── CH helpers ────────────────────────────────────────────────────────────────
|
|||
|
|
CH_URL = "http://localhost:8123"
|
|||
|
|
CH_USER = "dolphin"
|
|||
|
|
CH_PASS = "dolphin_ch_2026"
|
|||
|
|
|
|||
|
|
def _ch_query(sql: str) -> List[List[str]]:
|
|||
|
|
auth = base64.b64encode(f"{CH_USER}:{CH_PASS}".encode()).decode()
|
|||
|
|
req = urllib.request.Request(
|
|||
|
|
f"{CH_URL}/?database=dolphin&default_format=TabSeparated",
|
|||
|
|
data=sql.encode(),
|
|||
|
|
headers={"Authorization": f"Basic {auth}"},
|
|||
|
|
)
|
|||
|
|
with urllib.request.urlopen(req, timeout=10) as r:
|
|||
|
|
raw = r.read().decode().strip()
|
|||
|
|
if not raw:
|
|||
|
|
return []
|
|||
|
|
return [line.split('\t') for line in raw.split('\n')]
|
|||
|
|
|
|||
|
|
def _ch_available() -> bool:
|
|||
|
|
try:
|
|||
|
|
_ch_query("SELECT 1")
|
|||
|
|
return True
|
|||
|
|
except Exception:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
CH_UP = _ch_available()
|
|||
|
|
|
|||
|
|
# ── Trade loader (shared with gate test) ──────────────────────────────────────
|
|||
|
|
_CACHED_TRADES: Optional[List[dict]] = None
|
|||
|
|
|
|||
|
|
def fetch_trades() -> List[dict]:
|
|||
|
|
global _CACHED_TRADES
|
|||
|
|
if _CACHED_TRADES is not None:
|
|||
|
|
return _CACHED_TRADES
|
|||
|
|
sql = """
|
|||
|
|
SELECT
|
|||
|
|
toUnixTimestamp64Milli(ts) AS ts_ms,
|
|||
|
|
asset, side, pnl, exit_reason, leverage
|
|||
|
|
FROM dolphin.trade_events
|
|||
|
|
WHERE strategy = 'blue'
|
|||
|
|
AND exit_reason NOT IN ('HIBERNATE_HALT', 'SUBDAY_ACB_NORMALIZATION')
|
|||
|
|
ORDER BY ts
|
|||
|
|
"""
|
|||
|
|
rows = _ch_query(sql)
|
|||
|
|
pkl_map: Optional[Dict[str, int]] = None
|
|||
|
|
try:
|
|||
|
|
import pickle
|
|||
|
|
with open(_ROOT / "adaptive_exit/models/bucket_assignments.pkl", 'rb') as f:
|
|||
|
|
pkl_map = pickle.load(f).get('assignments', {})
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
trades = []
|
|||
|
|
for row in rows:
|
|||
|
|
if len(row) < 6:
|
|||
|
|
continue
|
|||
|
|
try:
|
|||
|
|
ts_ms = int(row[0])
|
|||
|
|
asset = row[1]
|
|||
|
|
pnl = float(row[3])
|
|||
|
|
leverage = float(row[5])
|
|||
|
|
except (ValueError, IndexError):
|
|||
|
|
continue
|
|||
|
|
ts = datetime.fromtimestamp(ts_ms / 1000.0, tz=timezone.utc)
|
|||
|
|
adv = compute_esof(ts)
|
|||
|
|
trades.append({
|
|||
|
|
"ts": ts,
|
|||
|
|
"asset": asset,
|
|||
|
|
"pnl": pnl,
|
|||
|
|
"leverage": leverage,
|
|||
|
|
"session": adv["session"],
|
|||
|
|
"dow": adv["dow"],
|
|||
|
|
"score": adv["advisory_score"],
|
|||
|
|
"label": adv["advisory_label"],
|
|||
|
|
"bucket_id": get_bucket(asset, pkl_map),
|
|||
|
|
})
|
|||
|
|
_CACHED_TRADES = trades
|
|||
|
|
return trades
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ── Statistical primitives ────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def wr(trades: List[dict]) -> float:
|
|||
|
|
if not trades:
|
|||
|
|
return float("nan")
|
|||
|
|
return sum(1 for t in trades if t["pnl"] > 0) / len(trades)
|
|||
|
|
|
|||
|
|
def net_pnl(trades: List[dict]) -> float:
|
|||
|
|
return sum(t["pnl"] for t in trades)
|
|||
|
|
|
|||
|
|
def cohen_h(p1: float, p2: float) -> float:
|
|||
|
|
"""Cohen's h effect size for two proportions. |h| ≥ 0.2 small, 0.5 medium, 0.8 large."""
|
|||
|
|
return abs(2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2)))
|
|||
|
|
|
|||
|
|
def binomial_se(p: float, n: int) -> float:
|
|||
|
|
return math.sqrt(p * (1 - p) / n) if n > 0 else float("inf")
|
|||
|
|
|
|||
|
|
def bootstrap_wr_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
|
|||
|
|
"""Bootstrap CI on WR. Returns (lower, upper)."""
|
|||
|
|
rng = random.Random(42)
|
|||
|
|
n = len(trades)
|
|||
|
|
samples = []
|
|||
|
|
for _ in range(n_boot):
|
|||
|
|
resample = [rng.choice(trades) for _ in range(n)]
|
|||
|
|
samples.append(wr(resample))
|
|||
|
|
samples.sort()
|
|||
|
|
lo = int((1 - ci) / 2 * n_boot)
|
|||
|
|
hi = int((1 + ci) / 2 * n_boot)
|
|||
|
|
return samples[lo], samples[hi]
|
|||
|
|
|
|||
|
|
def bootstrap_pnl_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
|
|||
|
|
rng = random.Random(42)
|
|||
|
|
n = len(trades)
|
|||
|
|
samples = []
|
|||
|
|
for _ in range(n_boot):
|
|||
|
|
resample = [rng.choice(trades) for _ in range(n)]
|
|||
|
|
samples.append(net_pnl(resample))
|
|||
|
|
samples.sort()
|
|||
|
|
lo = int((1 - ci) / 2 * n_boot)
|
|||
|
|
hi = int((1 + ci) / 2 * n_boot)
|
|||
|
|
return samples[lo], samples[hi]
|
|||
|
|
|
|||
|
|
def permutation_pvalue(
|
|||
|
|
trades: List[dict],
|
|||
|
|
observed_delta: float,
|
|||
|
|
label_key: str,
|
|||
|
|
blocked_label,
|
|||
|
|
n_perm: int = 2000,
|
|||
|
|
seed: int = 42,
|
|||
|
|
) -> float:
|
|||
|
|
"""
|
|||
|
|
Permutation test: shuffle label_key randomly, compute strategy improvement
|
|||
|
|
each time. Return fraction of permutations that produce >= observed_delta.
|
|||
|
|
observed_delta > 0 means "blocking blocked_label improved PnL".
|
|||
|
|
"""
|
|||
|
|
rng = random.Random(seed)
|
|||
|
|
labels = [t[label_key] for t in trades]
|
|||
|
|
pnls = [t["pnl"] for t in trades]
|
|||
|
|
count_ge = 0
|
|||
|
|
for _ in range(n_perm):
|
|||
|
|
rng.shuffle(labels)
|
|||
|
|
blocked_pnl = sum(p for l, p in zip(labels, pnls) if l == blocked_label)
|
|||
|
|
# delta = what we gain by blocking these trades
|
|||
|
|
delta = -blocked_pnl # if blocked_pnl < 0, delta > 0 = improvement
|
|||
|
|
if delta >= observed_delta:
|
|||
|
|
count_ge += 1
|
|||
|
|
return count_ge / n_perm
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ═════════════════════════════════════════════════════════════════════════════
|
|||
|
|
# TEST CLASSES
|
|||
|
|
# ═════════════════════════════════════════════════════════════════════════════
|
|||
|
|
|
|||
|
|
skip_no_ch = pytest.mark.skipif(not CH_UP, reason="ClickHouse not available")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestTemporalStability:
|
|||
|
|
"""
|
|||
|
|
Walk-forward: split chronologically into H1 (first 50%) and H2 (last 50%).
|
|||
|
|
Session and DoW effects must appear in BOTH halves to be considered real.
|
|||
|
|
If present in only one half → data snooping artifact.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_negative_in_h1_and_h2(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
n = len(trades)
|
|||
|
|
h1 = trades[:n // 2]
|
|||
|
|
h2 = trades[n // 2:]
|
|||
|
|
|
|||
|
|
ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
|
|||
|
|
base_h1 = wr(h1)
|
|||
|
|
base_h2 = wr(h2)
|
|||
|
|
|
|||
|
|
assert len(ny_h1) >= 10, f"H1 NY_AFT too small: n={len(ny_h1)}"
|
|||
|
|
assert len(ny_h2) >= 10, f"H2 NY_AFT too small: n={len(ny_h2)}"
|
|||
|
|
|
|||
|
|
# NY_AFTERNOON WR must be below baseline in BOTH halves
|
|||
|
|
wr_h1 = wr(ny_h1)
|
|||
|
|
wr_h2 = wr(ny_h2)
|
|||
|
|
assert wr_h1 < base_h1, (
|
|||
|
|
f"NY_AFT drag missing in H1: WR_NYA={wr_h1:.3f} >= baseline={base_h1:.3f}"
|
|||
|
|
)
|
|||
|
|
assert wr_h2 < base_h2, (
|
|||
|
|
f"NY_AFT drag missing in H2: WR_NYA={wr_h2:.3f} >= baseline={base_h2:.3f}"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_monday_negative_in_h1_and_h2(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
n = len(trades)
|
|||
|
|
h1 = trades[:n // 2]
|
|||
|
|
h2 = trades[n // 2:]
|
|||
|
|
|
|||
|
|
mon_h1 = [t for t in h1 if t["dow"] == 0]
|
|||
|
|
mon_h2 = [t for t in h2 if t["dow"] == 0]
|
|||
|
|
|
|||
|
|
# Monday sample is thin — require at least 10 in each half
|
|||
|
|
if len(mon_h1) < 10 or len(mon_h2) < 10:
|
|||
|
|
pytest.skip(f"Monday sample too thin for walk-forward: H1={len(mon_h1)}, H2={len(mon_h2)}")
|
|||
|
|
|
|||
|
|
assert wr(mon_h1) < wr(h1), "Monday drag absent in H1"
|
|||
|
|
assert wr(mon_h2) < wr(h2), "Monday drag absent in H2"
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_strategy_e_positive_in_both_halves(self):
|
|||
|
|
"""Combined gate (Mon+NY_AFT) must improve PnL in H1 AND H2 independently."""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
n = len(trades)
|
|||
|
|
h1 = trades[:n // 2]
|
|||
|
|
h2 = trades[n // 2:]
|
|||
|
|
|
|||
|
|
def gate_e_pnl(subset):
|
|||
|
|
return sum(t["pnl"] for t in subset
|
|||
|
|
if t["dow"] != 0 and t["session"] != "NY_AFTERNOON")
|
|||
|
|
|
|||
|
|
def base_pnl(subset):
|
|||
|
|
return sum(t["pnl"] for t in subset)
|
|||
|
|
|
|||
|
|
assert gate_e_pnl(h1) > base_pnl(h1), "Strategy E degrades H1"
|
|||
|
|
assert gate_e_pnl(h2) > base_pnl(h2), "Strategy E degrades H2"
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestPermutationSignificance:
|
|||
|
|
"""
|
|||
|
|
Permutation test: shuffle session / DoW labels randomly.
|
|||
|
|
The observed improvement from blocking must rank in the top 5%
|
|||
|
|
of the null distribution (p < 0.05) to be considered non-random.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_block_is_significant(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny_pnl = sum(t["pnl"] for t in trades if t["session"] == "NY_AFTERNOON")
|
|||
|
|
observed_delta = -ny_pnl # gain from skipping NY_AFT trades
|
|||
|
|
|
|||
|
|
p = permutation_pvalue(trades, observed_delta, "session", "NY_AFTERNOON",
|
|||
|
|
n_perm=2000)
|
|||
|
|
assert p < 0.05, (
|
|||
|
|
f"NY_AFTERNOON block not significant: p={p:.3f} >= 0.05. "
|
|||
|
|
f"Effect may be noise at this sample size."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_monday_block_significance(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
mon_pnl = sum(t["pnl"] for t in trades if t["dow"] == 0)
|
|||
|
|
observed_delta = -mon_pnl
|
|||
|
|
|
|||
|
|
p = permutation_pvalue(trades, observed_delta, "dow", 0, n_perm=2000)
|
|||
|
|
# Monday has fewer trades — use looser threshold (p < 0.15)
|
|||
|
|
# Flag as WARNING not FAIL if p >= 0.05: thin sample, directionally valid
|
|||
|
|
if p >= 0.05:
|
|||
|
|
print(f"\n WARN: Monday block p={p:.3f} >= 0.05. "
|
|||
|
|
f"Directionally valid but underpowered (n={sum(1 for t in trades if t['dow']==0)}).")
|
|||
|
|
assert p < 0.15, (
|
|||
|
|
f"Monday block not even marginally significant: p={p:.3f}. "
|
|||
|
|
f"Gate should not be applied until more data accumulates."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_london_morning_block_would_hurt(self):
|
|||
|
|
"""Blocking LONDON_MORNING (the BEST session) must NOT improve PnL."""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ldn_pnl = sum(t["pnl"] for t in trades if t["session"] == "LONDON_MORNING")
|
|||
|
|
observed_delta = -ldn_pnl # gain from blocking LDN (expect negative = harmful)
|
|||
|
|
|
|||
|
|
# LDN is net-positive, so blocking it is harmful (delta < 0)
|
|||
|
|
assert observed_delta < 0, (
|
|||
|
|
f"Blocking LONDON_MORNING should HURT PnL (it is the best session). "
|
|||
|
|
f"Got delta={observed_delta:.2f}. Check data integrity."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestMultipleComparison:
|
|||
|
|
"""
|
|||
|
|
Multiple comparison correction.
|
|||
|
|
We inspected 5 sessions × 7 DoW = 35 cells. Finding 'significant' cells
|
|||
|
|
after inspection requires Bonferroni correction: α_adj = 0.05 / 35 ≈ 0.0014.
|
|||
|
|
Only cells where WR deviation is large enough to survive Bonferroni should
|
|||
|
|
be used in the gate.
|
|||
|
|
|
|||
|
|
We test: do our chosen cells (NY_AFT, Monday) survive Bonferroni?
|
|||
|
|
Using a binomial z-test as a proxy for the corrected p-value.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_survives_bonferroni(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
n = len(ny)
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
wr_ny = wr(ny)
|
|||
|
|
|
|||
|
|
se = binomial_se(baseline, n)
|
|||
|
|
z = (baseline - wr_ny) / se if se > 0 else 0
|
|||
|
|
# One-tailed z for Bonferroni α=0.0014: z_crit ≈ 2.99
|
|||
|
|
# We use 2.0 as a practical threshold (more conservative than 1.96 but
|
|||
|
|
# less strict than Bonferroni, given 3-week sample inherent limitations)
|
|||
|
|
assert z > 2.0, (
|
|||
|
|
f"NY_AFTERNOON WR deviation (z={z:.2f}) does not survive "
|
|||
|
|
f"multiple-comparison correction. n={n}, WR={wr_ny:.3f} vs base={baseline:.3f}."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_monday_bonferroni_warning(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
mon = [t for t in trades if t["dow"] == 0]
|
|||
|
|
n = len(mon)
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
wr_mon = wr(mon)
|
|||
|
|
|
|||
|
|
se = binomial_se(baseline, n)
|
|||
|
|
z = (baseline - wr_mon) / se if se > 0 else 0
|
|||
|
|
|
|||
|
|
# Monday: warn if z < 2.0 (doesn't survive strict Bonferroni)
|
|||
|
|
if z < 2.0:
|
|||
|
|
print(f"\n WARN: Monday z={z:.2f} < 2.0. Does not survive Bonferroni "
|
|||
|
|
f"at current sample (n={n}). Apply Monday gate cautiously.")
|
|||
|
|
# Require at least z > 1.0 (directional signal, not pure noise)
|
|||
|
|
assert z > 1.0, (
|
|||
|
|
f"Monday WR deviation is indistinguishable from noise: z={z:.2f}. "
|
|||
|
|
f"Do not gate Monday until more trades accumulate."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_no_spurious_best_cell_used_as_gate(self):
|
|||
|
|
"""
|
|||
|
|
Best-cell cherry-pick guard: the SINGLE best-performing cell in the dataset
|
|||
|
|
must NOT be treated as a reliable gate without Bonferroni correction.
|
|||
|
|
Test: find the best WR cell (n >= 10), check that its deviation is NOT
|
|||
|
|
significantly larger than the worst cell — both could be noise extremes.
|
|||
|
|
"""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
cells: Dict[Tuple, List[dict]] = defaultdict(list)
|
|||
|
|
for t in trades:
|
|||
|
|
cells[(t["dow"], t["session"])].append(t)
|
|||
|
|
|
|||
|
|
valid = [(k, v) for k, v in cells.items() if len(v) >= 10]
|
|||
|
|
if len(valid) < 5:
|
|||
|
|
pytest.skip("Not enough cells with n >= 10")
|
|||
|
|
|
|||
|
|
wrs = [(k, wr(v), len(v)) for k, v in valid]
|
|||
|
|
best = max(wrs, key=lambda x: x[1])
|
|||
|
|
worst = min(wrs, key=lambda x: x[1])
|
|||
|
|
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
se_best = binomial_se(baseline, best[2])
|
|||
|
|
se_worst = binomial_se(baseline, worst[2])
|
|||
|
|
|
|||
|
|
z_best = (best[1] - baseline) / se_best if se_best > 0 else 0
|
|||
|
|
z_worst = (baseline - worst[1]) / se_worst if se_worst > 0 else 0
|
|||
|
|
|
|||
|
|
# Both extremes should be similarly significant (or not).
|
|||
|
|
# If best is >3σ but worst is <1σ, something is asymmetric — flag it.
|
|||
|
|
# Acceptable: both extremes are significant OR both are marginal.
|
|||
|
|
ratio = z_best / z_worst if z_worst > 0.1 else float("inf")
|
|||
|
|
assert ratio < 5.0, (
|
|||
|
|
f"Asymmetric cell extremes: z_best={z_best:.2f} vs z_worst={z_worst:.2f}. "
|
|||
|
|
f"Best cell ({best[0]}) may be a cherry-pick artifact."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestBootstrapCI:
|
|||
|
|
"""
|
|||
|
|
Bootstrap confidence intervals on WR for each gated segment.
|
|||
|
|
The 95% CI upper bound for NY_AFTERNOON WR must be below baseline WR.
|
|||
|
|
If the CI overlaps the baseline, the effect is not reliable.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_ci_below_baseline(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
|
|||
|
|
assert len(ny) >= 20, f"NY_AFT sample too small for bootstrap: n={len(ny)}"
|
|||
|
|
|
|||
|
|
_, upper = bootstrap_wr_ci(ny, n_boot=3000)
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
|
|||
|
|
assert upper < baseline, (
|
|||
|
|
f"NY_AFTERNOON WR CI upper bound ({upper:.3f}) overlaps baseline "
|
|||
|
|
f"WR ({baseline:.3f}). Effect not reliable at 95% confidence."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_london_morning_ci_above_baseline(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
|
|||
|
|
|
|||
|
|
assert len(ldn) >= 20, f"LDN sample too small: n={len(ldn)}"
|
|||
|
|
|
|||
|
|
lower, _ = bootstrap_wr_ci(ldn, n_boot=3000)
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
|
|||
|
|
assert lower > baseline * 0.95, (
|
|||
|
|
f"LONDON_MORNING WR CI lower bound ({lower:.3f}) is too far below "
|
|||
|
|
f"baseline ({baseline:.3f}). LDN advantage may not be reliable."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_pnl_ci_negative(self):
|
|||
|
|
"""Net PnL CI for NY_AFTERNOON must have upper bound < 0 (net loser with confidence)."""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
|
|||
|
|
assert len(ny) >= 20
|
|||
|
|
|
|||
|
|
_, upper = bootstrap_pnl_ci(ny, n_boot=3000)
|
|||
|
|
assert upper < 0, (
|
|||
|
|
f"NY_AFTERNOON net PnL CI upper bound is {upper:.2f} > 0. "
|
|||
|
|
f"Cannot confidently call it a net loser at current sample size."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestMinimumSampleSize:
|
|||
|
|
"""
|
|||
|
|
Minimum sample size guard. No session or DoW factor should influence
|
|||
|
|
the advisory score unless it has n >= 30 trades. Below 30, the WR
|
|||
|
|
estimate has SE > 9pp (too noisy to act on).
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_all_gate_factors_have_sufficient_n(self):
|
|||
|
|
"""
|
|||
|
|
The two gated factors (NY_AFTERNOON, Monday) must each have n >= 30
|
|||
|
|
in the current dataset for the gate to be considered valid.
|
|||
|
|
"""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
mon = [t for t in trades if t["dow"] == 0]
|
|||
|
|
|
|||
|
|
assert len(ny) >= 30, f"NY_AFTERNOON n={len(ny)} < 30. Gate underpowered."
|
|||
|
|
assert len(mon) >= 30, f"Monday n={len(mon)} < 30. Gate underpowered."
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_slot_15m_gate_would_be_overfit(self):
|
|||
|
|
"""
|
|||
|
|
15-minute slot data has median n ≈ 7. Any slot-level gate applied
|
|||
|
|
directly would be extreme overfitting. Verify: majority of slots have n < 30.
|
|||
|
|
"""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
slots: Dict[str, int] = defaultdict(int)
|
|||
|
|
for t in trades:
|
|||
|
|
h = t["ts"].hour
|
|||
|
|
m = (t["ts"].minute // 15) * 15
|
|||
|
|
slots[f"{h}:{m:02d}"] += 1
|
|||
|
|
|
|||
|
|
n_thin = sum(1 for n in slots.values() if n < 30)
|
|||
|
|
frac = n_thin / len(slots) if slots else 1.0
|
|||
|
|
|
|||
|
|
assert frac > 0.70, (
|
|||
|
|
f"Only {frac:.0%} of 15m slots have n < 30. "
|
|||
|
|
f"Expected most slots to be underpowered — if not, slot gate may be premature."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def test_advisory_score_weights_reflect_sample_size(self):
|
|||
|
|
"""
|
|||
|
|
Slot weight (0.10) must be lower than session (0.25) and DoW (0.30).
|
|||
|
|
Ensures the weakest-sample factor has the lowest influence.
|
|||
|
|
"""
|
|||
|
|
from esof_advisor import SESSION_STATS, DOW_STATS, SLOT_STATS
|
|||
|
|
median_session_n = sorted([v[0] for v in SESSION_STATS.values()])[len(SESSION_STATS) // 2]
|
|||
|
|
median_dow_n = sorted([v[0] for v in DOW_STATS.values()])[len(DOW_STATS) // 2]
|
|||
|
|
median_slot_n = sorted([v[0] for v in SLOT_STATS.values()])[len(SLOT_STATS) // 2]
|
|||
|
|
|
|||
|
|
assert median_slot_n < median_session_n, "Slot n should be < session n"
|
|||
|
|
assert median_slot_n < median_dow_n, "Slot n should be < DoW n"
|
|||
|
|
# Slot weight is 0.10, session 0.25, DoW 0.30 — smaller n = smaller weight
|
|||
|
|
SLOT_WEIGHT = 0.10
|
|||
|
|
SESSION_WEIGHT = 0.25
|
|||
|
|
DOW_WEIGHT = 0.30
|
|||
|
|
assert SLOT_WEIGHT < SESSION_WEIGHT
|
|||
|
|
assert SLOT_WEIGHT < DOW_WEIGHT
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestEffectSize:
|
|||
|
|
"""
|
|||
|
|
Cohen's h effect size on WR differences.
|
|||
|
|
|h| >= 0.2: small effect (minimum threshold to consider gating)
|
|||
|
|
|h| >= 0.5: medium effect (comfortable to gate)
|
|||
|
|
|h| >= 0.8: large effect (very strong signal)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_effect_size_medium(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
h = cohen_h(wr(ny), baseline)
|
|||
|
|
assert h >= 0.2, (
|
|||
|
|
f"NY_AFTERNOON effect size h={h:.3f} < 0.2 (small). "
|
|||
|
|
f"Signal too weak to justify gating."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_london_morning_effect_size_positive(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
h = cohen_h(wr(ldn), baseline)
|
|||
|
|
assert h >= 0.0, "LDN effect size must be measurable"
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_dow_tuesday_effect_size(self):
|
|||
|
|
"""Tuesday is the best DoW. Effect size must be positive."""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
tue = [t for t in trades if t["dow"] == 1]
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
if len(tue) < 10:
|
|||
|
|
pytest.skip("Tuesday sample too thin")
|
|||
|
|
h = cohen_h(wr(tue), baseline)
|
|||
|
|
assert h >= 0.0, "Tuesday must show positive effect"
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_effect_size_ranking_matches_expectation(self):
|
|||
|
|
"""
|
|||
|
|
NY_AFTERNOON effect size must be larger than LOW_LIQUIDITY effect size.
|
|||
|
|
NY_AFT has more trades and a larger WR gap — should show stronger signal.
|
|||
|
|
"""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
low = [t for t in trades if t["session"] == "LOW_LIQUIDITY"]
|
|||
|
|
base = wr(trades)
|
|||
|
|
|
|||
|
|
h_ny = cohen_h(wr(ny), base) if len(ny) >= 10 else 0
|
|||
|
|
h_low = cohen_h(wr(low), base) if len(low) >= 10 else 0
|
|||
|
|
|
|||
|
|
# NY_AFTERNOON has 3× the sample of LOW_LIQ — effect should be at least as large
|
|||
|
|
assert h_ny >= h_low * 0.7, (
|
|||
|
|
f"NY_AFT h={h_ny:.3f} much smaller than LOW_LIQ h={h_low:.3f}. "
|
|||
|
|
f"Unexpected — check data."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestWalkForwardAdvisory:
|
|||
|
|
"""
|
|||
|
|
Walk-forward advisory score validation.
|
|||
|
|
Train EsoF tables conceptually on H1 (we use the existing static tables as proxy).
|
|||
|
|
Evaluate: does the advisory score computed at H2 trade times predict H2 outcomes?
|
|||
|
|
|
|||
|
|
Method: within H2, rank trades by advisory_score. The bottom quartile (most
|
|||
|
|
negative score) should have lower WR than the top quartile. If the score
|
|||
|
|
has no predictive power on OOS data, it is overfit to the in-sample period.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_score_predicts_wr_direction_in_h2(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
n = len(trades)
|
|||
|
|
h2 = sorted(trades[n // 2:], key=lambda t: t["score"])
|
|||
|
|
|
|||
|
|
if len(h2) < 40:
|
|||
|
|
pytest.skip(f"H2 too small for quartile split: n={len(h2)}")
|
|||
|
|
|
|||
|
|
q = len(h2) // 4
|
|||
|
|
bottom = h2[:q] # worst advisory scores
|
|||
|
|
top = h2[-q:] # best advisory scores
|
|||
|
|
|
|||
|
|
wr_bot = wr(bottom)
|
|||
|
|
wr_top = wr(top)
|
|||
|
|
|
|||
|
|
assert wr_top > wr_bot, (
|
|||
|
|
f"Advisory score has no directional predictive power in H2: "
|
|||
|
|
f"WR_top={wr_top:.3f} WR_bot={wr_bot:.3f}. Score may be overfit."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_unfavorable_label_has_lower_wr_in_h2(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
n = len(trades)
|
|||
|
|
h2 = trades[n // 2:]
|
|||
|
|
|
|||
|
|
unfav = [t for t in h2 if t["label"] == "UNFAVORABLE"]
|
|||
|
|
rest = [t for t in h2 if t["label"] != "UNFAVORABLE"]
|
|||
|
|
|
|||
|
|
if len(unfav) < 5:
|
|||
|
|
pytest.skip(f"Too few UNFAVORABLE trades in H2: n={len(unfav)}")
|
|||
|
|
|
|||
|
|
assert wr(unfav) <= wr(rest) + 0.05, (
|
|||
|
|
f"UNFAVORABLE label does not predict lower WR in H2: "
|
|||
|
|
f"WR_unfav={wr(unfav):.3f} vs WR_rest={wr(rest):.3f}. "
|
|||
|
|
f"Advisory label may be overfit."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestAssetBucketStability:
|
|||
|
|
"""
|
|||
|
|
The session/DoW effect must not be driven by a single asset bucket.
|
|||
|
|
If NY_AFTERNOON drag is entirely explained by, say, B4 trades clustering
|
|||
|
|
in that session, the gate is actually gating B4 by proxy — not time.
|
|||
|
|
The effect must hold across at least 2 independent buckets.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_drag_cross_bucket(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
not_ny = [t for t in trades if t["session"] != "NY_AFTERNOON"]
|
|||
|
|
|
|||
|
|
by_bucket_ny = defaultdict(list)
|
|||
|
|
by_bucket_out = defaultdict(list)
|
|||
|
|
for t in ny:
|
|||
|
|
by_bucket_ny[t["bucket_id"]].append(t)
|
|||
|
|
for t in not_ny:
|
|||
|
|
by_bucket_out[t["bucket_id"]].append(t)
|
|||
|
|
|
|||
|
|
# Count buckets where NY_AFT WR is below out-of-session WR
|
|||
|
|
n_confirming = 0
|
|||
|
|
for bkt in by_bucket_ny:
|
|||
|
|
if len(by_bucket_ny[bkt]) < 5 or len(by_bucket_out.get(bkt, [])) < 5:
|
|||
|
|
continue
|
|||
|
|
if wr(by_bucket_ny[bkt]) < wr(by_bucket_out[bkt]):
|
|||
|
|
n_confirming += 1
|
|||
|
|
|
|||
|
|
assert n_confirming >= 2, (
|
|||
|
|
f"NY_AFT drag only confirmed in {n_confirming} bucket(s). "
|
|||
|
|
f"Need ≥ 2 for effect to be session-driven, not bucket-confounded."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_monday_drag_cross_bucket(self):
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
mon = [t for t in trades if t["dow"] == 0]
|
|||
|
|
not_mon = [t for t in trades if t["dow"] != 0]
|
|||
|
|
|
|||
|
|
by_bkt_mon = defaultdict(list)
|
|||
|
|
by_bkt_out = defaultdict(list)
|
|||
|
|
for t in mon:
|
|||
|
|
by_bkt_mon[t["bucket_id"]].append(t)
|
|||
|
|
for t in not_mon:
|
|||
|
|
by_bkt_out[t["bucket_id"]].append(t)
|
|||
|
|
|
|||
|
|
n_confirming = 0
|
|||
|
|
for bkt in by_bkt_mon:
|
|||
|
|
if len(by_bkt_mon[bkt]) < 5 or len(by_bkt_out.get(bkt, [])) < 5:
|
|||
|
|
continue
|
|||
|
|
if wr(by_bkt_mon[bkt]) < wr(by_bkt_out[bkt]):
|
|||
|
|
n_confirming += 1
|
|||
|
|
|
|||
|
|
if n_confirming < 2:
|
|||
|
|
print(f"\n WARN: Monday drag only in {n_confirming} bucket(s). "
|
|||
|
|
f"Thin sample — cannot confirm cross-bucket. Gate with caution.")
|
|||
|
|
# Soft assert: Monday has thinner sample, require at least 1
|
|||
|
|
assert n_confirming >= 1, (
|
|||
|
|
f"Monday drag not present in ANY bucket. "
|
|||
|
|
f"Likely a sampling artifact — do not gate Monday."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
class TestRegimeConfound:
|
|||
|
|
"""
|
|||
|
|
Regime confound check: is the session effect just a proxy for ACB beta?
|
|||
|
|
If all NY_AFTERNOON trades happen to coincide with low ACB beta (bearish
|
|||
|
|
regime), then blocking NY_AFT is actually blocking bear-regime trades,
|
|||
|
|
not session-specific trades. The gate would be redundant with ACB.
|
|||
|
|
|
|||
|
|
Method: compare ACB leverage (proxy for regime strength) between
|
|||
|
|
NY_AFTERNOON and other sessions. If leverage distributions are
|
|||
|
|
significantly different, the session effect is partially confounded.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_leverage_not_systematically_different(self):
|
|||
|
|
"""
|
|||
|
|
NY_AFTERNOON avg leverage should be within 20% of other sessions' avg leverage.
|
|||
|
|
Large divergence → session effect may be a regime proxy.
|
|||
|
|
"""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
not_ny = [t for t in trades if t["session"] != "NY_AFTERNOON"]
|
|||
|
|
|
|||
|
|
if len(ny) < 10 or len(not_ny) < 10:
|
|||
|
|
pytest.skip("Insufficient data for leverage comparison")
|
|||
|
|
|
|||
|
|
avg_lev_ny = sum(t["leverage"] for t in ny) / len(ny)
|
|||
|
|
avg_lev_out = sum(t["leverage"] for t in not_ny) / len(not_ny)
|
|||
|
|
|
|||
|
|
ratio = avg_lev_ny / avg_lev_out if avg_lev_out > 0 else 1.0
|
|||
|
|
|
|||
|
|
assert 0.80 <= ratio <= 1.20, (
|
|||
|
|
f"NY_AFTERNOON avg leverage ({avg_lev_ny:.2f}x) differs by >{20}% "
|
|||
|
|
f"from other sessions ({avg_lev_out:.2f}x). "
|
|||
|
|
f"Session effect may be a regime-proxy — investigate confound."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
@skip_no_ch
|
|||
|
|
def test_ny_afternoon_wr_negative_across_leverage_bands(self):
|
|||
|
|
"""
|
|||
|
|
Regime confound falsification: split NY_AFT trades into high/low leverage.
|
|||
|
|
If NY_AFT drag holds in BOTH leverage bands, it is NOT purely a regime effect.
|
|||
|
|
"""
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
|
|||
|
|
if len(ny) < 20:
|
|||
|
|
pytest.skip(f"NY_AFT too small for leverage split: n={len(ny)}")
|
|||
|
|
|
|||
|
|
median_lev = sorted(t["leverage"] for t in ny)[len(ny) // 2]
|
|||
|
|
hi_lev = [t for t in ny if t["leverage"] >= median_lev]
|
|||
|
|
lo_lev = [t for t in ny if t["leverage"] < median_lev]
|
|||
|
|
baseline = wr(fetch_trades())
|
|||
|
|
|
|||
|
|
hi_below = wr(hi_lev) < baseline if len(hi_lev) >= 5 else True
|
|||
|
|
lo_below = wr(lo_lev) < baseline if len(lo_lev) >= 5 else True
|
|||
|
|
|
|||
|
|
assert hi_below or lo_below, (
|
|||
|
|
"NY_AFT drag absent in BOTH leverage bands — effect is not regime-independent. "
|
|||
|
|
"Gate may be a regime proxy."
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ═════════════════════════════════════════════════════════════════════════════
|
|||
|
|
# STANDALONE REPORT
|
|||
|
|
# ═════════════════════════════════════════════════════════════════════════════
|
|||
|
|
|
|||
|
|
GREEN = "\033[32m"; RED = "\033[31m"; YELLOW = "\033[33m"
|
|||
|
|
BOLD = "\033[1m"; DIM = "\033[2m"; RST = "\033[0m"
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
if not CH_UP:
|
|||
|
|
print(f"{RED}ClickHouse not available.{RST}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
trades = fetch_trades()
|
|||
|
|
n = len(trades)
|
|||
|
|
h1, h2 = trades[:n // 2], trades[n // 2:]
|
|||
|
|
|
|||
|
|
print(f"\n{BOLD}{'═'*68}{RST}")
|
|||
|
|
print(f"{BOLD} EsoF Overfitting Guard Report ({n} trades){RST}")
|
|||
|
|
print(f"{'═'*68}\n")
|
|||
|
|
|
|||
|
|
baseline = wr(trades)
|
|||
|
|
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
mon = [t for t in trades if t["dow"] == 0]
|
|||
|
|
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
|
|||
|
|
|
|||
|
|
ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
|
|||
|
|
mon_h1 = [t for t in h1 if t["dow"] == 0]
|
|||
|
|
mon_h2 = [t for t in h2 if t["dow"] == 0]
|
|||
|
|
|
|||
|
|
def row(label, val, ref=None, lo=None, hi=None, warn=None, note=""):
|
|||
|
|
if lo is not None:
|
|||
|
|
ci_str = f" 95%CI [{lo:.3f}, {hi:.3f}]"
|
|||
|
|
else:
|
|||
|
|
ci_str = ""
|
|||
|
|
col = GREEN if (ref is None or val < ref) else RED
|
|||
|
|
if warn:
|
|||
|
|
col = YELLOW
|
|||
|
|
print(f" {label:<42} {col}{val:.3f}{RST}{ci_str} {DIM}{note}{RST}")
|
|||
|
|
|
|||
|
|
print(f" {'Baseline WR':<42} {baseline:.3f}")
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
print(f" {BOLD}1. Temporal Stability (H1 / H2){RST}")
|
|||
|
|
row(" NY_AFT WR — H1", wr(ny_h1), baseline, note=f"n={len(ny_h1)}")
|
|||
|
|
row(" NY_AFT WR — H2", wr(ny_h2), baseline, note=f"n={len(ny_h2)}")
|
|||
|
|
row(" Mon WR — H1", wr(mon_h1), baseline, note=f"n={len(mon_h1)}")
|
|||
|
|
row(" Mon WR — H2", wr(mon_h2), baseline, note=f"n={len(mon_h2)}")
|
|||
|
|
|
|||
|
|
print(f"\n {BOLD}2. Permutation p-values{RST}")
|
|||
|
|
ny_pnl = sum(t["pnl"] for t in ny)
|
|||
|
|
mon_pnl = sum(t["pnl"] for t in mon)
|
|||
|
|
p_ny = permutation_pvalue(trades, -ny_pnl, "session", "NY_AFTERNOON", n_perm=2000)
|
|||
|
|
p_mon = permutation_pvalue(trades, -mon_pnl, "dow", 0, n_perm=2000)
|
|||
|
|
col_ny = GREEN if p_ny < 0.05 else YELLOW if p_ny < 0.15 else RED
|
|||
|
|
col_mon = GREEN if p_mon < 0.05 else YELLOW if p_mon < 0.15 else RED
|
|||
|
|
print(f" {'NY_AFT block p-value':<42} {col_ny}{p_ny:.4f}{RST} {DIM}(< 0.05 = significant){RST}")
|
|||
|
|
print(f" {'Monday block p-value':<42} {col_mon}{p_mon:.4f}{RST} {DIM}(< 0.15 = directional){RST}")
|
|||
|
|
|
|||
|
|
print(f"\n {BOLD}3. Effect Sizes (Cohen's h){RST}")
|
|||
|
|
h_ny = cohen_h(wr(ny), baseline)
|
|||
|
|
h_mon = cohen_h(wr(mon), baseline)
|
|||
|
|
h_ldn = cohen_h(wr(ldn), baseline)
|
|||
|
|
for label, h, n_cell in [("NY_AFT", h_ny, len(ny)), ("Monday", h_mon, len(mon)), ("London", h_ldn, len(ldn))]:
|
|||
|
|
grade = "large" if h >= 0.8 else "medium" if h >= 0.5 else "small" if h >= 0.2 else "trivial"
|
|||
|
|
col = GREEN if h >= 0.5 else YELLOW if h >= 0.2 else RED
|
|||
|
|
print(f" {' '+label:<42} {col}{h:.3f}{RST} {DIM}{grade} (n={n_cell}){RST}")
|
|||
|
|
|
|||
|
|
print(f"\n {BOLD}4. Bootstrap 95% CIs{RST}")
|
|||
|
|
ny_lo, ny_hi = bootstrap_wr_ci(ny, n_boot=3000)
|
|||
|
|
col = GREEN if ny_hi < baseline else RED
|
|||
|
|
print(f" {'NY_AFT WR CI':<42} {col}[{ny_lo:.3f}, {ny_hi:.3f}]{RST} "
|
|||
|
|
f"{DIM}({'below' if ny_hi < baseline else 'overlaps'} baseline {baseline:.3f}){RST}")
|
|||
|
|
ny_plo, ny_phi = bootstrap_pnl_ci(ny, n_boot=3000)
|
|||
|
|
col = GREEN if ny_phi < 0 else RED
|
|||
|
|
print(f" {'NY_AFT net PnL CI':<42} {col}[{ny_plo:+,.0f}, {ny_phi:+,.0f}]{RST} "
|
|||
|
|
f"{DIM}({'net loser with confidence' if ny_phi < 0 else 'uncertain sign'}){RST}")
|
|||
|
|
|
|||
|
|
print(f"\n {BOLD}5. Bonferroni z-scores (35 cells tested){RST}")
|
|||
|
|
se_ny = binomial_se(baseline, len(ny))
|
|||
|
|
se_mon = binomial_se(baseline, len(mon))
|
|||
|
|
z_ny = (baseline - wr(ny)) / se_ny if se_ny > 0 else 0
|
|||
|
|
z_mon = (baseline - wr(mon)) / se_mon if se_mon > 0 else 0
|
|||
|
|
crit = 2.99 # Bonferroni α=0.0014 → z_crit≈2.99
|
|||
|
|
col_ny = GREEN if z_ny > crit else YELLOW if z_ny > 2.0 else RED
|
|||
|
|
col_mon = GREEN if z_mon > crit else YELLOW if z_mon > 2.0 else RED
|
|||
|
|
print(f" {'NY_AFT z':<42} {col_ny}{z_ny:.2f}{RST} {DIM}(Bonferroni crit ≈ {crit}){RST}")
|
|||
|
|
print(f" {'Monday z':<42} {col_mon}{z_mon:.2f}{RST}")
|
|||
|
|
|
|||
|
|
print(f"\n {BOLD}6. Walk-Forward: advisory score → H2 WR{RST}")
|
|||
|
|
h2s = sorted(h2, key=lambda t: t["score"])
|
|||
|
|
q = max(1, len(h2s) // 4)
|
|||
|
|
wr_bot = wr(h2s[:q])
|
|||
|
|
wr_top = wr(h2s[-q:])
|
|||
|
|
col = GREEN if wr_top > wr_bot else RED
|
|||
|
|
print(f" {' Top-quartile score WR (H2)':<42} {col}{wr_top:.3f}{RST} {DIM}n={q}{RST}")
|
|||
|
|
print(f" {' Bot-quartile score WR (H2)':<42} {col}{wr_bot:.3f}{RST} {DIM}n={q}{RST}")
|
|||
|
|
print(f" {' Predictive (top > bot)?':<42} {col}{'YES' if wr_top > wr_bot else 'NO — score overfit'}{RST}")
|
|||
|
|
|
|||
|
|
print(f"\n{'═'*68}\n")
|