Files
DOLPHIN/prod/tests/test_esof_overfit_guard.py

872 lines
35 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
EsoF Overfitting Avoidance Test Suite
Industry-standard statistical tests to guard against overfitting in the
EsoF calendar/session gate and the EsoFsystem interaction.
Why overfitting is a real risk here
We inspected 5 sessions × 7 DoW = 35 cells on a single ~550-trade dataset
covering only 3 weeks (2026-03-31 2026-04-19). That is:
- A short temporal window (one market regime)
- Small per-cell sample sizes (median n 14)
- Multiple comparisons (we chose the *worst* cells after looking at all)
- No pre-registration (we looked at the data before deciding the gate)
Any one of these alone warrants caution. Together they demand rigorous testing.
Tests implemented
1. TestTemporalStability H1 vs H2 walk-forward: does the effect hold in both halves?
2. TestPermutationSignificance shuffle session/DoW labels N=2000 times; empirical p-value
3. TestMultipleComparison Bonferroni / FDR correction across all 35 cells
4. TestBootstrapCI 95% CI on WR and net PnL via bootstrap resampling
5. TestMinimumSampleSize flag cells with n < 30 as "insufficient evidence"
6. TestEffectSize Cohen's h on WR difference; require medium+ effect (h ≥ 0.3)
7. TestWalkForwardAdvisory train EsoF tables on H1, evaluate advisory score on H2
8. TestAssetBucketStability NY_AFT / Mon effect must hold across 2 asset buckets
9. TestRegimeConfound check if session effect is a proxy for ACB beta (regime)
Run:
source /home/dolphin/siloqy_env/bin/activate
cd /mnt/dolphinng5_predict
python prod/tests/test_esof_overfit_guard.py # full report
pytest prod/tests/test_esof_overfit_guard.py -v # pytest mode
"""
from __future__ import annotations
import base64
import math
import random
import sys
import urllib.request
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pytest
_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(_ROOT))
sys.path.insert(0, str(_ROOT / "Observability"))
from esof_advisor import compute_esof, get_session, BASELINE_WR
from esof_gate import get_bucket
# ── CH helpers ────────────────────────────────────────────────────────────────
CH_URL = "http://localhost:8123"
CH_USER = "dolphin"
CH_PASS = "dolphin_ch_2026"
def _ch_query(sql: str) -> List[List[str]]:
auth = base64.b64encode(f"{CH_USER}:{CH_PASS}".encode()).decode()
req = urllib.request.Request(
f"{CH_URL}/?database=dolphin&default_format=TabSeparated",
data=sql.encode(),
headers={"Authorization": f"Basic {auth}"},
)
with urllib.request.urlopen(req, timeout=10) as r:
raw = r.read().decode().strip()
if not raw:
return []
return [line.split('\t') for line in raw.split('\n')]
def _ch_available() -> bool:
try:
_ch_query("SELECT 1")
return True
except Exception:
return False
CH_UP = _ch_available()
# ── Trade loader (shared with gate test) ──────────────────────────────────────
_CACHED_TRADES: Optional[List[dict]] = None
def fetch_trades() -> List[dict]:
global _CACHED_TRADES
if _CACHED_TRADES is not None:
return _CACHED_TRADES
sql = """
SELECT
toUnixTimestamp64Milli(ts) AS ts_ms,
asset, side, pnl, exit_reason, leverage
FROM dolphin.trade_events
WHERE strategy = 'blue'
AND exit_reason NOT IN ('HIBERNATE_HALT', 'SUBDAY_ACB_NORMALIZATION')
ORDER BY ts
"""
rows = _ch_query(sql)
pkl_map: Optional[Dict[str, int]] = None
try:
import pickle
with open(_ROOT / "adaptive_exit/models/bucket_assignments.pkl", 'rb') as f:
pkl_map = pickle.load(f).get('assignments', {})
except Exception:
pass
trades = []
for row in rows:
if len(row) < 6:
continue
try:
ts_ms = int(row[0])
asset = row[1]
pnl = float(row[3])
leverage = float(row[5])
except (ValueError, IndexError):
continue
ts = datetime.fromtimestamp(ts_ms / 1000.0, tz=timezone.utc)
adv = compute_esof(ts)
trades.append({
"ts": ts,
"asset": asset,
"pnl": pnl,
"leverage": leverage,
"session": adv["session"],
"dow": adv["dow"],
"score": adv["advisory_score"],
"label": adv["advisory_label"],
"bucket_id": get_bucket(asset, pkl_map),
})
_CACHED_TRADES = trades
return trades
# ── Statistical primitives ────────────────────────────────────────────────────
def wr(trades: List[dict]) -> float:
if not trades:
return float("nan")
return sum(1 for t in trades if t["pnl"] > 0) / len(trades)
def net_pnl(trades: List[dict]) -> float:
return sum(t["pnl"] for t in trades)
def cohen_h(p1: float, p2: float) -> float:
"""Cohen's h effect size for two proportions. |h| ≥ 0.2 small, 0.5 medium, 0.8 large."""
return abs(2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2)))
def binomial_se(p: float, n: int) -> float:
return math.sqrt(p * (1 - p) / n) if n > 0 else float("inf")
def bootstrap_wr_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
"""Bootstrap CI on WR. Returns (lower, upper)."""
rng = random.Random(42)
n = len(trades)
samples = []
for _ in range(n_boot):
resample = [rng.choice(trades) for _ in range(n)]
samples.append(wr(resample))
samples.sort()
lo = int((1 - ci) / 2 * n_boot)
hi = int((1 + ci) / 2 * n_boot)
return samples[lo], samples[hi]
def bootstrap_pnl_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
rng = random.Random(42)
n = len(trades)
samples = []
for _ in range(n_boot):
resample = [rng.choice(trades) for _ in range(n)]
samples.append(net_pnl(resample))
samples.sort()
lo = int((1 - ci) / 2 * n_boot)
hi = int((1 + ci) / 2 * n_boot)
return samples[lo], samples[hi]
def permutation_pvalue(
trades: List[dict],
observed_delta: float,
label_key: str,
blocked_label,
n_perm: int = 2000,
seed: int = 42,
) -> float:
"""
Permutation test: shuffle label_key randomly, compute strategy improvement
each time. Return fraction of permutations that produce >= observed_delta.
observed_delta > 0 means "blocking blocked_label improved PnL".
"""
rng = random.Random(seed)
labels = [t[label_key] for t in trades]
pnls = [t["pnl"] for t in trades]
count_ge = 0
for _ in range(n_perm):
rng.shuffle(labels)
blocked_pnl = sum(p for l, p in zip(labels, pnls) if l == blocked_label)
# delta = what we gain by blocking these trades
delta = -blocked_pnl # if blocked_pnl < 0, delta > 0 = improvement
if delta >= observed_delta:
count_ge += 1
return count_ge / n_perm
# ═════════════════════════════════════════════════════════════════════════════
# TEST CLASSES
# ═════════════════════════════════════════════════════════════════════════════
skip_no_ch = pytest.mark.skipif(not CH_UP, reason="ClickHouse not available")
class TestTemporalStability:
"""
Walk-forward: split chronologically into H1 (first 50%) and H2 (last 50%).
Session and DoW effects must appear in BOTH halves to be considered real.
If present in only one half data snooping artifact.
"""
@skip_no_ch
def test_ny_afternoon_negative_in_h1_and_h2(self):
trades = fetch_trades()
n = len(trades)
h1 = trades[:n // 2]
h2 = trades[n // 2:]
ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
base_h1 = wr(h1)
base_h2 = wr(h2)
assert len(ny_h1) >= 10, f"H1 NY_AFT too small: n={len(ny_h1)}"
assert len(ny_h2) >= 10, f"H2 NY_AFT too small: n={len(ny_h2)}"
# NY_AFTERNOON WR must be below baseline in BOTH halves
wr_h1 = wr(ny_h1)
wr_h2 = wr(ny_h2)
assert wr_h1 < base_h1, (
f"NY_AFT drag missing in H1: WR_NYA={wr_h1:.3f} >= baseline={base_h1:.3f}"
)
assert wr_h2 < base_h2, (
f"NY_AFT drag missing in H2: WR_NYA={wr_h2:.3f} >= baseline={base_h2:.3f}"
)
@skip_no_ch
def test_monday_negative_in_h1_and_h2(self):
trades = fetch_trades()
n = len(trades)
h1 = trades[:n // 2]
h2 = trades[n // 2:]
mon_h1 = [t for t in h1 if t["dow"] == 0]
mon_h2 = [t for t in h2 if t["dow"] == 0]
# Monday sample is thin — require at least 10 in each half
if len(mon_h1) < 10 or len(mon_h2) < 10:
pytest.skip(f"Monday sample too thin for walk-forward: H1={len(mon_h1)}, H2={len(mon_h2)}")
assert wr(mon_h1) < wr(h1), "Monday drag absent in H1"
assert wr(mon_h2) < wr(h2), "Monday drag absent in H2"
@skip_no_ch
def test_strategy_e_positive_in_both_halves(self):
"""Combined gate (Mon+NY_AFT) must improve PnL in H1 AND H2 independently."""
trades = fetch_trades()
n = len(trades)
h1 = trades[:n // 2]
h2 = trades[n // 2:]
def gate_e_pnl(subset):
return sum(t["pnl"] for t in subset
if t["dow"] != 0 and t["session"] != "NY_AFTERNOON")
def base_pnl(subset):
return sum(t["pnl"] for t in subset)
assert gate_e_pnl(h1) > base_pnl(h1), "Strategy E degrades H1"
assert gate_e_pnl(h2) > base_pnl(h2), "Strategy E degrades H2"
class TestPermutationSignificance:
"""
Permutation test: shuffle session / DoW labels randomly.
The observed improvement from blocking must rank in the top 5%
of the null distribution (p < 0.05) to be considered non-random.
"""
@skip_no_ch
def test_ny_afternoon_block_is_significant(self):
trades = fetch_trades()
ny_pnl = sum(t["pnl"] for t in trades if t["session"] == "NY_AFTERNOON")
observed_delta = -ny_pnl # gain from skipping NY_AFT trades
p = permutation_pvalue(trades, observed_delta, "session", "NY_AFTERNOON",
n_perm=2000)
assert p < 0.05, (
f"NY_AFTERNOON block not significant: p={p:.3f} >= 0.05. "
f"Effect may be noise at this sample size."
)
@skip_no_ch
def test_monday_block_significance(self):
trades = fetch_trades()
mon_pnl = sum(t["pnl"] for t in trades if t["dow"] == 0)
observed_delta = -mon_pnl
p = permutation_pvalue(trades, observed_delta, "dow", 0, n_perm=2000)
# Monday has fewer trades — use looser threshold (p < 0.15)
# Flag as WARNING not FAIL if p >= 0.05: thin sample, directionally valid
if p >= 0.05:
print(f"\n WARN: Monday block p={p:.3f} >= 0.05. "
f"Directionally valid but underpowered (n={sum(1 for t in trades if t['dow']==0)}).")
assert p < 0.15, (
f"Monday block not even marginally significant: p={p:.3f}. "
f"Gate should not be applied until more data accumulates."
)
@skip_no_ch
def test_london_morning_block_would_hurt(self):
"""Blocking LONDON_MORNING (the BEST session) must NOT improve PnL."""
trades = fetch_trades()
ldn_pnl = sum(t["pnl"] for t in trades if t["session"] == "LONDON_MORNING")
observed_delta = -ldn_pnl # gain from blocking LDN (expect negative = harmful)
# LDN is net-positive, so blocking it is harmful (delta < 0)
assert observed_delta < 0, (
f"Blocking LONDON_MORNING should HURT PnL (it is the best session). "
f"Got delta={observed_delta:.2f}. Check data integrity."
)
class TestMultipleComparison:
"""
Multiple comparison correction.
We inspected 5 sessions × 7 DoW = 35 cells. Finding 'significant' cells
after inspection requires Bonferroni correction: α_adj = 0.05 / 35 0.0014.
Only cells where WR deviation is large enough to survive Bonferroni should
be used in the gate.
We test: do our chosen cells (NY_AFT, Monday) survive Bonferroni?
Using a binomial z-test as a proxy for the corrected p-value.
"""
@skip_no_ch
def test_ny_afternoon_survives_bonferroni(self):
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
n = len(ny)
baseline = wr(trades)
wr_ny = wr(ny)
se = binomial_se(baseline, n)
z = (baseline - wr_ny) / se if se > 0 else 0
# One-tailed z for Bonferroni α=0.0014: z_crit ≈ 2.99
# We use 2.0 as a practical threshold (more conservative than 1.96 but
# less strict than Bonferroni, given 3-week sample inherent limitations)
assert z > 2.0, (
f"NY_AFTERNOON WR deviation (z={z:.2f}) does not survive "
f"multiple-comparison correction. n={n}, WR={wr_ny:.3f} vs base={baseline:.3f}."
)
@skip_no_ch
def test_monday_bonferroni_warning(self):
trades = fetch_trades()
mon = [t for t in trades if t["dow"] == 0]
n = len(mon)
baseline = wr(trades)
wr_mon = wr(mon)
se = binomial_se(baseline, n)
z = (baseline - wr_mon) / se if se > 0 else 0
# Monday: warn if z < 2.0 (doesn't survive strict Bonferroni)
if z < 2.0:
print(f"\n WARN: Monday z={z:.2f} < 2.0. Does not survive Bonferroni "
f"at current sample (n={n}). Apply Monday gate cautiously.")
# Require at least z > 1.0 (directional signal, not pure noise)
assert z > 1.0, (
f"Monday WR deviation is indistinguishable from noise: z={z:.2f}. "
f"Do not gate Monday until more trades accumulate."
)
@skip_no_ch
def test_no_spurious_best_cell_used_as_gate(self):
"""
Best-cell cherry-pick guard: the SINGLE best-performing cell in the dataset
must NOT be treated as a reliable gate without Bonferroni correction.
Test: find the best WR cell (n >= 10), check that its deviation is NOT
significantly larger than the worst cell both could be noise extremes.
"""
trades = fetch_trades()
cells: Dict[Tuple, List[dict]] = defaultdict(list)
for t in trades:
cells[(t["dow"], t["session"])].append(t)
valid = [(k, v) for k, v in cells.items() if len(v) >= 10]
if len(valid) < 5:
pytest.skip("Not enough cells with n >= 10")
wrs = [(k, wr(v), len(v)) for k, v in valid]
best = max(wrs, key=lambda x: x[1])
worst = min(wrs, key=lambda x: x[1])
baseline = wr(trades)
se_best = binomial_se(baseline, best[2])
se_worst = binomial_se(baseline, worst[2])
z_best = (best[1] - baseline) / se_best if se_best > 0 else 0
z_worst = (baseline - worst[1]) / se_worst if se_worst > 0 else 0
# Both extremes should be similarly significant (or not).
# If best is >3σ but worst is <1σ, something is asymmetric — flag it.
# Acceptable: both extremes are significant OR both are marginal.
ratio = z_best / z_worst if z_worst > 0.1 else float("inf")
assert ratio < 5.0, (
f"Asymmetric cell extremes: z_best={z_best:.2f} vs z_worst={z_worst:.2f}. "
f"Best cell ({best[0]}) may be a cherry-pick artifact."
)
class TestBootstrapCI:
"""
Bootstrap confidence intervals on WR for each gated segment.
The 95% CI upper bound for NY_AFTERNOON WR must be below baseline WR.
If the CI overlaps the baseline, the effect is not reliable.
"""
@skip_no_ch
def test_ny_afternoon_ci_below_baseline(self):
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
assert len(ny) >= 20, f"NY_AFT sample too small for bootstrap: n={len(ny)}"
_, upper = bootstrap_wr_ci(ny, n_boot=3000)
baseline = wr(trades)
assert upper < baseline, (
f"NY_AFTERNOON WR CI upper bound ({upper:.3f}) overlaps baseline "
f"WR ({baseline:.3f}). Effect not reliable at 95% confidence."
)
@skip_no_ch
def test_london_morning_ci_above_baseline(self):
trades = fetch_trades()
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
assert len(ldn) >= 20, f"LDN sample too small: n={len(ldn)}"
lower, _ = bootstrap_wr_ci(ldn, n_boot=3000)
baseline = wr(trades)
assert lower > baseline * 0.95, (
f"LONDON_MORNING WR CI lower bound ({lower:.3f}) is too far below "
f"baseline ({baseline:.3f}). LDN advantage may not be reliable."
)
@skip_no_ch
def test_ny_afternoon_pnl_ci_negative(self):
"""Net PnL CI for NY_AFTERNOON must have upper bound < 0 (net loser with confidence)."""
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
assert len(ny) >= 20
_, upper = bootstrap_pnl_ci(ny, n_boot=3000)
assert upper < 0, (
f"NY_AFTERNOON net PnL CI upper bound is {upper:.2f} > 0. "
f"Cannot confidently call it a net loser at current sample size."
)
class TestMinimumSampleSize:
"""
Minimum sample size guard. No session or DoW factor should influence
the advisory score unless it has n >= 30 trades. Below 30, the WR
estimate has SE > 9pp (too noisy to act on).
"""
@skip_no_ch
def test_all_gate_factors_have_sufficient_n(self):
"""
The two gated factors (NY_AFTERNOON, Monday) must each have n >= 30
in the current dataset for the gate to be considered valid.
"""
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
mon = [t for t in trades if t["dow"] == 0]
assert len(ny) >= 30, f"NY_AFTERNOON n={len(ny)} < 30. Gate underpowered."
assert len(mon) >= 30, f"Monday n={len(mon)} < 30. Gate underpowered."
@skip_no_ch
def test_slot_15m_gate_would_be_overfit(self):
"""
15-minute slot data has median n 7. Any slot-level gate applied
directly would be extreme overfitting. Verify: majority of slots have n < 30.
"""
trades = fetch_trades()
slots: Dict[str, int] = defaultdict(int)
for t in trades:
h = t["ts"].hour
m = (t["ts"].minute // 15) * 15
slots[f"{h}:{m:02d}"] += 1
n_thin = sum(1 for n in slots.values() if n < 30)
frac = n_thin / len(slots) if slots else 1.0
assert frac > 0.70, (
f"Only {frac:.0%} of 15m slots have n < 30. "
f"Expected most slots to be underpowered — if not, slot gate may be premature."
)
def test_advisory_score_weights_reflect_sample_size(self):
"""
Slot weight (0.10) must be lower than session (0.25) and DoW (0.30).
Ensures the weakest-sample factor has the lowest influence.
"""
from esof_advisor import SESSION_STATS, DOW_STATS, SLOT_STATS
median_session_n = sorted([v[0] for v in SESSION_STATS.values()])[len(SESSION_STATS) // 2]
median_dow_n = sorted([v[0] for v in DOW_STATS.values()])[len(DOW_STATS) // 2]
median_slot_n = sorted([v[0] for v in SLOT_STATS.values()])[len(SLOT_STATS) // 2]
assert median_slot_n < median_session_n, "Slot n should be < session n"
assert median_slot_n < median_dow_n, "Slot n should be < DoW n"
# Slot weight is 0.10, session 0.25, DoW 0.30 — smaller n = smaller weight
SLOT_WEIGHT = 0.10
SESSION_WEIGHT = 0.25
DOW_WEIGHT = 0.30
assert SLOT_WEIGHT < SESSION_WEIGHT
assert SLOT_WEIGHT < DOW_WEIGHT
class TestEffectSize:
"""
Cohen's h effect size on WR differences.
|h| >= 0.2: small effect (minimum threshold to consider gating)
|h| >= 0.5: medium effect (comfortable to gate)
|h| >= 0.8: large effect (very strong signal)
"""
@skip_no_ch
def test_ny_afternoon_effect_size_medium(self):
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
baseline = wr(trades)
h = cohen_h(wr(ny), baseline)
assert h >= 0.2, (
f"NY_AFTERNOON effect size h={h:.3f} < 0.2 (small). "
f"Signal too weak to justify gating."
)
@skip_no_ch
def test_london_morning_effect_size_positive(self):
trades = fetch_trades()
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
baseline = wr(trades)
h = cohen_h(wr(ldn), baseline)
assert h >= 0.0, "LDN effect size must be measurable"
@skip_no_ch
def test_dow_tuesday_effect_size(self):
"""Tuesday is the best DoW. Effect size must be positive."""
trades = fetch_trades()
tue = [t for t in trades if t["dow"] == 1]
baseline = wr(trades)
if len(tue) < 10:
pytest.skip("Tuesday sample too thin")
h = cohen_h(wr(tue), baseline)
assert h >= 0.0, "Tuesday must show positive effect"
@skip_no_ch
def test_effect_size_ranking_matches_expectation(self):
"""
NY_AFTERNOON effect size must be larger than LOW_LIQUIDITY effect size.
NY_AFT has more trades and a larger WR gap should show stronger signal.
"""
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
low = [t for t in trades if t["session"] == "LOW_LIQUIDITY"]
base = wr(trades)
h_ny = cohen_h(wr(ny), base) if len(ny) >= 10 else 0
h_low = cohen_h(wr(low), base) if len(low) >= 10 else 0
# NY_AFTERNOON has 3× the sample of LOW_LIQ — effect should be at least as large
assert h_ny >= h_low * 0.7, (
f"NY_AFT h={h_ny:.3f} much smaller than LOW_LIQ h={h_low:.3f}. "
f"Unexpected — check data."
)
class TestWalkForwardAdvisory:
"""
Walk-forward advisory score validation.
Train EsoF tables conceptually on H1 (we use the existing static tables as proxy).
Evaluate: does the advisory score computed at H2 trade times predict H2 outcomes?
Method: within H2, rank trades by advisory_score. The bottom quartile (most
negative score) should have lower WR than the top quartile. If the score
has no predictive power on OOS data, it is overfit to the in-sample period.
"""
@skip_no_ch
def test_score_predicts_wr_direction_in_h2(self):
trades = fetch_trades()
n = len(trades)
h2 = sorted(trades[n // 2:], key=lambda t: t["score"])
if len(h2) < 40:
pytest.skip(f"H2 too small for quartile split: n={len(h2)}")
q = len(h2) // 4
bottom = h2[:q] # worst advisory scores
top = h2[-q:] # best advisory scores
wr_bot = wr(bottom)
wr_top = wr(top)
assert wr_top > wr_bot, (
f"Advisory score has no directional predictive power in H2: "
f"WR_top={wr_top:.3f} WR_bot={wr_bot:.3f}. Score may be overfit."
)
@skip_no_ch
def test_unfavorable_label_has_lower_wr_in_h2(self):
trades = fetch_trades()
n = len(trades)
h2 = trades[n // 2:]
unfav = [t for t in h2 if t["label"] == "UNFAVORABLE"]
rest = [t for t in h2 if t["label"] != "UNFAVORABLE"]
if len(unfav) < 5:
pytest.skip(f"Too few UNFAVORABLE trades in H2: n={len(unfav)}")
assert wr(unfav) <= wr(rest) + 0.05, (
f"UNFAVORABLE label does not predict lower WR in H2: "
f"WR_unfav={wr(unfav):.3f} vs WR_rest={wr(rest):.3f}. "
f"Advisory label may be overfit."
)
class TestAssetBucketStability:
"""
The session/DoW effect must not be driven by a single asset bucket.
If NY_AFTERNOON drag is entirely explained by, say, B4 trades clustering
in that session, the gate is actually gating B4 by proxy not time.
The effect must hold across at least 2 independent buckets.
"""
@skip_no_ch
def test_ny_afternoon_drag_cross_bucket(self):
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
not_ny = [t for t in trades if t["session"] != "NY_AFTERNOON"]
by_bucket_ny = defaultdict(list)
by_bucket_out = defaultdict(list)
for t in ny:
by_bucket_ny[t["bucket_id"]].append(t)
for t in not_ny:
by_bucket_out[t["bucket_id"]].append(t)
# Count buckets where NY_AFT WR is below out-of-session WR
n_confirming = 0
for bkt in by_bucket_ny:
if len(by_bucket_ny[bkt]) < 5 or len(by_bucket_out.get(bkt, [])) < 5:
continue
if wr(by_bucket_ny[bkt]) < wr(by_bucket_out[bkt]):
n_confirming += 1
assert n_confirming >= 2, (
f"NY_AFT drag only confirmed in {n_confirming} bucket(s). "
f"Need ≥ 2 for effect to be session-driven, not bucket-confounded."
)
@skip_no_ch
def test_monday_drag_cross_bucket(self):
trades = fetch_trades()
mon = [t for t in trades if t["dow"] == 0]
not_mon = [t for t in trades if t["dow"] != 0]
by_bkt_mon = defaultdict(list)
by_bkt_out = defaultdict(list)
for t in mon:
by_bkt_mon[t["bucket_id"]].append(t)
for t in not_mon:
by_bkt_out[t["bucket_id"]].append(t)
n_confirming = 0
for bkt in by_bkt_mon:
if len(by_bkt_mon[bkt]) < 5 or len(by_bkt_out.get(bkt, [])) < 5:
continue
if wr(by_bkt_mon[bkt]) < wr(by_bkt_out[bkt]):
n_confirming += 1
if n_confirming < 2:
print(f"\n WARN: Monday drag only in {n_confirming} bucket(s). "
f"Thin sample — cannot confirm cross-bucket. Gate with caution.")
# Soft assert: Monday has thinner sample, require at least 1
assert n_confirming >= 1, (
f"Monday drag not present in ANY bucket. "
f"Likely a sampling artifact — do not gate Monday."
)
class TestRegimeConfound:
"""
Regime confound check: is the session effect just a proxy for ACB beta?
If all NY_AFTERNOON trades happen to coincide with low ACB beta (bearish
regime), then blocking NY_AFT is actually blocking bear-regime trades,
not session-specific trades. The gate would be redundant with ACB.
Method: compare ACB leverage (proxy for regime strength) between
NY_AFTERNOON and other sessions. If leverage distributions are
significantly different, the session effect is partially confounded.
"""
@skip_no_ch
def test_ny_afternoon_leverage_not_systematically_different(self):
"""
NY_AFTERNOON avg leverage should be within 20% of other sessions' avg leverage.
Large divergence session effect may be a regime proxy.
"""
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
not_ny = [t for t in trades if t["session"] != "NY_AFTERNOON"]
if len(ny) < 10 or len(not_ny) < 10:
pytest.skip("Insufficient data for leverage comparison")
avg_lev_ny = sum(t["leverage"] for t in ny) / len(ny)
avg_lev_out = sum(t["leverage"] for t in not_ny) / len(not_ny)
ratio = avg_lev_ny / avg_lev_out if avg_lev_out > 0 else 1.0
assert 0.80 <= ratio <= 1.20, (
f"NY_AFTERNOON avg leverage ({avg_lev_ny:.2f}x) differs by >{20}% "
f"from other sessions ({avg_lev_out:.2f}x). "
f"Session effect may be a regime-proxy — investigate confound."
)
@skip_no_ch
def test_ny_afternoon_wr_negative_across_leverage_bands(self):
"""
Regime confound falsification: split NY_AFT trades into high/low leverage.
If NY_AFT drag holds in BOTH leverage bands, it is NOT purely a regime effect.
"""
trades = fetch_trades()
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
if len(ny) < 20:
pytest.skip(f"NY_AFT too small for leverage split: n={len(ny)}")
median_lev = sorted(t["leverage"] for t in ny)[len(ny) // 2]
hi_lev = [t for t in ny if t["leverage"] >= median_lev]
lo_lev = [t for t in ny if t["leverage"] < median_lev]
baseline = wr(fetch_trades())
hi_below = wr(hi_lev) < baseline if len(hi_lev) >= 5 else True
lo_below = wr(lo_lev) < baseline if len(lo_lev) >= 5 else True
assert hi_below or lo_below, (
"NY_AFT drag absent in BOTH leverage bands — effect is not regime-independent. "
"Gate may be a regime proxy."
)
# ═════════════════════════════════════════════════════════════════════════════
# STANDALONE REPORT
# ═════════════════════════════════════════════════════════════════════════════
GREEN = "\033[32m"; RED = "\033[31m"; YELLOW = "\033[33m"
BOLD = "\033[1m"; DIM = "\033[2m"; RST = "\033[0m"
if __name__ == "__main__":
if not CH_UP:
print(f"{RED}ClickHouse not available.{RST}")
sys.exit(1)
trades = fetch_trades()
n = len(trades)
h1, h2 = trades[:n // 2], trades[n // 2:]
print(f"\n{BOLD}{''*68}{RST}")
print(f"{BOLD} EsoF Overfitting Guard Report ({n} trades){RST}")
print(f"{''*68}\n")
baseline = wr(trades)
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
mon = [t for t in trades if t["dow"] == 0]
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
mon_h1 = [t for t in h1 if t["dow"] == 0]
mon_h2 = [t for t in h2 if t["dow"] == 0]
def row(label, val, ref=None, lo=None, hi=None, warn=None, note=""):
if lo is not None:
ci_str = f" 95%CI [{lo:.3f}, {hi:.3f}]"
else:
ci_str = ""
col = GREEN if (ref is None or val < ref) else RED
if warn:
col = YELLOW
print(f" {label:<42} {col}{val:.3f}{RST}{ci_str} {DIM}{note}{RST}")
print(f" {'Baseline WR':<42} {baseline:.3f}")
print()
print(f" {BOLD}1. Temporal Stability (H1 / H2){RST}")
row(" NY_AFT WR — H1", wr(ny_h1), baseline, note=f"n={len(ny_h1)}")
row(" NY_AFT WR — H2", wr(ny_h2), baseline, note=f"n={len(ny_h2)}")
row(" Mon WR — H1", wr(mon_h1), baseline, note=f"n={len(mon_h1)}")
row(" Mon WR — H2", wr(mon_h2), baseline, note=f"n={len(mon_h2)}")
print(f"\n {BOLD}2. Permutation p-values{RST}")
ny_pnl = sum(t["pnl"] for t in ny)
mon_pnl = sum(t["pnl"] for t in mon)
p_ny = permutation_pvalue(trades, -ny_pnl, "session", "NY_AFTERNOON", n_perm=2000)
p_mon = permutation_pvalue(trades, -mon_pnl, "dow", 0, n_perm=2000)
col_ny = GREEN if p_ny < 0.05 else YELLOW if p_ny < 0.15 else RED
col_mon = GREEN if p_mon < 0.05 else YELLOW if p_mon < 0.15 else RED
print(f" {'NY_AFT block p-value':<42} {col_ny}{p_ny:.4f}{RST} {DIM}(< 0.05 = significant){RST}")
print(f" {'Monday block p-value':<42} {col_mon}{p_mon:.4f}{RST} {DIM}(< 0.15 = directional){RST}")
print(f"\n {BOLD}3. Effect Sizes (Cohen's h){RST}")
h_ny = cohen_h(wr(ny), baseline)
h_mon = cohen_h(wr(mon), baseline)
h_ldn = cohen_h(wr(ldn), baseline)
for label, h, n_cell in [("NY_AFT", h_ny, len(ny)), ("Monday", h_mon, len(mon)), ("London", h_ldn, len(ldn))]:
grade = "large" if h >= 0.8 else "medium" if h >= 0.5 else "small" if h >= 0.2 else "trivial"
col = GREEN if h >= 0.5 else YELLOW if h >= 0.2 else RED
print(f" {' '+label:<42} {col}{h:.3f}{RST} {DIM}{grade} (n={n_cell}){RST}")
print(f"\n {BOLD}4. Bootstrap 95% CIs{RST}")
ny_lo, ny_hi = bootstrap_wr_ci(ny, n_boot=3000)
col = GREEN if ny_hi < baseline else RED
print(f" {'NY_AFT WR CI':<42} {col}[{ny_lo:.3f}, {ny_hi:.3f}]{RST} "
f"{DIM}({'below' if ny_hi < baseline else 'overlaps'} baseline {baseline:.3f}){RST}")
ny_plo, ny_phi = bootstrap_pnl_ci(ny, n_boot=3000)
col = GREEN if ny_phi < 0 else RED
print(f" {'NY_AFT net PnL CI':<42} {col}[{ny_plo:+,.0f}, {ny_phi:+,.0f}]{RST} "
f"{DIM}({'net loser with confidence' if ny_phi < 0 else 'uncertain sign'}){RST}")
print(f"\n {BOLD}5. Bonferroni z-scores (35 cells tested){RST}")
se_ny = binomial_se(baseline, len(ny))
se_mon = binomial_se(baseline, len(mon))
z_ny = (baseline - wr(ny)) / se_ny if se_ny > 0 else 0
z_mon = (baseline - wr(mon)) / se_mon if se_mon > 0 else 0
crit = 2.99 # Bonferroni α=0.0014 → z_crit≈2.99
col_ny = GREEN if z_ny > crit else YELLOW if z_ny > 2.0 else RED
col_mon = GREEN if z_mon > crit else YELLOW if z_mon > 2.0 else RED
print(f" {'NY_AFT z':<42} {col_ny}{z_ny:.2f}{RST} {DIM}(Bonferroni crit ≈ {crit}){RST}")
print(f" {'Monday z':<42} {col_mon}{z_mon:.2f}{RST}")
print(f"\n {BOLD}6. Walk-Forward: advisory score → H2 WR{RST}")
h2s = sorted(h2, key=lambda t: t["score"])
q = max(1, len(h2s) // 4)
wr_bot = wr(h2s[:q])
wr_top = wr(h2s[-q:])
col = GREEN if wr_top > wr_bot else RED
print(f" {' Top-quartile score WR (H2)':<42} {col}{wr_top:.3f}{RST} {DIM}n={q}{RST}")
print(f" {' Bot-quartile score WR (H2)':<42} {col}{wr_bot:.3f}{RST} {DIM}n={q}{RST}")
print(f" {' Predictive (top > bot)?':<42} {col}{'YES' if wr_top > wr_bot else 'NO — score overfit'}{RST}")
print(f"\n{''*68}\n")