#!/usr/bin/env python3 """ EsoF Overfitting Avoidance Test Suite Industry-standard statistical tests to guard against overfitting in the EsoF calendar/session gate and the EsoF↔system interaction. Why overfitting is a real risk here ───────────────────────────────────── We inspected 5 sessions × 7 DoW = 35 cells on a single ~550-trade dataset covering only 3 weeks (2026-03-31 → 2026-04-19). That is: - A short temporal window (one market regime) - Small per-cell sample sizes (median n ≈ 14) - Multiple comparisons (we chose the *worst* cells after looking at all) - No pre-registration (we looked at the data before deciding the gate) Any one of these alone warrants caution. Together they demand rigorous testing. Tests implemented ────────────────── 1. TestTemporalStability — H1 vs H2 walk-forward: does the effect hold in both halves? 2. TestPermutationSignificance — shuffle session/DoW labels N=2000 times; empirical p-value 3. TestMultipleComparison — Bonferroni / FDR correction across all 35 cells 4. TestBootstrapCI — 95% CI on WR and net PnL via bootstrap resampling 5. TestMinimumSampleSize — flag cells with n < 30 as "insufficient evidence" 6. TestEffectSize — Cohen's h on WR difference; require medium+ effect (h ≥ 0.3) 7. TestWalkForwardAdvisory — train EsoF tables on H1, evaluate advisory score on H2 8. TestAssetBucketStability — NY_AFT / Mon effect must hold across ≥ 2 asset buckets 9. TestRegimeConfound — check if session effect is a proxy for ACB beta (regime) Run: source /home/dolphin/siloqy_env/bin/activate cd /mnt/dolphinng5_predict python prod/tests/test_esof_overfit_guard.py # full report pytest prod/tests/test_esof_overfit_guard.py -v # pytest mode """ from __future__ import annotations import base64 import math import random import sys import urllib.request from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Tuple import pytest _ROOT = Path(__file__).parent.parent.parent sys.path.insert(0, str(_ROOT)) sys.path.insert(0, str(_ROOT / "Observability")) from esof_advisor import compute_esof, get_session, BASELINE_WR from esof_gate import get_bucket # ── CH helpers ──────────────────────────────────────────────────────────────── CH_URL = "http://localhost:8123" CH_USER = "dolphin" CH_PASS = "dolphin_ch_2026" def _ch_query(sql: str) -> List[List[str]]: auth = base64.b64encode(f"{CH_USER}:{CH_PASS}".encode()).decode() req = urllib.request.Request( f"{CH_URL}/?database=dolphin&default_format=TabSeparated", data=sql.encode(), headers={"Authorization": f"Basic {auth}"}, ) with urllib.request.urlopen(req, timeout=10) as r: raw = r.read().decode().strip() if not raw: return [] return [line.split('\t') for line in raw.split('\n')] def _ch_available() -> bool: try: _ch_query("SELECT 1") return True except Exception: return False CH_UP = _ch_available() # ── Trade loader (shared with gate test) ────────────────────────────────────── _CACHED_TRADES: Optional[List[dict]] = None def fetch_trades() -> List[dict]: global _CACHED_TRADES if _CACHED_TRADES is not None: return _CACHED_TRADES sql = """ SELECT toUnixTimestamp64Milli(ts) AS ts_ms, asset, side, pnl, exit_reason, leverage FROM dolphin.trade_events WHERE strategy = 'blue' AND exit_reason NOT IN ('HIBERNATE_HALT', 'SUBDAY_ACB_NORMALIZATION') ORDER BY ts """ rows = _ch_query(sql) pkl_map: Optional[Dict[str, int]] = None try: import pickle with open(_ROOT / "adaptive_exit/models/bucket_assignments.pkl", 'rb') as f: pkl_map = pickle.load(f).get('assignments', {}) except Exception: pass trades = [] for row in rows: if len(row) < 6: continue try: ts_ms = int(row[0]) asset = row[1] pnl = float(row[3]) leverage = float(row[5]) except (ValueError, IndexError): continue ts = datetime.fromtimestamp(ts_ms / 1000.0, tz=timezone.utc) adv = compute_esof(ts) trades.append({ "ts": ts, "asset": asset, "pnl": pnl, "leverage": leverage, "session": adv["session"], "dow": adv["dow"], "score": adv["advisory_score"], "label": adv["advisory_label"], "bucket_id": get_bucket(asset, pkl_map), }) _CACHED_TRADES = trades return trades # ── Statistical primitives ──────────────────────────────────────────────────── def wr(trades: List[dict]) -> float: if not trades: return float("nan") return sum(1 for t in trades if t["pnl"] > 0) / len(trades) def net_pnl(trades: List[dict]) -> float: return sum(t["pnl"] for t in trades) def cohen_h(p1: float, p2: float) -> float: """Cohen's h effect size for two proportions. |h| ≥ 0.2 small, 0.5 medium, 0.8 large.""" return abs(2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2))) def binomial_se(p: float, n: int) -> float: return math.sqrt(p * (1 - p) / n) if n > 0 else float("inf") def bootstrap_wr_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]: """Bootstrap CI on WR. Returns (lower, upper).""" rng = random.Random(42) n = len(trades) samples = [] for _ in range(n_boot): resample = [rng.choice(trades) for _ in range(n)] samples.append(wr(resample)) samples.sort() lo = int((1 - ci) / 2 * n_boot) hi = int((1 + ci) / 2 * n_boot) return samples[lo], samples[hi] def bootstrap_pnl_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]: rng = random.Random(42) n = len(trades) samples = [] for _ in range(n_boot): resample = [rng.choice(trades) for _ in range(n)] samples.append(net_pnl(resample)) samples.sort() lo = int((1 - ci) / 2 * n_boot) hi = int((1 + ci) / 2 * n_boot) return samples[lo], samples[hi] def permutation_pvalue( trades: List[dict], observed_delta: float, label_key: str, blocked_label, n_perm: int = 2000, seed: int = 42, ) -> float: """ Permutation test: shuffle label_key randomly, compute strategy improvement each time. Return fraction of permutations that produce >= observed_delta. observed_delta > 0 means "blocking blocked_label improved PnL". """ rng = random.Random(seed) labels = [t[label_key] for t in trades] pnls = [t["pnl"] for t in trades] count_ge = 0 for _ in range(n_perm): rng.shuffle(labels) blocked_pnl = sum(p for l, p in zip(labels, pnls) if l == blocked_label) # delta = what we gain by blocking these trades delta = -blocked_pnl # if blocked_pnl < 0, delta > 0 = improvement if delta >= observed_delta: count_ge += 1 return count_ge / n_perm # ═════════════════════════════════════════════════════════════════════════════ # TEST CLASSES # ═════════════════════════════════════════════════════════════════════════════ skip_no_ch = pytest.mark.skipif(not CH_UP, reason="ClickHouse not available") class TestTemporalStability: """ Walk-forward: split chronologically into H1 (first 50%) and H2 (last 50%). Session and DoW effects must appear in BOTH halves to be considered real. If present in only one half → data snooping artifact. """ @skip_no_ch def test_ny_afternoon_negative_in_h1_and_h2(self): trades = fetch_trades() n = len(trades) h1 = trades[:n // 2] h2 = trades[n // 2:] ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"] ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"] base_h1 = wr(h1) base_h2 = wr(h2) assert len(ny_h1) >= 10, f"H1 NY_AFT too small: n={len(ny_h1)}" assert len(ny_h2) >= 10, f"H2 NY_AFT too small: n={len(ny_h2)}" # NY_AFTERNOON WR must be below baseline in BOTH halves wr_h1 = wr(ny_h1) wr_h2 = wr(ny_h2) assert wr_h1 < base_h1, ( f"NY_AFT drag missing in H1: WR_NYA={wr_h1:.3f} >= baseline={base_h1:.3f}" ) assert wr_h2 < base_h2, ( f"NY_AFT drag missing in H2: WR_NYA={wr_h2:.3f} >= baseline={base_h2:.3f}" ) @skip_no_ch def test_monday_negative_in_h1_and_h2(self): trades = fetch_trades() n = len(trades) h1 = trades[:n // 2] h2 = trades[n // 2:] mon_h1 = [t for t in h1 if t["dow"] == 0] mon_h2 = [t for t in h2 if t["dow"] == 0] # Monday sample is thin — require at least 10 in each half if len(mon_h1) < 10 or len(mon_h2) < 10: pytest.skip(f"Monday sample too thin for walk-forward: H1={len(mon_h1)}, H2={len(mon_h2)}") assert wr(mon_h1) < wr(h1), "Monday drag absent in H1" assert wr(mon_h2) < wr(h2), "Monday drag absent in H2" @skip_no_ch def test_strategy_e_positive_in_both_halves(self): """Combined gate (Mon+NY_AFT) must improve PnL in H1 AND H2 independently.""" trades = fetch_trades() n = len(trades) h1 = trades[:n // 2] h2 = trades[n // 2:] def gate_e_pnl(subset): return sum(t["pnl"] for t in subset if t["dow"] != 0 and t["session"] != "NY_AFTERNOON") def base_pnl(subset): return sum(t["pnl"] for t in subset) assert gate_e_pnl(h1) > base_pnl(h1), "Strategy E degrades H1" assert gate_e_pnl(h2) > base_pnl(h2), "Strategy E degrades H2" class TestPermutationSignificance: """ Permutation test: shuffle session / DoW labels randomly. The observed improvement from blocking must rank in the top 5% of the null distribution (p < 0.05) to be considered non-random. """ @skip_no_ch def test_ny_afternoon_block_is_significant(self): trades = fetch_trades() ny_pnl = sum(t["pnl"] for t in trades if t["session"] == "NY_AFTERNOON") observed_delta = -ny_pnl # gain from skipping NY_AFT trades p = permutation_pvalue(trades, observed_delta, "session", "NY_AFTERNOON", n_perm=2000) assert p < 0.05, ( f"NY_AFTERNOON block not significant: p={p:.3f} >= 0.05. " f"Effect may be noise at this sample size." ) @skip_no_ch def test_monday_block_significance(self): trades = fetch_trades() mon_pnl = sum(t["pnl"] for t in trades if t["dow"] == 0) observed_delta = -mon_pnl p = permutation_pvalue(trades, observed_delta, "dow", 0, n_perm=2000) # Monday has fewer trades — use looser threshold (p < 0.15) # Flag as WARNING not FAIL if p >= 0.05: thin sample, directionally valid if p >= 0.05: print(f"\n WARN: Monday block p={p:.3f} >= 0.05. " f"Directionally valid but underpowered (n={sum(1 for t in trades if t['dow']==0)}).") assert p < 0.15, ( f"Monday block not even marginally significant: p={p:.3f}. " f"Gate should not be applied until more data accumulates." ) @skip_no_ch def test_london_morning_block_would_hurt(self): """Blocking LONDON_MORNING (the BEST session) must NOT improve PnL.""" trades = fetch_trades() ldn_pnl = sum(t["pnl"] for t in trades if t["session"] == "LONDON_MORNING") observed_delta = -ldn_pnl # gain from blocking LDN (expect negative = harmful) # LDN is net-positive, so blocking it is harmful (delta < 0) assert observed_delta < 0, ( f"Blocking LONDON_MORNING should HURT PnL (it is the best session). " f"Got delta={observed_delta:.2f}. Check data integrity." ) class TestMultipleComparison: """ Multiple comparison correction. We inspected 5 sessions × 7 DoW = 35 cells. Finding 'significant' cells after inspection requires Bonferroni correction: α_adj = 0.05 / 35 ≈ 0.0014. Only cells where WR deviation is large enough to survive Bonferroni should be used in the gate. We test: do our chosen cells (NY_AFT, Monday) survive Bonferroni? Using a binomial z-test as a proxy for the corrected p-value. """ @skip_no_ch def test_ny_afternoon_survives_bonferroni(self): trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] n = len(ny) baseline = wr(trades) wr_ny = wr(ny) se = binomial_se(baseline, n) z = (baseline - wr_ny) / se if se > 0 else 0 # One-tailed z for Bonferroni α=0.0014: z_crit ≈ 2.99 # We use 2.0 as a practical threshold (more conservative than 1.96 but # less strict than Bonferroni, given 3-week sample inherent limitations) assert z > 2.0, ( f"NY_AFTERNOON WR deviation (z={z:.2f}) does not survive " f"multiple-comparison correction. n={n}, WR={wr_ny:.3f} vs base={baseline:.3f}." ) @skip_no_ch def test_monday_bonferroni_warning(self): trades = fetch_trades() mon = [t for t in trades if t["dow"] == 0] n = len(mon) baseline = wr(trades) wr_mon = wr(mon) se = binomial_se(baseline, n) z = (baseline - wr_mon) / se if se > 0 else 0 # Monday: warn if z < 2.0 (doesn't survive strict Bonferroni) if z < 2.0: print(f"\n WARN: Monday z={z:.2f} < 2.0. Does not survive Bonferroni " f"at current sample (n={n}). Apply Monday gate cautiously.") # Require at least z > 1.0 (directional signal, not pure noise) assert z > 1.0, ( f"Monday WR deviation is indistinguishable from noise: z={z:.2f}. " f"Do not gate Monday until more trades accumulate." ) @skip_no_ch def test_no_spurious_best_cell_used_as_gate(self): """ Best-cell cherry-pick guard: the SINGLE best-performing cell in the dataset must NOT be treated as a reliable gate without Bonferroni correction. Test: find the best WR cell (n >= 10), check that its deviation is NOT significantly larger than the worst cell — both could be noise extremes. """ trades = fetch_trades() cells: Dict[Tuple, List[dict]] = defaultdict(list) for t in trades: cells[(t["dow"], t["session"])].append(t) valid = [(k, v) for k, v in cells.items() if len(v) >= 10] if len(valid) < 5: pytest.skip("Not enough cells with n >= 10") wrs = [(k, wr(v), len(v)) for k, v in valid] best = max(wrs, key=lambda x: x[1]) worst = min(wrs, key=lambda x: x[1]) baseline = wr(trades) se_best = binomial_se(baseline, best[2]) se_worst = binomial_se(baseline, worst[2]) z_best = (best[1] - baseline) / se_best if se_best > 0 else 0 z_worst = (baseline - worst[1]) / se_worst if se_worst > 0 else 0 # Both extremes should be similarly significant (or not). # If best is >3σ but worst is <1σ, something is asymmetric — flag it. # Acceptable: both extremes are significant OR both are marginal. ratio = z_best / z_worst if z_worst > 0.1 else float("inf") assert ratio < 5.0, ( f"Asymmetric cell extremes: z_best={z_best:.2f} vs z_worst={z_worst:.2f}. " f"Best cell ({best[0]}) may be a cherry-pick artifact." ) class TestBootstrapCI: """ Bootstrap confidence intervals on WR for each gated segment. The 95% CI upper bound for NY_AFTERNOON WR must be below baseline WR. If the CI overlaps the baseline, the effect is not reliable. """ @skip_no_ch def test_ny_afternoon_ci_below_baseline(self): trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] assert len(ny) >= 20, f"NY_AFT sample too small for bootstrap: n={len(ny)}" _, upper = bootstrap_wr_ci(ny, n_boot=3000) baseline = wr(trades) assert upper < baseline, ( f"NY_AFTERNOON WR CI upper bound ({upper:.3f}) overlaps baseline " f"WR ({baseline:.3f}). Effect not reliable at 95% confidence." ) @skip_no_ch def test_london_morning_ci_above_baseline(self): trades = fetch_trades() ldn = [t for t in trades if t["session"] == "LONDON_MORNING"] assert len(ldn) >= 20, f"LDN sample too small: n={len(ldn)}" lower, _ = bootstrap_wr_ci(ldn, n_boot=3000) baseline = wr(trades) assert lower > baseline * 0.95, ( f"LONDON_MORNING WR CI lower bound ({lower:.3f}) is too far below " f"baseline ({baseline:.3f}). LDN advantage may not be reliable." ) @skip_no_ch def test_ny_afternoon_pnl_ci_negative(self): """Net PnL CI for NY_AFTERNOON must have upper bound < 0 (net loser with confidence).""" trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] assert len(ny) >= 20 _, upper = bootstrap_pnl_ci(ny, n_boot=3000) assert upper < 0, ( f"NY_AFTERNOON net PnL CI upper bound is {upper:.2f} > 0. " f"Cannot confidently call it a net loser at current sample size." ) class TestMinimumSampleSize: """ Minimum sample size guard. No session or DoW factor should influence the advisory score unless it has n >= 30 trades. Below 30, the WR estimate has SE > 9pp (too noisy to act on). """ @skip_no_ch def test_all_gate_factors_have_sufficient_n(self): """ The two gated factors (NY_AFTERNOON, Monday) must each have n >= 30 in the current dataset for the gate to be considered valid. """ trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] mon = [t for t in trades if t["dow"] == 0] assert len(ny) >= 30, f"NY_AFTERNOON n={len(ny)} < 30. Gate underpowered." assert len(mon) >= 30, f"Monday n={len(mon)} < 30. Gate underpowered." @skip_no_ch def test_slot_15m_gate_would_be_overfit(self): """ 15-minute slot data has median n ≈ 7. Any slot-level gate applied directly would be extreme overfitting. Verify: majority of slots have n < 30. """ trades = fetch_trades() slots: Dict[str, int] = defaultdict(int) for t in trades: h = t["ts"].hour m = (t["ts"].minute // 15) * 15 slots[f"{h}:{m:02d}"] += 1 n_thin = sum(1 for n in slots.values() if n < 30) frac = n_thin / len(slots) if slots else 1.0 assert frac > 0.70, ( f"Only {frac:.0%} of 15m slots have n < 30. " f"Expected most slots to be underpowered — if not, slot gate may be premature." ) def test_advisory_score_weights_reflect_sample_size(self): """ Slot weight (0.10) must be lower than session (0.25) and DoW (0.30). Ensures the weakest-sample factor has the lowest influence. """ from esof_advisor import SESSION_STATS, DOW_STATS, SLOT_STATS median_session_n = sorted([v[0] for v in SESSION_STATS.values()])[len(SESSION_STATS) // 2] median_dow_n = sorted([v[0] for v in DOW_STATS.values()])[len(DOW_STATS) // 2] median_slot_n = sorted([v[0] for v in SLOT_STATS.values()])[len(SLOT_STATS) // 2] assert median_slot_n < median_session_n, "Slot n should be < session n" assert median_slot_n < median_dow_n, "Slot n should be < DoW n" # Slot weight is 0.10, session 0.25, DoW 0.30 — smaller n = smaller weight SLOT_WEIGHT = 0.10 SESSION_WEIGHT = 0.25 DOW_WEIGHT = 0.30 assert SLOT_WEIGHT < SESSION_WEIGHT assert SLOT_WEIGHT < DOW_WEIGHT class TestEffectSize: """ Cohen's h effect size on WR differences. |h| >= 0.2: small effect (minimum threshold to consider gating) |h| >= 0.5: medium effect (comfortable to gate) |h| >= 0.8: large effect (very strong signal) """ @skip_no_ch def test_ny_afternoon_effect_size_medium(self): trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] baseline = wr(trades) h = cohen_h(wr(ny), baseline) assert h >= 0.2, ( f"NY_AFTERNOON effect size h={h:.3f} < 0.2 (small). " f"Signal too weak to justify gating." ) @skip_no_ch def test_london_morning_effect_size_positive(self): trades = fetch_trades() ldn = [t for t in trades if t["session"] == "LONDON_MORNING"] baseline = wr(trades) h = cohen_h(wr(ldn), baseline) assert h >= 0.0, "LDN effect size must be measurable" @skip_no_ch def test_dow_tuesday_effect_size(self): """Tuesday is the best DoW. Effect size must be positive.""" trades = fetch_trades() tue = [t for t in trades if t["dow"] == 1] baseline = wr(trades) if len(tue) < 10: pytest.skip("Tuesday sample too thin") h = cohen_h(wr(tue), baseline) assert h >= 0.0, "Tuesday must show positive effect" @skip_no_ch def test_effect_size_ranking_matches_expectation(self): """ NY_AFTERNOON effect size must be larger than LOW_LIQUIDITY effect size. NY_AFT has more trades and a larger WR gap — should show stronger signal. """ trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] low = [t for t in trades if t["session"] == "LOW_LIQUIDITY"] base = wr(trades) h_ny = cohen_h(wr(ny), base) if len(ny) >= 10 else 0 h_low = cohen_h(wr(low), base) if len(low) >= 10 else 0 # NY_AFTERNOON has 3× the sample of LOW_LIQ — effect should be at least as large assert h_ny >= h_low * 0.7, ( f"NY_AFT h={h_ny:.3f} much smaller than LOW_LIQ h={h_low:.3f}. " f"Unexpected — check data." ) class TestWalkForwardAdvisory: """ Walk-forward advisory score validation. Train EsoF tables conceptually on H1 (we use the existing static tables as proxy). Evaluate: does the advisory score computed at H2 trade times predict H2 outcomes? Method: within H2, rank trades by advisory_score. The bottom quartile (most negative score) should have lower WR than the top quartile. If the score has no predictive power on OOS data, it is overfit to the in-sample period. """ @skip_no_ch def test_score_predicts_wr_direction_in_h2(self): trades = fetch_trades() n = len(trades) h2 = sorted(trades[n // 2:], key=lambda t: t["score"]) if len(h2) < 40: pytest.skip(f"H2 too small for quartile split: n={len(h2)}") q = len(h2) // 4 bottom = h2[:q] # worst advisory scores top = h2[-q:] # best advisory scores wr_bot = wr(bottom) wr_top = wr(top) assert wr_top > wr_bot, ( f"Advisory score has no directional predictive power in H2: " f"WR_top={wr_top:.3f} WR_bot={wr_bot:.3f}. Score may be overfit." ) @skip_no_ch def test_unfavorable_label_has_lower_wr_in_h2(self): trades = fetch_trades() n = len(trades) h2 = trades[n // 2:] unfav = [t for t in h2 if t["label"] == "UNFAVORABLE"] rest = [t for t in h2 if t["label"] != "UNFAVORABLE"] if len(unfav) < 5: pytest.skip(f"Too few UNFAVORABLE trades in H2: n={len(unfav)}") assert wr(unfav) <= wr(rest) + 0.05, ( f"UNFAVORABLE label does not predict lower WR in H2: " f"WR_unfav={wr(unfav):.3f} vs WR_rest={wr(rest):.3f}. " f"Advisory label may be overfit." ) class TestAssetBucketStability: """ The session/DoW effect must not be driven by a single asset bucket. If NY_AFTERNOON drag is entirely explained by, say, B4 trades clustering in that session, the gate is actually gating B4 by proxy — not time. The effect must hold across at least 2 independent buckets. """ @skip_no_ch def test_ny_afternoon_drag_cross_bucket(self): trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] not_ny = [t for t in trades if t["session"] != "NY_AFTERNOON"] by_bucket_ny = defaultdict(list) by_bucket_out = defaultdict(list) for t in ny: by_bucket_ny[t["bucket_id"]].append(t) for t in not_ny: by_bucket_out[t["bucket_id"]].append(t) # Count buckets where NY_AFT WR is below out-of-session WR n_confirming = 0 for bkt in by_bucket_ny: if len(by_bucket_ny[bkt]) < 5 or len(by_bucket_out.get(bkt, [])) < 5: continue if wr(by_bucket_ny[bkt]) < wr(by_bucket_out[bkt]): n_confirming += 1 assert n_confirming >= 2, ( f"NY_AFT drag only confirmed in {n_confirming} bucket(s). " f"Need ≥ 2 for effect to be session-driven, not bucket-confounded." ) @skip_no_ch def test_monday_drag_cross_bucket(self): trades = fetch_trades() mon = [t for t in trades if t["dow"] == 0] not_mon = [t for t in trades if t["dow"] != 0] by_bkt_mon = defaultdict(list) by_bkt_out = defaultdict(list) for t in mon: by_bkt_mon[t["bucket_id"]].append(t) for t in not_mon: by_bkt_out[t["bucket_id"]].append(t) n_confirming = 0 for bkt in by_bkt_mon: if len(by_bkt_mon[bkt]) < 5 or len(by_bkt_out.get(bkt, [])) < 5: continue if wr(by_bkt_mon[bkt]) < wr(by_bkt_out[bkt]): n_confirming += 1 if n_confirming < 2: print(f"\n WARN: Monday drag only in {n_confirming} bucket(s). " f"Thin sample — cannot confirm cross-bucket. Gate with caution.") # Soft assert: Monday has thinner sample, require at least 1 assert n_confirming >= 1, ( f"Monday drag not present in ANY bucket. " f"Likely a sampling artifact — do not gate Monday." ) class TestRegimeConfound: """ Regime confound check: is the session effect just a proxy for ACB beta? If all NY_AFTERNOON trades happen to coincide with low ACB beta (bearish regime), then blocking NY_AFT is actually blocking bear-regime trades, not session-specific trades. The gate would be redundant with ACB. Method: compare ACB leverage (proxy for regime strength) between NY_AFTERNOON and other sessions. If leverage distributions are significantly different, the session effect is partially confounded. """ @skip_no_ch def test_ny_afternoon_leverage_not_systematically_different(self): """ NY_AFTERNOON avg leverage should be within 20% of other sessions' avg leverage. Large divergence → session effect may be a regime proxy. """ trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] not_ny = [t for t in trades if t["session"] != "NY_AFTERNOON"] if len(ny) < 10 or len(not_ny) < 10: pytest.skip("Insufficient data for leverage comparison") avg_lev_ny = sum(t["leverage"] for t in ny) / len(ny) avg_lev_out = sum(t["leverage"] for t in not_ny) / len(not_ny) ratio = avg_lev_ny / avg_lev_out if avg_lev_out > 0 else 1.0 assert 0.80 <= ratio <= 1.20, ( f"NY_AFTERNOON avg leverage ({avg_lev_ny:.2f}x) differs by >{20}% " f"from other sessions ({avg_lev_out:.2f}x). " f"Session effect may be a regime-proxy — investigate confound." ) @skip_no_ch def test_ny_afternoon_wr_negative_across_leverage_bands(self): """ Regime confound falsification: split NY_AFT trades into high/low leverage. If NY_AFT drag holds in BOTH leverage bands, it is NOT purely a regime effect. """ trades = fetch_trades() ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] if len(ny) < 20: pytest.skip(f"NY_AFT too small for leverage split: n={len(ny)}") median_lev = sorted(t["leverage"] for t in ny)[len(ny) // 2] hi_lev = [t for t in ny if t["leverage"] >= median_lev] lo_lev = [t for t in ny if t["leverage"] < median_lev] baseline = wr(fetch_trades()) hi_below = wr(hi_lev) < baseline if len(hi_lev) >= 5 else True lo_below = wr(lo_lev) < baseline if len(lo_lev) >= 5 else True assert hi_below or lo_below, ( "NY_AFT drag absent in BOTH leverage bands — effect is not regime-independent. " "Gate may be a regime proxy." ) # ═════════════════════════════════════════════════════════════════════════════ # STANDALONE REPORT # ═════════════════════════════════════════════════════════════════════════════ GREEN = "\033[32m"; RED = "\033[31m"; YELLOW = "\033[33m" BOLD = "\033[1m"; DIM = "\033[2m"; RST = "\033[0m" if __name__ == "__main__": if not CH_UP: print(f"{RED}ClickHouse not available.{RST}") sys.exit(1) trades = fetch_trades() n = len(trades) h1, h2 = trades[:n // 2], trades[n // 2:] print(f"\n{BOLD}{'═'*68}{RST}") print(f"{BOLD} EsoF Overfitting Guard Report ({n} trades){RST}") print(f"{'═'*68}\n") baseline = wr(trades) ny = [t for t in trades if t["session"] == "NY_AFTERNOON"] mon = [t for t in trades if t["dow"] == 0] ldn = [t for t in trades if t["session"] == "LONDON_MORNING"] ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"] ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"] mon_h1 = [t for t in h1 if t["dow"] == 0] mon_h2 = [t for t in h2 if t["dow"] == 0] def row(label, val, ref=None, lo=None, hi=None, warn=None, note=""): if lo is not None: ci_str = f" 95%CI [{lo:.3f}, {hi:.3f}]" else: ci_str = "" col = GREEN if (ref is None or val < ref) else RED if warn: col = YELLOW print(f" {label:<42} {col}{val:.3f}{RST}{ci_str} {DIM}{note}{RST}") print(f" {'Baseline WR':<42} {baseline:.3f}") print() print(f" {BOLD}1. Temporal Stability (H1 / H2){RST}") row(" NY_AFT WR — H1", wr(ny_h1), baseline, note=f"n={len(ny_h1)}") row(" NY_AFT WR — H2", wr(ny_h2), baseline, note=f"n={len(ny_h2)}") row(" Mon WR — H1", wr(mon_h1), baseline, note=f"n={len(mon_h1)}") row(" Mon WR — H2", wr(mon_h2), baseline, note=f"n={len(mon_h2)}") print(f"\n {BOLD}2. Permutation p-values{RST}") ny_pnl = sum(t["pnl"] for t in ny) mon_pnl = sum(t["pnl"] for t in mon) p_ny = permutation_pvalue(trades, -ny_pnl, "session", "NY_AFTERNOON", n_perm=2000) p_mon = permutation_pvalue(trades, -mon_pnl, "dow", 0, n_perm=2000) col_ny = GREEN if p_ny < 0.05 else YELLOW if p_ny < 0.15 else RED col_mon = GREEN if p_mon < 0.05 else YELLOW if p_mon < 0.15 else RED print(f" {'NY_AFT block p-value':<42} {col_ny}{p_ny:.4f}{RST} {DIM}(< 0.05 = significant){RST}") print(f" {'Monday block p-value':<42} {col_mon}{p_mon:.4f}{RST} {DIM}(< 0.15 = directional){RST}") print(f"\n {BOLD}3. Effect Sizes (Cohen's h){RST}") h_ny = cohen_h(wr(ny), baseline) h_mon = cohen_h(wr(mon), baseline) h_ldn = cohen_h(wr(ldn), baseline) for label, h, n_cell in [("NY_AFT", h_ny, len(ny)), ("Monday", h_mon, len(mon)), ("London", h_ldn, len(ldn))]: grade = "large" if h >= 0.8 else "medium" if h >= 0.5 else "small" if h >= 0.2 else "trivial" col = GREEN if h >= 0.5 else YELLOW if h >= 0.2 else RED print(f" {' '+label:<42} {col}{h:.3f}{RST} {DIM}{grade} (n={n_cell}){RST}") print(f"\n {BOLD}4. Bootstrap 95% CIs{RST}") ny_lo, ny_hi = bootstrap_wr_ci(ny, n_boot=3000) col = GREEN if ny_hi < baseline else RED print(f" {'NY_AFT WR CI':<42} {col}[{ny_lo:.3f}, {ny_hi:.3f}]{RST} " f"{DIM}({'below' if ny_hi < baseline else 'overlaps'} baseline {baseline:.3f}){RST}") ny_plo, ny_phi = bootstrap_pnl_ci(ny, n_boot=3000) col = GREEN if ny_phi < 0 else RED print(f" {'NY_AFT net PnL CI':<42} {col}[{ny_plo:+,.0f}, {ny_phi:+,.0f}]{RST} " f"{DIM}({'net loser with confidence' if ny_phi < 0 else 'uncertain sign'}){RST}") print(f"\n {BOLD}5. Bonferroni z-scores (35 cells tested){RST}") se_ny = binomial_se(baseline, len(ny)) se_mon = binomial_se(baseline, len(mon)) z_ny = (baseline - wr(ny)) / se_ny if se_ny > 0 else 0 z_mon = (baseline - wr(mon)) / se_mon if se_mon > 0 else 0 crit = 2.99 # Bonferroni α=0.0014 → z_crit≈2.99 col_ny = GREEN if z_ny > crit else YELLOW if z_ny > 2.0 else RED col_mon = GREEN if z_mon > crit else YELLOW if z_mon > 2.0 else RED print(f" {'NY_AFT z':<42} {col_ny}{z_ny:.2f}{RST} {DIM}(Bonferroni crit ≈ {crit}){RST}") print(f" {'Monday z':<42} {col_mon}{z_mon:.2f}{RST}") print(f"\n {BOLD}6. Walk-Forward: advisory score → H2 WR{RST}") h2s = sorted(h2, key=lambda t: t["score"]) q = max(1, len(h2s) // 4) wr_bot = wr(h2s[:q]) wr_top = wr(h2s[-q:]) col = GREEN if wr_top > wr_bot else RED print(f" {' Top-quartile score WR (H2)':<42} {col}{wr_top:.3f}{RST} {DIM}n={q}{RST}") print(f" {' Bot-quartile score WR (H2)':<42} {col}{wr_bot:.3f}{RST} {DIM}n={q}{RST}") print(f" {' Predictive (top > bot)?':<42} {col}{'YES' if wr_top > wr_bot else 'NO — score overfit'}{RST}") print(f"\n{'═'*68}\n")