initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/prod/tests/test_esof_overfit_guard.py
+++ b/prod/tests/test_esof_overfit_guard.py
@@ -0,0 +1,871 @@
+#!/usr/bin/env python3
+"""
+EsoF Overfitting Avoidance Test Suite
+
+Industry-standard statistical tests to guard against overfitting in the
+EsoF calendar/session gate and the EsoF↔system interaction.
+
+Why overfitting is a real risk here
+─────────────────────────────────────
+We inspected 5 sessions × 7 DoW = 35 cells on a single ~550-trade dataset
+covering only 3 weeks (2026-03-31 → 2026-04-19). That is:
+  - A short temporal window (one market regime)
+  - Small per-cell sample sizes (median n ≈ 14)
+  - Multiple comparisons (we chose the *worst* cells after looking at all)
+  - No pre-registration (we looked at the data before deciding the gate)
+
+Any one of these alone warrants caution. Together they demand rigorous testing.
+
+Tests implemented
+──────────────────
+1. TestTemporalStability      — H1 vs H2 walk-forward: does the effect hold in both halves?
+2. TestPermutationSignificance — shuffle session/DoW labels N=2000 times; empirical p-value
+3. TestMultipleComparison      — Bonferroni / FDR correction across all 35 cells
+4. TestBootstrapCI             — 95% CI on WR and net PnL via bootstrap resampling
+5. TestMinimumSampleSize       — flag cells with n < 30 as "insufficient evidence"
+6. TestEffectSize              — Cohen's h on WR difference; require medium+ effect (h ≥ 0.3)
+7. TestWalkForwardAdvisory     — train EsoF tables on H1, evaluate advisory score on H2
+8. TestAssetBucketStability    — NY_AFT / Mon effect must hold across ≥ 2 asset buckets
+9. TestRegimeConfound          — check if session effect is a proxy for ACB beta (regime)
+
+Run:
+  source /home/dolphin/siloqy_env/bin/activate
+  cd /mnt/dolphinng5_predict
+  python prod/tests/test_esof_overfit_guard.py       # full report
+  pytest prod/tests/test_esof_overfit_guard.py -v    # pytest mode
+"""
+from __future__ import annotations
+
+import base64
+import math
+import random
+import sys
+import urllib.request
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import pytest
+
+_ROOT = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(_ROOT))
+sys.path.insert(0, str(_ROOT / "Observability"))
+
+from esof_advisor import compute_esof, get_session, BASELINE_WR
+from esof_gate import get_bucket
+
+# ── CH helpers ────────────────────────────────────────────────────────────────
+CH_URL  = "http://localhost:8123"
+CH_USER = "dolphin"
+CH_PASS = "dolphin_ch_2026"
+
+def _ch_query(sql: str) -> List[List[str]]:
+    auth = base64.b64encode(f"{CH_USER}:{CH_PASS}".encode()).decode()
+    req = urllib.request.Request(
+        f"{CH_URL}/?database=dolphin&default_format=TabSeparated",
+        data=sql.encode(),
+        headers={"Authorization": f"Basic {auth}"},
+    )
+    with urllib.request.urlopen(req, timeout=10) as r:
+        raw = r.read().decode().strip()
+    if not raw:
+        return []
+    return [line.split('\t') for line in raw.split('\n')]
+
+def _ch_available() -> bool:
+    try:
+        _ch_query("SELECT 1")
+        return True
+    except Exception:
+        return False
+
+CH_UP = _ch_available()
+
+# ── Trade loader (shared with gate test) ──────────────────────────────────────
+_CACHED_TRADES: Optional[List[dict]] = None
+
+def fetch_trades() -> List[dict]:
+    global _CACHED_TRADES
+    if _CACHED_TRADES is not None:
+        return _CACHED_TRADES
+    sql = """
+        SELECT
+            toUnixTimestamp64Milli(ts) AS ts_ms,
+            asset, side, pnl, exit_reason, leverage
+        FROM dolphin.trade_events
+        WHERE strategy = 'blue'
+          AND exit_reason NOT IN ('HIBERNATE_HALT', 'SUBDAY_ACB_NORMALIZATION')
+        ORDER BY ts
+    """
+    rows = _ch_query(sql)
+    pkl_map: Optional[Dict[str, int]] = None
+    try:
+        import pickle
+        with open(_ROOT / "adaptive_exit/models/bucket_assignments.pkl", 'rb') as f:
+            pkl_map = pickle.load(f).get('assignments', {})
+    except Exception:
+        pass
+
+    trades = []
+    for row in rows:
+        if len(row) < 6:
+            continue
+        try:
+            ts_ms    = int(row[0])
+            asset    = row[1]
+            pnl      = float(row[3])
+            leverage = float(row[5])
+        except (ValueError, IndexError):
+            continue
+        ts  = datetime.fromtimestamp(ts_ms / 1000.0, tz=timezone.utc)
+        adv = compute_esof(ts)
+        trades.append({
+            "ts":        ts,
+            "asset":     asset,
+            "pnl":       pnl,
+            "leverage":  leverage,
+            "session":   adv["session"],
+            "dow":       adv["dow"],
+            "score":     adv["advisory_score"],
+            "label":     adv["advisory_label"],
+            "bucket_id": get_bucket(asset, pkl_map),
+        })
+    _CACHED_TRADES = trades
+    return trades
+
+
+# ── Statistical primitives ────────────────────────────────────────────────────
+
+def wr(trades: List[dict]) -> float:
+    if not trades:
+        return float("nan")
+    return sum(1 for t in trades if t["pnl"] > 0) / len(trades)
+
+def net_pnl(trades: List[dict]) -> float:
+    return sum(t["pnl"] for t in trades)
+
+def cohen_h(p1: float, p2: float) -> float:
+    """Cohen's h effect size for two proportions. |h| ≥ 0.2 small, 0.5 medium, 0.8 large."""
+    return abs(2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2)))
+
+def binomial_se(p: float, n: int) -> float:
+    return math.sqrt(p * (1 - p) / n) if n > 0 else float("inf")
+
+def bootstrap_wr_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
+    """Bootstrap CI on WR. Returns (lower, upper)."""
+    rng = random.Random(42)
+    n   = len(trades)
+    samples = []
+    for _ in range(n_boot):
+        resample = [rng.choice(trades) for _ in range(n)]
+        samples.append(wr(resample))
+    samples.sort()
+    lo = int((1 - ci) / 2 * n_boot)
+    hi = int((1 + ci) / 2 * n_boot)
+    return samples[lo], samples[hi]
+
+def bootstrap_pnl_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
+    rng = random.Random(42)
+    n   = len(trades)
+    samples = []
+    for _ in range(n_boot):
+        resample = [rng.choice(trades) for _ in range(n)]
+        samples.append(net_pnl(resample))
+    samples.sort()
+    lo = int((1 - ci) / 2 * n_boot)
+    hi = int((1 + ci) / 2 * n_boot)
+    return samples[lo], samples[hi]
+
+def permutation_pvalue(
+    trades: List[dict],
+    observed_delta: float,
+    label_key: str,
+    blocked_label,
+    n_perm: int = 2000,
+    seed: int = 42,
+) -> float:
+    """
+    Permutation test: shuffle label_key randomly, compute strategy improvement
+    each time. Return fraction of permutations that produce >= observed_delta.
+    observed_delta > 0 means "blocking blocked_label improved PnL".
+    """
+    rng   = random.Random(seed)
+    labels = [t[label_key] for t in trades]
+    pnls   = [t["pnl"] for t in trades]
+    count_ge = 0
+    for _ in range(n_perm):
+        rng.shuffle(labels)
+        blocked_pnl = sum(p for l, p in zip(labels, pnls) if l == blocked_label)
+        # delta = what we gain by blocking these trades
+        delta = -blocked_pnl  # if blocked_pnl < 0, delta > 0 = improvement
+        if delta >= observed_delta:
+            count_ge += 1
+    return count_ge / n_perm
+
+
+# ═════════════════════════════════════════════════════════════════════════════
+# TEST CLASSES
+# ═════════════════════════════════════════════════════════════════════════════
+
+skip_no_ch = pytest.mark.skipif(not CH_UP, reason="ClickHouse not available")
+
+
+class TestTemporalStability:
+    """
+    Walk-forward: split chronologically into H1 (first 50%) and H2 (last 50%).
+    Session and DoW effects must appear in BOTH halves to be considered real.
+    If present in only one half → data snooping artifact.
+    """
+
+    @skip_no_ch
+    def test_ny_afternoon_negative_in_h1_and_h2(self):
+        trades = fetch_trades()
+        n      = len(trades)
+        h1     = trades[:n // 2]
+        h2     = trades[n // 2:]
+
+        ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
+        ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
+
+        base_h1 = wr(h1)
+        base_h2 = wr(h2)
+
+        assert len(ny_h1) >= 10, f"H1 NY_AFT too small: n={len(ny_h1)}"
+        assert len(ny_h2) >= 10, f"H2 NY_AFT too small: n={len(ny_h2)}"
+
+        # NY_AFTERNOON WR must be below baseline in BOTH halves
+        wr_h1 = wr(ny_h1)
+        wr_h2 = wr(ny_h2)
+        assert wr_h1 < base_h1, (
+            f"NY_AFT drag missing in H1: WR_NYA={wr_h1:.3f} >= baseline={base_h1:.3f}"
+        )
+        assert wr_h2 < base_h2, (
+            f"NY_AFT drag missing in H2: WR_NYA={wr_h2:.3f} >= baseline={base_h2:.3f}"
+        )
+
+    @skip_no_ch
+    def test_monday_negative_in_h1_and_h2(self):
+        trades = fetch_trades()
+        n      = len(trades)
+        h1     = trades[:n // 2]
+        h2     = trades[n // 2:]
+
+        mon_h1 = [t for t in h1 if t["dow"] == 0]
+        mon_h2 = [t for t in h2 if t["dow"] == 0]
+
+        # Monday sample is thin — require at least 10 in each half
+        if len(mon_h1) < 10 or len(mon_h2) < 10:
+            pytest.skip(f"Monday sample too thin for walk-forward: H1={len(mon_h1)}, H2={len(mon_h2)}")
+
+        assert wr(mon_h1) < wr(h1), "Monday drag absent in H1"
+        assert wr(mon_h2) < wr(h2), "Monday drag absent in H2"
+
+    @skip_no_ch
+    def test_strategy_e_positive_in_both_halves(self):
+        """Combined gate (Mon+NY_AFT) must improve PnL in H1 AND H2 independently."""
+        trades = fetch_trades()
+        n      = len(trades)
+        h1     = trades[:n // 2]
+        h2     = trades[n // 2:]
+
+        def gate_e_pnl(subset):
+            return sum(t["pnl"] for t in subset
+                       if t["dow"] != 0 and t["session"] != "NY_AFTERNOON")
+
+        def base_pnl(subset):
+            return sum(t["pnl"] for t in subset)
+
+        assert gate_e_pnl(h1) > base_pnl(h1), "Strategy E degrades H1"
+        assert gate_e_pnl(h2) > base_pnl(h2), "Strategy E degrades H2"
+
+
+class TestPermutationSignificance:
+    """
+    Permutation test: shuffle session / DoW labels randomly.
+    The observed improvement from blocking must rank in the top 5%
+    of the null distribution (p < 0.05) to be considered non-random.
+    """
+
+    @skip_no_ch
+    def test_ny_afternoon_block_is_significant(self):
+        trades = fetch_trades()
+        ny_pnl = sum(t["pnl"] for t in trades if t["session"] == "NY_AFTERNOON")
+        observed_delta = -ny_pnl  # gain from skipping NY_AFT trades
+
+        p = permutation_pvalue(trades, observed_delta, "session", "NY_AFTERNOON",
+                               n_perm=2000)
+        assert p < 0.05, (
+            f"NY_AFTERNOON block not significant: p={p:.3f} >= 0.05. "
+            f"Effect may be noise at this sample size."
+        )
+
+    @skip_no_ch
+    def test_monday_block_significance(self):
+        trades = fetch_trades()
+        mon_pnl = sum(t["pnl"] for t in trades if t["dow"] == 0)
+        observed_delta = -mon_pnl
+
+        p = permutation_pvalue(trades, observed_delta, "dow", 0, n_perm=2000)
+        # Monday has fewer trades — use looser threshold (p < 0.15)
+        # Flag as WARNING not FAIL if p >= 0.05: thin sample, directionally valid
+        if p >= 0.05:
+            print(f"\n  WARN: Monday block p={p:.3f} >= 0.05. "
+                  f"Directionally valid but underpowered (n={sum(1 for t in trades if t['dow']==0)}).")
+        assert p < 0.15, (
+            f"Monday block not even marginally significant: p={p:.3f}. "
+            f"Gate should not be applied until more data accumulates."
+        )
+
+    @skip_no_ch
+    def test_london_morning_block_would_hurt(self):
+        """Blocking LONDON_MORNING (the BEST session) must NOT improve PnL."""
+        trades = fetch_trades()
+        ldn_pnl = sum(t["pnl"] for t in trades if t["session"] == "LONDON_MORNING")
+        observed_delta = -ldn_pnl  # gain from blocking LDN (expect negative = harmful)
+
+        # LDN is net-positive, so blocking it is harmful (delta < 0)
+        assert observed_delta < 0, (
+            f"Blocking LONDON_MORNING should HURT PnL (it is the best session). "
+            f"Got delta={observed_delta:.2f}. Check data integrity."
+        )
+
+
+class TestMultipleComparison:
+    """
+    Multiple comparison correction.
+    We inspected 5 sessions × 7 DoW = 35 cells. Finding 'significant' cells
+    after inspection requires Bonferroni correction: α_adj = 0.05 / 35 ≈ 0.0014.
+    Only cells where WR deviation is large enough to survive Bonferroni should
+    be used in the gate.
+
+    We test: do our chosen cells (NY_AFT, Monday) survive Bonferroni?
+    Using a binomial z-test as a proxy for the corrected p-value.
+    """
+
+    @skip_no_ch
+    def test_ny_afternoon_survives_bonferroni(self):
+        trades   = fetch_trades()
+        ny       = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+        n        = len(ny)
+        baseline = wr(trades)
+        wr_ny    = wr(ny)
+
+        se   = binomial_se(baseline, n)
+        z    = (baseline - wr_ny) / se if se > 0 else 0
+        # One-tailed z for Bonferroni α=0.0014: z_crit ≈ 2.99
+        # We use 2.0 as a practical threshold (more conservative than 1.96 but
+        # less strict than Bonferroni, given 3-week sample inherent limitations)
+        assert z > 2.0, (
+            f"NY_AFTERNOON WR deviation (z={z:.2f}) does not survive "
+            f"multiple-comparison correction. n={n}, WR={wr_ny:.3f} vs base={baseline:.3f}."
+        )
+
+    @skip_no_ch
+    def test_monday_bonferroni_warning(self):
+        trades   = fetch_trades()
+        mon      = [t for t in trades if t["dow"] == 0]
+        n        = len(mon)
+        baseline = wr(trades)
+        wr_mon   = wr(mon)
+
+        se = binomial_se(baseline, n)
+        z  = (baseline - wr_mon) / se if se > 0 else 0
+
+        # Monday: warn if z < 2.0 (doesn't survive strict Bonferroni)
+        if z < 2.0:
+            print(f"\n  WARN: Monday z={z:.2f} < 2.0. Does not survive Bonferroni "
+                  f"at current sample (n={n}). Apply Monday gate cautiously.")
+        # Require at least z > 1.0 (directional signal, not pure noise)
+        assert z > 1.0, (
+            f"Monday WR deviation is indistinguishable from noise: z={z:.2f}. "
+            f"Do not gate Monday until more trades accumulate."
+        )
+
+    @skip_no_ch
+    def test_no_spurious_best_cell_used_as_gate(self):
+        """
+        Best-cell cherry-pick guard: the SINGLE best-performing cell in the dataset
+        must NOT be treated as a reliable gate without Bonferroni correction.
+        Test: find the best WR cell (n >= 10), check that its deviation is NOT
+        significantly larger than the worst cell — both could be noise extremes.
+        """
+        trades = fetch_trades()
+        cells: Dict[Tuple, List[dict]] = defaultdict(list)
+        for t in trades:
+            cells[(t["dow"], t["session"])].append(t)
+
+        valid = [(k, v) for k, v in cells.items() if len(v) >= 10]
+        if len(valid) < 5:
+            pytest.skip("Not enough cells with n >= 10")
+
+        wrs    = [(k, wr(v), len(v)) for k, v in valid]
+        best   = max(wrs, key=lambda x: x[1])
+        worst  = min(wrs, key=lambda x: x[1])
+
+        baseline = wr(trades)
+        se_best  = binomial_se(baseline, best[2])
+        se_worst = binomial_se(baseline, worst[2])
+
+        z_best  = (best[1]  - baseline) / se_best  if se_best  > 0 else 0
+        z_worst = (baseline - worst[1]) / se_worst if se_worst > 0 else 0
+
+        # Both extremes should be similarly significant (or not).
+        # If best is >3σ but worst is <1σ, something is asymmetric — flag it.
+        # Acceptable: both extremes are significant OR both are marginal.
+        ratio = z_best / z_worst if z_worst > 0.1 else float("inf")
+        assert ratio < 5.0, (
+            f"Asymmetric cell extremes: z_best={z_best:.2f} vs z_worst={z_worst:.2f}. "
+            f"Best cell ({best[0]}) may be a cherry-pick artifact."
+        )
+
+
+class TestBootstrapCI:
+    """
+    Bootstrap confidence intervals on WR for each gated segment.
+    The 95% CI upper bound for NY_AFTERNOON WR must be below baseline WR.
+    If the CI overlaps the baseline, the effect is not reliable.
+    """
+
+    @skip_no_ch
+    def test_ny_afternoon_ci_below_baseline(self):
+        trades = fetch_trades()
+        ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+
+        assert len(ny) >= 20, f"NY_AFT sample too small for bootstrap: n={len(ny)}"
+
+        _, upper = bootstrap_wr_ci(ny, n_boot=3000)
+        baseline = wr(trades)
+
+        assert upper < baseline, (
+            f"NY_AFTERNOON WR CI upper bound ({upper:.3f}) overlaps baseline "
+            f"WR ({baseline:.3f}). Effect not reliable at 95% confidence."
+        )
+
+    @skip_no_ch
+    def test_london_morning_ci_above_baseline(self):
+        trades = fetch_trades()
+        ldn    = [t for t in trades if t["session"] == "LONDON_MORNING"]
+
+        assert len(ldn) >= 20, f"LDN sample too small: n={len(ldn)}"
+
+        lower, _ = bootstrap_wr_ci(ldn, n_boot=3000)
+        baseline = wr(trades)
+
+        assert lower > baseline * 0.95, (
+            f"LONDON_MORNING WR CI lower bound ({lower:.3f}) is too far below "
+            f"baseline ({baseline:.3f}). LDN advantage may not be reliable."
+        )
+
+    @skip_no_ch
+    def test_ny_afternoon_pnl_ci_negative(self):
+        """Net PnL CI for NY_AFTERNOON must have upper bound < 0 (net loser with confidence)."""
+        trades = fetch_trades()
+        ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+
+        assert len(ny) >= 20
+
+        _, upper = bootstrap_pnl_ci(ny, n_boot=3000)
+        assert upper < 0, (
+            f"NY_AFTERNOON net PnL CI upper bound is {upper:.2f} > 0. "
+            f"Cannot confidently call it a net loser at current sample size."
+        )
+
+
+class TestMinimumSampleSize:
+    """
+    Minimum sample size guard. No session or DoW factor should influence
+    the advisory score unless it has n >= 30 trades. Below 30, the WR
+    estimate has SE > 9pp (too noisy to act on).
+    """
+
+    @skip_no_ch
+    def test_all_gate_factors_have_sufficient_n(self):
+        """
+        The two gated factors (NY_AFTERNOON, Monday) must each have n >= 30
+        in the current dataset for the gate to be considered valid.
+        """
+        trades = fetch_trades()
+        ny  = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+        mon = [t for t in trades if t["dow"] == 0]
+
+        assert len(ny)  >= 30, f"NY_AFTERNOON n={len(ny)} < 30. Gate underpowered."
+        assert len(mon) >= 30, f"Monday n={len(mon)} < 30. Gate underpowered."
+
+    @skip_no_ch
+    def test_slot_15m_gate_would_be_overfit(self):
+        """
+        15-minute slot data has median n ≈ 7. Any slot-level gate applied
+        directly would be extreme overfitting. Verify: majority of slots have n < 30.
+        """
+        trades  = fetch_trades()
+        slots: Dict[str, int] = defaultdict(int)
+        for t in trades:
+            h   = t["ts"].hour
+            m   = (t["ts"].minute // 15) * 15
+            slots[f"{h}:{m:02d}"] += 1
+
+        n_thin = sum(1 for n in slots.values() if n < 30)
+        frac   = n_thin / len(slots) if slots else 1.0
+
+        assert frac > 0.70, (
+            f"Only {frac:.0%} of 15m slots have n < 30. "
+            f"Expected most slots to be underpowered — if not, slot gate may be premature."
+        )
+
+    def test_advisory_score_weights_reflect_sample_size(self):
+        """
+        Slot weight (0.10) must be lower than session (0.25) and DoW (0.30).
+        Ensures the weakest-sample factor has the lowest influence.
+        """
+        from esof_advisor import SESSION_STATS, DOW_STATS, SLOT_STATS
+        median_session_n = sorted([v[0] for v in SESSION_STATS.values()])[len(SESSION_STATS) // 2]
+        median_dow_n     = sorted([v[0] for v in DOW_STATS.values()])[len(DOW_STATS) // 2]
+        median_slot_n    = sorted([v[0] for v in SLOT_STATS.values()])[len(SLOT_STATS) // 2]
+
+        assert median_slot_n < median_session_n, "Slot n should be < session n"
+        assert median_slot_n < median_dow_n,     "Slot n should be < DoW n"
+        # Slot weight is 0.10, session 0.25, DoW 0.30 — smaller n = smaller weight
+        SLOT_WEIGHT    = 0.10
+        SESSION_WEIGHT = 0.25
+        DOW_WEIGHT     = 0.30
+        assert SLOT_WEIGHT < SESSION_WEIGHT
+        assert SLOT_WEIGHT < DOW_WEIGHT
+
+
+class TestEffectSize:
+    """
+    Cohen's h effect size on WR differences.
+    |h| >= 0.2: small effect (minimum threshold to consider gating)
+    |h| >= 0.5: medium effect (comfortable to gate)
+    |h| >= 0.8: large effect (very strong signal)
+    """
+
+    @skip_no_ch
+    def test_ny_afternoon_effect_size_medium(self):
+        trades   = fetch_trades()
+        ny       = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+        baseline = wr(trades)
+        h        = cohen_h(wr(ny), baseline)
+        assert h >= 0.2, (
+            f"NY_AFTERNOON effect size h={h:.3f} < 0.2 (small). "
+            f"Signal too weak to justify gating."
+        )
+
+    @skip_no_ch
+    def test_london_morning_effect_size_positive(self):
+        trades   = fetch_trades()
+        ldn      = [t for t in trades if t["session"] == "LONDON_MORNING"]
+        baseline = wr(trades)
+        h        = cohen_h(wr(ldn), baseline)
+        assert h >= 0.0, "LDN effect size must be measurable"
+
+    @skip_no_ch
+    def test_dow_tuesday_effect_size(self):
+        """Tuesday is the best DoW. Effect size must be positive."""
+        trades   = fetch_trades()
+        tue      = [t for t in trades if t["dow"] == 1]
+        baseline = wr(trades)
+        if len(tue) < 10:
+            pytest.skip("Tuesday sample too thin")
+        h = cohen_h(wr(tue), baseline)
+        assert h >= 0.0, "Tuesday must show positive effect"
+
+    @skip_no_ch
+    def test_effect_size_ranking_matches_expectation(self):
+        """
+        NY_AFTERNOON effect size must be larger than LOW_LIQUIDITY effect size.
+        NY_AFT has more trades and a larger WR gap — should show stronger signal.
+        """
+        trades = fetch_trades()
+        ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+        low    = [t for t in trades if t["session"] == "LOW_LIQUIDITY"]
+        base   = wr(trades)
+
+        h_ny  = cohen_h(wr(ny),  base) if len(ny)  >= 10 else 0
+        h_low = cohen_h(wr(low), base) if len(low) >= 10 else 0
+
+        # NY_AFTERNOON has 3× the sample of LOW_LIQ — effect should be at least as large
+        assert h_ny >= h_low * 0.7, (
+            f"NY_AFT h={h_ny:.3f} much smaller than LOW_LIQ h={h_low:.3f}. "
+            f"Unexpected — check data."
+        )
+
+
+class TestWalkForwardAdvisory:
+    """
+    Walk-forward advisory score validation.
+    Train EsoF tables conceptually on H1 (we use the existing static tables as proxy).
+    Evaluate: does the advisory score computed at H2 trade times predict H2 outcomes?
+
+    Method: within H2, rank trades by advisory_score. The bottom quartile (most
+    negative score) should have lower WR than the top quartile. If the score
+    has no predictive power on OOS data, it is overfit to the in-sample period.
+    """
+
+    @skip_no_ch
+    def test_score_predicts_wr_direction_in_h2(self):
+        trades = fetch_trades()
+        n      = len(trades)
+        h2     = sorted(trades[n // 2:], key=lambda t: t["score"])
+
+        if len(h2) < 40:
+            pytest.skip(f"H2 too small for quartile split: n={len(h2)}")
+
+        q   = len(h2) // 4
+        bottom = h2[:q]      # worst advisory scores
+        top    = h2[-q:]     # best advisory scores
+
+        wr_bot = wr(bottom)
+        wr_top = wr(top)
+
+        assert wr_top > wr_bot, (
+            f"Advisory score has no directional predictive power in H2: "
+            f"WR_top={wr_top:.3f}  WR_bot={wr_bot:.3f}. Score may be overfit."
+        )
+
+    @skip_no_ch
+    def test_unfavorable_label_has_lower_wr_in_h2(self):
+        trades = fetch_trades()
+        n      = len(trades)
+        h2     = trades[n // 2:]
+
+        unfav = [t for t in h2 if t["label"] == "UNFAVORABLE"]
+        rest  = [t for t in h2 if t["label"] != "UNFAVORABLE"]
+
+        if len(unfav) < 5:
+            pytest.skip(f"Too few UNFAVORABLE trades in H2: n={len(unfav)}")
+
+        assert wr(unfav) <= wr(rest) + 0.05, (
+            f"UNFAVORABLE label does not predict lower WR in H2: "
+            f"WR_unfav={wr(unfav):.3f} vs WR_rest={wr(rest):.3f}. "
+            f"Advisory label may be overfit."
+        )
+
+
+class TestAssetBucketStability:
+    """
+    The session/DoW effect must not be driven by a single asset bucket.
+    If NY_AFTERNOON drag is entirely explained by, say, B4 trades clustering
+    in that session, the gate is actually gating B4 by proxy — not time.
+    The effect must hold across at least 2 independent buckets.
+    """
+
+    @skip_no_ch
+    def test_ny_afternoon_drag_cross_bucket(self):
+        trades  = fetch_trades()
+        ny      = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+        not_ny  = [t for t in trades if t["session"] != "NY_AFTERNOON"]
+
+        by_bucket_ny  = defaultdict(list)
+        by_bucket_out = defaultdict(list)
+        for t in ny:
+            by_bucket_ny[t["bucket_id"]].append(t)
+        for t in not_ny:
+            by_bucket_out[t["bucket_id"]].append(t)
+
+        # Count buckets where NY_AFT WR is below out-of-session WR
+        n_confirming = 0
+        for bkt in by_bucket_ny:
+            if len(by_bucket_ny[bkt]) < 5 or len(by_bucket_out.get(bkt, [])) < 5:
+                continue
+            if wr(by_bucket_ny[bkt]) < wr(by_bucket_out[bkt]):
+                n_confirming += 1
+
+        assert n_confirming >= 2, (
+            f"NY_AFT drag only confirmed in {n_confirming} bucket(s). "
+            f"Need ≥ 2 for effect to be session-driven, not bucket-confounded."
+        )
+
+    @skip_no_ch
+    def test_monday_drag_cross_bucket(self):
+        trades = fetch_trades()
+        mon    = [t for t in trades if t["dow"] == 0]
+        not_mon = [t for t in trades if t["dow"] != 0]
+
+        by_bkt_mon = defaultdict(list)
+        by_bkt_out = defaultdict(list)
+        for t in mon:
+            by_bkt_mon[t["bucket_id"]].append(t)
+        for t in not_mon:
+            by_bkt_out[t["bucket_id"]].append(t)
+
+        n_confirming = 0
+        for bkt in by_bkt_mon:
+            if len(by_bkt_mon[bkt]) < 5 or len(by_bkt_out.get(bkt, [])) < 5:
+                continue
+            if wr(by_bkt_mon[bkt]) < wr(by_bkt_out[bkt]):
+                n_confirming += 1
+
+        if n_confirming < 2:
+            print(f"\n  WARN: Monday drag only in {n_confirming} bucket(s). "
+                  f"Thin sample — cannot confirm cross-bucket. Gate with caution.")
+        # Soft assert: Monday has thinner sample, require at least 1
+        assert n_confirming >= 1, (
+            f"Monday drag not present in ANY bucket. "
+            f"Likely a sampling artifact — do not gate Monday."
+        )
+
+
+class TestRegimeConfound:
+    """
+    Regime confound check: is the session effect just a proxy for ACB beta?
+    If all NY_AFTERNOON trades happen to coincide with low ACB beta (bearish
+    regime), then blocking NY_AFT is actually blocking bear-regime trades,
+    not session-specific trades. The gate would be redundant with ACB.
+
+    Method: compare ACB leverage (proxy for regime strength) between
+    NY_AFTERNOON and other sessions. If leverage distributions are
+    significantly different, the session effect is partially confounded.
+    """
+
+    @skip_no_ch
+    def test_ny_afternoon_leverage_not_systematically_different(self):
+        """
+        NY_AFTERNOON avg leverage should be within 20% of other sessions' avg leverage.
+        Large divergence → session effect may be a regime proxy.
+        """
+        trades  = fetch_trades()
+        ny      = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+        not_ny  = [t for t in trades if t["session"] != "NY_AFTERNOON"]
+
+        if len(ny) < 10 or len(not_ny) < 10:
+            pytest.skip("Insufficient data for leverage comparison")
+
+        avg_lev_ny  = sum(t["leverage"] for t in ny)  / len(ny)
+        avg_lev_out = sum(t["leverage"] for t in not_ny) / len(not_ny)
+
+        ratio = avg_lev_ny / avg_lev_out if avg_lev_out > 0 else 1.0
+
+        assert 0.80 <= ratio <= 1.20, (
+            f"NY_AFTERNOON avg leverage ({avg_lev_ny:.2f}x) differs by >{20}% "
+            f"from other sessions ({avg_lev_out:.2f}x). "
+            f"Session effect may be a regime-proxy — investigate confound."
+        )
+
+    @skip_no_ch
+    def test_ny_afternoon_wr_negative_across_leverage_bands(self):
+        """
+        Regime confound falsification: split NY_AFT trades into high/low leverage.
+        If NY_AFT drag holds in BOTH leverage bands, it is NOT purely a regime effect.
+        """
+        trades = fetch_trades()
+        ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+
+        if len(ny) < 20:
+            pytest.skip(f"NY_AFT too small for leverage split: n={len(ny)}")
+
+        median_lev = sorted(t["leverage"] for t in ny)[len(ny) // 2]
+        hi_lev     = [t for t in ny if t["leverage"] >= median_lev]
+        lo_lev     = [t for t in ny if t["leverage"] <  median_lev]
+        baseline   = wr(fetch_trades())
+
+        hi_below = wr(hi_lev) < baseline if len(hi_lev) >= 5 else True
+        lo_below = wr(lo_lev) < baseline if len(lo_lev) >= 5 else True
+
+        assert hi_below or lo_below, (
+            "NY_AFT drag absent in BOTH leverage bands — effect is not regime-independent. "
+            "Gate may be a regime proxy."
+        )
+
+
+# ═════════════════════════════════════════════════════════════════════════════
+# STANDALONE REPORT
+# ═════════════════════════════════════════════════════════════════════════════
+
+GREEN = "\033[32m"; RED = "\033[31m"; YELLOW = "\033[33m"
+BOLD  = "\033[1m";  DIM = "\033[2m";  RST = "\033[0m"
+
+if __name__ == "__main__":
+    if not CH_UP:
+        print(f"{RED}ClickHouse not available.{RST}")
+        sys.exit(1)
+
+    trades = fetch_trades()
+    n      = len(trades)
+    h1, h2 = trades[:n // 2], trades[n // 2:]
+
+    print(f"\n{BOLD}{'═'*68}{RST}")
+    print(f"{BOLD}  EsoF Overfitting Guard Report  ({n} trades){RST}")
+    print(f"{'═'*68}\n")
+
+    baseline  = wr(trades)
+    ny        = [t for t in trades if t["session"] == "NY_AFTERNOON"]
+    mon       = [t for t in trades if t["dow"] == 0]
+    ldn       = [t for t in trades if t["session"] == "LONDON_MORNING"]
+
+    ny_h1  = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
+    ny_h2  = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
+    mon_h1 = [t for t in h1 if t["dow"] == 0]
+    mon_h2 = [t for t in h2 if t["dow"] == 0]
+
+    def row(label, val, ref=None, lo=None, hi=None, warn=None, note=""):
+        if lo is not None:
+            ci_str = f"  95%CI [{lo:.3f}, {hi:.3f}]"
+        else:
+            ci_str = ""
+        col = GREEN if (ref is None or val < ref) else RED
+        if warn:
+            col = YELLOW
+        print(f"  {label:<42} {col}{val:.3f}{RST}{ci_str}  {DIM}{note}{RST}")
+
+    print(f"  {'Baseline WR':<42} {baseline:.3f}")
+    print()
+
+    print(f"  {BOLD}1. Temporal Stability (H1 / H2){RST}")
+    row("  NY_AFT WR — H1", wr(ny_h1), baseline, note=f"n={len(ny_h1)}")
+    row("  NY_AFT WR — H2", wr(ny_h2), baseline, note=f"n={len(ny_h2)}")
+    row("  Mon WR — H1",    wr(mon_h1), baseline, note=f"n={len(mon_h1)}")
+    row("  Mon WR — H2",    wr(mon_h2), baseline, note=f"n={len(mon_h2)}")
+
+    print(f"\n  {BOLD}2. Permutation p-values{RST}")
+    ny_pnl  = sum(t["pnl"] for t in ny)
+    mon_pnl = sum(t["pnl"] for t in mon)
+    p_ny  = permutation_pvalue(trades, -ny_pnl,  "session", "NY_AFTERNOON", n_perm=2000)
+    p_mon = permutation_pvalue(trades, -mon_pnl, "dow",     0,              n_perm=2000)
+    col_ny  = GREEN if p_ny  < 0.05 else YELLOW if p_ny  < 0.15 else RED
+    col_mon = GREEN if p_mon < 0.05 else YELLOW if p_mon < 0.15 else RED
+    print(f"  {'NY_AFT block p-value':<42} {col_ny}{p_ny:.4f}{RST}  {DIM}(< 0.05 = significant){RST}")
+    print(f"  {'Monday block p-value':<42} {col_mon}{p_mon:.4f}{RST}  {DIM}(< 0.15 = directional){RST}")
+
+    print(f"\n  {BOLD}3. Effect Sizes (Cohen's h){RST}")
+    h_ny  = cohen_h(wr(ny),  baseline)
+    h_mon = cohen_h(wr(mon), baseline)
+    h_ldn = cohen_h(wr(ldn), baseline)
+    for label, h, n_cell in [("NY_AFT", h_ny, len(ny)), ("Monday", h_mon, len(mon)), ("London", h_ldn, len(ldn))]:
+        grade = "large" if h >= 0.8 else "medium" if h >= 0.5 else "small" if h >= 0.2 else "trivial"
+        col = GREEN if h >= 0.5 else YELLOW if h >= 0.2 else RED
+        print(f"  {'  '+label:<42} {col}{h:.3f}{RST}  {DIM}{grade} (n={n_cell}){RST}")
+
+    print(f"\n  {BOLD}4. Bootstrap 95% CIs{RST}")
+    ny_lo, ny_hi = bootstrap_wr_ci(ny, n_boot=3000)
+    col = GREEN if ny_hi < baseline else RED
+    print(f"  {'NY_AFT WR CI':<42} {col}[{ny_lo:.3f}, {ny_hi:.3f}]{RST}  "
+          f"{DIM}({'below' if ny_hi < baseline else 'overlaps'} baseline {baseline:.3f}){RST}")
+    ny_plo, ny_phi = bootstrap_pnl_ci(ny, n_boot=3000)
+    col = GREEN if ny_phi < 0 else RED
+    print(f"  {'NY_AFT net PnL CI':<42} {col}[{ny_plo:+,.0f}, {ny_phi:+,.0f}]{RST}  "
+          f"{DIM}({'net loser with confidence' if ny_phi < 0 else 'uncertain sign'}){RST}")
+
+    print(f"\n  {BOLD}5. Bonferroni z-scores (35 cells tested){RST}")
+    se_ny  = binomial_se(baseline, len(ny))
+    se_mon = binomial_se(baseline, len(mon))
+    z_ny   = (baseline - wr(ny))  / se_ny  if se_ny  > 0 else 0
+    z_mon  = (baseline - wr(mon)) / se_mon if se_mon > 0 else 0
+    crit   = 2.99  # Bonferroni α=0.0014 → z_crit≈2.99
+    col_ny  = GREEN if z_ny  > crit else YELLOW if z_ny  > 2.0 else RED
+    col_mon = GREEN if z_mon > crit else YELLOW if z_mon > 2.0 else RED
+    print(f"  {'NY_AFT z':<42} {col_ny}{z_ny:.2f}{RST}  {DIM}(Bonferroni crit ≈ {crit}){RST}")
+    print(f"  {'Monday z':<42} {col_mon}{z_mon:.2f}{RST}")
+
+    print(f"\n  {BOLD}6. Walk-Forward: advisory score → H2 WR{RST}")
+    h2s = sorted(h2, key=lambda t: t["score"])
+    q   = max(1, len(h2s) // 4)
+    wr_bot = wr(h2s[:q])
+    wr_top = wr(h2s[-q:])
+    col = GREEN if wr_top > wr_bot else RED
+    print(f"  {'  Top-quartile score WR (H2)':<42} {col}{wr_top:.3f}{RST}  {DIM}n={q}{RST}")
+    print(f"  {'  Bot-quartile score WR (H2)':<42} {col}{wr_bot:.3f}{RST}  {DIM}n={q}{RST}")
+    print(f"  {'  Predictive (top > bot)?':<42} {col}{'YES' if wr_top > wr_bot else 'NO — score overfit'}{RST}")
+
+    print(f"\n{'═'*68}\n")