DOLPHIN/prod/tests/test_esof_overfit_guard.py

#!/usr/bin/env python3
"""
EsoF Overfitting Avoidance Test Suite

Industry-standard statistical tests to guard against overfitting in the
EsoF calendar/session gate and the EsoF↔system interaction.

Why overfitting is a real risk here
─────────────────────────────────────
We inspected 5 sessions × 7 DoW = 35 cells on a single ~550-trade dataset
covering only 3 weeks (2026-03-31 → 2026-04-19). That is:
  - A short temporal window (one market regime)
  - Small per-cell sample sizes (median n ≈ 14)
  - Multiple comparisons (we chose the *worst* cells after looking at all)
  - No pre-registration (we looked at the data before deciding the gate)

Any one of these alone warrants caution. Together they demand rigorous testing.

Tests implemented
──────────────────
1. TestTemporalStability      — H1 vs H2 walk-forward: does the effect hold in both halves?
2. TestPermutationSignificance — shuffle session/DoW labels N=2000 times; empirical p-value
3. TestMultipleComparison      — Bonferroni / FDR correction across all 35 cells
4. TestBootstrapCI             — 95% CI on WR and net PnL via bootstrap resampling
5. TestMinimumSampleSize       — flag cells with n < 30 as "insufficient evidence"
6. TestEffectSize              — Cohen's h on WR difference; require medium+ effect (h ≥ 0.3)
7. TestWalkForwardAdvisory     — train EsoF tables on H1, evaluate advisory score on H2
8. TestAssetBucketStability    — NY_AFT / Mon effect must hold across ≥ 2 asset buckets
9. TestRegimeConfound          — check if session effect is a proxy for ACB beta (regime)

Run:
  source /home/dolphin/siloqy_env/bin/activate
  cd /mnt/dolphinng5_predict
  python prod/tests/test_esof_overfit_guard.py       # full report
  pytest prod/tests/test_esof_overfit_guard.py -v    # pytest mode
"""
from __future__ import annotations

import base64
import math
import random
import sys
import urllib.request
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import pytest

_ROOT = Path(__file__).parent.parent.parent
sys.path.insert(0, str(_ROOT))
sys.path.insert(0, str(_ROOT / "Observability"))

from esof_advisor import compute_esof, get_session, BASELINE_WR
from esof_gate import get_bucket

# ── CH helpers ────────────────────────────────────────────────────────────────
CH_URL  = "http://localhost:8123"
CH_USER = "dolphin"
CH_PASS = "dolphin_ch_2026"

def _ch_query(sql: str) -> List[List[str]]:
    auth = base64.b64encode(f"{CH_USER}:{CH_PASS}".encode()).decode()
    req = urllib.request.Request(
        f"{CH_URL}/?database=dolphin&default_format=TabSeparated",
        data=sql.encode(),
        headers={"Authorization": f"Basic {auth}"},
    )
    with urllib.request.urlopen(req, timeout=10) as r:
        raw = r.read().decode().strip()
    if not raw:
        return []
    return [line.split('\t') for line in raw.split('\n')]

def _ch_available() -> bool:
    try:
        _ch_query("SELECT 1")
        return True
    except Exception:
        return False

CH_UP = _ch_available()

# ── Trade loader (shared with gate test) ──────────────────────────────────────
_CACHED_TRADES: Optional[List[dict]] = None

def fetch_trades() -> List[dict]:
    global _CACHED_TRADES
    if _CACHED_TRADES is not None:
        return _CACHED_TRADES
    sql = """
        SELECT
            toUnixTimestamp64Milli(ts) AS ts_ms,
            asset, side, pnl, exit_reason, leverage
        FROM dolphin.trade_events
        WHERE strategy = 'blue'
          AND exit_reason NOT IN ('HIBERNATE_HALT', 'SUBDAY_ACB_NORMALIZATION')
        ORDER BY ts
    """
    rows = _ch_query(sql)
    pkl_map: Optional[Dict[str, int]] = None
    try:
        import pickle
        with open(_ROOT / "adaptive_exit/models/bucket_assignments.pkl", 'rb') as f:
            pkl_map = pickle.load(f).get('assignments', {})
    except Exception:
        pass

    trades = []
    for row in rows:
        if len(row) < 6:
            continue
        try:
            ts_ms    = int(row[0])
            asset    = row[1]
            pnl      = float(row[3])
            leverage = float(row[5])
        except (ValueError, IndexError):
            continue
        ts  = datetime.fromtimestamp(ts_ms / 1000.0, tz=timezone.utc)
        adv = compute_esof(ts)
        trades.append({
            "ts":        ts,
            "asset":     asset,
            "pnl":       pnl,
            "leverage":  leverage,
            "session":   adv["session"],
            "dow":       adv["dow"],
            "score":     adv["advisory_score"],
            "label":     adv["advisory_label"],
            "bucket_id": get_bucket(asset, pkl_map),
        })
    _CACHED_TRADES = trades
    return trades


# ── Statistical primitives ────────────────────────────────────────────────────

def wr(trades: List[dict]) -> float:
    if not trades:
        return float("nan")
    return sum(1 for t in trades if t["pnl"] > 0) / len(trades)

def net_pnl(trades: List[dict]) -> float:
    return sum(t["pnl"] for t in trades)

def cohen_h(p1: float, p2: float) -> float:
    """Cohen's h effect size for two proportions. |h| ≥ 0.2 small, 0.5 medium, 0.8 large."""
    return abs(2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2)))

def binomial_se(p: float, n: int) -> float:
    return math.sqrt(p * (1 - p) / n) if n > 0 else float("inf")

def bootstrap_wr_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
    """Bootstrap CI on WR. Returns (lower, upper)."""
    rng = random.Random(42)
    n   = len(trades)
    samples = []
    for _ in range(n_boot):
        resample = [rng.choice(trades) for _ in range(n)]
        samples.append(wr(resample))
    samples.sort()
    lo = int((1 - ci) / 2 * n_boot)
    hi = int((1 + ci) / 2 * n_boot)
    return samples[lo], samples[hi]

def bootstrap_pnl_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
    rng = random.Random(42)
    n   = len(trades)
    samples = []
    for _ in range(n_boot):
        resample = [rng.choice(trades) for _ in range(n)]
        samples.append(net_pnl(resample))
    samples.sort()
    lo = int((1 - ci) / 2 * n_boot)
    hi = int((1 + ci) / 2 * n_boot)
    return samples[lo], samples[hi]

def permutation_pvalue(
    trades: List[dict],
    observed_delta: float,
    label_key: str,
    blocked_label,
    n_perm: int = 2000,
    seed: int = 42,
) -> float:
    """
    Permutation test: shuffle label_key randomly, compute strategy improvement
    each time. Return fraction of permutations that produce >= observed_delta.
    observed_delta > 0 means "blocking blocked_label improved PnL".
    """
    rng   = random.Random(seed)
    labels = [t[label_key] for t in trades]
    pnls   = [t["pnl"] for t in trades]
    count_ge = 0
    for _ in range(n_perm):
        rng.shuffle(labels)
        blocked_pnl = sum(p for l, p in zip(labels, pnls) if l == blocked_label)
        # delta = what we gain by blocking these trades
        delta = -blocked_pnl  # if blocked_pnl < 0, delta > 0 = improvement
        if delta >= observed_delta:
            count_ge += 1
    return count_ge / n_perm


# ═════════════════════════════════════════════════════════════════════════════
# TEST CLASSES
# ═════════════════════════════════════════════════════════════════════════════

skip_no_ch = pytest.mark.skipif(not CH_UP, reason="ClickHouse not available")


class TestTemporalStability:
    """
    Walk-forward: split chronologically into H1 (first 50%) and H2 (last 50%).
    Session and DoW effects must appear in BOTH halves to be considered real.
    If present in only one half → data snooping artifact.
    """

    @skip_no_ch
    def test_ny_afternoon_negative_in_h1_and_h2(self):
        trades = fetch_trades()
        n      = len(trades)
        h1     = trades[:n // 2]
        h2     = trades[n // 2:]

        ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
        ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]

        base_h1 = wr(h1)
        base_h2 = wr(h2)

        assert len(ny_h1) >= 10, f"H1 NY_AFT too small: n={len(ny_h1)}"
        assert len(ny_h2) >= 10, f"H2 NY_AFT too small: n={len(ny_h2)}"

        # NY_AFTERNOON WR must be below baseline in BOTH halves
        wr_h1 = wr(ny_h1)
        wr_h2 = wr(ny_h2)
        assert wr_h1 < base_h1, (
            f"NY_AFT drag missing in H1: WR_NYA={wr_h1:.3f} >= baseline={base_h1:.3f}"
        )
        assert wr_h2 < base_h2, (
            f"NY_AFT drag missing in H2: WR_NYA={wr_h2:.3f} >= baseline={base_h2:.3f}"
        )

    @skip_no_ch
    def test_monday_negative_in_h1_and_h2(self):
        trades = fetch_trades()
        n      = len(trades)
        h1     = trades[:n // 2]
        h2     = trades[n // 2:]

        mon_h1 = [t for t in h1 if t["dow"] == 0]
        mon_h2 = [t for t in h2 if t["dow"] == 0]

        # Monday sample is thin — require at least 10 in each half
        if len(mon_h1) < 10 or len(mon_h2) < 10:
            pytest.skip(f"Monday sample too thin for walk-forward: H1={len(mon_h1)}, H2={len(mon_h2)}")

        assert wr(mon_h1) < wr(h1), "Monday drag absent in H1"
        assert wr(mon_h2) < wr(h2), "Monday drag absent in H2"

    @skip_no_ch
    def test_strategy_e_positive_in_both_halves(self):
        """Combined gate (Mon+NY_AFT) must improve PnL in H1 AND H2 independently."""
        trades = fetch_trades()
        n      = len(trades)
        h1     = trades[:n // 2]
        h2     = trades[n // 2:]

        def gate_e_pnl(subset):
            return sum(t["pnl"] for t in subset
                       if t["dow"] != 0 and t["session"] != "NY_AFTERNOON")

        def base_pnl(subset):
            return sum(t["pnl"] for t in subset)

        assert gate_e_pnl(h1) > base_pnl(h1), "Strategy E degrades H1"
        assert gate_e_pnl(h2) > base_pnl(h2), "Strategy E degrades H2"


class TestPermutationSignificance:
    """
    Permutation test: shuffle session / DoW labels randomly.
    The observed improvement from blocking must rank in the top 5%
    of the null distribution (p < 0.05) to be considered non-random.
    """

    @skip_no_ch
    def test_ny_afternoon_block_is_significant(self):
        trades = fetch_trades()
        ny_pnl = sum(t["pnl"] for t in trades if t["session"] == "NY_AFTERNOON")
        observed_delta = -ny_pnl  # gain from skipping NY_AFT trades

        p = permutation_pvalue(trades, observed_delta, "session", "NY_AFTERNOON",
                               n_perm=2000)
        assert p < 0.05, (
            f"NY_AFTERNOON block not significant: p={p:.3f} >= 0.05. "
            f"Effect may be noise at this sample size."
        )

    @skip_no_ch
    def test_monday_block_significance(self):
        trades = fetch_trades()
        mon_pnl = sum(t["pnl"] for t in trades if t["dow"] == 0)
        observed_delta = -mon_pnl

        p = permutation_pvalue(trades, observed_delta, "dow", 0, n_perm=2000)
        # Monday has fewer trades — use looser threshold (p < 0.15)
        # Flag as WARNING not FAIL if p >= 0.05: thin sample, directionally valid
        if p >= 0.05:
            print(f"\n  WARN: Monday block p={p:.3f} >= 0.05. "
                  f"Directionally valid but underpowered (n={sum(1 for t in trades if t['dow']==0)}).")
        assert p < 0.15, (
            f"Monday block not even marginally significant: p={p:.3f}. "
            f"Gate should not be applied until more data accumulates."
        )

    @skip_no_ch
    def test_london_morning_block_would_hurt(self):
        """Blocking LONDON_MORNING (the BEST session) must NOT improve PnL."""
        trades = fetch_trades()
        ldn_pnl = sum(t["pnl"] for t in trades if t["session"] == "LONDON_MORNING")
        observed_delta = -ldn_pnl  # gain from blocking LDN (expect negative = harmful)

        # LDN is net-positive, so blocking it is harmful (delta < 0)
        assert observed_delta < 0, (
            f"Blocking LONDON_MORNING should HURT PnL (it is the best session). "
            f"Got delta={observed_delta:.2f}. Check data integrity."
        )


class TestMultipleComparison:
    """
    Multiple comparison correction.
    We inspected 5 sessions × 7 DoW = 35 cells. Finding 'significant' cells
    after inspection requires Bonferroni correction: α_adj = 0.05 / 35 ≈ 0.0014.
    Only cells where WR deviation is large enough to survive Bonferroni should
    be used in the gate.

    We test: do our chosen cells (NY_AFT, Monday) survive Bonferroni?
    Using a binomial z-test as a proxy for the corrected p-value.
    """

    @skip_no_ch
    def test_ny_afternoon_survives_bonferroni(self):
        trades   = fetch_trades()
        ny       = [t for t in trades if t["session"] == "NY_AFTERNOON"]
        n        = len(ny)
        baseline = wr(trades)
        wr_ny    = wr(ny)

        se   = binomial_se(baseline, n)
        z    = (baseline - wr_ny) / se if se > 0 else 0
        # One-tailed z for Bonferroni α=0.0014: z_crit ≈ 2.99
        # We use 2.0 as a practical threshold (more conservative than 1.96 but
        # less strict than Bonferroni, given 3-week sample inherent limitations)
        assert z > 2.0, (
            f"NY_AFTERNOON WR deviation (z={z:.2f}) does not survive "
            f"multiple-comparison correction. n={n}, WR={wr_ny:.3f} vs base={baseline:.3f}."
        )

    @skip_no_ch
    def test_monday_bonferroni_warning(self):
        trades   = fetch_trades()
        mon      = [t for t in trades if t["dow"] == 0]
        n        = len(mon)
        baseline = wr(trades)
        wr_mon   = wr(mon)

        se = binomial_se(baseline, n)
        z  = (baseline - wr_mon) / se if se > 0 else 0

        # Monday: warn if z < 2.0 (doesn't survive strict Bonferroni)
        if z < 2.0:
            print(f"\n  WARN: Monday z={z:.2f} < 2.0. Does not survive Bonferroni "
                  f"at current sample (n={n}). Apply Monday gate cautiously.")
        # Require at least z > 1.0 (directional signal, not pure noise)
        assert z > 1.0, (
            f"Monday WR deviation is indistinguishable from noise: z={z:.2f}. "
            f"Do not gate Monday until more trades accumulate."
        )

    @skip_no_ch
    def test_no_spurious_best_cell_used_as_gate(self):
        """
        Best-cell cherry-pick guard: the SINGLE best-performing cell in the dataset
        must NOT be treated as a reliable gate without Bonferroni correction.
        Test: find the best WR cell (n >= 10), check that its deviation is NOT
        significantly larger than the worst cell — both could be noise extremes.
        """
        trades = fetch_trades()
        cells: Dict[Tuple, List[dict]] = defaultdict(list)
        for t in trades:
            cells[(t["dow"], t["session"])].append(t)

        valid = [(k, v) for k, v in cells.items() if len(v) >= 10]
        if len(valid) < 5:
            pytest.skip("Not enough cells with n >= 10")

        wrs    = [(k, wr(v), len(v)) for k, v in valid]
        best   = max(wrs, key=lambda x: x[1])
        worst  = min(wrs, key=lambda x: x[1])

        baseline = wr(trades)
        se_best  = binomial_se(baseline, best[2])
        se_worst = binomial_se(baseline, worst[2])

        z_best  = (best[1]  - baseline) / se_best  if se_best  > 0 else 0
        z_worst = (baseline - worst[1]) / se_worst if se_worst > 0 else 0

        # Both extremes should be similarly significant (or not).
        # If best is >3σ but worst is <1σ, something is asymmetric — flag it.
        # Acceptable: both extremes are significant OR both are marginal.
        ratio = z_best / z_worst if z_worst > 0.1 else float("inf")
        assert ratio < 5.0, (
            f"Asymmetric cell extremes: z_best={z_best:.2f} vs z_worst={z_worst:.2f}. "
            f"Best cell ({best[0]}) may be a cherry-pick artifact."
        )


class TestBootstrapCI:
    """
    Bootstrap confidence intervals on WR for each gated segment.
    The 95% CI upper bound for NY_AFTERNOON WR must be below baseline WR.
    If the CI overlaps the baseline, the effect is not reliable.
    """

    @skip_no_ch
    def test_ny_afternoon_ci_below_baseline(self):
        trades = fetch_trades()
        ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]

        assert len(ny) >= 20, f"NY_AFT sample too small for bootstrap: n={len(ny)}"

        _, upper = bootstrap_wr_ci(ny, n_boot=3000)
        baseline = wr(trades)

        assert upper < baseline, (
            f"NY_AFTERNOON WR CI upper bound ({upper:.3f}) overlaps baseline "
            f"WR ({baseline:.3f}). Effect not reliable at 95% confidence."
        )

    @skip_no_ch
    def test_london_morning_ci_above_baseline(self):
        trades = fetch_trades()
        ldn    = [t for t in trades if t["session"] == "LONDON_MORNING"]

        assert len(ldn) >= 20, f"LDN sample too small: n={len(ldn)}"

        lower, _ = bootstrap_wr_ci(ldn, n_boot=3000)
        baseline = wr(trades)

        assert lower > baseline * 0.95, (
            f"LONDON_MORNING WR CI lower bound ({lower:.3f}) is too far below "
            f"baseline ({baseline:.3f}). LDN advantage may not be reliable."
        )

    @skip_no_ch
    def test_ny_afternoon_pnl_ci_negative(self):
        """Net PnL CI for NY_AFTERNOON must have upper bound < 0 (net loser with confidence)."""
        trades = fetch_trades()
        ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]

        assert len(ny) >= 20

        _, upper = bootstrap_pnl_ci(ny, n_boot=3000)
        assert upper < 0, (
            f"NY_AFTERNOON net PnL CI upper bound is {upper:.2f} > 0. "
            f"Cannot confidently call it a net loser at current sample size."
        )


class TestMinimumSampleSize:
    """
    Minimum sample size guard. No session or DoW factor should influence
    the advisory score unless it has n >= 30 trades. Below 30, the WR
    estimate has SE > 9pp (too noisy to act on).
    """

    @skip_no_ch
    def test_all_gate_factors_have_sufficient_n(self):
        """
        The two gated factors (NY_AFTERNOON, Monday) must each have n >= 30
        in the current dataset for the gate to be considered valid.
        """
        trades = fetch_trades()
        ny  = [t for t in trades if t["session"] == "NY_AFTERNOON"]
        mon = [t for t in trades if t["dow"] == 0]

        assert len(ny)  >= 30, f"NY_AFTERNOON n={len(ny)} < 30. Gate underpowered."
        assert len(mon) >= 30, f"Monday n={len(mon)} < 30. Gate underpowered."

    @skip_no_ch
    def test_slot_15m_gate_would_be_overfit(self):
        """
        15-minute slot data has median n ≈ 7. Any slot-level gate applied
        directly would be extreme overfitting. Verify: majority of slots have n < 30.
        """
        trades  = fetch_trades()
        slots: Dict[str, int] = defaultdict(int)
        for t in trades:
            h   = t["ts"].hour
            m   = (t["ts"].minute // 15) * 15
            slots[f"{h}:{m:02d}"] += 1

        n_thin = sum(1 for n in slots.values() if n < 30)
        frac   = n_thin / len(slots) if slots else 1.0

        assert frac > 0.70, (
            f"Only {frac:.0%} of 15m slots have n < 30. "
            f"Expected most slots to be underpowered — if not, slot gate may be premature."
        )

    def test_advisory_score_weights_reflect_sample_size(self):
        """
        Slot weight (0.10) must be lower than session (0.25) and DoW (0.30).
        Ensures the weakest-sample factor has the lowest influence.
        """
        from esof_advisor import SESSION_STATS, DOW_STATS, SLOT_STATS
        median_session_n = sorted([v[0] for v in SESSION_STATS.values()])[len(SESSION_STATS) // 2]
        median_dow_n     = sorted([v[0] for v in DOW_STATS.values()])[len(DOW_STATS) // 2]
        median_slot_n    = sorted([v[0] for v in SLOT_STATS.values()])[len(SLOT_STATS) // 2]

        assert median_slot_n < median_session_n, "Slot n should be < session n"
        assert median_slot_n < median_dow_n,     "Slot n should be < DoW n"
        # Slot weight is 0.10, session 0.25, DoW 0.30 — smaller n = smaller weight
        SLOT_WEIGHT    = 0.10
        SESSION_WEIGHT = 0.25
        DOW_WEIGHT     = 0.30
        assert SLOT_WEIGHT < SESSION_WEIGHT
        assert SLOT_WEIGHT < DOW_WEIGHT


class TestEffectSize:
    """
    Cohen's h effect size on WR differences.
    |h| >= 0.2: small effect (minimum threshold to consider gating)
    |h| >= 0.5: medium effect (comfortable to gate)
    |h| >= 0.8: large effect (very strong signal)
    """

    @skip_no_ch
    def test_ny_afternoon_effect_size_medium(self):
        trades   = fetch_trades()
        ny       = [t for t in trades if t["session"] == "NY_AFTERNOON"]
        baseline = wr(trades)
        h        = cohen_h(wr(ny), baseline)
        assert h >= 0.2, (
            f"NY_AFTERNOON effect size h={h:.3f} < 0.2 (small). "
            f"Signal too weak to justify gating."
        )

    @skip_no_ch
    def test_london_morning_effect_size_positive(self):
        trades   = fetch_trades()
        ldn      = [t for t in trades if t["session"] == "LONDON_MORNING"]
        baseline = wr(trades)
        h        = cohen_h(wr(ldn), baseline)
        assert h >= 0.0, "LDN effect size must be measurable"

    @skip_no_ch
    def test_dow_tuesday_effect_size(self):
        """Tuesday is the best DoW. Effect size must be positive."""
        trades   = fetch_trades()
        tue      = [t for t in trades if t["dow"] == 1]
        baseline = wr(trades)
        if len(tue) < 10:
            pytest.skip("Tuesday sample too thin")
        h = cohen_h(wr(tue), baseline)
        assert h >= 0.0, "Tuesday must show positive effect"

    @skip_no_ch
    def test_effect_size_ranking_matches_expectation(self):
        """
        NY_AFTERNOON effect size must be larger than LOW_LIQUIDITY effect size.
        NY_AFT has more trades and a larger WR gap — should show stronger signal.
        """
        trades = fetch_trades()
        ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]
        low    = [t for t in trades if t["session"] == "LOW_LIQUIDITY"]
        base   = wr(trades)

        h_ny  = cohen_h(wr(ny),  base) if len(ny)  >= 10 else 0
        h_low = cohen_h(wr(low), base) if len(low) >= 10 else 0

        # NY_AFTERNOON has 3× the sample of LOW_LIQ — effect should be at least as large
        assert h_ny >= h_low * 0.7, (
            f"NY_AFT h={h_ny:.3f} much smaller than LOW_LIQ h={h_low:.3f}. "
            f"Unexpected — check data."
        )


class TestWalkForwardAdvisory:
    """
    Walk-forward advisory score validation.
    Train EsoF tables conceptually on H1 (we use the existing static tables as proxy).
    Evaluate: does the advisory score computed at H2 trade times predict H2 outcomes?

    Method: within H2, rank trades by advisory_score. The bottom quartile (most
    negative score) should have lower WR than the top quartile. If the score
    has no predictive power on OOS data, it is overfit to the in-sample period.
    """

    @skip_no_ch
    def test_score_predicts_wr_direction_in_h2(self):
        trades = fetch_trades()
        n      = len(trades)
        h2     = sorted(trades[n // 2:], key=lambda t: t["score"])

        if len(h2) < 40:
            pytest.skip(f"H2 too small for quartile split: n={len(h2)}")

        q   = len(h2) // 4
        bottom = h2[:q]      # worst advisory scores
        top    = h2[-q:]     # best advisory scores

        wr_bot = wr(bottom)
        wr_top = wr(top)

        assert wr_top > wr_bot, (
            f"Advisory score has no directional predictive power in H2: "
            f"WR_top={wr_top:.3f}  WR_bot={wr_bot:.3f}. Score may be overfit."
        )

    @skip_no_ch
    def test_unfavorable_label_has_lower_wr_in_h2(self):
        trades = fetch_trades()
        n      = len(trades)
        h2     = trades[n // 2:]

        unfav = [t for t in h2 if t["label"] == "UNFAVORABLE"]
        rest  = [t for t in h2 if t["label"] != "UNFAVORABLE"]

        if len(unfav) < 5:
            pytest.skip(f"Too few UNFAVORABLE trades in H2: n={len(unfav)}")

        assert wr(unfav) <= wr(rest) + 0.05, (
            f"UNFAVORABLE label does not predict lower WR in H2: "
            f"WR_unfav={wr(unfav):.3f} vs WR_rest={wr(rest):.3f}. "
            f"Advisory label may be overfit."
        )


class TestAssetBucketStability:
    """
    The session/DoW effect must not be driven by a single asset bucket.
    If NY_AFTERNOON drag is entirely explained by, say, B4 trades clustering
    in that session, the gate is actually gating B4 by proxy — not time.
    The effect must hold across at least 2 independent buckets.
    """

    @skip_no_ch
    def test_ny_afternoon_drag_cross_bucket(self):
        trades  = fetch_trades()
        ny      = [t for t in trades if t["session"] == "NY_AFTERNOON"]
        not_ny  = [t for t in trades if t["session"] != "NY_AFTERNOON"]

        by_bucket_ny  = defaultdict(list)
        by_bucket_out = defaultdict(list)
        for t in ny:
            by_bucket_ny[t["bucket_id"]].append(t)
        for t in not_ny:
            by_bucket_out[t["bucket_id"]].append(t)

        # Count buckets where NY_AFT WR is below out-of-session WR
        n_confirming = 0
        for bkt in by_bucket_ny:
            if len(by_bucket_ny[bkt]) < 5 or len(by_bucket_out.get(bkt, [])) < 5:
                continue
            if wr(by_bucket_ny[bkt]) < wr(by_bucket_out[bkt]):
                n_confirming += 1

        assert n_confirming >= 2, (
            f"NY_AFT drag only confirmed in {n_confirming} bucket(s). "
            f"Need ≥ 2 for effect to be session-driven, not bucket-confounded."
        )

    @skip_no_ch
    def test_monday_drag_cross_bucket(self):
        trades = fetch_trades()
        mon    = [t for t in trades if t["dow"] == 0]
        not_mon = [t for t in trades if t["dow"] != 0]

        by_bkt_mon = defaultdict(list)
        by_bkt_out = defaultdict(list)
        for t in mon:
            by_bkt_mon[t["bucket_id"]].append(t)
        for t in not_mon:
            by_bkt_out[t["bucket_id"]].append(t)

        n_confirming = 0
        for bkt in by_bkt_mon:
            if len(by_bkt_mon[bkt]) < 5 or len(by_bkt_out.get(bkt, [])) < 5:
                continue
            if wr(by_bkt_mon[bkt]) < wr(by_bkt_out[bkt]):
                n_confirming += 1

        if n_confirming < 2:
            print(f"\n  WARN: Monday drag only in {n_confirming} bucket(s). "
                  f"Thin sample — cannot confirm cross-bucket. Gate with caution.")
        # Soft assert: Monday has thinner sample, require at least 1
        assert n_confirming >= 1, (
            f"Monday drag not present in ANY bucket. "
            f"Likely a sampling artifact — do not gate Monday."
        )


class TestRegimeConfound:
    """
    Regime confound check: is the session effect just a proxy for ACB beta?
    If all NY_AFTERNOON trades happen to coincide with low ACB beta (bearish
    regime), then blocking NY_AFT is actually blocking bear-regime trades,
    not session-specific trades. The gate would be redundant with ACB.

    Method: compare ACB leverage (proxy for regime strength) between
    NY_AFTERNOON and other sessions. If leverage distributions are
    significantly different, the session effect is partially confounded.
    """

    @skip_no_ch
    def test_ny_afternoon_leverage_not_systematically_different(self):
        """
        NY_AFTERNOON avg leverage should be within 20% of other sessions' avg leverage.
        Large divergence → session effect may be a regime proxy.
        """
        trades  = fetch_trades()
        ny      = [t for t in trades if t["session"] == "NY_AFTERNOON"]
        not_ny  = [t for t in trades if t["session"] != "NY_AFTERNOON"]

        if len(ny) < 10 or len(not_ny) < 10:
            pytest.skip("Insufficient data for leverage comparison")

        avg_lev_ny  = sum(t["leverage"] for t in ny)  / len(ny)
        avg_lev_out = sum(t["leverage"] for t in not_ny) / len(not_ny)

        ratio = avg_lev_ny / avg_lev_out if avg_lev_out > 0 else 1.0

        assert 0.80 <= ratio <= 1.20, (
            f"NY_AFTERNOON avg leverage ({avg_lev_ny:.2f}x) differs by >{20}% "
            f"from other sessions ({avg_lev_out:.2f}x). "
            f"Session effect may be a regime-proxy — investigate confound."
        )

    @skip_no_ch
    def test_ny_afternoon_wr_negative_across_leverage_bands(self):
        """
        Regime confound falsification: split NY_AFT trades into high/low leverage.
        If NY_AFT drag holds in BOTH leverage bands, it is NOT purely a regime effect.
        """
        trades = fetch_trades()
        ny     = [t for t in trades if t["session"] == "NY_AFTERNOON"]

        if len(ny) < 20:
            pytest.skip(f"NY_AFT too small for leverage split: n={len(ny)}")

        median_lev = sorted(t["leverage"] for t in ny)[len(ny) // 2]
        hi_lev     = [t for t in ny if t["leverage"] >= median_lev]
        lo_lev     = [t for t in ny if t["leverage"] <  median_lev]
        baseline   = wr(fetch_trades())

        hi_below = wr(hi_lev) < baseline if len(hi_lev) >= 5 else True
        lo_below = wr(lo_lev) < baseline if len(lo_lev) >= 5 else True

        assert hi_below or lo_below, (
            "NY_AFT drag absent in BOTH leverage bands — effect is not regime-independent. "
            "Gate may be a regime proxy."
        )


# ═════════════════════════════════════════════════════════════════════════════
# STANDALONE REPORT
# ═════════════════════════════════════════════════════════════════════════════

GREEN = "\033[32m"; RED = "\033[31m"; YELLOW = "\033[33m"
BOLD  = "\033[1m";  DIM = "\033[2m";  RST = "\033[0m"

if __name__ == "__main__":
    if not CH_UP:
        print(f"{RED}ClickHouse not available.{RST}")
        sys.exit(1)

    trades = fetch_trades()
    n      = len(trades)
    h1, h2 = trades[:n // 2], trades[n // 2:]

    print(f"\n{BOLD}{'═'*68}{RST}")
    print(f"{BOLD}  EsoF Overfitting Guard Report  ({n} trades){RST}")
    print(f"{'═'*68}\n")

    baseline  = wr(trades)
    ny        = [t for t in trades if t["session"] == "NY_AFTERNOON"]
    mon       = [t for t in trades if t["dow"] == 0]
    ldn       = [t for t in trades if t["session"] == "LONDON_MORNING"]

    ny_h1  = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
    ny_h2  = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
    mon_h1 = [t for t in h1 if t["dow"] == 0]
    mon_h2 = [t for t in h2 if t["dow"] == 0]

    def row(label, val, ref=None, lo=None, hi=None, warn=None, note=""):
        if lo is not None:
            ci_str = f"  95%CI [{lo:.3f}, {hi:.3f}]"
        else:
            ci_str = ""
        col = GREEN if (ref is None or val < ref) else RED
        if warn:
            col = YELLOW
        print(f"  {label:<42} {col}{val:.3f}{RST}{ci_str}  {DIM}{note}{RST}")

    print(f"  {'Baseline WR':<42} {baseline:.3f}")
    print()

    print(f"  {BOLD}1. Temporal Stability (H1 / H2){RST}")
    row("  NY_AFT WR — H1", wr(ny_h1), baseline, note=f"n={len(ny_h1)}")
    row("  NY_AFT WR — H2", wr(ny_h2), baseline, note=f"n={len(ny_h2)}")
    row("  Mon WR — H1",    wr(mon_h1), baseline, note=f"n={len(mon_h1)}")
    row("  Mon WR — H2",    wr(mon_h2), baseline, note=f"n={len(mon_h2)}")

    print(f"\n  {BOLD}2. Permutation p-values{RST}")
    ny_pnl  = sum(t["pnl"] for t in ny)
    mon_pnl = sum(t["pnl"] for t in mon)
    p_ny  = permutation_pvalue(trades, -ny_pnl,  "session", "NY_AFTERNOON", n_perm=2000)
    p_mon = permutation_pvalue(trades, -mon_pnl, "dow",     0,              n_perm=2000)
    col_ny  = GREEN if p_ny  < 0.05 else YELLOW if p_ny  < 0.15 else RED
    col_mon = GREEN if p_mon < 0.05 else YELLOW if p_mon < 0.15 else RED
    print(f"  {'NY_AFT block p-value':<42} {col_ny}{p_ny:.4f}{RST}  {DIM}(< 0.05 = significant){RST}")
    print(f"  {'Monday block p-value':<42} {col_mon}{p_mon:.4f}{RST}  {DIM}(< 0.15 = directional){RST}")

    print(f"\n  {BOLD}3. Effect Sizes (Cohen's h){RST}")
    h_ny  = cohen_h(wr(ny),  baseline)
    h_mon = cohen_h(wr(mon), baseline)
    h_ldn = cohen_h(wr(ldn), baseline)
    for label, h, n_cell in [("NY_AFT", h_ny, len(ny)), ("Monday", h_mon, len(mon)), ("London", h_ldn, len(ldn))]:
        grade = "large" if h >= 0.8 else "medium" if h >= 0.5 else "small" if h >= 0.2 else "trivial"
        col = GREEN if h >= 0.5 else YELLOW if h >= 0.2 else RED
        print(f"  {'  '+label:<42} {col}{h:.3f}{RST}  {DIM}{grade} (n={n_cell}){RST}")

    print(f"\n  {BOLD}4. Bootstrap 95% CIs{RST}")
    ny_lo, ny_hi = bootstrap_wr_ci(ny, n_boot=3000)
    col = GREEN if ny_hi < baseline else RED
    print(f"  {'NY_AFT WR CI':<42} {col}[{ny_lo:.3f}, {ny_hi:.3f}]{RST}  "
          f"{DIM}({'below' if ny_hi < baseline else 'overlaps'} baseline {baseline:.3f}){RST}")
    ny_plo, ny_phi = bootstrap_pnl_ci(ny, n_boot=3000)
    col = GREEN if ny_phi < 0 else RED
    print(f"  {'NY_AFT net PnL CI':<42} {col}[{ny_plo:+,.0f}, {ny_phi:+,.0f}]{RST}  "
          f"{DIM}({'net loser with confidence' if ny_phi < 0 else 'uncertain sign'}){RST}")

    print(f"\n  {BOLD}5. Bonferroni z-scores (35 cells tested){RST}")
    se_ny  = binomial_se(baseline, len(ny))
    se_mon = binomial_se(baseline, len(mon))
    z_ny   = (baseline - wr(ny))  / se_ny  if se_ny  > 0 else 0
    z_mon  = (baseline - wr(mon)) / se_mon if se_mon > 0 else 0
    crit   = 2.99  # Bonferroni α=0.0014 → z_crit≈2.99
    col_ny  = GREEN if z_ny  > crit else YELLOW if z_ny  > 2.0 else RED
    col_mon = GREEN if z_mon > crit else YELLOW if z_mon > 2.0 else RED
    print(f"  {'NY_AFT z':<42} {col_ny}{z_ny:.2f}{RST}  {DIM}(Bonferroni crit ≈ {crit}){RST}")
    print(f"  {'Monday z':<42} {col_mon}{z_mon:.2f}{RST}")

    print(f"\n  {BOLD}6. Walk-Forward: advisory score → H2 WR{RST}")
    h2s = sorted(h2, key=lambda t: t["score"])
    q   = max(1, len(h2s) // 4)
    wr_bot = wr(h2s[:q])
    wr_top = wr(h2s[-q:])
    col = GREEN if wr_top > wr_bot else RED
    print(f"  {'  Top-quartile score WR (H2)':<42} {col}{wr_top:.3f}{RST}  {DIM}n={q}{RST}")
    print(f"  {'  Bot-quartile score WR (H2)':<42} {col}{wr_bot:.3f}{RST}  {DIM}n={q}{RST}")
    print(f"  {'  Predictive (top > bot)?':<42} {col}{'YES' if wr_top > wr_bot else 'NO — score overfit'}{RST}")

    print(f"\n{'═'*68}\n")