initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
871
prod/tests/test_esof_overfit_guard.py
Executable file
871
prod/tests/test_esof_overfit_guard.py
Executable file
@@ -0,0 +1,871 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
EsoF Overfitting Avoidance Test Suite
|
||||
|
||||
Industry-standard statistical tests to guard against overfitting in the
|
||||
EsoF calendar/session gate and the EsoF↔system interaction.
|
||||
|
||||
Why overfitting is a real risk here
|
||||
─────────────────────────────────────
|
||||
We inspected 5 sessions × 7 DoW = 35 cells on a single ~550-trade dataset
|
||||
covering only 3 weeks (2026-03-31 → 2026-04-19). That is:
|
||||
- A short temporal window (one market regime)
|
||||
- Small per-cell sample sizes (median n ≈ 14)
|
||||
- Multiple comparisons (we chose the *worst* cells after looking at all)
|
||||
- No pre-registration (we looked at the data before deciding the gate)
|
||||
|
||||
Any one of these alone warrants caution. Together they demand rigorous testing.
|
||||
|
||||
Tests implemented
|
||||
──────────────────
|
||||
1. TestTemporalStability — H1 vs H2 walk-forward: does the effect hold in both halves?
|
||||
2. TestPermutationSignificance — shuffle session/DoW labels N=2000 times; empirical p-value
|
||||
3. TestMultipleComparison — Bonferroni / FDR correction across all 35 cells
|
||||
4. TestBootstrapCI — 95% CI on WR and net PnL via bootstrap resampling
|
||||
5. TestMinimumSampleSize — flag cells with n < 30 as "insufficient evidence"
|
||||
6. TestEffectSize — Cohen's h on WR difference; require medium+ effect (h ≥ 0.3)
|
||||
7. TestWalkForwardAdvisory — train EsoF tables on H1, evaluate advisory score on H2
|
||||
8. TestAssetBucketStability — NY_AFT / Mon effect must hold across ≥ 2 asset buckets
|
||||
9. TestRegimeConfound — check if session effect is a proxy for ACB beta (regime)
|
||||
|
||||
Run:
|
||||
source /home/dolphin/siloqy_env/bin/activate
|
||||
cd /mnt/dolphinng5_predict
|
||||
python prod/tests/test_esof_overfit_guard.py # full report
|
||||
pytest prod/tests/test_esof_overfit_guard.py -v # pytest mode
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import math
|
||||
import random
|
||||
import sys
|
||||
import urllib.request
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
_ROOT = Path(__file__).parent.parent.parent
|
||||
sys.path.insert(0, str(_ROOT))
|
||||
sys.path.insert(0, str(_ROOT / "Observability"))
|
||||
|
||||
from esof_advisor import compute_esof, get_session, BASELINE_WR
|
||||
from esof_gate import get_bucket
|
||||
|
||||
# ── CH helpers ────────────────────────────────────────────────────────────────
|
||||
CH_URL = "http://localhost:8123"
|
||||
CH_USER = "dolphin"
|
||||
CH_PASS = "dolphin_ch_2026"
|
||||
|
||||
def _ch_query(sql: str) -> List[List[str]]:
|
||||
auth = base64.b64encode(f"{CH_USER}:{CH_PASS}".encode()).decode()
|
||||
req = urllib.request.Request(
|
||||
f"{CH_URL}/?database=dolphin&default_format=TabSeparated",
|
||||
data=sql.encode(),
|
||||
headers={"Authorization": f"Basic {auth}"},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as r:
|
||||
raw = r.read().decode().strip()
|
||||
if not raw:
|
||||
return []
|
||||
return [line.split('\t') for line in raw.split('\n')]
|
||||
|
||||
def _ch_available() -> bool:
|
||||
try:
|
||||
_ch_query("SELECT 1")
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
CH_UP = _ch_available()
|
||||
|
||||
# ── Trade loader (shared with gate test) ──────────────────────────────────────
|
||||
_CACHED_TRADES: Optional[List[dict]] = None
|
||||
|
||||
def fetch_trades() -> List[dict]:
|
||||
global _CACHED_TRADES
|
||||
if _CACHED_TRADES is not None:
|
||||
return _CACHED_TRADES
|
||||
sql = """
|
||||
SELECT
|
||||
toUnixTimestamp64Milli(ts) AS ts_ms,
|
||||
asset, side, pnl, exit_reason, leverage
|
||||
FROM dolphin.trade_events
|
||||
WHERE strategy = 'blue'
|
||||
AND exit_reason NOT IN ('HIBERNATE_HALT', 'SUBDAY_ACB_NORMALIZATION')
|
||||
ORDER BY ts
|
||||
"""
|
||||
rows = _ch_query(sql)
|
||||
pkl_map: Optional[Dict[str, int]] = None
|
||||
try:
|
||||
import pickle
|
||||
with open(_ROOT / "adaptive_exit/models/bucket_assignments.pkl", 'rb') as f:
|
||||
pkl_map = pickle.load(f).get('assignments', {})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
trades = []
|
||||
for row in rows:
|
||||
if len(row) < 6:
|
||||
continue
|
||||
try:
|
||||
ts_ms = int(row[0])
|
||||
asset = row[1]
|
||||
pnl = float(row[3])
|
||||
leverage = float(row[5])
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
ts = datetime.fromtimestamp(ts_ms / 1000.0, tz=timezone.utc)
|
||||
adv = compute_esof(ts)
|
||||
trades.append({
|
||||
"ts": ts,
|
||||
"asset": asset,
|
||||
"pnl": pnl,
|
||||
"leverage": leverage,
|
||||
"session": adv["session"],
|
||||
"dow": adv["dow"],
|
||||
"score": adv["advisory_score"],
|
||||
"label": adv["advisory_label"],
|
||||
"bucket_id": get_bucket(asset, pkl_map),
|
||||
})
|
||||
_CACHED_TRADES = trades
|
||||
return trades
|
||||
|
||||
|
||||
# ── Statistical primitives ────────────────────────────────────────────────────
|
||||
|
||||
def wr(trades: List[dict]) -> float:
|
||||
if not trades:
|
||||
return float("nan")
|
||||
return sum(1 for t in trades if t["pnl"] > 0) / len(trades)
|
||||
|
||||
def net_pnl(trades: List[dict]) -> float:
|
||||
return sum(t["pnl"] for t in trades)
|
||||
|
||||
def cohen_h(p1: float, p2: float) -> float:
|
||||
"""Cohen's h effect size for two proportions. |h| ≥ 0.2 small, 0.5 medium, 0.8 large."""
|
||||
return abs(2 * math.asin(math.sqrt(p1)) - 2 * math.asin(math.sqrt(p2)))
|
||||
|
||||
def binomial_se(p: float, n: int) -> float:
|
||||
return math.sqrt(p * (1 - p) / n) if n > 0 else float("inf")
|
||||
|
||||
def bootstrap_wr_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
|
||||
"""Bootstrap CI on WR. Returns (lower, upper)."""
|
||||
rng = random.Random(42)
|
||||
n = len(trades)
|
||||
samples = []
|
||||
for _ in range(n_boot):
|
||||
resample = [rng.choice(trades) for _ in range(n)]
|
||||
samples.append(wr(resample))
|
||||
samples.sort()
|
||||
lo = int((1 - ci) / 2 * n_boot)
|
||||
hi = int((1 + ci) / 2 * n_boot)
|
||||
return samples[lo], samples[hi]
|
||||
|
||||
def bootstrap_pnl_ci(trades: List[dict], n_boot: int = 5000, ci: float = 0.95) -> Tuple[float, float]:
|
||||
rng = random.Random(42)
|
||||
n = len(trades)
|
||||
samples = []
|
||||
for _ in range(n_boot):
|
||||
resample = [rng.choice(trades) for _ in range(n)]
|
||||
samples.append(net_pnl(resample))
|
||||
samples.sort()
|
||||
lo = int((1 - ci) / 2 * n_boot)
|
||||
hi = int((1 + ci) / 2 * n_boot)
|
||||
return samples[lo], samples[hi]
|
||||
|
||||
def permutation_pvalue(
|
||||
trades: List[dict],
|
||||
observed_delta: float,
|
||||
label_key: str,
|
||||
blocked_label,
|
||||
n_perm: int = 2000,
|
||||
seed: int = 42,
|
||||
) -> float:
|
||||
"""
|
||||
Permutation test: shuffle label_key randomly, compute strategy improvement
|
||||
each time. Return fraction of permutations that produce >= observed_delta.
|
||||
observed_delta > 0 means "blocking blocked_label improved PnL".
|
||||
"""
|
||||
rng = random.Random(seed)
|
||||
labels = [t[label_key] for t in trades]
|
||||
pnls = [t["pnl"] for t in trades]
|
||||
count_ge = 0
|
||||
for _ in range(n_perm):
|
||||
rng.shuffle(labels)
|
||||
blocked_pnl = sum(p for l, p in zip(labels, pnls) if l == blocked_label)
|
||||
# delta = what we gain by blocking these trades
|
||||
delta = -blocked_pnl # if blocked_pnl < 0, delta > 0 = improvement
|
||||
if delta >= observed_delta:
|
||||
count_ge += 1
|
||||
return count_ge / n_perm
|
||||
|
||||
|
||||
# ═════════════════════════════════════════════════════════════════════════════
|
||||
# TEST CLASSES
|
||||
# ═════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
skip_no_ch = pytest.mark.skipif(not CH_UP, reason="ClickHouse not available")
|
||||
|
||||
|
||||
class TestTemporalStability:
|
||||
"""
|
||||
Walk-forward: split chronologically into H1 (first 50%) and H2 (last 50%).
|
||||
Session and DoW effects must appear in BOTH halves to be considered real.
|
||||
If present in only one half → data snooping artifact.
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_negative_in_h1_and_h2(self):
|
||||
trades = fetch_trades()
|
||||
n = len(trades)
|
||||
h1 = trades[:n // 2]
|
||||
h2 = trades[n // 2:]
|
||||
|
||||
ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
|
||||
ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
|
||||
|
||||
base_h1 = wr(h1)
|
||||
base_h2 = wr(h2)
|
||||
|
||||
assert len(ny_h1) >= 10, f"H1 NY_AFT too small: n={len(ny_h1)}"
|
||||
assert len(ny_h2) >= 10, f"H2 NY_AFT too small: n={len(ny_h2)}"
|
||||
|
||||
# NY_AFTERNOON WR must be below baseline in BOTH halves
|
||||
wr_h1 = wr(ny_h1)
|
||||
wr_h2 = wr(ny_h2)
|
||||
assert wr_h1 < base_h1, (
|
||||
f"NY_AFT drag missing in H1: WR_NYA={wr_h1:.3f} >= baseline={base_h1:.3f}"
|
||||
)
|
||||
assert wr_h2 < base_h2, (
|
||||
f"NY_AFT drag missing in H2: WR_NYA={wr_h2:.3f} >= baseline={base_h2:.3f}"
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_monday_negative_in_h1_and_h2(self):
|
||||
trades = fetch_trades()
|
||||
n = len(trades)
|
||||
h1 = trades[:n // 2]
|
||||
h2 = trades[n // 2:]
|
||||
|
||||
mon_h1 = [t for t in h1 if t["dow"] == 0]
|
||||
mon_h2 = [t for t in h2 if t["dow"] == 0]
|
||||
|
||||
# Monday sample is thin — require at least 10 in each half
|
||||
if len(mon_h1) < 10 or len(mon_h2) < 10:
|
||||
pytest.skip(f"Monday sample too thin for walk-forward: H1={len(mon_h1)}, H2={len(mon_h2)}")
|
||||
|
||||
assert wr(mon_h1) < wr(h1), "Monday drag absent in H1"
|
||||
assert wr(mon_h2) < wr(h2), "Monday drag absent in H2"
|
||||
|
||||
@skip_no_ch
|
||||
def test_strategy_e_positive_in_both_halves(self):
|
||||
"""Combined gate (Mon+NY_AFT) must improve PnL in H1 AND H2 independently."""
|
||||
trades = fetch_trades()
|
||||
n = len(trades)
|
||||
h1 = trades[:n // 2]
|
||||
h2 = trades[n // 2:]
|
||||
|
||||
def gate_e_pnl(subset):
|
||||
return sum(t["pnl"] for t in subset
|
||||
if t["dow"] != 0 and t["session"] != "NY_AFTERNOON")
|
||||
|
||||
def base_pnl(subset):
|
||||
return sum(t["pnl"] for t in subset)
|
||||
|
||||
assert gate_e_pnl(h1) > base_pnl(h1), "Strategy E degrades H1"
|
||||
assert gate_e_pnl(h2) > base_pnl(h2), "Strategy E degrades H2"
|
||||
|
||||
|
||||
class TestPermutationSignificance:
|
||||
"""
|
||||
Permutation test: shuffle session / DoW labels randomly.
|
||||
The observed improvement from blocking must rank in the top 5%
|
||||
of the null distribution (p < 0.05) to be considered non-random.
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_block_is_significant(self):
|
||||
trades = fetch_trades()
|
||||
ny_pnl = sum(t["pnl"] for t in trades if t["session"] == "NY_AFTERNOON")
|
||||
observed_delta = -ny_pnl # gain from skipping NY_AFT trades
|
||||
|
||||
p = permutation_pvalue(trades, observed_delta, "session", "NY_AFTERNOON",
|
||||
n_perm=2000)
|
||||
assert p < 0.05, (
|
||||
f"NY_AFTERNOON block not significant: p={p:.3f} >= 0.05. "
|
||||
f"Effect may be noise at this sample size."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_monday_block_significance(self):
|
||||
trades = fetch_trades()
|
||||
mon_pnl = sum(t["pnl"] for t in trades if t["dow"] == 0)
|
||||
observed_delta = -mon_pnl
|
||||
|
||||
p = permutation_pvalue(trades, observed_delta, "dow", 0, n_perm=2000)
|
||||
# Monday has fewer trades — use looser threshold (p < 0.15)
|
||||
# Flag as WARNING not FAIL if p >= 0.05: thin sample, directionally valid
|
||||
if p >= 0.05:
|
||||
print(f"\n WARN: Monday block p={p:.3f} >= 0.05. "
|
||||
f"Directionally valid but underpowered (n={sum(1 for t in trades if t['dow']==0)}).")
|
||||
assert p < 0.15, (
|
||||
f"Monday block not even marginally significant: p={p:.3f}. "
|
||||
f"Gate should not be applied until more data accumulates."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_london_morning_block_would_hurt(self):
|
||||
"""Blocking LONDON_MORNING (the BEST session) must NOT improve PnL."""
|
||||
trades = fetch_trades()
|
||||
ldn_pnl = sum(t["pnl"] for t in trades if t["session"] == "LONDON_MORNING")
|
||||
observed_delta = -ldn_pnl # gain from blocking LDN (expect negative = harmful)
|
||||
|
||||
# LDN is net-positive, so blocking it is harmful (delta < 0)
|
||||
assert observed_delta < 0, (
|
||||
f"Blocking LONDON_MORNING should HURT PnL (it is the best session). "
|
||||
f"Got delta={observed_delta:.2f}. Check data integrity."
|
||||
)
|
||||
|
||||
|
||||
class TestMultipleComparison:
|
||||
"""
|
||||
Multiple comparison correction.
|
||||
We inspected 5 sessions × 7 DoW = 35 cells. Finding 'significant' cells
|
||||
after inspection requires Bonferroni correction: α_adj = 0.05 / 35 ≈ 0.0014.
|
||||
Only cells where WR deviation is large enough to survive Bonferroni should
|
||||
be used in the gate.
|
||||
|
||||
We test: do our chosen cells (NY_AFT, Monday) survive Bonferroni?
|
||||
Using a binomial z-test as a proxy for the corrected p-value.
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_survives_bonferroni(self):
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
n = len(ny)
|
||||
baseline = wr(trades)
|
||||
wr_ny = wr(ny)
|
||||
|
||||
se = binomial_se(baseline, n)
|
||||
z = (baseline - wr_ny) / se if se > 0 else 0
|
||||
# One-tailed z for Bonferroni α=0.0014: z_crit ≈ 2.99
|
||||
# We use 2.0 as a practical threshold (more conservative than 1.96 but
|
||||
# less strict than Bonferroni, given 3-week sample inherent limitations)
|
||||
assert z > 2.0, (
|
||||
f"NY_AFTERNOON WR deviation (z={z:.2f}) does not survive "
|
||||
f"multiple-comparison correction. n={n}, WR={wr_ny:.3f} vs base={baseline:.3f}."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_monday_bonferroni_warning(self):
|
||||
trades = fetch_trades()
|
||||
mon = [t for t in trades if t["dow"] == 0]
|
||||
n = len(mon)
|
||||
baseline = wr(trades)
|
||||
wr_mon = wr(mon)
|
||||
|
||||
se = binomial_se(baseline, n)
|
||||
z = (baseline - wr_mon) / se if se > 0 else 0
|
||||
|
||||
# Monday: warn if z < 2.0 (doesn't survive strict Bonferroni)
|
||||
if z < 2.0:
|
||||
print(f"\n WARN: Monday z={z:.2f} < 2.0. Does not survive Bonferroni "
|
||||
f"at current sample (n={n}). Apply Monday gate cautiously.")
|
||||
# Require at least z > 1.0 (directional signal, not pure noise)
|
||||
assert z > 1.0, (
|
||||
f"Monday WR deviation is indistinguishable from noise: z={z:.2f}. "
|
||||
f"Do not gate Monday until more trades accumulate."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_no_spurious_best_cell_used_as_gate(self):
|
||||
"""
|
||||
Best-cell cherry-pick guard: the SINGLE best-performing cell in the dataset
|
||||
must NOT be treated as a reliable gate without Bonferroni correction.
|
||||
Test: find the best WR cell (n >= 10), check that its deviation is NOT
|
||||
significantly larger than the worst cell — both could be noise extremes.
|
||||
"""
|
||||
trades = fetch_trades()
|
||||
cells: Dict[Tuple, List[dict]] = defaultdict(list)
|
||||
for t in trades:
|
||||
cells[(t["dow"], t["session"])].append(t)
|
||||
|
||||
valid = [(k, v) for k, v in cells.items() if len(v) >= 10]
|
||||
if len(valid) < 5:
|
||||
pytest.skip("Not enough cells with n >= 10")
|
||||
|
||||
wrs = [(k, wr(v), len(v)) for k, v in valid]
|
||||
best = max(wrs, key=lambda x: x[1])
|
||||
worst = min(wrs, key=lambda x: x[1])
|
||||
|
||||
baseline = wr(trades)
|
||||
se_best = binomial_se(baseline, best[2])
|
||||
se_worst = binomial_se(baseline, worst[2])
|
||||
|
||||
z_best = (best[1] - baseline) / se_best if se_best > 0 else 0
|
||||
z_worst = (baseline - worst[1]) / se_worst if se_worst > 0 else 0
|
||||
|
||||
# Both extremes should be similarly significant (or not).
|
||||
# If best is >3σ but worst is <1σ, something is asymmetric — flag it.
|
||||
# Acceptable: both extremes are significant OR both are marginal.
|
||||
ratio = z_best / z_worst if z_worst > 0.1 else float("inf")
|
||||
assert ratio < 5.0, (
|
||||
f"Asymmetric cell extremes: z_best={z_best:.2f} vs z_worst={z_worst:.2f}. "
|
||||
f"Best cell ({best[0]}) may be a cherry-pick artifact."
|
||||
)
|
||||
|
||||
|
||||
class TestBootstrapCI:
|
||||
"""
|
||||
Bootstrap confidence intervals on WR for each gated segment.
|
||||
The 95% CI upper bound for NY_AFTERNOON WR must be below baseline WR.
|
||||
If the CI overlaps the baseline, the effect is not reliable.
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_ci_below_baseline(self):
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
|
||||
assert len(ny) >= 20, f"NY_AFT sample too small for bootstrap: n={len(ny)}"
|
||||
|
||||
_, upper = bootstrap_wr_ci(ny, n_boot=3000)
|
||||
baseline = wr(trades)
|
||||
|
||||
assert upper < baseline, (
|
||||
f"NY_AFTERNOON WR CI upper bound ({upper:.3f}) overlaps baseline "
|
||||
f"WR ({baseline:.3f}). Effect not reliable at 95% confidence."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_london_morning_ci_above_baseline(self):
|
||||
trades = fetch_trades()
|
||||
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
|
||||
|
||||
assert len(ldn) >= 20, f"LDN sample too small: n={len(ldn)}"
|
||||
|
||||
lower, _ = bootstrap_wr_ci(ldn, n_boot=3000)
|
||||
baseline = wr(trades)
|
||||
|
||||
assert lower > baseline * 0.95, (
|
||||
f"LONDON_MORNING WR CI lower bound ({lower:.3f}) is too far below "
|
||||
f"baseline ({baseline:.3f}). LDN advantage may not be reliable."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_pnl_ci_negative(self):
|
||||
"""Net PnL CI for NY_AFTERNOON must have upper bound < 0 (net loser with confidence)."""
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
|
||||
assert len(ny) >= 20
|
||||
|
||||
_, upper = bootstrap_pnl_ci(ny, n_boot=3000)
|
||||
assert upper < 0, (
|
||||
f"NY_AFTERNOON net PnL CI upper bound is {upper:.2f} > 0. "
|
||||
f"Cannot confidently call it a net loser at current sample size."
|
||||
)
|
||||
|
||||
|
||||
class TestMinimumSampleSize:
|
||||
"""
|
||||
Minimum sample size guard. No session or DoW factor should influence
|
||||
the advisory score unless it has n >= 30 trades. Below 30, the WR
|
||||
estimate has SE > 9pp (too noisy to act on).
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_all_gate_factors_have_sufficient_n(self):
|
||||
"""
|
||||
The two gated factors (NY_AFTERNOON, Monday) must each have n >= 30
|
||||
in the current dataset for the gate to be considered valid.
|
||||
"""
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
mon = [t for t in trades if t["dow"] == 0]
|
||||
|
||||
assert len(ny) >= 30, f"NY_AFTERNOON n={len(ny)} < 30. Gate underpowered."
|
||||
assert len(mon) >= 30, f"Monday n={len(mon)} < 30. Gate underpowered."
|
||||
|
||||
@skip_no_ch
|
||||
def test_slot_15m_gate_would_be_overfit(self):
|
||||
"""
|
||||
15-minute slot data has median n ≈ 7. Any slot-level gate applied
|
||||
directly would be extreme overfitting. Verify: majority of slots have n < 30.
|
||||
"""
|
||||
trades = fetch_trades()
|
||||
slots: Dict[str, int] = defaultdict(int)
|
||||
for t in trades:
|
||||
h = t["ts"].hour
|
||||
m = (t["ts"].minute // 15) * 15
|
||||
slots[f"{h}:{m:02d}"] += 1
|
||||
|
||||
n_thin = sum(1 for n in slots.values() if n < 30)
|
||||
frac = n_thin / len(slots) if slots else 1.0
|
||||
|
||||
assert frac > 0.70, (
|
||||
f"Only {frac:.0%} of 15m slots have n < 30. "
|
||||
f"Expected most slots to be underpowered — if not, slot gate may be premature."
|
||||
)
|
||||
|
||||
def test_advisory_score_weights_reflect_sample_size(self):
|
||||
"""
|
||||
Slot weight (0.10) must be lower than session (0.25) and DoW (0.30).
|
||||
Ensures the weakest-sample factor has the lowest influence.
|
||||
"""
|
||||
from esof_advisor import SESSION_STATS, DOW_STATS, SLOT_STATS
|
||||
median_session_n = sorted([v[0] for v in SESSION_STATS.values()])[len(SESSION_STATS) // 2]
|
||||
median_dow_n = sorted([v[0] for v in DOW_STATS.values()])[len(DOW_STATS) // 2]
|
||||
median_slot_n = sorted([v[0] for v in SLOT_STATS.values()])[len(SLOT_STATS) // 2]
|
||||
|
||||
assert median_slot_n < median_session_n, "Slot n should be < session n"
|
||||
assert median_slot_n < median_dow_n, "Slot n should be < DoW n"
|
||||
# Slot weight is 0.10, session 0.25, DoW 0.30 — smaller n = smaller weight
|
||||
SLOT_WEIGHT = 0.10
|
||||
SESSION_WEIGHT = 0.25
|
||||
DOW_WEIGHT = 0.30
|
||||
assert SLOT_WEIGHT < SESSION_WEIGHT
|
||||
assert SLOT_WEIGHT < DOW_WEIGHT
|
||||
|
||||
|
||||
class TestEffectSize:
|
||||
"""
|
||||
Cohen's h effect size on WR differences.
|
||||
|h| >= 0.2: small effect (minimum threshold to consider gating)
|
||||
|h| >= 0.5: medium effect (comfortable to gate)
|
||||
|h| >= 0.8: large effect (very strong signal)
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_effect_size_medium(self):
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
baseline = wr(trades)
|
||||
h = cohen_h(wr(ny), baseline)
|
||||
assert h >= 0.2, (
|
||||
f"NY_AFTERNOON effect size h={h:.3f} < 0.2 (small). "
|
||||
f"Signal too weak to justify gating."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_london_morning_effect_size_positive(self):
|
||||
trades = fetch_trades()
|
||||
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
|
||||
baseline = wr(trades)
|
||||
h = cohen_h(wr(ldn), baseline)
|
||||
assert h >= 0.0, "LDN effect size must be measurable"
|
||||
|
||||
@skip_no_ch
|
||||
def test_dow_tuesday_effect_size(self):
|
||||
"""Tuesday is the best DoW. Effect size must be positive."""
|
||||
trades = fetch_trades()
|
||||
tue = [t for t in trades if t["dow"] == 1]
|
||||
baseline = wr(trades)
|
||||
if len(tue) < 10:
|
||||
pytest.skip("Tuesday sample too thin")
|
||||
h = cohen_h(wr(tue), baseline)
|
||||
assert h >= 0.0, "Tuesday must show positive effect"
|
||||
|
||||
@skip_no_ch
|
||||
def test_effect_size_ranking_matches_expectation(self):
|
||||
"""
|
||||
NY_AFTERNOON effect size must be larger than LOW_LIQUIDITY effect size.
|
||||
NY_AFT has more trades and a larger WR gap — should show stronger signal.
|
||||
"""
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
low = [t for t in trades if t["session"] == "LOW_LIQUIDITY"]
|
||||
base = wr(trades)
|
||||
|
||||
h_ny = cohen_h(wr(ny), base) if len(ny) >= 10 else 0
|
||||
h_low = cohen_h(wr(low), base) if len(low) >= 10 else 0
|
||||
|
||||
# NY_AFTERNOON has 3× the sample of LOW_LIQ — effect should be at least as large
|
||||
assert h_ny >= h_low * 0.7, (
|
||||
f"NY_AFT h={h_ny:.3f} much smaller than LOW_LIQ h={h_low:.3f}. "
|
||||
f"Unexpected — check data."
|
||||
)
|
||||
|
||||
|
||||
class TestWalkForwardAdvisory:
|
||||
"""
|
||||
Walk-forward advisory score validation.
|
||||
Train EsoF tables conceptually on H1 (we use the existing static tables as proxy).
|
||||
Evaluate: does the advisory score computed at H2 trade times predict H2 outcomes?
|
||||
|
||||
Method: within H2, rank trades by advisory_score. The bottom quartile (most
|
||||
negative score) should have lower WR than the top quartile. If the score
|
||||
has no predictive power on OOS data, it is overfit to the in-sample period.
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_score_predicts_wr_direction_in_h2(self):
|
||||
trades = fetch_trades()
|
||||
n = len(trades)
|
||||
h2 = sorted(trades[n // 2:], key=lambda t: t["score"])
|
||||
|
||||
if len(h2) < 40:
|
||||
pytest.skip(f"H2 too small for quartile split: n={len(h2)}")
|
||||
|
||||
q = len(h2) // 4
|
||||
bottom = h2[:q] # worst advisory scores
|
||||
top = h2[-q:] # best advisory scores
|
||||
|
||||
wr_bot = wr(bottom)
|
||||
wr_top = wr(top)
|
||||
|
||||
assert wr_top > wr_bot, (
|
||||
f"Advisory score has no directional predictive power in H2: "
|
||||
f"WR_top={wr_top:.3f} WR_bot={wr_bot:.3f}. Score may be overfit."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_unfavorable_label_has_lower_wr_in_h2(self):
|
||||
trades = fetch_trades()
|
||||
n = len(trades)
|
||||
h2 = trades[n // 2:]
|
||||
|
||||
unfav = [t for t in h2 if t["label"] == "UNFAVORABLE"]
|
||||
rest = [t for t in h2 if t["label"] != "UNFAVORABLE"]
|
||||
|
||||
if len(unfav) < 5:
|
||||
pytest.skip(f"Too few UNFAVORABLE trades in H2: n={len(unfav)}")
|
||||
|
||||
assert wr(unfav) <= wr(rest) + 0.05, (
|
||||
f"UNFAVORABLE label does not predict lower WR in H2: "
|
||||
f"WR_unfav={wr(unfav):.3f} vs WR_rest={wr(rest):.3f}. "
|
||||
f"Advisory label may be overfit."
|
||||
)
|
||||
|
||||
|
||||
class TestAssetBucketStability:
|
||||
"""
|
||||
The session/DoW effect must not be driven by a single asset bucket.
|
||||
If NY_AFTERNOON drag is entirely explained by, say, B4 trades clustering
|
||||
in that session, the gate is actually gating B4 by proxy — not time.
|
||||
The effect must hold across at least 2 independent buckets.
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_drag_cross_bucket(self):
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
not_ny = [t for t in trades if t["session"] != "NY_AFTERNOON"]
|
||||
|
||||
by_bucket_ny = defaultdict(list)
|
||||
by_bucket_out = defaultdict(list)
|
||||
for t in ny:
|
||||
by_bucket_ny[t["bucket_id"]].append(t)
|
||||
for t in not_ny:
|
||||
by_bucket_out[t["bucket_id"]].append(t)
|
||||
|
||||
# Count buckets where NY_AFT WR is below out-of-session WR
|
||||
n_confirming = 0
|
||||
for bkt in by_bucket_ny:
|
||||
if len(by_bucket_ny[bkt]) < 5 or len(by_bucket_out.get(bkt, [])) < 5:
|
||||
continue
|
||||
if wr(by_bucket_ny[bkt]) < wr(by_bucket_out[bkt]):
|
||||
n_confirming += 1
|
||||
|
||||
assert n_confirming >= 2, (
|
||||
f"NY_AFT drag only confirmed in {n_confirming} bucket(s). "
|
||||
f"Need ≥ 2 for effect to be session-driven, not bucket-confounded."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_monday_drag_cross_bucket(self):
|
||||
trades = fetch_trades()
|
||||
mon = [t for t in trades if t["dow"] == 0]
|
||||
not_mon = [t for t in trades if t["dow"] != 0]
|
||||
|
||||
by_bkt_mon = defaultdict(list)
|
||||
by_bkt_out = defaultdict(list)
|
||||
for t in mon:
|
||||
by_bkt_mon[t["bucket_id"]].append(t)
|
||||
for t in not_mon:
|
||||
by_bkt_out[t["bucket_id"]].append(t)
|
||||
|
||||
n_confirming = 0
|
||||
for bkt in by_bkt_mon:
|
||||
if len(by_bkt_mon[bkt]) < 5 or len(by_bkt_out.get(bkt, [])) < 5:
|
||||
continue
|
||||
if wr(by_bkt_mon[bkt]) < wr(by_bkt_out[bkt]):
|
||||
n_confirming += 1
|
||||
|
||||
if n_confirming < 2:
|
||||
print(f"\n WARN: Monday drag only in {n_confirming} bucket(s). "
|
||||
f"Thin sample — cannot confirm cross-bucket. Gate with caution.")
|
||||
# Soft assert: Monday has thinner sample, require at least 1
|
||||
assert n_confirming >= 1, (
|
||||
f"Monday drag not present in ANY bucket. "
|
||||
f"Likely a sampling artifact — do not gate Monday."
|
||||
)
|
||||
|
||||
|
||||
class TestRegimeConfound:
|
||||
"""
|
||||
Regime confound check: is the session effect just a proxy for ACB beta?
|
||||
If all NY_AFTERNOON trades happen to coincide with low ACB beta (bearish
|
||||
regime), then blocking NY_AFT is actually blocking bear-regime trades,
|
||||
not session-specific trades. The gate would be redundant with ACB.
|
||||
|
||||
Method: compare ACB leverage (proxy for regime strength) between
|
||||
NY_AFTERNOON and other sessions. If leverage distributions are
|
||||
significantly different, the session effect is partially confounded.
|
||||
"""
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_leverage_not_systematically_different(self):
|
||||
"""
|
||||
NY_AFTERNOON avg leverage should be within 20% of other sessions' avg leverage.
|
||||
Large divergence → session effect may be a regime proxy.
|
||||
"""
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
not_ny = [t for t in trades if t["session"] != "NY_AFTERNOON"]
|
||||
|
||||
if len(ny) < 10 or len(not_ny) < 10:
|
||||
pytest.skip("Insufficient data for leverage comparison")
|
||||
|
||||
avg_lev_ny = sum(t["leverage"] for t in ny) / len(ny)
|
||||
avg_lev_out = sum(t["leverage"] for t in not_ny) / len(not_ny)
|
||||
|
||||
ratio = avg_lev_ny / avg_lev_out if avg_lev_out > 0 else 1.0
|
||||
|
||||
assert 0.80 <= ratio <= 1.20, (
|
||||
f"NY_AFTERNOON avg leverage ({avg_lev_ny:.2f}x) differs by >{20}% "
|
||||
f"from other sessions ({avg_lev_out:.2f}x). "
|
||||
f"Session effect may be a regime-proxy — investigate confound."
|
||||
)
|
||||
|
||||
@skip_no_ch
|
||||
def test_ny_afternoon_wr_negative_across_leverage_bands(self):
|
||||
"""
|
||||
Regime confound falsification: split NY_AFT trades into high/low leverage.
|
||||
If NY_AFT drag holds in BOTH leverage bands, it is NOT purely a regime effect.
|
||||
"""
|
||||
trades = fetch_trades()
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
|
||||
if len(ny) < 20:
|
||||
pytest.skip(f"NY_AFT too small for leverage split: n={len(ny)}")
|
||||
|
||||
median_lev = sorted(t["leverage"] for t in ny)[len(ny) // 2]
|
||||
hi_lev = [t for t in ny if t["leverage"] >= median_lev]
|
||||
lo_lev = [t for t in ny if t["leverage"] < median_lev]
|
||||
baseline = wr(fetch_trades())
|
||||
|
||||
hi_below = wr(hi_lev) < baseline if len(hi_lev) >= 5 else True
|
||||
lo_below = wr(lo_lev) < baseline if len(lo_lev) >= 5 else True
|
||||
|
||||
assert hi_below or lo_below, (
|
||||
"NY_AFT drag absent in BOTH leverage bands — effect is not regime-independent. "
|
||||
"Gate may be a regime proxy."
|
||||
)
|
||||
|
||||
|
||||
# ═════════════════════════════════════════════════════════════════════════════
|
||||
# STANDALONE REPORT
|
||||
# ═════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
GREEN = "\033[32m"; RED = "\033[31m"; YELLOW = "\033[33m"
|
||||
BOLD = "\033[1m"; DIM = "\033[2m"; RST = "\033[0m"
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not CH_UP:
|
||||
print(f"{RED}ClickHouse not available.{RST}")
|
||||
sys.exit(1)
|
||||
|
||||
trades = fetch_trades()
|
||||
n = len(trades)
|
||||
h1, h2 = trades[:n // 2], trades[n // 2:]
|
||||
|
||||
print(f"\n{BOLD}{'═'*68}{RST}")
|
||||
print(f"{BOLD} EsoF Overfitting Guard Report ({n} trades){RST}")
|
||||
print(f"{'═'*68}\n")
|
||||
|
||||
baseline = wr(trades)
|
||||
ny = [t for t in trades if t["session"] == "NY_AFTERNOON"]
|
||||
mon = [t for t in trades if t["dow"] == 0]
|
||||
ldn = [t for t in trades if t["session"] == "LONDON_MORNING"]
|
||||
|
||||
ny_h1 = [t for t in h1 if t["session"] == "NY_AFTERNOON"]
|
||||
ny_h2 = [t for t in h2 if t["session"] == "NY_AFTERNOON"]
|
||||
mon_h1 = [t for t in h1 if t["dow"] == 0]
|
||||
mon_h2 = [t for t in h2 if t["dow"] == 0]
|
||||
|
||||
def row(label, val, ref=None, lo=None, hi=None, warn=None, note=""):
|
||||
if lo is not None:
|
||||
ci_str = f" 95%CI [{lo:.3f}, {hi:.3f}]"
|
||||
else:
|
||||
ci_str = ""
|
||||
col = GREEN if (ref is None or val < ref) else RED
|
||||
if warn:
|
||||
col = YELLOW
|
||||
print(f" {label:<42} {col}{val:.3f}{RST}{ci_str} {DIM}{note}{RST}")
|
||||
|
||||
print(f" {'Baseline WR':<42} {baseline:.3f}")
|
||||
print()
|
||||
|
||||
print(f" {BOLD}1. Temporal Stability (H1 / H2){RST}")
|
||||
row(" NY_AFT WR — H1", wr(ny_h1), baseline, note=f"n={len(ny_h1)}")
|
||||
row(" NY_AFT WR — H2", wr(ny_h2), baseline, note=f"n={len(ny_h2)}")
|
||||
row(" Mon WR — H1", wr(mon_h1), baseline, note=f"n={len(mon_h1)}")
|
||||
row(" Mon WR — H2", wr(mon_h2), baseline, note=f"n={len(mon_h2)}")
|
||||
|
||||
print(f"\n {BOLD}2. Permutation p-values{RST}")
|
||||
ny_pnl = sum(t["pnl"] for t in ny)
|
||||
mon_pnl = sum(t["pnl"] for t in mon)
|
||||
p_ny = permutation_pvalue(trades, -ny_pnl, "session", "NY_AFTERNOON", n_perm=2000)
|
||||
p_mon = permutation_pvalue(trades, -mon_pnl, "dow", 0, n_perm=2000)
|
||||
col_ny = GREEN if p_ny < 0.05 else YELLOW if p_ny < 0.15 else RED
|
||||
col_mon = GREEN if p_mon < 0.05 else YELLOW if p_mon < 0.15 else RED
|
||||
print(f" {'NY_AFT block p-value':<42} {col_ny}{p_ny:.4f}{RST} {DIM}(< 0.05 = significant){RST}")
|
||||
print(f" {'Monday block p-value':<42} {col_mon}{p_mon:.4f}{RST} {DIM}(< 0.15 = directional){RST}")
|
||||
|
||||
print(f"\n {BOLD}3. Effect Sizes (Cohen's h){RST}")
|
||||
h_ny = cohen_h(wr(ny), baseline)
|
||||
h_mon = cohen_h(wr(mon), baseline)
|
||||
h_ldn = cohen_h(wr(ldn), baseline)
|
||||
for label, h, n_cell in [("NY_AFT", h_ny, len(ny)), ("Monday", h_mon, len(mon)), ("London", h_ldn, len(ldn))]:
|
||||
grade = "large" if h >= 0.8 else "medium" if h >= 0.5 else "small" if h >= 0.2 else "trivial"
|
||||
col = GREEN if h >= 0.5 else YELLOW if h >= 0.2 else RED
|
||||
print(f" {' '+label:<42} {col}{h:.3f}{RST} {DIM}{grade} (n={n_cell}){RST}")
|
||||
|
||||
print(f"\n {BOLD}4. Bootstrap 95% CIs{RST}")
|
||||
ny_lo, ny_hi = bootstrap_wr_ci(ny, n_boot=3000)
|
||||
col = GREEN if ny_hi < baseline else RED
|
||||
print(f" {'NY_AFT WR CI':<42} {col}[{ny_lo:.3f}, {ny_hi:.3f}]{RST} "
|
||||
f"{DIM}({'below' if ny_hi < baseline else 'overlaps'} baseline {baseline:.3f}){RST}")
|
||||
ny_plo, ny_phi = bootstrap_pnl_ci(ny, n_boot=3000)
|
||||
col = GREEN if ny_phi < 0 else RED
|
||||
print(f" {'NY_AFT net PnL CI':<42} {col}[{ny_plo:+,.0f}, {ny_phi:+,.0f}]{RST} "
|
||||
f"{DIM}({'net loser with confidence' if ny_phi < 0 else 'uncertain sign'}){RST}")
|
||||
|
||||
print(f"\n {BOLD}5. Bonferroni z-scores (35 cells tested){RST}")
|
||||
se_ny = binomial_se(baseline, len(ny))
|
||||
se_mon = binomial_se(baseline, len(mon))
|
||||
z_ny = (baseline - wr(ny)) / se_ny if se_ny > 0 else 0
|
||||
z_mon = (baseline - wr(mon)) / se_mon if se_mon > 0 else 0
|
||||
crit = 2.99 # Bonferroni α=0.0014 → z_crit≈2.99
|
||||
col_ny = GREEN if z_ny > crit else YELLOW if z_ny > 2.0 else RED
|
||||
col_mon = GREEN if z_mon > crit else YELLOW if z_mon > 2.0 else RED
|
||||
print(f" {'NY_AFT z':<42} {col_ny}{z_ny:.2f}{RST} {DIM}(Bonferroni crit ≈ {crit}){RST}")
|
||||
print(f" {'Monday z':<42} {col_mon}{z_mon:.2f}{RST}")
|
||||
|
||||
print(f"\n {BOLD}6. Walk-Forward: advisory score → H2 WR{RST}")
|
||||
h2s = sorted(h2, key=lambda t: t["score"])
|
||||
q = max(1, len(h2s) // 4)
|
||||
wr_bot = wr(h2s[:q])
|
||||
wr_top = wr(h2s[-q:])
|
||||
col = GREEN if wr_top > wr_bot else RED
|
||||
print(f" {' Top-quartile score WR (H2)':<42} {col}{wr_top:.3f}{RST} {DIM}n={q}{RST}")
|
||||
print(f" {' Bot-quartile score WR (H2)':<42} {col}{wr_bot:.3f}{RST} {DIM}n={q}{RST}")
|
||||
print(f" {' Predictive (top > bot)?':<42} {col}{'YES' if wr_top > wr_bot else 'NO — score overfit'}{RST}")
|
||||
|
||||
print(f"\n{'═'*68}\n")
|
||||
Reference in New Issue
Block a user