DOLPHIN/prod/tests/test_degradational.py

"""
DOLPHIN Degradational / Chaos Test Suite
=========================================
Triggers real failure modes against live Docker containers and supervisord processes,
then asserts correct healing/restart within time budgets.

REQUIRES:
  - Docker running (dolphin-hazelcast, dolphin-prefect, dolphin-hazelcast-mc)
  - supervisord running with dolphin group
  - MHS (meta_health) running
  - nautilus_trader running

Run as root (docker commands require it):
  /home/dolphin/siloqy_env/bin/python3 -m pytest prod/tests/test_degradational.py -v -s --timeout=120
"""
import json
import math
import subprocess
import time
import urllib.request
from pathlib import Path

import pytest

# ── Constants ────────────────────────────────────────────────────────────────
SUPERVISORD_CONF = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
HZ_HEALTH_URL    = "http://127.0.0.1:5701/hazelcast/health"
PREFECT_HEALTH_URL = "http://127.0.0.1:4200/api/health"
MC_HEALTH_URL    = "http://127.0.0.1:8080/"
TRADER_LOG       = "/tmp/nautilus_trader.log"
CAPITAL_DISK     = Path("/tmp/dolphin_capital_checkpoint.json")
HZ_RESTART_BUDGET_S  = 25   # worst-case: ~19s + 6s buffer
PREFECT_RESTART_BUDGET_S = 40
MC_RESTART_BUDGET_S  = 90   # MC is non-critical, slower tolerance


# ── Helpers ───────────────────────────────────────────────────────────────────

def _http_ok(url, timeout=1.0):
    try:
        with urllib.request.urlopen(url, timeout=timeout) as r:
            return r.status == 200
    except Exception:
        return False


def _hz_active(timeout=0.5):
    try:
        with urllib.request.urlopen(HZ_HEALTH_URL, timeout=timeout) as r:
            return json.loads(r.read()).get('nodeState') == 'ACTIVE'
    except Exception:
        return False


def _prefect_ok(timeout=0.5):
    try:
        with urllib.request.urlopen(PREFECT_HEALTH_URL, timeout=timeout) as r:
            return r.read().strip() == b'true'
    except Exception:
        return False


def _wait_until(predicate, budget_s, poll=0.3):
    t0 = time.time()
    while time.time() - t0 < budget_s:
        if predicate():
            return time.time() - t0
        time.sleep(poll)
    raise TimeoutError(f"Not recovered within {budget_s}s")


def _supervisord(cmd):
    return subprocess.run(
        ["supervisorctl", "-c", SUPERVISORD_CONF] + cmd.split(),
        capture_output=True, text=True
    )


def _trader_pid():
    r = _supervisord("status dolphin:nautilus_trader")
    # supervisorctl output: "dolphin:nautilus_trader   RUNNING   pid 12345, uptime ..."
    import re
    m = re.search(r'pid\s+(\d+)', r.stdout)
    if m:
        return int(m.group(1))
    return None


def _wait_hz_cooldown_clear(max_wait=8):
    """Wait for HZ to be confirmed healthy so MHS resets cooldown."""
    _wait_until(_hz_active, max_wait)


def _docker_kill(name):
    subprocess.run(["docker", "kill", name], check=True, capture_output=True)


def _docker_stop(name):
    subprocess.run(["docker", "stop", "-t", "2", name], check=True, capture_output=True)


def _docker_running(name):
    r = subprocess.run(["docker", "inspect", "--format", "{{.State.Running}}", name],
                       capture_output=True, text=True)
    return r.stdout.strip() == "true"


def _assert_hz_was_healthy():
    assert _hz_active(timeout=2.0), "Precondition: HZ must be healthy before test"


def _assert_prefect_was_healthy():
    assert _prefect_ok(timeout=2.0), "Precondition: Prefect must be healthy before test"


# ── Fixtures ──────────────────────────────────────────────────────────────────

@pytest.fixture(autouse=True)
def ensure_baseline_healthy():
    """Wait for all services healthy + trader running before each test."""
    deadline = time.time() + 90
    while time.time() < deadline:
        trader_ok = _trader_pid() is not None
        if _hz_active() and _prefect_ok() and trader_ok:
            break
        time.sleep(1)
    else:
        pytest.skip("Baseline services not healthy — skipping chaos test")
    yield
    # Post-test: wait for any killed containers to fully recover before next test
    deadline2 = time.time() + 90
    while time.time() < deadline2:
        if _hz_active() and _prefect_ok() and _trader_pid() is not None:
            # Extra 2s for MHS cooldown reset (it resets on healthy probe, ~0.5s after recovery)
            time.sleep(2)
            break
        time.sleep(1)


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 1: Hazelcast container killed (SIGKILL)
# ══════════════════════════════════════════════════════════════════════════════

class TestHZContainerKill:

    def test_hz_kill_mhs_heals_within_budget(self):
        """SIGKILL HZ → MHS HTTP probe detects in ~1s → docker restart → HZ healthy."""
        _assert_hz_was_healthy()

        _docker_kill("dolphin-hazelcast")
        kill_time = time.time()

        # Immediately confirm it's dead
        time.sleep(0.5)
        assert not _hz_active(timeout=0.3), "HZ should be down after kill"

        # Wait for recovery
        recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
        print(f"\n  HZ kill→recovered in {recovery_s:.1f}s (budget {HZ_RESTART_BUDGET_S}s)")
        assert recovery_s <= HZ_RESTART_BUDGET_S

    def test_hz_kill_trader_reconnects(self):
        """After HZ kill+recovery, nautilus_trader must be processing scans again within 45s."""
        _assert_hz_was_healthy()
        pre_log_size = Path(TRADER_LOG).stat().st_size

        _docker_kill("dolphin-hazelcast")
        time.sleep(1)

        # Wait for HZ recovery
        _wait_until(_hz_active, HZ_RESTART_BUDGET_S)

        # Then wait for trader to log a new LATENCY line
        def _new_latency_line():
            try:
                return Path(TRADER_LOG).stat().st_size > pre_log_size + 100
            except Exception:
                return False

        reconnect_s = _wait_until(_new_latency_line, 45)
        print(f"\n  Trader reconnected and logging within {reconnect_s:.1f}s of kill")
        assert reconnect_s <= 45

    def test_hz_kill_capital_survives_on_disk(self):
        """Kill HZ (loses in-memory maps) → disk checkpoint must still have valid capital."""
        _assert_hz_was_healthy()

        # Ensure there is a disk checkpoint (trader must have written one)
        assert CAPITAL_DISK.exists(), "Disk checkpoint must exist before kill"
        data = json.loads(CAPITAL_DISK.read_text())
        pre_capital = float(data['capital'])
        assert pre_capital >= 1.0, f"Pre-kill capital invalid: {pre_capital}"

        _docker_kill("dolphin-hazelcast")
        time.sleep(1)

        # Disk checkpoint must be unchanged (not corrupted by kill)
        data2 = json.loads(CAPITAL_DISK.read_text())
        post_capital = float(data2['capital'])
        assert math.isfinite(post_capital) and post_capital >= 1.0
        # Within 1% of pre-kill (may have advanced slightly from a scan just before kill)
        assert abs(post_capital - pre_capital) / pre_capital < 0.01, \
            f"Capital changed unexpectedly: {pre_capital} → {post_capital}"

        # Wait for recovery
        _wait_until(_hz_active, HZ_RESTART_BUDGET_S)


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 2: Hazelcast container graceful stop
# ══════════════════════════════════════════════════════════════════════════════

class TestHZContainerStop:

    def test_hz_stop_recovers_within_budget(self):
        """Graceful stop (SIGTERM) — same recovery path as kill."""
        _assert_hz_was_healthy()

        _docker_stop("dolphin-hazelcast")
        time.sleep(0.5)
        assert not _hz_active(timeout=0.3)

        recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
        print(f"\n  HZ stop→recovered in {recovery_s:.1f}s")
        assert recovery_s <= HZ_RESTART_BUDGET_S


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 3: Prefect container killed
# ══════════════════════════════════════════════════════════════════════════════

class TestPrefectContainerKill:

    def test_prefect_kill_recovers_within_budget(self):
        """SIGKILL Prefect → MHS probe detects → docker restart → Prefect healthy."""
        _assert_prefect_was_healthy()

        _docker_kill("dolphin-prefect")
        time.sleep(0.5)
        assert not _prefect_ok(timeout=0.3), "Prefect should be down"

        recovery_s = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)
        print(f"\n  Prefect kill→recovered in {recovery_s:.1f}s (budget {PREFECT_RESTART_BUDGET_S}s)")
        assert recovery_s <= PREFECT_RESTART_BUDGET_S

    def test_prefect_kill_hz_unaffected(self):
        """Killing Prefect must not affect HZ or the trader."""
        _assert_hz_was_healthy()
        _assert_prefect_was_healthy()

        _docker_kill("dolphin-prefect")
        time.sleep(2)

        # HZ must still be healthy
        assert _hz_active(timeout=1.0), "HZ must be unaffected by Prefect kill"

        # Trader must still be running
        pid = _trader_pid()
        assert pid is not None and pid > 0, "Trader must still be running"

        # Wait for Prefect to recover
        _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 4: Simultaneous HZ + Prefect kill
# ══════════════════════════════════════════════════════════════════════════════

class TestSimultaneousKill:

    def test_hz_and_prefect_simultaneous_kill(self):
        """Both killed simultaneously — both must recover independently."""
        _assert_hz_was_healthy()
        _assert_prefect_was_healthy()

        _docker_kill("dolphin-hazelcast")
        _docker_kill("dolphin-prefect")
        kill_time = time.time()

        time.sleep(0.5)
        assert not _hz_active(timeout=0.3)
        assert not _prefect_ok(timeout=0.3)

        # Both must recover — HZ first (faster restart), then Prefect
        hz_recovery = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
        prefect_recovery = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)

        print(f"\n  Simultaneous kill: HZ recovered in {hz_recovery:.1f}s, "
              f"Prefect in {prefect_recovery:.1f}s")
        assert hz_recovery <= HZ_RESTART_BUDGET_S
        assert prefect_recovery <= PREFECT_RESTART_BUDGET_S


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 5: nautilus_trader process killed (supervisord restarts)
# ══════════════════════════════════════════════════════════════════════════════

class TestTraderProcessKill:

    def test_trader_kill_supervisord_restarts(self):
        """Kill trader process — supervisord must restart it and it must connect to HZ."""
        pid_before = _trader_pid()
        assert pid_before is not None

        subprocess.run(["kill", "-9", str(pid_before)], check=True)
        time.sleep(2)

        # Wait for supervisord to restart and new process to connect
        def _new_pid_running():
            r = _supervisord("status dolphin:nautilus_trader")
            return "RUNNING" in r.stdout

        recovery_s = _wait_until(_new_pid_running, 30)
        pid_after = _trader_pid()
        assert pid_after != pid_before, "supervisord must have assigned new PID"
        print(f"\n  Trader killed+restarted in {recovery_s:.1f}s (PID {pid_before}→{pid_after})")

    def test_trader_restart_capital_restored_from_disk(self):
        """After trader restart, capital must be restored from disk checkpoint."""
        assert CAPITAL_DISK.exists(), "Disk checkpoint required"
        data = json.loads(CAPITAL_DISK.read_text())
        expected_capital = float(data['capital'])
        assert expected_capital >= 1.0

        pid_before = _trader_pid()
        subprocess.run(["kill", "-9", str(pid_before)], check=True)

        # Wait for restart + first scan processed
        def _trader_log_shows_restored():
            try:
                text = Path(TRADER_LOG).read_text()
                return "Capital restored" in text.split("🐬 DOLPHIN")[-1]
            except Exception:
                return False

        _wait_until(lambda: _supervisord("status dolphin:nautilus_trader").stdout.count("RUNNING") > 0, 20)
        time.sleep(5)

        log_tail = Path(TRADER_LOG).read_text().split("🐬 DOLPHIN")[-1]
        if "no valid checkpoint" in log_tail:
            pytest.fail("Trader started without capital checkpoint — disk restore failed")
        if "Capital restored" in log_tail:
            # Extract restored value
            for line in log_tail.splitlines():
                if "Capital restored" in line:
                    print(f"\n  {line.strip()}")
                    break


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 6: scan_bridge process killed
# ══════════════════════════════════════════════════════════════════════════════

class TestScanBridgeKill:

    def test_scan_bridge_kill_supervisord_restarts(self):
        r = _supervisord("status dolphin:scan_bridge")
        assert "RUNNING" in r.stdout, "scan_bridge must be running"

        for part in r.stdout.split():
            if part.isdigit():
                pid = int(part)
                break
        else:
            pytest.skip("Could not parse scan_bridge PID")

        subprocess.run(["kill", "-9", str(pid)], check=True)
        time.sleep(2)

        def _sb_running():
            return "RUNNING" in _supervisord("status dolphin:scan_bridge").stdout

        recovery_s = _wait_until(_sb_running, 20)
        print(f"\n  scan_bridge restarted in {recovery_s:.1f}s")
        assert recovery_s <= 20


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 7: Rapid repeated HZ kills (stress resilience)
# ══════════════════════════════════════════════════════════════════════════════

class TestHZRapidKills:

    def test_hz_three_rapid_kills(self):
        """Kill HZ 3 times — each must recover. Waits for MHS cooldown reset between kills."""
        for i in range(3):
            _assert_hz_was_healthy()
            _docker_kill("dolphin-hazelcast")
            recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
            print(f"\n  Kill #{i+1}: recovered in {recovery_s:.1f}s")
            assert recovery_s <= HZ_RESTART_BUDGET_S
            # Wait for MHS to confirm healthy (resets cooldown) before next kill
            time.sleep(1.5)


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 8: Capital checkpoint integrity under concurrent writes
# ══════════════════════════════════════════════════════════════════════════════

class TestCapitalCheckpointIntegrity:

    def test_disk_checkpoint_always_valid_json(self):
        """Disk checkpoint must be valid JSON with capital >= 1.0 and finite ts."""
        assert CAPITAL_DISK.exists()
        data = json.loads(CAPITAL_DISK.read_text())
        capital = float(data['capital'])
        ts = float(data['ts'])
        assert math.isfinite(capital) and capital >= 1.0
        assert math.isfinite(ts) and ts > 1_700_000_000  # post-2023 epoch

    def test_disk_checkpoint_survives_hz_restart(self):
        """Restart HZ (clears in-memory maps) — disk checkpoint must still be valid."""
        assert CAPITAL_DISK.exists()
        pre = json.loads(CAPITAL_DISK.read_text())

        subprocess.run(["docker", "restart", "dolphin-hazelcast"],
                       check=True, capture_output=True)
        _wait_until(_hz_active, HZ_RESTART_BUDGET_S)

        post = json.loads(CAPITAL_DISK.read_text())
        # Disk checkpoint should not have been corrupted
        assert math.isfinite(float(post['capital']))
        assert float(post['capital']) >= 1.0
        print(f"\n  Capital pre={pre['capital']:.2f} post={post['capital']:.2f}")


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 9: MHS (meta_health) killed — supervisord restarts it
# ══════════════════════════════════════════════════════════════════════════════

class TestMHSKill:

    def test_mhs_kill_supervisord_restarts(self):
        r = _supervisord("status dolphin_data:meta_health")
        assert "RUNNING" in r.stdout

        for part in r.stdout.split():
            if part.isdigit():
                pid = int(part)
                break
        else:
            pytest.skip("Could not parse meta_health PID")

        subprocess.run(["kill", "-9", str(pid)], check=True)
        time.sleep(2)

        def _mhs_running():
            return "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout

        recovery_s = _wait_until(_mhs_running, 20)
        print(f"\n  MHS restarted in {recovery_s:.1f}s")
        assert recovery_s <= 20

    def test_hz_heals_even_without_mhs(self):
        """Kill MHS then kill HZ — autoheal (Docker layer) must still recover HZ."""
        _assert_hz_was_healthy()

        # Kill MHS
        r = _supervisord("status dolphin_data:meta_health")
        for part in r.stdout.split():
            if part.isdigit():
                mhs_pid = int(part)
                break
        else:
            pytest.skip("Could not parse MHS PID")
        subprocess.run(["kill", "-9", str(mhs_pid)], check=True)
        time.sleep(1)

        # Now kill HZ — autoheal must recover it without MHS
        _docker_kill("dolphin-hazelcast")
        time.sleep(1)

        # autoheal polls every 10s, Docker healthcheck interval 10s → worst case ~45s
        recovery_s = _wait_until(_hz_active, 60)
        print(f"\n  HZ healed without MHS in {recovery_s:.1f}s (autoheal layer)")

        # Let MHS restart on its own via supervisord
        _wait_until(lambda: "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout, 20)
initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore. 2026-04-21 16:58:38 +02:00			`"""`
			`DOLPHIN Degradational / Chaos Test Suite`
			`=========================================`
			`Triggers real failure modes against live Docker containers and supervisord processes,`
			`then asserts correct healing/restart within time budgets.`

			`REQUIRES:`
			`- Docker running (dolphin-hazelcast, dolphin-prefect, dolphin-hazelcast-mc)`
			`- supervisord running with dolphin group`
			`- MHS (meta_health) running`
			`- nautilus_trader running`

			`Run as root (docker commands require it):`
			`/home/dolphin/siloqy_env/bin/python3 -m pytest prod/tests/test_degradational.py -v -s --timeout=120`
			`"""`
			`import json`
			`import math`
			`import subprocess`
			`import time`
			`import urllib.request`
			`from pathlib import Path`

			`import pytest`

			`# ── Constants ────────────────────────────────────────────────────────────────`
			`SUPERVISORD_CONF = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"`
			`HZ_HEALTH_URL = "http://127.0.0.1:5701/hazelcast/health"`
			`PREFECT_HEALTH_URL = "http://127.0.0.1:4200/api/health"`
			`MC_HEALTH_URL = "http://127.0.0.1:8080/"`
			`TRADER_LOG = "/tmp/nautilus_trader.log"`
			`CAPITAL_DISK = Path("/tmp/dolphin_capital_checkpoint.json")`
			`HZ_RESTART_BUDGET_S = 25 # worst-case: ~19s + 6s buffer`
			`PREFECT_RESTART_BUDGET_S = 40`
			`MC_RESTART_BUDGET_S = 90 # MC is non-critical, slower tolerance`


			`# ── Helpers ───────────────────────────────────────────────────────────────────`

			`def _http_ok(url, timeout=1.0):`
			`try:`
			`with urllib.request.urlopen(url, timeout=timeout) as r:`
			`return r.status == 200`
			`except Exception:`
			`return False`


			`def _hz_active(timeout=0.5):`
			`try:`
			`with urllib.request.urlopen(HZ_HEALTH_URL, timeout=timeout) as r:`
			`return json.loads(r.read()).get('nodeState') == 'ACTIVE'`
			`except Exception:`
			`return False`


			`def _prefect_ok(timeout=0.5):`
			`try:`
			`with urllib.request.urlopen(PREFECT_HEALTH_URL, timeout=timeout) as r:`
			`return r.read().strip() == b'true'`
			`except Exception:`
			`return False`


			`def _wait_until(predicate, budget_s, poll=0.3):`
			`t0 = time.time()`
			`while time.time() - t0 < budget_s:`
			`if predicate():`
			`return time.time() - t0`
			`time.sleep(poll)`
			`raise TimeoutError(f"Not recovered within {budget_s}s")`


			`def _supervisord(cmd):`
			`return subprocess.run(`
			`["supervisorctl", "-c", SUPERVISORD_CONF] + cmd.split(),`
			`capture_output=True, text=True`
			`)`


			`def _trader_pid():`
			`r = _supervisord("status dolphin:nautilus_trader")`
			`# supervisorctl output: "dolphin:nautilus_trader RUNNING pid 12345, uptime ..."`
			`import re`
			`m = re.search(r'pid\s+(\d+)', r.stdout)`
			`if m:`
			`return int(m.group(1))`
			`return None`


			`def _wait_hz_cooldown_clear(max_wait=8):`
			`"""Wait for HZ to be confirmed healthy so MHS resets cooldown."""`
			`_wait_until(_hz_active, max_wait)`


			`def _docker_kill(name):`
			`subprocess.run(["docker", "kill", name], check=True, capture_output=True)`


			`def _docker_stop(name):`
			`subprocess.run(["docker", "stop", "-t", "2", name], check=True, capture_output=True)`


			`def _docker_running(name):`
			`r = subprocess.run(["docker", "inspect", "--format", "{{.State.Running}}", name],`
			`capture_output=True, text=True)`
			`return r.stdout.strip() == "true"`


			`def _assert_hz_was_healthy():`
			`assert _hz_active(timeout=2.0), "Precondition: HZ must be healthy before test"`


			`def _assert_prefect_was_healthy():`
			`assert _prefect_ok(timeout=2.0), "Precondition: Prefect must be healthy before test"`


			`# ── Fixtures ──────────────────────────────────────────────────────────────────`

			`@pytest.fixture(autouse=True)`
			`def ensure_baseline_healthy():`
			`"""Wait for all services healthy + trader running before each test."""`
			`deadline = time.time() + 90`
			`while time.time() < deadline:`
			`trader_ok = _trader_pid() is not None`
			`if _hz_active() and _prefect_ok() and trader_ok:`
			`break`
			`time.sleep(1)`
			`else:`
			`pytest.skip("Baseline services not healthy — skipping chaos test")`
			`yield`
			`# Post-test: wait for any killed containers to fully recover before next test`
			`deadline2 = time.time() + 90`
			`while time.time() < deadline2:`
			`if _hz_active() and _prefect_ok() and _trader_pid() is not None:`
			`# Extra 2s for MHS cooldown reset (it resets on healthy probe, ~0.5s after recovery)`
			`time.sleep(2)`
			`break`
			`time.sleep(1)`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 1: Hazelcast container killed (SIGKILL)`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestHZContainerKill:`

			`def test_hz_kill_mhs_heals_within_budget(self):`
			`"""SIGKILL HZ → MHS HTTP probe detects in ~1s → docker restart → HZ healthy."""`
			`_assert_hz_was_healthy()`

			`_docker_kill("dolphin-hazelcast")`
			`kill_time = time.time()`

			`# Immediately confirm it's dead`
			`time.sleep(0.5)`
			`assert not _hz_active(timeout=0.3), "HZ should be down after kill"`

			`# Wait for recovery`
			`recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)`
			`print(f"\n HZ kill→recovered in {recovery_s:.1f}s (budget {HZ_RESTART_BUDGET_S}s)")`
			`assert recovery_s <= HZ_RESTART_BUDGET_S`

			`def test_hz_kill_trader_reconnects(self):`
			`"""After HZ kill+recovery, nautilus_trader must be processing scans again within 45s."""`
			`_assert_hz_was_healthy()`
			`pre_log_size = Path(TRADER_LOG).stat().st_size`

			`_docker_kill("dolphin-hazelcast")`
			`time.sleep(1)`

			`# Wait for HZ recovery`
			`_wait_until(_hz_active, HZ_RESTART_BUDGET_S)`

			`# Then wait for trader to log a new LATENCY line`
			`def _new_latency_line():`
			`try:`
			`return Path(TRADER_LOG).stat().st_size > pre_log_size + 100`
			`except Exception:`
			`return False`

			`reconnect_s = _wait_until(_new_latency_line, 45)`
			`print(f"\n Trader reconnected and logging within {reconnect_s:.1f}s of kill")`
			`assert reconnect_s <= 45`

			`def test_hz_kill_capital_survives_on_disk(self):`
			`"""Kill HZ (loses in-memory maps) → disk checkpoint must still have valid capital."""`
			`_assert_hz_was_healthy()`

			`# Ensure there is a disk checkpoint (trader must have written one)`
			`assert CAPITAL_DISK.exists(), "Disk checkpoint must exist before kill"`
			`data = json.loads(CAPITAL_DISK.read_text())`
			`pre_capital = float(data['capital'])`
			`assert pre_capital >= 1.0, f"Pre-kill capital invalid: {pre_capital}"`

			`_docker_kill("dolphin-hazelcast")`
			`time.sleep(1)`

			`# Disk checkpoint must be unchanged (not corrupted by kill)`
			`data2 = json.loads(CAPITAL_DISK.read_text())`
			`post_capital = float(data2['capital'])`
			`assert math.isfinite(post_capital) and post_capital >= 1.0`
			`# Within 1% of pre-kill (may have advanced slightly from a scan just before kill)`
			`assert abs(post_capital - pre_capital) / pre_capital < 0.01, \`
			`f"Capital changed unexpectedly: {pre_capital} → {post_capital}"`

			`# Wait for recovery`
			`_wait_until(_hz_active, HZ_RESTART_BUDGET_S)`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 2: Hazelcast container graceful stop`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestHZContainerStop:`

			`def test_hz_stop_recovers_within_budget(self):`
			`"""Graceful stop (SIGTERM) — same recovery path as kill."""`
			`_assert_hz_was_healthy()`

			`_docker_stop("dolphin-hazelcast")`
			`time.sleep(0.5)`
			`assert not _hz_active(timeout=0.3)`

			`recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)`
			`print(f"\n HZ stop→recovered in {recovery_s:.1f}s")`
			`assert recovery_s <= HZ_RESTART_BUDGET_S`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 3: Prefect container killed`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestPrefectContainerKill:`

			`def test_prefect_kill_recovers_within_budget(self):`
			`"""SIGKILL Prefect → MHS probe detects → docker restart → Prefect healthy."""`
			`_assert_prefect_was_healthy()`

			`_docker_kill("dolphin-prefect")`
			`time.sleep(0.5)`
			`assert not _prefect_ok(timeout=0.3), "Prefect should be down"`

			`recovery_s = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)`
			`print(f"\n Prefect kill→recovered in {recovery_s:.1f}s (budget {PREFECT_RESTART_BUDGET_S}s)")`
			`assert recovery_s <= PREFECT_RESTART_BUDGET_S`

			`def test_prefect_kill_hz_unaffected(self):`
			`"""Killing Prefect must not affect HZ or the trader."""`
			`_assert_hz_was_healthy()`
			`_assert_prefect_was_healthy()`

			`_docker_kill("dolphin-prefect")`
			`time.sleep(2)`

			`# HZ must still be healthy`
			`assert _hz_active(timeout=1.0), "HZ must be unaffected by Prefect kill"`

			`# Trader must still be running`
			`pid = _trader_pid()`
			`assert pid is not None and pid > 0, "Trader must still be running"`

			`# Wait for Prefect to recover`
			`_wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 4: Simultaneous HZ + Prefect kill`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestSimultaneousKill:`

			`def test_hz_and_prefect_simultaneous_kill(self):`
			`"""Both killed simultaneously — both must recover independently."""`
			`_assert_hz_was_healthy()`
			`_assert_prefect_was_healthy()`

			`_docker_kill("dolphin-hazelcast")`
			`_docker_kill("dolphin-prefect")`
			`kill_time = time.time()`

			`time.sleep(0.5)`
			`assert not _hz_active(timeout=0.3)`
			`assert not _prefect_ok(timeout=0.3)`

			`# Both must recover — HZ first (faster restart), then Prefect`
			`hz_recovery = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)`
			`prefect_recovery = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)`

			`print(f"\n Simultaneous kill: HZ recovered in {hz_recovery:.1f}s, "`
			`f"Prefect in {prefect_recovery:.1f}s")`
			`assert hz_recovery <= HZ_RESTART_BUDGET_S`
			`assert prefect_recovery <= PREFECT_RESTART_BUDGET_S`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 5: nautilus_trader process killed (supervisord restarts)`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestTraderProcessKill:`

			`def test_trader_kill_supervisord_restarts(self):`
			`"""Kill trader process — supervisord must restart it and it must connect to HZ."""`
			`pid_before = _trader_pid()`
			`assert pid_before is not None`

			`subprocess.run(["kill", "-9", str(pid_before)], check=True)`
			`time.sleep(2)`

			`# Wait for supervisord to restart and new process to connect`
			`def _new_pid_running():`
			`r = _supervisord("status dolphin:nautilus_trader")`
			`return "RUNNING" in r.stdout`

			`recovery_s = _wait_until(_new_pid_running, 30)`
			`pid_after = _trader_pid()`
			`assert pid_after != pid_before, "supervisord must have assigned new PID"`
			`print(f"\n Trader killed+restarted in {recovery_s:.1f}s (PID {pid_before}→{pid_after})")`

			`def test_trader_restart_capital_restored_from_disk(self):`
			`"""After trader restart, capital must be restored from disk checkpoint."""`
			`assert CAPITAL_DISK.exists(), "Disk checkpoint required"`
			`data = json.loads(CAPITAL_DISK.read_text())`
			`expected_capital = float(data['capital'])`
			`assert expected_capital >= 1.0`

			`pid_before = _trader_pid()`
			`subprocess.run(["kill", "-9", str(pid_before)], check=True)`

			`# Wait for restart + first scan processed`
			`def _trader_log_shows_restored():`
			`try:`
			`text = Path(TRADER_LOG).read_text()`
			`return "Capital restored" in text.split("🐬 DOLPHIN")[-1]`
			`except Exception:`
			`return False`

			`_wait_until(lambda: _supervisord("status dolphin:nautilus_trader").stdout.count("RUNNING") > 0, 20)`
			`time.sleep(5)`

			`log_tail = Path(TRADER_LOG).read_text().split("🐬 DOLPHIN")[-1]`
			`if "no valid checkpoint" in log_tail:`
			`pytest.fail("Trader started without capital checkpoint — disk restore failed")`
			`if "Capital restored" in log_tail:`
			`# Extract restored value`
			`for line in log_tail.splitlines():`
			`if "Capital restored" in line:`
			`print(f"\n {line.strip()}")`
			`break`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 6: scan_bridge process killed`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestScanBridgeKill:`

			`def test_scan_bridge_kill_supervisord_restarts(self):`
			`r = _supervisord("status dolphin:scan_bridge")`
			`assert "RUNNING" in r.stdout, "scan_bridge must be running"`

			`for part in r.stdout.split():`
			`if part.isdigit():`
			`pid = int(part)`
			`break`
			`else:`
			`pytest.skip("Could not parse scan_bridge PID")`

			`subprocess.run(["kill", "-9", str(pid)], check=True)`
			`time.sleep(2)`

			`def _sb_running():`
			`return "RUNNING" in _supervisord("status dolphin:scan_bridge").stdout`

			`recovery_s = _wait_until(_sb_running, 20)`
			`print(f"\n scan_bridge restarted in {recovery_s:.1f}s")`
			`assert recovery_s <= 20`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 7: Rapid repeated HZ kills (stress resilience)`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestHZRapidKills:`

			`def test_hz_three_rapid_kills(self):`
			`"""Kill HZ 3 times — each must recover. Waits for MHS cooldown reset between kills."""`
			`for i in range(3):`
			`_assert_hz_was_healthy()`
			`_docker_kill("dolphin-hazelcast")`
			`recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)`
			`print(f"\n Kill #{i+1}: recovered in {recovery_s:.1f}s")`
			`assert recovery_s <= HZ_RESTART_BUDGET_S`
			`# Wait for MHS to confirm healthy (resets cooldown) before next kill`
			`time.sleep(1.5)`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 8: Capital checkpoint integrity under concurrent writes`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestCapitalCheckpointIntegrity:`

			`def test_disk_checkpoint_always_valid_json(self):`
			`"""Disk checkpoint must be valid JSON with capital >= 1.0 and finite ts."""`
			`assert CAPITAL_DISK.exists()`
			`data = json.loads(CAPITAL_DISK.read_text())`
			`capital = float(data['capital'])`
			`ts = float(data['ts'])`
			`assert math.isfinite(capital) and capital >= 1.0`
			`assert math.isfinite(ts) and ts > 1_700_000_000 # post-2023 epoch`

			`def test_disk_checkpoint_survives_hz_restart(self):`
			`"""Restart HZ (clears in-memory maps) — disk checkpoint must still be valid."""`
			`assert CAPITAL_DISK.exists()`
			`pre = json.loads(CAPITAL_DISK.read_text())`

			`subprocess.run(["docker", "restart", "dolphin-hazelcast"],`
			`check=True, capture_output=True)`
			`_wait_until(_hz_active, HZ_RESTART_BUDGET_S)`

			`post = json.loads(CAPITAL_DISK.read_text())`
			`# Disk checkpoint should not have been corrupted`
			`assert math.isfinite(float(post['capital']))`
			`assert float(post['capital']) >= 1.0`
			`print(f"\n Capital pre={pre['capital']:.2f} post={post['capital']:.2f}")`


			`# ══════════════════════════════════════════════════════════════════════════════`
			`# FAILURE MODE 9: MHS (meta_health) killed — supervisord restarts it`
			`# ══════════════════════════════════════════════════════════════════════════════`

			`class TestMHSKill:`

			`def test_mhs_kill_supervisord_restarts(self):`
			`r = _supervisord("status dolphin_data:meta_health")`
			`assert "RUNNING" in r.stdout`

			`for part in r.stdout.split():`
			`if part.isdigit():`
			`pid = int(part)`
			`break`
			`else:`
			`pytest.skip("Could not parse meta_health PID")`

			`subprocess.run(["kill", "-9", str(pid)], check=True)`
			`time.sleep(2)`

			`def _mhs_running():`
			`return "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout`

			`recovery_s = _wait_until(_mhs_running, 20)`
			`print(f"\n MHS restarted in {recovery_s:.1f}s")`
			`assert recovery_s <= 20`

			`def test_hz_heals_even_without_mhs(self):`
			`"""Kill MHS then kill HZ — autoheal (Docker layer) must still recover HZ."""`
			`_assert_hz_was_healthy()`

			`# Kill MHS`
			`r = _supervisord("status dolphin_data:meta_health")`
			`for part in r.stdout.split():`
			`if part.isdigit():`
			`mhs_pid = int(part)`
			`break`
			`else:`
			`pytest.skip("Could not parse MHS PID")`
			`subprocess.run(["kill", "-9", str(mhs_pid)], check=True)`
			`time.sleep(1)`

			`# Now kill HZ — autoheal must recover it without MHS`
			`_docker_kill("dolphin-hazelcast")`
			`time.sleep(1)`

			`# autoheal polls every 10s, Docker healthcheck interval 10s → worst case ~45s`
			`recovery_s = _wait_until(_hz_active, 60)`
			`print(f"\n HZ healed without MHS in {recovery_s:.1f}s (autoheal layer)")`

			`# Let MHS restart on its own via supervisord`
			`_wait_until(lambda: "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout, 20)`