Files
DOLPHIN/prod/tests/test_degradational.py

479 lines
21 KiB
Python
Raw Normal View History

"""
DOLPHIN Degradational / Chaos Test Suite
=========================================
Triggers real failure modes against live Docker containers and supervisord processes,
then asserts correct healing/restart within time budgets.
REQUIRES:
- Docker running (dolphin-hazelcast, dolphin-prefect, dolphin-hazelcast-mc)
- supervisord running with dolphin group
- MHS (meta_health) running
- nautilus_trader running
Run as root (docker commands require it):
/home/dolphin/siloqy_env/bin/python3 -m pytest prod/tests/test_degradational.py -v -s --timeout=120
"""
import json
import math
import subprocess
import time
import urllib.request
from pathlib import Path
import pytest
# ── Constants ────────────────────────────────────────────────────────────────
SUPERVISORD_CONF = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
HZ_HEALTH_URL = "http://127.0.0.1:5701/hazelcast/health"
PREFECT_HEALTH_URL = "http://127.0.0.1:4200/api/health"
MC_HEALTH_URL = "http://127.0.0.1:8080/"
TRADER_LOG = "/tmp/nautilus_trader.log"
CAPITAL_DISK = Path("/tmp/dolphin_capital_checkpoint.json")
HZ_RESTART_BUDGET_S = 25 # worst-case: ~19s + 6s buffer
PREFECT_RESTART_BUDGET_S = 40
MC_RESTART_BUDGET_S = 90 # MC is non-critical, slower tolerance
# ── Helpers ───────────────────────────────────────────────────────────────────
def _http_ok(url, timeout=1.0):
try:
with urllib.request.urlopen(url, timeout=timeout) as r:
return r.status == 200
except Exception:
return False
def _hz_active(timeout=0.5):
try:
with urllib.request.urlopen(HZ_HEALTH_URL, timeout=timeout) as r:
return json.loads(r.read()).get('nodeState') == 'ACTIVE'
except Exception:
return False
def _prefect_ok(timeout=0.5):
try:
with urllib.request.urlopen(PREFECT_HEALTH_URL, timeout=timeout) as r:
return r.read().strip() == b'true'
except Exception:
return False
def _wait_until(predicate, budget_s, poll=0.3):
t0 = time.time()
while time.time() - t0 < budget_s:
if predicate():
return time.time() - t0
time.sleep(poll)
raise TimeoutError(f"Not recovered within {budget_s}s")
def _supervisord(cmd):
return subprocess.run(
["supervisorctl", "-c", SUPERVISORD_CONF] + cmd.split(),
capture_output=True, text=True
)
def _trader_pid():
r = _supervisord("status dolphin:nautilus_trader")
# supervisorctl output: "dolphin:nautilus_trader RUNNING pid 12345, uptime ..."
import re
m = re.search(r'pid\s+(\d+)', r.stdout)
if m:
return int(m.group(1))
return None
def _wait_hz_cooldown_clear(max_wait=8):
"""Wait for HZ to be confirmed healthy so MHS resets cooldown."""
_wait_until(_hz_active, max_wait)
def _docker_kill(name):
subprocess.run(["docker", "kill", name], check=True, capture_output=True)
def _docker_stop(name):
subprocess.run(["docker", "stop", "-t", "2", name], check=True, capture_output=True)
def _docker_running(name):
r = subprocess.run(["docker", "inspect", "--format", "{{.State.Running}}", name],
capture_output=True, text=True)
return r.stdout.strip() == "true"
def _assert_hz_was_healthy():
assert _hz_active(timeout=2.0), "Precondition: HZ must be healthy before test"
def _assert_prefect_was_healthy():
assert _prefect_ok(timeout=2.0), "Precondition: Prefect must be healthy before test"
# ── Fixtures ──────────────────────────────────────────────────────────────────
@pytest.fixture(autouse=True)
def ensure_baseline_healthy():
"""Wait for all services healthy + trader running before each test."""
deadline = time.time() + 90
while time.time() < deadline:
trader_ok = _trader_pid() is not None
if _hz_active() and _prefect_ok() and trader_ok:
break
time.sleep(1)
else:
pytest.skip("Baseline services not healthy — skipping chaos test")
yield
# Post-test: wait for any killed containers to fully recover before next test
deadline2 = time.time() + 90
while time.time() < deadline2:
if _hz_active() and _prefect_ok() and _trader_pid() is not None:
# Extra 2s for MHS cooldown reset (it resets on healthy probe, ~0.5s after recovery)
time.sleep(2)
break
time.sleep(1)
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 1: Hazelcast container killed (SIGKILL)
# ══════════════════════════════════════════════════════════════════════════════
class TestHZContainerKill:
def test_hz_kill_mhs_heals_within_budget(self):
"""SIGKILL HZ → MHS HTTP probe detects in ~1s → docker restart → HZ healthy."""
_assert_hz_was_healthy()
_docker_kill("dolphin-hazelcast")
kill_time = time.time()
# Immediately confirm it's dead
time.sleep(0.5)
assert not _hz_active(timeout=0.3), "HZ should be down after kill"
# Wait for recovery
recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
print(f"\n HZ kill→recovered in {recovery_s:.1f}s (budget {HZ_RESTART_BUDGET_S}s)")
assert recovery_s <= HZ_RESTART_BUDGET_S
def test_hz_kill_trader_reconnects(self):
"""After HZ kill+recovery, nautilus_trader must be processing scans again within 45s."""
_assert_hz_was_healthy()
pre_log_size = Path(TRADER_LOG).stat().st_size
_docker_kill("dolphin-hazelcast")
time.sleep(1)
# Wait for HZ recovery
_wait_until(_hz_active, HZ_RESTART_BUDGET_S)
# Then wait for trader to log a new LATENCY line
def _new_latency_line():
try:
return Path(TRADER_LOG).stat().st_size > pre_log_size + 100
except Exception:
return False
reconnect_s = _wait_until(_new_latency_line, 45)
print(f"\n Trader reconnected and logging within {reconnect_s:.1f}s of kill")
assert reconnect_s <= 45
def test_hz_kill_capital_survives_on_disk(self):
"""Kill HZ (loses in-memory maps) → disk checkpoint must still have valid capital."""
_assert_hz_was_healthy()
# Ensure there is a disk checkpoint (trader must have written one)
assert CAPITAL_DISK.exists(), "Disk checkpoint must exist before kill"
data = json.loads(CAPITAL_DISK.read_text())
pre_capital = float(data['capital'])
assert pre_capital >= 1.0, f"Pre-kill capital invalid: {pre_capital}"
_docker_kill("dolphin-hazelcast")
time.sleep(1)
# Disk checkpoint must be unchanged (not corrupted by kill)
data2 = json.loads(CAPITAL_DISK.read_text())
post_capital = float(data2['capital'])
assert math.isfinite(post_capital) and post_capital >= 1.0
# Within 1% of pre-kill (may have advanced slightly from a scan just before kill)
assert abs(post_capital - pre_capital) / pre_capital < 0.01, \
f"Capital changed unexpectedly: {pre_capital}{post_capital}"
# Wait for recovery
_wait_until(_hz_active, HZ_RESTART_BUDGET_S)
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 2: Hazelcast container graceful stop
# ══════════════════════════════════════════════════════════════════════════════
class TestHZContainerStop:
def test_hz_stop_recovers_within_budget(self):
"""Graceful stop (SIGTERM) — same recovery path as kill."""
_assert_hz_was_healthy()
_docker_stop("dolphin-hazelcast")
time.sleep(0.5)
assert not _hz_active(timeout=0.3)
recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
print(f"\n HZ stop→recovered in {recovery_s:.1f}s")
assert recovery_s <= HZ_RESTART_BUDGET_S
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 3: Prefect container killed
# ══════════════════════════════════════════════════════════════════════════════
class TestPrefectContainerKill:
def test_prefect_kill_recovers_within_budget(self):
"""SIGKILL Prefect → MHS probe detects → docker restart → Prefect healthy."""
_assert_prefect_was_healthy()
_docker_kill("dolphin-prefect")
time.sleep(0.5)
assert not _prefect_ok(timeout=0.3), "Prefect should be down"
recovery_s = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)
print(f"\n Prefect kill→recovered in {recovery_s:.1f}s (budget {PREFECT_RESTART_BUDGET_S}s)")
assert recovery_s <= PREFECT_RESTART_BUDGET_S
def test_prefect_kill_hz_unaffected(self):
"""Killing Prefect must not affect HZ or the trader."""
_assert_hz_was_healthy()
_assert_prefect_was_healthy()
_docker_kill("dolphin-prefect")
time.sleep(2)
# HZ must still be healthy
assert _hz_active(timeout=1.0), "HZ must be unaffected by Prefect kill"
# Trader must still be running
pid = _trader_pid()
assert pid is not None and pid > 0, "Trader must still be running"
# Wait for Prefect to recover
_wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 4: Simultaneous HZ + Prefect kill
# ══════════════════════════════════════════════════════════════════════════════
class TestSimultaneousKill:
def test_hz_and_prefect_simultaneous_kill(self):
"""Both killed simultaneously — both must recover independently."""
_assert_hz_was_healthy()
_assert_prefect_was_healthy()
_docker_kill("dolphin-hazelcast")
_docker_kill("dolphin-prefect")
kill_time = time.time()
time.sleep(0.5)
assert not _hz_active(timeout=0.3)
assert not _prefect_ok(timeout=0.3)
# Both must recover — HZ first (faster restart), then Prefect
hz_recovery = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
prefect_recovery = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)
print(f"\n Simultaneous kill: HZ recovered in {hz_recovery:.1f}s, "
f"Prefect in {prefect_recovery:.1f}s")
assert hz_recovery <= HZ_RESTART_BUDGET_S
assert prefect_recovery <= PREFECT_RESTART_BUDGET_S
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 5: nautilus_trader process killed (supervisord restarts)
# ══════════════════════════════════════════════════════════════════════════════
class TestTraderProcessKill:
def test_trader_kill_supervisord_restarts(self):
"""Kill trader process — supervisord must restart it and it must connect to HZ."""
pid_before = _trader_pid()
assert pid_before is not None
subprocess.run(["kill", "-9", str(pid_before)], check=True)
time.sleep(2)
# Wait for supervisord to restart and new process to connect
def _new_pid_running():
r = _supervisord("status dolphin:nautilus_trader")
return "RUNNING" in r.stdout
recovery_s = _wait_until(_new_pid_running, 30)
pid_after = _trader_pid()
assert pid_after != pid_before, "supervisord must have assigned new PID"
print(f"\n Trader killed+restarted in {recovery_s:.1f}s (PID {pid_before}{pid_after})")
def test_trader_restart_capital_restored_from_disk(self):
"""After trader restart, capital must be restored from disk checkpoint."""
assert CAPITAL_DISK.exists(), "Disk checkpoint required"
data = json.loads(CAPITAL_DISK.read_text())
expected_capital = float(data['capital'])
assert expected_capital >= 1.0
pid_before = _trader_pid()
subprocess.run(["kill", "-9", str(pid_before)], check=True)
# Wait for restart + first scan processed
def _trader_log_shows_restored():
try:
text = Path(TRADER_LOG).read_text()
return "Capital restored" in text.split("🐬 DOLPHIN")[-1]
except Exception:
return False
_wait_until(lambda: _supervisord("status dolphin:nautilus_trader").stdout.count("RUNNING") > 0, 20)
time.sleep(5)
log_tail = Path(TRADER_LOG).read_text().split("🐬 DOLPHIN")[-1]
if "no valid checkpoint" in log_tail:
pytest.fail("Trader started without capital checkpoint — disk restore failed")
if "Capital restored" in log_tail:
# Extract restored value
for line in log_tail.splitlines():
if "Capital restored" in line:
print(f"\n {line.strip()}")
break
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 6: scan_bridge process killed
# ══════════════════════════════════════════════════════════════════════════════
class TestScanBridgeKill:
def test_scan_bridge_kill_supervisord_restarts(self):
r = _supervisord("status dolphin:scan_bridge")
assert "RUNNING" in r.stdout, "scan_bridge must be running"
for part in r.stdout.split():
if part.isdigit():
pid = int(part)
break
else:
pytest.skip("Could not parse scan_bridge PID")
subprocess.run(["kill", "-9", str(pid)], check=True)
time.sleep(2)
def _sb_running():
return "RUNNING" in _supervisord("status dolphin:scan_bridge").stdout
recovery_s = _wait_until(_sb_running, 20)
print(f"\n scan_bridge restarted in {recovery_s:.1f}s")
assert recovery_s <= 20
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 7: Rapid repeated HZ kills (stress resilience)
# ══════════════════════════════════════════════════════════════════════════════
class TestHZRapidKills:
def test_hz_three_rapid_kills(self):
"""Kill HZ 3 times — each must recover. Waits for MHS cooldown reset between kills."""
for i in range(3):
_assert_hz_was_healthy()
_docker_kill("dolphin-hazelcast")
recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
print(f"\n Kill #{i+1}: recovered in {recovery_s:.1f}s")
assert recovery_s <= HZ_RESTART_BUDGET_S
# Wait for MHS to confirm healthy (resets cooldown) before next kill
time.sleep(1.5)
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 8: Capital checkpoint integrity under concurrent writes
# ══════════════════════════════════════════════════════════════════════════════
class TestCapitalCheckpointIntegrity:
def test_disk_checkpoint_always_valid_json(self):
"""Disk checkpoint must be valid JSON with capital >= 1.0 and finite ts."""
assert CAPITAL_DISK.exists()
data = json.loads(CAPITAL_DISK.read_text())
capital = float(data['capital'])
ts = float(data['ts'])
assert math.isfinite(capital) and capital >= 1.0
assert math.isfinite(ts) and ts > 1_700_000_000 # post-2023 epoch
def test_disk_checkpoint_survives_hz_restart(self):
"""Restart HZ (clears in-memory maps) — disk checkpoint must still be valid."""
assert CAPITAL_DISK.exists()
pre = json.loads(CAPITAL_DISK.read_text())
subprocess.run(["docker", "restart", "dolphin-hazelcast"],
check=True, capture_output=True)
_wait_until(_hz_active, HZ_RESTART_BUDGET_S)
post = json.loads(CAPITAL_DISK.read_text())
# Disk checkpoint should not have been corrupted
assert math.isfinite(float(post['capital']))
assert float(post['capital']) >= 1.0
print(f"\n Capital pre={pre['capital']:.2f} post={post['capital']:.2f}")
# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 9: MHS (meta_health) killed — supervisord restarts it
# ══════════════════════════════════════════════════════════════════════════════
class TestMHSKill:
def test_mhs_kill_supervisord_restarts(self):
r = _supervisord("status dolphin_data:meta_health")
assert "RUNNING" in r.stdout
for part in r.stdout.split():
if part.isdigit():
pid = int(part)
break
else:
pytest.skip("Could not parse meta_health PID")
subprocess.run(["kill", "-9", str(pid)], check=True)
time.sleep(2)
def _mhs_running():
return "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout
recovery_s = _wait_until(_mhs_running, 20)
print(f"\n MHS restarted in {recovery_s:.1f}s")
assert recovery_s <= 20
def test_hz_heals_even_without_mhs(self):
"""Kill MHS then kill HZ — autoheal (Docker layer) must still recover HZ."""
_assert_hz_was_healthy()
# Kill MHS
r = _supervisord("status dolphin_data:meta_health")
for part in r.stdout.split():
if part.isdigit():
mhs_pid = int(part)
break
else:
pytest.skip("Could not parse MHS PID")
subprocess.run(["kill", "-9", str(mhs_pid)], check=True)
time.sleep(1)
# Now kill HZ — autoheal must recover it without MHS
_docker_kill("dolphin-hazelcast")
time.sleep(1)
# autoheal polls every 10s, Docker healthcheck interval 10s → worst case ~45s
recovery_s = _wait_until(_hz_active, 60)
print(f"\n HZ healed without MHS in {recovery_s:.1f}s (autoheal layer)")
# Let MHS restart on its own via supervisord
_wait_until(lambda: "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout, 20)