479 lines
21 KiB
Python
479 lines
21 KiB
Python
|
|
"""
|
||
|
|
DOLPHIN Degradational / Chaos Test Suite
|
||
|
|
=========================================
|
||
|
|
Triggers real failure modes against live Docker containers and supervisord processes,
|
||
|
|
then asserts correct healing/restart within time budgets.
|
||
|
|
|
||
|
|
REQUIRES:
|
||
|
|
- Docker running (dolphin-hazelcast, dolphin-prefect, dolphin-hazelcast-mc)
|
||
|
|
- supervisord running with dolphin group
|
||
|
|
- MHS (meta_health) running
|
||
|
|
- nautilus_trader running
|
||
|
|
|
||
|
|
Run as root (docker commands require it):
|
||
|
|
/home/dolphin/siloqy_env/bin/python3 -m pytest prod/tests/test_degradational.py -v -s --timeout=120
|
||
|
|
"""
|
||
|
|
import json
|
||
|
|
import math
|
||
|
|
import subprocess
|
||
|
|
import time
|
||
|
|
import urllib.request
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
# ── Constants ────────────────────────────────────────────────────────────────
|
||
|
|
SUPERVISORD_CONF = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
|
||
|
|
HZ_HEALTH_URL = "http://127.0.0.1:5701/hazelcast/health"
|
||
|
|
PREFECT_HEALTH_URL = "http://127.0.0.1:4200/api/health"
|
||
|
|
MC_HEALTH_URL = "http://127.0.0.1:8080/"
|
||
|
|
TRADER_LOG = "/tmp/nautilus_trader.log"
|
||
|
|
CAPITAL_DISK = Path("/tmp/dolphin_capital_checkpoint.json")
|
||
|
|
HZ_RESTART_BUDGET_S = 25 # worst-case: ~19s + 6s buffer
|
||
|
|
PREFECT_RESTART_BUDGET_S = 40
|
||
|
|
MC_RESTART_BUDGET_S = 90 # MC is non-critical, slower tolerance
|
||
|
|
|
||
|
|
|
||
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def _http_ok(url, timeout=1.0):
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(url, timeout=timeout) as r:
|
||
|
|
return r.status == 200
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def _hz_active(timeout=0.5):
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(HZ_HEALTH_URL, timeout=timeout) as r:
|
||
|
|
return json.loads(r.read()).get('nodeState') == 'ACTIVE'
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def _prefect_ok(timeout=0.5):
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(PREFECT_HEALTH_URL, timeout=timeout) as r:
|
||
|
|
return r.read().strip() == b'true'
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
def _wait_until(predicate, budget_s, poll=0.3):
|
||
|
|
t0 = time.time()
|
||
|
|
while time.time() - t0 < budget_s:
|
||
|
|
if predicate():
|
||
|
|
return time.time() - t0
|
||
|
|
time.sleep(poll)
|
||
|
|
raise TimeoutError(f"Not recovered within {budget_s}s")
|
||
|
|
|
||
|
|
|
||
|
|
def _supervisord(cmd):
|
||
|
|
return subprocess.run(
|
||
|
|
["supervisorctl", "-c", SUPERVISORD_CONF] + cmd.split(),
|
||
|
|
capture_output=True, text=True
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _trader_pid():
|
||
|
|
r = _supervisord("status dolphin:nautilus_trader")
|
||
|
|
# supervisorctl output: "dolphin:nautilus_trader RUNNING pid 12345, uptime ..."
|
||
|
|
import re
|
||
|
|
m = re.search(r'pid\s+(\d+)', r.stdout)
|
||
|
|
if m:
|
||
|
|
return int(m.group(1))
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def _wait_hz_cooldown_clear(max_wait=8):
|
||
|
|
"""Wait for HZ to be confirmed healthy so MHS resets cooldown."""
|
||
|
|
_wait_until(_hz_active, max_wait)
|
||
|
|
|
||
|
|
|
||
|
|
def _docker_kill(name):
|
||
|
|
subprocess.run(["docker", "kill", name], check=True, capture_output=True)
|
||
|
|
|
||
|
|
|
||
|
|
def _docker_stop(name):
|
||
|
|
subprocess.run(["docker", "stop", "-t", "2", name], check=True, capture_output=True)
|
||
|
|
|
||
|
|
|
||
|
|
def _docker_running(name):
|
||
|
|
r = subprocess.run(["docker", "inspect", "--format", "{{.State.Running}}", name],
|
||
|
|
capture_output=True, text=True)
|
||
|
|
return r.stdout.strip() == "true"
|
||
|
|
|
||
|
|
|
||
|
|
def _assert_hz_was_healthy():
|
||
|
|
assert _hz_active(timeout=2.0), "Precondition: HZ must be healthy before test"
|
||
|
|
|
||
|
|
|
||
|
|
def _assert_prefect_was_healthy():
|
||
|
|
assert _prefect_ok(timeout=2.0), "Precondition: Prefect must be healthy before test"
|
||
|
|
|
||
|
|
|
||
|
|
# ── Fixtures ──────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
@pytest.fixture(autouse=True)
|
||
|
|
def ensure_baseline_healthy():
|
||
|
|
"""Wait for all services healthy + trader running before each test."""
|
||
|
|
deadline = time.time() + 90
|
||
|
|
while time.time() < deadline:
|
||
|
|
trader_ok = _trader_pid() is not None
|
||
|
|
if _hz_active() and _prefect_ok() and trader_ok:
|
||
|
|
break
|
||
|
|
time.sleep(1)
|
||
|
|
else:
|
||
|
|
pytest.skip("Baseline services not healthy — skipping chaos test")
|
||
|
|
yield
|
||
|
|
# Post-test: wait for any killed containers to fully recover before next test
|
||
|
|
deadline2 = time.time() + 90
|
||
|
|
while time.time() < deadline2:
|
||
|
|
if _hz_active() and _prefect_ok() and _trader_pid() is not None:
|
||
|
|
# Extra 2s for MHS cooldown reset (it resets on healthy probe, ~0.5s after recovery)
|
||
|
|
time.sleep(2)
|
||
|
|
break
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 1: Hazelcast container killed (SIGKILL)
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestHZContainerKill:
|
||
|
|
|
||
|
|
def test_hz_kill_mhs_heals_within_budget(self):
|
||
|
|
"""SIGKILL HZ → MHS HTTP probe detects in ~1s → docker restart → HZ healthy."""
|
||
|
|
_assert_hz_was_healthy()
|
||
|
|
|
||
|
|
_docker_kill("dolphin-hazelcast")
|
||
|
|
kill_time = time.time()
|
||
|
|
|
||
|
|
# Immediately confirm it's dead
|
||
|
|
time.sleep(0.5)
|
||
|
|
assert not _hz_active(timeout=0.3), "HZ should be down after kill"
|
||
|
|
|
||
|
|
# Wait for recovery
|
||
|
|
recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
|
||
|
|
print(f"\n HZ kill→recovered in {recovery_s:.1f}s (budget {HZ_RESTART_BUDGET_S}s)")
|
||
|
|
assert recovery_s <= HZ_RESTART_BUDGET_S
|
||
|
|
|
||
|
|
def test_hz_kill_trader_reconnects(self):
|
||
|
|
"""After HZ kill+recovery, nautilus_trader must be processing scans again within 45s."""
|
||
|
|
_assert_hz_was_healthy()
|
||
|
|
pre_log_size = Path(TRADER_LOG).stat().st_size
|
||
|
|
|
||
|
|
_docker_kill("dolphin-hazelcast")
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
# Wait for HZ recovery
|
||
|
|
_wait_until(_hz_active, HZ_RESTART_BUDGET_S)
|
||
|
|
|
||
|
|
# Then wait for trader to log a new LATENCY line
|
||
|
|
def _new_latency_line():
|
||
|
|
try:
|
||
|
|
return Path(TRADER_LOG).stat().st_size > pre_log_size + 100
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
reconnect_s = _wait_until(_new_latency_line, 45)
|
||
|
|
print(f"\n Trader reconnected and logging within {reconnect_s:.1f}s of kill")
|
||
|
|
assert reconnect_s <= 45
|
||
|
|
|
||
|
|
def test_hz_kill_capital_survives_on_disk(self):
|
||
|
|
"""Kill HZ (loses in-memory maps) → disk checkpoint must still have valid capital."""
|
||
|
|
_assert_hz_was_healthy()
|
||
|
|
|
||
|
|
# Ensure there is a disk checkpoint (trader must have written one)
|
||
|
|
assert CAPITAL_DISK.exists(), "Disk checkpoint must exist before kill"
|
||
|
|
data = json.loads(CAPITAL_DISK.read_text())
|
||
|
|
pre_capital = float(data['capital'])
|
||
|
|
assert pre_capital >= 1.0, f"Pre-kill capital invalid: {pre_capital}"
|
||
|
|
|
||
|
|
_docker_kill("dolphin-hazelcast")
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
# Disk checkpoint must be unchanged (not corrupted by kill)
|
||
|
|
data2 = json.loads(CAPITAL_DISK.read_text())
|
||
|
|
post_capital = float(data2['capital'])
|
||
|
|
assert math.isfinite(post_capital) and post_capital >= 1.0
|
||
|
|
# Within 1% of pre-kill (may have advanced slightly from a scan just before kill)
|
||
|
|
assert abs(post_capital - pre_capital) / pre_capital < 0.01, \
|
||
|
|
f"Capital changed unexpectedly: {pre_capital} → {post_capital}"
|
||
|
|
|
||
|
|
# Wait for recovery
|
||
|
|
_wait_until(_hz_active, HZ_RESTART_BUDGET_S)
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 2: Hazelcast container graceful stop
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestHZContainerStop:
|
||
|
|
|
||
|
|
def test_hz_stop_recovers_within_budget(self):
|
||
|
|
"""Graceful stop (SIGTERM) — same recovery path as kill."""
|
||
|
|
_assert_hz_was_healthy()
|
||
|
|
|
||
|
|
_docker_stop("dolphin-hazelcast")
|
||
|
|
time.sleep(0.5)
|
||
|
|
assert not _hz_active(timeout=0.3)
|
||
|
|
|
||
|
|
recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
|
||
|
|
print(f"\n HZ stop→recovered in {recovery_s:.1f}s")
|
||
|
|
assert recovery_s <= HZ_RESTART_BUDGET_S
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 3: Prefect container killed
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestPrefectContainerKill:
|
||
|
|
|
||
|
|
def test_prefect_kill_recovers_within_budget(self):
|
||
|
|
"""SIGKILL Prefect → MHS probe detects → docker restart → Prefect healthy."""
|
||
|
|
_assert_prefect_was_healthy()
|
||
|
|
|
||
|
|
_docker_kill("dolphin-prefect")
|
||
|
|
time.sleep(0.5)
|
||
|
|
assert not _prefect_ok(timeout=0.3), "Prefect should be down"
|
||
|
|
|
||
|
|
recovery_s = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)
|
||
|
|
print(f"\n Prefect kill→recovered in {recovery_s:.1f}s (budget {PREFECT_RESTART_BUDGET_S}s)")
|
||
|
|
assert recovery_s <= PREFECT_RESTART_BUDGET_S
|
||
|
|
|
||
|
|
def test_prefect_kill_hz_unaffected(self):
|
||
|
|
"""Killing Prefect must not affect HZ or the trader."""
|
||
|
|
_assert_hz_was_healthy()
|
||
|
|
_assert_prefect_was_healthy()
|
||
|
|
|
||
|
|
_docker_kill("dolphin-prefect")
|
||
|
|
time.sleep(2)
|
||
|
|
|
||
|
|
# HZ must still be healthy
|
||
|
|
assert _hz_active(timeout=1.0), "HZ must be unaffected by Prefect kill"
|
||
|
|
|
||
|
|
# Trader must still be running
|
||
|
|
pid = _trader_pid()
|
||
|
|
assert pid is not None and pid > 0, "Trader must still be running"
|
||
|
|
|
||
|
|
# Wait for Prefect to recover
|
||
|
|
_wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 4: Simultaneous HZ + Prefect kill
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestSimultaneousKill:
|
||
|
|
|
||
|
|
def test_hz_and_prefect_simultaneous_kill(self):
|
||
|
|
"""Both killed simultaneously — both must recover independently."""
|
||
|
|
_assert_hz_was_healthy()
|
||
|
|
_assert_prefect_was_healthy()
|
||
|
|
|
||
|
|
_docker_kill("dolphin-hazelcast")
|
||
|
|
_docker_kill("dolphin-prefect")
|
||
|
|
kill_time = time.time()
|
||
|
|
|
||
|
|
time.sleep(0.5)
|
||
|
|
assert not _hz_active(timeout=0.3)
|
||
|
|
assert not _prefect_ok(timeout=0.3)
|
||
|
|
|
||
|
|
# Both must recover — HZ first (faster restart), then Prefect
|
||
|
|
hz_recovery = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
|
||
|
|
prefect_recovery = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)
|
||
|
|
|
||
|
|
print(f"\n Simultaneous kill: HZ recovered in {hz_recovery:.1f}s, "
|
||
|
|
f"Prefect in {prefect_recovery:.1f}s")
|
||
|
|
assert hz_recovery <= HZ_RESTART_BUDGET_S
|
||
|
|
assert prefect_recovery <= PREFECT_RESTART_BUDGET_S
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 5: nautilus_trader process killed (supervisord restarts)
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestTraderProcessKill:
|
||
|
|
|
||
|
|
def test_trader_kill_supervisord_restarts(self):
|
||
|
|
"""Kill trader process — supervisord must restart it and it must connect to HZ."""
|
||
|
|
pid_before = _trader_pid()
|
||
|
|
assert pid_before is not None
|
||
|
|
|
||
|
|
subprocess.run(["kill", "-9", str(pid_before)], check=True)
|
||
|
|
time.sleep(2)
|
||
|
|
|
||
|
|
# Wait for supervisord to restart and new process to connect
|
||
|
|
def _new_pid_running():
|
||
|
|
r = _supervisord("status dolphin:nautilus_trader")
|
||
|
|
return "RUNNING" in r.stdout
|
||
|
|
|
||
|
|
recovery_s = _wait_until(_new_pid_running, 30)
|
||
|
|
pid_after = _trader_pid()
|
||
|
|
assert pid_after != pid_before, "supervisord must have assigned new PID"
|
||
|
|
print(f"\n Trader killed+restarted in {recovery_s:.1f}s (PID {pid_before}→{pid_after})")
|
||
|
|
|
||
|
|
def test_trader_restart_capital_restored_from_disk(self):
|
||
|
|
"""After trader restart, capital must be restored from disk checkpoint."""
|
||
|
|
assert CAPITAL_DISK.exists(), "Disk checkpoint required"
|
||
|
|
data = json.loads(CAPITAL_DISK.read_text())
|
||
|
|
expected_capital = float(data['capital'])
|
||
|
|
assert expected_capital >= 1.0
|
||
|
|
|
||
|
|
pid_before = _trader_pid()
|
||
|
|
subprocess.run(["kill", "-9", str(pid_before)], check=True)
|
||
|
|
|
||
|
|
# Wait for restart + first scan processed
|
||
|
|
def _trader_log_shows_restored():
|
||
|
|
try:
|
||
|
|
text = Path(TRADER_LOG).read_text()
|
||
|
|
return "Capital restored" in text.split("🐬 DOLPHIN")[-1]
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
_wait_until(lambda: _supervisord("status dolphin:nautilus_trader").stdout.count("RUNNING") > 0, 20)
|
||
|
|
time.sleep(5)
|
||
|
|
|
||
|
|
log_tail = Path(TRADER_LOG).read_text().split("🐬 DOLPHIN")[-1]
|
||
|
|
if "no valid checkpoint" in log_tail:
|
||
|
|
pytest.fail("Trader started without capital checkpoint — disk restore failed")
|
||
|
|
if "Capital restored" in log_tail:
|
||
|
|
# Extract restored value
|
||
|
|
for line in log_tail.splitlines():
|
||
|
|
if "Capital restored" in line:
|
||
|
|
print(f"\n {line.strip()}")
|
||
|
|
break
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 6: scan_bridge process killed
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestScanBridgeKill:
|
||
|
|
|
||
|
|
def test_scan_bridge_kill_supervisord_restarts(self):
|
||
|
|
r = _supervisord("status dolphin:scan_bridge")
|
||
|
|
assert "RUNNING" in r.stdout, "scan_bridge must be running"
|
||
|
|
|
||
|
|
for part in r.stdout.split():
|
||
|
|
if part.isdigit():
|
||
|
|
pid = int(part)
|
||
|
|
break
|
||
|
|
else:
|
||
|
|
pytest.skip("Could not parse scan_bridge PID")
|
||
|
|
|
||
|
|
subprocess.run(["kill", "-9", str(pid)], check=True)
|
||
|
|
time.sleep(2)
|
||
|
|
|
||
|
|
def _sb_running():
|
||
|
|
return "RUNNING" in _supervisord("status dolphin:scan_bridge").stdout
|
||
|
|
|
||
|
|
recovery_s = _wait_until(_sb_running, 20)
|
||
|
|
print(f"\n scan_bridge restarted in {recovery_s:.1f}s")
|
||
|
|
assert recovery_s <= 20
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 7: Rapid repeated HZ kills (stress resilience)
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestHZRapidKills:
|
||
|
|
|
||
|
|
def test_hz_three_rapid_kills(self):
|
||
|
|
"""Kill HZ 3 times — each must recover. Waits for MHS cooldown reset between kills."""
|
||
|
|
for i in range(3):
|
||
|
|
_assert_hz_was_healthy()
|
||
|
|
_docker_kill("dolphin-hazelcast")
|
||
|
|
recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
|
||
|
|
print(f"\n Kill #{i+1}: recovered in {recovery_s:.1f}s")
|
||
|
|
assert recovery_s <= HZ_RESTART_BUDGET_S
|
||
|
|
# Wait for MHS to confirm healthy (resets cooldown) before next kill
|
||
|
|
time.sleep(1.5)
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 8: Capital checkpoint integrity under concurrent writes
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestCapitalCheckpointIntegrity:
|
||
|
|
|
||
|
|
def test_disk_checkpoint_always_valid_json(self):
|
||
|
|
"""Disk checkpoint must be valid JSON with capital >= 1.0 and finite ts."""
|
||
|
|
assert CAPITAL_DISK.exists()
|
||
|
|
data = json.loads(CAPITAL_DISK.read_text())
|
||
|
|
capital = float(data['capital'])
|
||
|
|
ts = float(data['ts'])
|
||
|
|
assert math.isfinite(capital) and capital >= 1.0
|
||
|
|
assert math.isfinite(ts) and ts > 1_700_000_000 # post-2023 epoch
|
||
|
|
|
||
|
|
def test_disk_checkpoint_survives_hz_restart(self):
|
||
|
|
"""Restart HZ (clears in-memory maps) — disk checkpoint must still be valid."""
|
||
|
|
assert CAPITAL_DISK.exists()
|
||
|
|
pre = json.loads(CAPITAL_DISK.read_text())
|
||
|
|
|
||
|
|
subprocess.run(["docker", "restart", "dolphin-hazelcast"],
|
||
|
|
check=True, capture_output=True)
|
||
|
|
_wait_until(_hz_active, HZ_RESTART_BUDGET_S)
|
||
|
|
|
||
|
|
post = json.loads(CAPITAL_DISK.read_text())
|
||
|
|
# Disk checkpoint should not have been corrupted
|
||
|
|
assert math.isfinite(float(post['capital']))
|
||
|
|
assert float(post['capital']) >= 1.0
|
||
|
|
print(f"\n Capital pre={pre['capital']:.2f} post={post['capital']:.2f}")
|
||
|
|
|
||
|
|
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
# FAILURE MODE 9: MHS (meta_health) killed — supervisord restarts it
|
||
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
||
|
|
|
||
|
|
class TestMHSKill:
|
||
|
|
|
||
|
|
def test_mhs_kill_supervisord_restarts(self):
|
||
|
|
r = _supervisord("status dolphin_data:meta_health")
|
||
|
|
assert "RUNNING" in r.stdout
|
||
|
|
|
||
|
|
for part in r.stdout.split():
|
||
|
|
if part.isdigit():
|
||
|
|
pid = int(part)
|
||
|
|
break
|
||
|
|
else:
|
||
|
|
pytest.skip("Could not parse meta_health PID")
|
||
|
|
|
||
|
|
subprocess.run(["kill", "-9", str(pid)], check=True)
|
||
|
|
time.sleep(2)
|
||
|
|
|
||
|
|
def _mhs_running():
|
||
|
|
return "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout
|
||
|
|
|
||
|
|
recovery_s = _wait_until(_mhs_running, 20)
|
||
|
|
print(f"\n MHS restarted in {recovery_s:.1f}s")
|
||
|
|
assert recovery_s <= 20
|
||
|
|
|
||
|
|
def test_hz_heals_even_without_mhs(self):
|
||
|
|
"""Kill MHS then kill HZ — autoheal (Docker layer) must still recover HZ."""
|
||
|
|
_assert_hz_was_healthy()
|
||
|
|
|
||
|
|
# Kill MHS
|
||
|
|
r = _supervisord("status dolphin_data:meta_health")
|
||
|
|
for part in r.stdout.split():
|
||
|
|
if part.isdigit():
|
||
|
|
mhs_pid = int(part)
|
||
|
|
break
|
||
|
|
else:
|
||
|
|
pytest.skip("Could not parse MHS PID")
|
||
|
|
subprocess.run(["kill", "-9", str(mhs_pid)], check=True)
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
# Now kill HZ — autoheal must recover it without MHS
|
||
|
|
_docker_kill("dolphin-hazelcast")
|
||
|
|
time.sleep(1)
|
||
|
|
|
||
|
|
# autoheal polls every 10s, Docker healthcheck interval 10s → worst case ~45s
|
||
|
|
recovery_s = _wait_until(_hz_active, 60)
|
||
|
|
print(f"\n HZ healed without MHS in {recovery_s:.1f}s (autoheal layer)")
|
||
|
|
|
||
|
|
# Let MHS restart on its own via supervisord
|
||
|
|
_wait_until(lambda: "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout, 20)
|