""" DOLPHIN Degradational / Chaos Test Suite ========================================= Triggers real failure modes against live Docker containers and supervisord processes, then asserts correct healing/restart within time budgets. REQUIRES: - Docker running (dolphin-hazelcast, dolphin-prefect, dolphin-hazelcast-mc) - supervisord running with dolphin group - MHS (meta_health) running - nautilus_trader running Run as root (docker commands require it): /home/dolphin/siloqy_env/bin/python3 -m pytest prod/tests/test_degradational.py -v -s --timeout=120 """ import json import math import subprocess import time import urllib.request from pathlib import Path import pytest # ── Constants ──────────────────────────────────────────────────────────────── SUPERVISORD_CONF = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf" HZ_HEALTH_URL = "http://127.0.0.1:5701/hazelcast/health" PREFECT_HEALTH_URL = "http://127.0.0.1:4200/api/health" MC_HEALTH_URL = "http://127.0.0.1:8080/" TRADER_LOG = "/tmp/nautilus_trader.log" CAPITAL_DISK = Path("/tmp/dolphin_capital_checkpoint.json") HZ_RESTART_BUDGET_S = 25 # worst-case: ~19s + 6s buffer PREFECT_RESTART_BUDGET_S = 40 MC_RESTART_BUDGET_S = 90 # MC is non-critical, slower tolerance # ── Helpers ─────────────────────────────────────────────────────────────────── def _http_ok(url, timeout=1.0): try: with urllib.request.urlopen(url, timeout=timeout) as r: return r.status == 200 except Exception: return False def _hz_active(timeout=0.5): try: with urllib.request.urlopen(HZ_HEALTH_URL, timeout=timeout) as r: return json.loads(r.read()).get('nodeState') == 'ACTIVE' except Exception: return False def _prefect_ok(timeout=0.5): try: with urllib.request.urlopen(PREFECT_HEALTH_URL, timeout=timeout) as r: return r.read().strip() == b'true' except Exception: return False def _wait_until(predicate, budget_s, poll=0.3): t0 = time.time() while time.time() - t0 < budget_s: if predicate(): return time.time() - t0 time.sleep(poll) raise TimeoutError(f"Not recovered within {budget_s}s") def _supervisord(cmd): return subprocess.run( ["supervisorctl", "-c", SUPERVISORD_CONF] + cmd.split(), capture_output=True, text=True ) def _trader_pid(): r = _supervisord("status dolphin:nautilus_trader") # supervisorctl output: "dolphin:nautilus_trader RUNNING pid 12345, uptime ..." import re m = re.search(r'pid\s+(\d+)', r.stdout) if m: return int(m.group(1)) return None def _wait_hz_cooldown_clear(max_wait=8): """Wait for HZ to be confirmed healthy so MHS resets cooldown.""" _wait_until(_hz_active, max_wait) def _docker_kill(name): subprocess.run(["docker", "kill", name], check=True, capture_output=True) def _docker_stop(name): subprocess.run(["docker", "stop", "-t", "2", name], check=True, capture_output=True) def _docker_running(name): r = subprocess.run(["docker", "inspect", "--format", "{{.State.Running}}", name], capture_output=True, text=True) return r.stdout.strip() == "true" def _assert_hz_was_healthy(): assert _hz_active(timeout=2.0), "Precondition: HZ must be healthy before test" def _assert_prefect_was_healthy(): assert _prefect_ok(timeout=2.0), "Precondition: Prefect must be healthy before test" # ── Fixtures ────────────────────────────────────────────────────────────────── @pytest.fixture(autouse=True) def ensure_baseline_healthy(): """Wait for all services healthy + trader running before each test.""" deadline = time.time() + 90 while time.time() < deadline: trader_ok = _trader_pid() is not None if _hz_active() and _prefect_ok() and trader_ok: break time.sleep(1) else: pytest.skip("Baseline services not healthy — skipping chaos test") yield # Post-test: wait for any killed containers to fully recover before next test deadline2 = time.time() + 90 while time.time() < deadline2: if _hz_active() and _prefect_ok() and _trader_pid() is not None: # Extra 2s for MHS cooldown reset (it resets on healthy probe, ~0.5s after recovery) time.sleep(2) break time.sleep(1) # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 1: Hazelcast container killed (SIGKILL) # ══════════════════════════════════════════════════════════════════════════════ class TestHZContainerKill: def test_hz_kill_mhs_heals_within_budget(self): """SIGKILL HZ → MHS HTTP probe detects in ~1s → docker restart → HZ healthy.""" _assert_hz_was_healthy() _docker_kill("dolphin-hazelcast") kill_time = time.time() # Immediately confirm it's dead time.sleep(0.5) assert not _hz_active(timeout=0.3), "HZ should be down after kill" # Wait for recovery recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S) print(f"\n HZ kill→recovered in {recovery_s:.1f}s (budget {HZ_RESTART_BUDGET_S}s)") assert recovery_s <= HZ_RESTART_BUDGET_S def test_hz_kill_trader_reconnects(self): """After HZ kill+recovery, nautilus_trader must be processing scans again within 45s.""" _assert_hz_was_healthy() pre_log_size = Path(TRADER_LOG).stat().st_size _docker_kill("dolphin-hazelcast") time.sleep(1) # Wait for HZ recovery _wait_until(_hz_active, HZ_RESTART_BUDGET_S) # Then wait for trader to log a new LATENCY line def _new_latency_line(): try: return Path(TRADER_LOG).stat().st_size > pre_log_size + 100 except Exception: return False reconnect_s = _wait_until(_new_latency_line, 45) print(f"\n Trader reconnected and logging within {reconnect_s:.1f}s of kill") assert reconnect_s <= 45 def test_hz_kill_capital_survives_on_disk(self): """Kill HZ (loses in-memory maps) → disk checkpoint must still have valid capital.""" _assert_hz_was_healthy() # Ensure there is a disk checkpoint (trader must have written one) assert CAPITAL_DISK.exists(), "Disk checkpoint must exist before kill" data = json.loads(CAPITAL_DISK.read_text()) pre_capital = float(data['capital']) assert pre_capital >= 1.0, f"Pre-kill capital invalid: {pre_capital}" _docker_kill("dolphin-hazelcast") time.sleep(1) # Disk checkpoint must be unchanged (not corrupted by kill) data2 = json.loads(CAPITAL_DISK.read_text()) post_capital = float(data2['capital']) assert math.isfinite(post_capital) and post_capital >= 1.0 # Within 1% of pre-kill (may have advanced slightly from a scan just before kill) assert abs(post_capital - pre_capital) / pre_capital < 0.01, \ f"Capital changed unexpectedly: {pre_capital} → {post_capital}" # Wait for recovery _wait_until(_hz_active, HZ_RESTART_BUDGET_S) # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 2: Hazelcast container graceful stop # ══════════════════════════════════════════════════════════════════════════════ class TestHZContainerStop: def test_hz_stop_recovers_within_budget(self): """Graceful stop (SIGTERM) — same recovery path as kill.""" _assert_hz_was_healthy() _docker_stop("dolphin-hazelcast") time.sleep(0.5) assert not _hz_active(timeout=0.3) recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S) print(f"\n HZ stop→recovered in {recovery_s:.1f}s") assert recovery_s <= HZ_RESTART_BUDGET_S # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 3: Prefect container killed # ══════════════════════════════════════════════════════════════════════════════ class TestPrefectContainerKill: def test_prefect_kill_recovers_within_budget(self): """SIGKILL Prefect → MHS probe detects → docker restart → Prefect healthy.""" _assert_prefect_was_healthy() _docker_kill("dolphin-prefect") time.sleep(0.5) assert not _prefect_ok(timeout=0.3), "Prefect should be down" recovery_s = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S) print(f"\n Prefect kill→recovered in {recovery_s:.1f}s (budget {PREFECT_RESTART_BUDGET_S}s)") assert recovery_s <= PREFECT_RESTART_BUDGET_S def test_prefect_kill_hz_unaffected(self): """Killing Prefect must not affect HZ or the trader.""" _assert_hz_was_healthy() _assert_prefect_was_healthy() _docker_kill("dolphin-prefect") time.sleep(2) # HZ must still be healthy assert _hz_active(timeout=1.0), "HZ must be unaffected by Prefect kill" # Trader must still be running pid = _trader_pid() assert pid is not None and pid > 0, "Trader must still be running" # Wait for Prefect to recover _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S) # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 4: Simultaneous HZ + Prefect kill # ══════════════════════════════════════════════════════════════════════════════ class TestSimultaneousKill: def test_hz_and_prefect_simultaneous_kill(self): """Both killed simultaneously — both must recover independently.""" _assert_hz_was_healthy() _assert_prefect_was_healthy() _docker_kill("dolphin-hazelcast") _docker_kill("dolphin-prefect") kill_time = time.time() time.sleep(0.5) assert not _hz_active(timeout=0.3) assert not _prefect_ok(timeout=0.3) # Both must recover — HZ first (faster restart), then Prefect hz_recovery = _wait_until(_hz_active, HZ_RESTART_BUDGET_S) prefect_recovery = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S) print(f"\n Simultaneous kill: HZ recovered in {hz_recovery:.1f}s, " f"Prefect in {prefect_recovery:.1f}s") assert hz_recovery <= HZ_RESTART_BUDGET_S assert prefect_recovery <= PREFECT_RESTART_BUDGET_S # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 5: nautilus_trader process killed (supervisord restarts) # ══════════════════════════════════════════════════════════════════════════════ class TestTraderProcessKill: def test_trader_kill_supervisord_restarts(self): """Kill trader process — supervisord must restart it and it must connect to HZ.""" pid_before = _trader_pid() assert pid_before is not None subprocess.run(["kill", "-9", str(pid_before)], check=True) time.sleep(2) # Wait for supervisord to restart and new process to connect def _new_pid_running(): r = _supervisord("status dolphin:nautilus_trader") return "RUNNING" in r.stdout recovery_s = _wait_until(_new_pid_running, 30) pid_after = _trader_pid() assert pid_after != pid_before, "supervisord must have assigned new PID" print(f"\n Trader killed+restarted in {recovery_s:.1f}s (PID {pid_before}→{pid_after})") def test_trader_restart_capital_restored_from_disk(self): """After trader restart, capital must be restored from disk checkpoint.""" assert CAPITAL_DISK.exists(), "Disk checkpoint required" data = json.loads(CAPITAL_DISK.read_text()) expected_capital = float(data['capital']) assert expected_capital >= 1.0 pid_before = _trader_pid() subprocess.run(["kill", "-9", str(pid_before)], check=True) # Wait for restart + first scan processed def _trader_log_shows_restored(): try: text = Path(TRADER_LOG).read_text() return "Capital restored" in text.split("🐬 DOLPHIN")[-1] except Exception: return False _wait_until(lambda: _supervisord("status dolphin:nautilus_trader").stdout.count("RUNNING") > 0, 20) time.sleep(5) log_tail = Path(TRADER_LOG).read_text().split("🐬 DOLPHIN")[-1] if "no valid checkpoint" in log_tail: pytest.fail("Trader started without capital checkpoint — disk restore failed") if "Capital restored" in log_tail: # Extract restored value for line in log_tail.splitlines(): if "Capital restored" in line: print(f"\n {line.strip()}") break # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 6: scan_bridge process killed # ══════════════════════════════════════════════════════════════════════════════ class TestScanBridgeKill: def test_scan_bridge_kill_supervisord_restarts(self): r = _supervisord("status dolphin:scan_bridge") assert "RUNNING" in r.stdout, "scan_bridge must be running" for part in r.stdout.split(): if part.isdigit(): pid = int(part) break else: pytest.skip("Could not parse scan_bridge PID") subprocess.run(["kill", "-9", str(pid)], check=True) time.sleep(2) def _sb_running(): return "RUNNING" in _supervisord("status dolphin:scan_bridge").stdout recovery_s = _wait_until(_sb_running, 20) print(f"\n scan_bridge restarted in {recovery_s:.1f}s") assert recovery_s <= 20 # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 7: Rapid repeated HZ kills (stress resilience) # ══════════════════════════════════════════════════════════════════════════════ class TestHZRapidKills: def test_hz_three_rapid_kills(self): """Kill HZ 3 times — each must recover. Waits for MHS cooldown reset between kills.""" for i in range(3): _assert_hz_was_healthy() _docker_kill("dolphin-hazelcast") recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S) print(f"\n Kill #{i+1}: recovered in {recovery_s:.1f}s") assert recovery_s <= HZ_RESTART_BUDGET_S # Wait for MHS to confirm healthy (resets cooldown) before next kill time.sleep(1.5) # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 8: Capital checkpoint integrity under concurrent writes # ══════════════════════════════════════════════════════════════════════════════ class TestCapitalCheckpointIntegrity: def test_disk_checkpoint_always_valid_json(self): """Disk checkpoint must be valid JSON with capital >= 1.0 and finite ts.""" assert CAPITAL_DISK.exists() data = json.loads(CAPITAL_DISK.read_text()) capital = float(data['capital']) ts = float(data['ts']) assert math.isfinite(capital) and capital >= 1.0 assert math.isfinite(ts) and ts > 1_700_000_000 # post-2023 epoch def test_disk_checkpoint_survives_hz_restart(self): """Restart HZ (clears in-memory maps) — disk checkpoint must still be valid.""" assert CAPITAL_DISK.exists() pre = json.loads(CAPITAL_DISK.read_text()) subprocess.run(["docker", "restart", "dolphin-hazelcast"], check=True, capture_output=True) _wait_until(_hz_active, HZ_RESTART_BUDGET_S) post = json.loads(CAPITAL_DISK.read_text()) # Disk checkpoint should not have been corrupted assert math.isfinite(float(post['capital'])) assert float(post['capital']) >= 1.0 print(f"\n Capital pre={pre['capital']:.2f} post={post['capital']:.2f}") # ══════════════════════════════════════════════════════════════════════════════ # FAILURE MODE 9: MHS (meta_health) killed — supervisord restarts it # ══════════════════════════════════════════════════════════════════════════════ class TestMHSKill: def test_mhs_kill_supervisord_restarts(self): r = _supervisord("status dolphin_data:meta_health") assert "RUNNING" in r.stdout for part in r.stdout.split(): if part.isdigit(): pid = int(part) break else: pytest.skip("Could not parse meta_health PID") subprocess.run(["kill", "-9", str(pid)], check=True) time.sleep(2) def _mhs_running(): return "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout recovery_s = _wait_until(_mhs_running, 20) print(f"\n MHS restarted in {recovery_s:.1f}s") assert recovery_s <= 20 def test_hz_heals_even_without_mhs(self): """Kill MHS then kill HZ — autoheal (Docker layer) must still recover HZ.""" _assert_hz_was_healthy() # Kill MHS r = _supervisord("status dolphin_data:meta_health") for part in r.stdout.split(): if part.isdigit(): mhs_pid = int(part) break else: pytest.skip("Could not parse MHS PID") subprocess.run(["kill", "-9", str(mhs_pid)], check=True) time.sleep(1) # Now kill HZ — autoheal must recover it without MHS _docker_kill("dolphin-hazelcast") time.sleep(1) # autoheal polls every 10s, Docker healthcheck interval 10s → worst case ~45s recovery_s = _wait_until(_hz_active, 60) print(f"\n HZ healed without MHS in {recovery_s:.1f}s (autoheal layer)") # Let MHS restart on its own via supervisord _wait_until(lambda: "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout, 20)