"""
DOLPHIN Degradational / Chaos Test Suite
=========================================
Triggers real failure modes against live Docker containers and supervisord processes,
then asserts correct healing/restart within time budgets.

REQUIRES:
  - Docker running (dolphin-hazelcast, dolphin-prefect, dolphin-hazelcast-mc)
  - supervisord running with dolphin group
  - MHS (meta_health) running
  - nautilus_trader running

Run as root (docker commands require it):
  /home/dolphin/siloqy_env/bin/python3 -m pytest prod/tests/test_degradational.py -v -s --timeout=120
"""
import json
import math
import subprocess
import time
import urllib.request
from pathlib import Path

import pytest

# ── Constants ────────────────────────────────────────────────────────────────
SUPERVISORD_CONF = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
HZ_HEALTH_URL    = "http://127.0.0.1:5701/hazelcast/health"
PREFECT_HEALTH_URL = "http://127.0.0.1:4200/api/health"
MC_HEALTH_URL    = "http://127.0.0.1:8080/"
TRADER_LOG       = "/tmp/nautilus_trader.log"
CAPITAL_DISK     = Path("/tmp/dolphin_capital_checkpoint.json")
HZ_RESTART_BUDGET_S  = 25   # worst-case: ~19s + 6s buffer
PREFECT_RESTART_BUDGET_S = 40
MC_RESTART_BUDGET_S  = 90   # MC is non-critical, slower tolerance


# ── Helpers ───────────────────────────────────────────────────────────────────

def _http_ok(url, timeout=1.0):
    try:
        with urllib.request.urlopen(url, timeout=timeout) as r:
            return r.status == 200
    except Exception:
        return False


def _hz_active(timeout=0.5):
    try:
        with urllib.request.urlopen(HZ_HEALTH_URL, timeout=timeout) as r:
            return json.loads(r.read()).get('nodeState') == 'ACTIVE'
    except Exception:
        return False


def _prefect_ok(timeout=0.5):
    try:
        with urllib.request.urlopen(PREFECT_HEALTH_URL, timeout=timeout) as r:
            return r.read().strip() == b'true'
    except Exception:
        return False


def _wait_until(predicate, budget_s, poll=0.3):
    t0 = time.time()
    while time.time() - t0 < budget_s:
        if predicate():
            return time.time() - t0
        time.sleep(poll)
    raise TimeoutError(f"Not recovered within {budget_s}s")


def _supervisord(cmd):
    return subprocess.run(
        ["supervisorctl", "-c", SUPERVISORD_CONF] + cmd.split(),
        capture_output=True, text=True
    )


def _trader_pid():
    r = _supervisord("status dolphin:nautilus_trader")
    # supervisorctl output: "dolphin:nautilus_trader   RUNNING   pid 12345, uptime ..."
    import re
    m = re.search(r'pid\s+(\d+)', r.stdout)
    if m:
        return int(m.group(1))
    return None


def _wait_hz_cooldown_clear(max_wait=8):
    """Wait for HZ to be confirmed healthy so MHS resets cooldown."""
    _wait_until(_hz_active, max_wait)


def _docker_kill(name):
    subprocess.run(["docker", "kill", name], check=True, capture_output=True)


def _docker_stop(name):
    subprocess.run(["docker", "stop", "-t", "2", name], check=True, capture_output=True)


def _docker_running(name):
    r = subprocess.run(["docker", "inspect", "--format", "{{.State.Running}}", name],
                       capture_output=True, text=True)
    return r.stdout.strip() == "true"


def _assert_hz_was_healthy():
    assert _hz_active(timeout=2.0), "Precondition: HZ must be healthy before test"


def _assert_prefect_was_healthy():
    assert _prefect_ok(timeout=2.0), "Precondition: Prefect must be healthy before test"


# ── Fixtures ──────────────────────────────────────────────────────────────────

@pytest.fixture(autouse=True)
def ensure_baseline_healthy():
    """Wait for all services healthy + trader running before each test."""
    deadline = time.time() + 90
    while time.time() < deadline:
        trader_ok = _trader_pid() is not None
        if _hz_active() and _prefect_ok() and trader_ok:
            break
        time.sleep(1)
    else:
        pytest.skip("Baseline services not healthy — skipping chaos test")
    yield
    # Post-test: wait for any killed containers to fully recover before next test
    deadline2 = time.time() + 90
    while time.time() < deadline2:
        if _hz_active() and _prefect_ok() and _trader_pid() is not None:
            # Extra 2s for MHS cooldown reset (it resets on healthy probe, ~0.5s after recovery)
            time.sleep(2)
            break
        time.sleep(1)


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 1: Hazelcast container killed (SIGKILL)
# ══════════════════════════════════════════════════════════════════════════════

class TestHZContainerKill:

    def test_hz_kill_mhs_heals_within_budget(self):
        """SIGKILL HZ → MHS HTTP probe detects in ~1s → docker restart → HZ healthy."""
        _assert_hz_was_healthy()

        _docker_kill("dolphin-hazelcast")
        kill_time = time.time()

        # Immediately confirm it's dead
        time.sleep(0.5)
        assert not _hz_active(timeout=0.3), "HZ should be down after kill"

        # Wait for recovery
        recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
        print(f"\n  HZ kill→recovered in {recovery_s:.1f}s (budget {HZ_RESTART_BUDGET_S}s)")
        assert recovery_s <= HZ_RESTART_BUDGET_S

    def test_hz_kill_trader_reconnects(self):
        """After HZ kill+recovery, nautilus_trader must be processing scans again within 45s."""
        _assert_hz_was_healthy()
        pre_log_size = Path(TRADER_LOG).stat().st_size

        _docker_kill("dolphin-hazelcast")
        time.sleep(1)

        # Wait for HZ recovery
        _wait_until(_hz_active, HZ_RESTART_BUDGET_S)

        # Then wait for trader to log a new LATENCY line
        def _new_latency_line():
            try:
                return Path(TRADER_LOG).stat().st_size > pre_log_size + 100
            except Exception:
                return False

        reconnect_s = _wait_until(_new_latency_line, 45)
        print(f"\n  Trader reconnected and logging within {reconnect_s:.1f}s of kill")
        assert reconnect_s <= 45

    def test_hz_kill_capital_survives_on_disk(self):
        """Kill HZ (loses in-memory maps) → disk checkpoint must still have valid capital."""
        _assert_hz_was_healthy()

        # Ensure there is a disk checkpoint (trader must have written one)
        assert CAPITAL_DISK.exists(), "Disk checkpoint must exist before kill"
        data = json.loads(CAPITAL_DISK.read_text())
        pre_capital = float(data['capital'])
        assert pre_capital >= 1.0, f"Pre-kill capital invalid: {pre_capital}"

        _docker_kill("dolphin-hazelcast")
        time.sleep(1)

        # Disk checkpoint must be unchanged (not corrupted by kill)
        data2 = json.loads(CAPITAL_DISK.read_text())
        post_capital = float(data2['capital'])
        assert math.isfinite(post_capital) and post_capital >= 1.0
        # Within 1% of pre-kill (may have advanced slightly from a scan just before kill)
        assert abs(post_capital - pre_capital) / pre_capital < 0.01, \
            f"Capital changed unexpectedly: {pre_capital} → {post_capital}"

        # Wait for recovery
        _wait_until(_hz_active, HZ_RESTART_BUDGET_S)


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 2: Hazelcast container graceful stop
# ══════════════════════════════════════════════════════════════════════════════

class TestHZContainerStop:

    def test_hz_stop_recovers_within_budget(self):
        """Graceful stop (SIGTERM) — same recovery path as kill."""
        _assert_hz_was_healthy()

        _docker_stop("dolphin-hazelcast")
        time.sleep(0.5)
        assert not _hz_active(timeout=0.3)

        recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
        print(f"\n  HZ stop→recovered in {recovery_s:.1f}s")
        assert recovery_s <= HZ_RESTART_BUDGET_S


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 3: Prefect container killed
# ══════════════════════════════════════════════════════════════════════════════

class TestPrefectContainerKill:

    def test_prefect_kill_recovers_within_budget(self):
        """SIGKILL Prefect → MHS probe detects → docker restart → Prefect healthy."""
        _assert_prefect_was_healthy()

        _docker_kill("dolphin-prefect")
        time.sleep(0.5)
        assert not _prefect_ok(timeout=0.3), "Prefect should be down"

        recovery_s = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)
        print(f"\n  Prefect kill→recovered in {recovery_s:.1f}s (budget {PREFECT_RESTART_BUDGET_S}s)")
        assert recovery_s <= PREFECT_RESTART_BUDGET_S

    def test_prefect_kill_hz_unaffected(self):
        """Killing Prefect must not affect HZ or the trader."""
        _assert_hz_was_healthy()
        _assert_prefect_was_healthy()

        _docker_kill("dolphin-prefect")
        time.sleep(2)

        # HZ must still be healthy
        assert _hz_active(timeout=1.0), "HZ must be unaffected by Prefect kill"

        # Trader must still be running
        pid = _trader_pid()
        assert pid is not None and pid > 0, "Trader must still be running"

        # Wait for Prefect to recover
        _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 4: Simultaneous HZ + Prefect kill
# ══════════════════════════════════════════════════════════════════════════════

class TestSimultaneousKill:

    def test_hz_and_prefect_simultaneous_kill(self):
        """Both killed simultaneously — both must recover independently."""
        _assert_hz_was_healthy()
        _assert_prefect_was_healthy()

        _docker_kill("dolphin-hazelcast")
        _docker_kill("dolphin-prefect")
        kill_time = time.time()

        time.sleep(0.5)
        assert not _hz_active(timeout=0.3)
        assert not _prefect_ok(timeout=0.3)

        # Both must recover — HZ first (faster restart), then Prefect
        hz_recovery = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
        prefect_recovery = _wait_until(_prefect_ok, PREFECT_RESTART_BUDGET_S)

        print(f"\n  Simultaneous kill: HZ recovered in {hz_recovery:.1f}s, "
              f"Prefect in {prefect_recovery:.1f}s")
        assert hz_recovery <= HZ_RESTART_BUDGET_S
        assert prefect_recovery <= PREFECT_RESTART_BUDGET_S


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 5: nautilus_trader process killed (supervisord restarts)
# ══════════════════════════════════════════════════════════════════════════════

class TestTraderProcessKill:

    def test_trader_kill_supervisord_restarts(self):
        """Kill trader process — supervisord must restart it and it must connect to HZ."""
        pid_before = _trader_pid()
        assert pid_before is not None

        subprocess.run(["kill", "-9", str(pid_before)], check=True)
        time.sleep(2)

        # Wait for supervisord to restart and new process to connect
        def _new_pid_running():
            r = _supervisord("status dolphin:nautilus_trader")
            return "RUNNING" in r.stdout

        recovery_s = _wait_until(_new_pid_running, 30)
        pid_after = _trader_pid()
        assert pid_after != pid_before, "supervisord must have assigned new PID"
        print(f"\n  Trader killed+restarted in {recovery_s:.1f}s (PID {pid_before}→{pid_after})")

    def test_trader_restart_capital_restored_from_disk(self):
        """After trader restart, capital must be restored from disk checkpoint."""
        assert CAPITAL_DISK.exists(), "Disk checkpoint required"
        data = json.loads(CAPITAL_DISK.read_text())
        expected_capital = float(data['capital'])
        assert expected_capital >= 1.0

        pid_before = _trader_pid()
        subprocess.run(["kill", "-9", str(pid_before)], check=True)

        # Wait for restart + first scan processed
        def _trader_log_shows_restored():
            try:
                text = Path(TRADER_LOG).read_text()
                return "Capital restored" in text.split("🐬 DOLPHIN")[-1]
            except Exception:
                return False

        _wait_until(lambda: _supervisord("status dolphin:nautilus_trader").stdout.count("RUNNING") > 0, 20)
        time.sleep(5)

        log_tail = Path(TRADER_LOG).read_text().split("🐬 DOLPHIN")[-1]
        if "no valid checkpoint" in log_tail:
            pytest.fail("Trader started without capital checkpoint — disk restore failed")
        if "Capital restored" in log_tail:
            # Extract restored value
            for line in log_tail.splitlines():
                if "Capital restored" in line:
                    print(f"\n  {line.strip()}")
                    break


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 6: scan_bridge process killed
# ══════════════════════════════════════════════════════════════════════════════

class TestScanBridgeKill:

    def test_scan_bridge_kill_supervisord_restarts(self):
        r = _supervisord("status dolphin:scan_bridge")
        assert "RUNNING" in r.stdout, "scan_bridge must be running"

        for part in r.stdout.split():
            if part.isdigit():
                pid = int(part)
                break
        else:
            pytest.skip("Could not parse scan_bridge PID")

        subprocess.run(["kill", "-9", str(pid)], check=True)
        time.sleep(2)

        def _sb_running():
            return "RUNNING" in _supervisord("status dolphin:scan_bridge").stdout

        recovery_s = _wait_until(_sb_running, 20)
        print(f"\n  scan_bridge restarted in {recovery_s:.1f}s")
        assert recovery_s <= 20


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 7: Rapid repeated HZ kills (stress resilience)
# ══════════════════════════════════════════════════════════════════════════════

class TestHZRapidKills:

    def test_hz_three_rapid_kills(self):
        """Kill HZ 3 times — each must recover. Waits for MHS cooldown reset between kills."""
        for i in range(3):
            _assert_hz_was_healthy()
            _docker_kill("dolphin-hazelcast")
            recovery_s = _wait_until(_hz_active, HZ_RESTART_BUDGET_S)
            print(f"\n  Kill #{i+1}: recovered in {recovery_s:.1f}s")
            assert recovery_s <= HZ_RESTART_BUDGET_S
            # Wait for MHS to confirm healthy (resets cooldown) before next kill
            time.sleep(1.5)


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 8: Capital checkpoint integrity under concurrent writes
# ══════════════════════════════════════════════════════════════════════════════

class TestCapitalCheckpointIntegrity:

    def test_disk_checkpoint_always_valid_json(self):
        """Disk checkpoint must be valid JSON with capital >= 1.0 and finite ts."""
        assert CAPITAL_DISK.exists()
        data = json.loads(CAPITAL_DISK.read_text())
        capital = float(data['capital'])
        ts = float(data['ts'])
        assert math.isfinite(capital) and capital >= 1.0
        assert math.isfinite(ts) and ts > 1_700_000_000  # post-2023 epoch

    def test_disk_checkpoint_survives_hz_restart(self):
        """Restart HZ (clears in-memory maps) — disk checkpoint must still be valid."""
        assert CAPITAL_DISK.exists()
        pre = json.loads(CAPITAL_DISK.read_text())

        subprocess.run(["docker", "restart", "dolphin-hazelcast"],
                       check=True, capture_output=True)
        _wait_until(_hz_active, HZ_RESTART_BUDGET_S)

        post = json.loads(CAPITAL_DISK.read_text())
        # Disk checkpoint should not have been corrupted
        assert math.isfinite(float(post['capital']))
        assert float(post['capital']) >= 1.0
        print(f"\n  Capital pre={pre['capital']:.2f} post={post['capital']:.2f}")


# ══════════════════════════════════════════════════════════════════════════════
# FAILURE MODE 9: MHS (meta_health) killed — supervisord restarts it
# ══════════════════════════════════════════════════════════════════════════════

class TestMHSKill:

    def test_mhs_kill_supervisord_restarts(self):
        r = _supervisord("status dolphin_data:meta_health")
        assert "RUNNING" in r.stdout

        for part in r.stdout.split():
            if part.isdigit():
                pid = int(part)
                break
        else:
            pytest.skip("Could not parse meta_health PID")

        subprocess.run(["kill", "-9", str(pid)], check=True)
        time.sleep(2)

        def _mhs_running():
            return "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout

        recovery_s = _wait_until(_mhs_running, 20)
        print(f"\n  MHS restarted in {recovery_s:.1f}s")
        assert recovery_s <= 20

    def test_hz_heals_even_without_mhs(self):
        """Kill MHS then kill HZ — autoheal (Docker layer) must still recover HZ."""
        _assert_hz_was_healthy()

        # Kill MHS
        r = _supervisord("status dolphin_data:meta_health")
        for part in r.stdout.split():
            if part.isdigit():
                mhs_pid = int(part)
                break
        else:
            pytest.skip("Could not parse MHS PID")
        subprocess.run(["kill", "-9", str(mhs_pid)], check=True)
        time.sleep(1)

        # Now kill HZ — autoheal must recover it without MHS
        _docker_kill("dolphin-hazelcast")
        time.sleep(1)

        # autoheal polls every 10s, Docker healthcheck interval 10s → worst case ~45s
        recovery_s = _wait_until(_hz_active, 60)
        print(f"\n  HZ healed without MHS in {recovery_s:.1f}s (autoheal layer)")

        # Let MHS restart on its own via supervisord
        _wait_until(lambda: "RUNNING" in _supervisord("status dolphin_data:meta_health").stdout, 20)