"""DOLPHIN MIG3 — High-Frequency System Watchdog Service
Runs a continuous loop (< 1s tick) to verify system health.
Writes DOLPHIN_SYSTEM_HEALTH state to Hazelcast strongly consistent structure.
"""
import os
import time
import json
import logging
import argparse
import subprocess
import urllib.request
from pathlib import Path
from datetime import datetime, timezone, timedelta
import sys

import hazelcast

HCM_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(HCM_DIR / 'nautilus_dolphin'))
from nautilus_dolphin.nautilus.survival_stack import SurvivalStack

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s - %(message)s')
logger = logging.getLogger("Watchdog")

HCM_DIR = Path(__file__).parent.parent

# Use platform-independent paths
sys.path.insert(0, str(HCM_DIR))
sys.path.insert(0, str(HCM_DIR / 'prod'))
from dolphin_paths import get_eigenvalues_path, get_project_root

SCANS_DIR = get_eigenvalues_path()
LOGS_DIR = get_project_root() / 'paper_logs'

HZ_HOST = "localhost:5701"
HZ_CLUSTER = "dolphin"

class SystemWatchdog:
    def __init__(self, tick_rate=0.5):
        self.tick_rate = tick_rate
        self.slow_tick_rate = 10.0
        self.last_slow_tick = 0
        self.running = False
        
        self.health = {
            'hz': 'FAIL',
            'hz_container': 'UNKNOWN',
            'prefect': 'FAIL',
            'prefect_container': 'UNKNOWN',
            'mc_container': 'UNKNOWN',
            'scans': 'UNKNOWN',
            'logs': 'UNKNOWN',
            'timestamp': '',
            'overall': 'DEGRADED'
        }

        # HZ auto-heal: fast-tick HTTP probe, 2 failures (~1s) → restart (~19s worst-case)
        self._hz_container_fail_streak = 0
        self._hz_container_heal_threshold = 2
        self._hz_container_last_restart = 0.0

        # Prefect auto-heal: fast-tick HTTP probe, 4 failures (~2s) → restart (~35s worst-case)
        self._prefect_fail_streak = 0
        self._prefect_heal_threshold = 4
        self._prefect_last_restart = 0.0

        # MC auto-heal: slow-tick only (non-critical), 5 failures (50s) → restart
        self._mc_fail_streak = 0
        self._mc_heal_threshold = 5
        self._mc_last_restart = 0.0

        _fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes')
        self.survival_stack = SurvivalStack(fast_recovery=_fast_recovery)
        
        self._init_hz()

    def _init_hz(self):
        try:
            self.hz_client = hazelcast.HazelcastClient(
                cluster_name=HZ_CLUSTER,
                cluster_members=[HZ_HOST],
                connection_timeout=2.0
            )
            # Use IMap for strong consistency, no near-cache
            self.health_map = self.hz_client.get_map('DOLPHIN_SYSTEM_HEALTH').blocking()
            self.features_map = self.hz_client.get_map('DOLPHIN_FEATURES').blocking()
            self.state_map = self.hz_client.get_map('DOLPHIN_STATE_BLUE').blocking()
        except Exception as e:
            logger.error(f"HZ Init failed: {e}")
            self.hz_client = None
            self.health_map = None
            self.features_map = None
            self.state_map = None

    def start(self):
        self.running = True
        logger.info(f"Starting System Watchdog (tick={self.tick_rate}s)")
        
        while self.running:
            t0 = time.time()
            
            self._check_hz()
            self._check_hz_container()      # fast-tick: ~1ms HTTP probe
            self._check_prefect_container() # fast-tick: ~1ms HTTP probe

            if t0 - self.last_slow_tick >= self.slow_tick_rate:
                self._check_prefect()
                self._check_scans()
                self._check_logs()
                self._check_mc_container()  # slow-tick: MC is non-critical
                self.last_slow_tick = t0
                
            self.health['timestamp'] = datetime.now(timezone.utc).isoformat()
            
            overall = 'GREEN' if all(v == 'OK' for k, v in self.health.items() if k not in ('timestamp', 'overall')) else 'DEGRADED'
            
            if overall != self.health['overall']:
                logger.info(f"Health transitioned {self.health['overall']} -> {overall}. Details: {self.health}")
                
            self.health['overall'] = overall
            self._write_health()
            self._process_survival_stack()
            
            elapsed = time.time() - t0
            time.sleep(max(0.01, self.tick_rate - elapsed))
            
    def stop(self):
        self.running = False
        if self.hz_client:
            self.hz_client.shutdown()

    def _check_hz(self):
        if self.hz_client and self.hz_client.lifecycle_service.is_running():
            if self.hz_client.cluster_service.get_members():
                self.health['hz'] = 'OK'
                return
        
        self.health['hz'] = 'FAIL'
        if not self.hz_client or not self.hz_client.lifecycle_service.is_running():
            self._init_hz()

    def _check_prefect(self):
        try:
            req = urllib.request.Request("http://localhost:4200/api/health")
            with urllib.request.urlopen(req, timeout=1) as resp:
                data = json.loads(resp.read().decode('utf-8'))
                self.health['prefect'] = 'OK' if data.get('status') == 'ok' else 'FAIL'
        except Exception:
            self.health['prefect'] = 'FAIL'

    def _check_scans(self):
        try:
            if not SCANS_DIR.exists():
                self.health['scans'] = 'FAIL (No directory)'
                return
            dirs = sorted(d.name for d in SCANS_DIR.iterdir() if d.is_dir() and len(d.name) == 10 and d.name.startswith('20'))
            if not dirs:
                self.health['scans'] = 'FAIL (No scans)'
                return
            latest_date = datetime.strptime(dirs[-1], '%Y-%m-%d').replace(tzinfo=timezone.utc)
            age = datetime.now(timezone.utc) - latest_date
            if age > timedelta(days=2):
                self.health['scans'] = f"FAIL (Stale {age.days} days)"
            else:
                self.health['scans'] = 'OK'
        except Exception as e:
            self.health['scans'] = f"FAIL ({str(e)})"

    def _check_logs(self):
        try:
            if not LOGS_DIR.exists():
                self.health['logs'] = 'FAIL (No directory)'
                return
            log_files = sorted(LOGS_DIR.glob("paper_pnl_*.jsonl"))
            if not log_files:
                self.health['logs'] = 'FAIL (No logs)'
                return
            last_line = ""
            with open(log_files[-1], 'r') as f:
                for line in f:
                    if line.strip():
                        last_line = line
            if not last_line:
                self.health['logs'] = 'FAIL (Empty log)'
                return
            data = json.loads(last_line)
            log_date = datetime.fromisoformat(data['logged_at'])
            age = datetime.now(timezone.utc) - log_date
            if age > timedelta(days=2):
                self.health['logs'] = f"FAIL (Stale {age.days} days)"
            else:
                self.health['logs'] = 'OK'
        except Exception as e:
            self.health['logs'] = f"FAIL ({str(e)})"

    def _check_hz_container(self):
        """Fast-tick HTTP health probe (~1ms when healthy). Triggers auto-heal after 2 failures."""
        try:
            req = urllib.request.Request('http://127.0.0.1:5701/hazelcast/health')
            with urllib.request.urlopen(req, timeout=0.3) as resp:
                data = json.loads(resp.read())
            if data.get('nodeState') == 'ACTIVE':
                self.health['hz_container'] = 'OK'
                self._hz_container_fail_streak = 0
                return
            # Node up but not ACTIVE (e.g. PASSIVE, FROZEN)
            state = data.get('nodeState', 'UNKNOWN')
            self.health['hz_container'] = f'WARN ({state})'
            self._hz_container_fail_streak += 1
        except Exception:
            self.health['hz_container'] = 'FAIL'
            self._hz_container_fail_streak += 1

        if self._hz_container_fail_streak >= self._hz_container_heal_threshold:
            self._heal_hz_container()

    def _heal_hz_container(self):
        """Restart the HZ container after sustained unhealthy state. Idempotent."""
        now = time.time()
        if now - self._hz_container_last_restart < 30:   # minimum 30s between restarts (restart takes ~15s)
            logger.warning("HZ heal suppressed — last restart was <30s ago")
            return
        logger.critical("AUTO-HEAL: restarting dolphin-hazelcast (sustained unhealthy)")
        try:
            subprocess.run(['docker', 'restart', 'dolphin-hazelcast'], timeout=30, check=True)
            self._hz_container_fail_streak = 0
            self._hz_container_last_restart = now
            logger.info("AUTO-HEAL: dolphin-hazelcast restarted successfully")
        except Exception as e:
            logger.error(f"AUTO-HEAL: docker restart failed: {e}")

    def _check_prefect_container(self):
        """Fast-tick HTTP probe to Prefect API. Heals after 4 consecutive failures (~2s)."""
        try:
            req = urllib.request.Request('http://127.0.0.1:4200/api/health')
            with urllib.request.urlopen(req, timeout=0.3) as resp:
                body = resp.read().strip()
            if body == b'true':
                self.health['prefect_container'] = 'OK'
                self._prefect_fail_streak = 0
                return
            self.health['prefect_container'] = f'WARN (body={body[:20]})'
            self._prefect_fail_streak += 1
        except Exception:
            self.health['prefect_container'] = 'FAIL'
            self._prefect_fail_streak += 1

        if self._prefect_fail_streak >= self._prefect_heal_threshold:
            self._heal_prefect_container()

    def _heal_prefect_container(self):
        now = time.time()
        if now - self._prefect_last_restart < 60:
            logger.warning("Prefect heal suppressed — last restart <60s ago")
            return
        logger.critical("AUTO-HEAL: restarting dolphin-prefect (sustained unhealthy)")
        try:
            subprocess.run(['docker', 'restart', 'dolphin-prefect'], timeout=45, check=True)
            self._prefect_fail_streak = 0
            self._prefect_last_restart = now
            logger.info("AUTO-HEAL: dolphin-prefect restarted successfully")
        except Exception as e:
            logger.error(f"AUTO-HEAL: prefect restart failed: {e}")

    def _check_mc_container(self):
        """Slow-tick HTTP probe to Hazelcast MC (non-critical). Heals after 5 slow-tick failures."""
        try:
            req = urllib.request.Request('http://127.0.0.1:8080/')
            with urllib.request.urlopen(req, timeout=1.0) as resp:
                if resp.status == 200:
                    self.health['mc_container'] = 'OK'
                    self._mc_fail_streak = 0
                    return
            self.health['mc_container'] = f'WARN (status={resp.status})'
            self._mc_fail_streak += 1
        except Exception:
            self.health['mc_container'] = 'FAIL'
            self._mc_fail_streak += 1

        if self._mc_fail_streak >= self._mc_heal_threshold:
            self._heal_mc_container()

    def _heal_mc_container(self):
        now = time.time()
        if now - self._mc_last_restart < 120:
            logger.warning("MC heal suppressed — last restart <120s ago")
            return
        logger.critical("AUTO-HEAL: restarting dolphin-hazelcast-mc (sustained unhealthy)")
        try:
            subprocess.run(['docker', 'restart', 'dolphin-hazelcast-mc'], timeout=60, check=True)
            self._mc_fail_streak = 0
            self._mc_last_restart = now
            logger.info("AUTO-HEAL: dolphin-hazelcast-mc restarted successfully")
        except Exception as e:
            logger.error(f"AUTO-HEAL: mc restart failed: {e}")

    def _process_survival_stack(self):
        if not self.features_map:
            return
            
        try:
            # Gather state safely
            hz_nodes = len(self.hz_client.cluster_service.get_members()) if self.hz_client else 0
            heartbeat_age = 0.0 # Handled by the watchdog directly being alive
            
            mc_raw = self.features_map.get('mc_forewarner_latest')
            mc_state = json.loads(mc_raw) if mc_raw else {}
            mc_status = mc_state.get('status', 'ORANGE') # default ORANGE if missing
            mc_ts = mc_state.get('timestamp')
            
            if mc_ts:
                mc_age = (datetime.now(timezone.utc) - datetime.fromisoformat(mc_ts)).total_seconds() / 3600.0
            else:
                mc_age = 999.0
            
            ob_raw = self.features_map.get('asset_BTCUSDT_ob')
            ob_state = json.loads(ob_raw) if ob_raw else {}
            
            ob_depth = ob_state.get('depth_quality', 0.5)
            ob_fill = ob_state.get('fill_prob', 0.5)
            ob_stale = ob_state.get('stale', True)
            
            dvol_spike = False # Extracted from ExF, currently hardcoded neutral
            t_since_spike = 999.0
            
            state_raw = self.state_map.get('latest')
            state = json.loads(state_raw) if state_raw else {}
            drawdown = state.get('drawdown', 0.0)
            
            rm, breakdown = self.survival_stack.compute_rm(
                hz_nodes=hz_nodes, heartbeat_age_s=heartbeat_age,
                mc_status=mc_status, mc_staleness_hours=mc_age,
                ob_depth_quality=ob_depth, ob_fill_prob=ob_fill, ob_stale=ob_stale,
                dvol_spike=dvol_spike, t_since_spike_min=t_since_spike,
                drawdown=drawdown
            )
            
            posture = self.survival_stack.update_posture(rm)
            self.survival_stack.write_to_hz(rm, breakdown, self.hz_client)
            
        except Exception as e:
            logger.error(f"Failed to process Survival Stack: {e}")

    def _write_health(self):
        if self.health_map:
            try:
                self.health_map.put('latest', json.dumps(self.health))
            except Exception as e:
                logger.debug(f"Failed to write health to HZ: {e}")

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    args = parser.parse_args()
    watchdog = SystemWatchdog()
    try:
        watchdog.start()
    except KeyboardInterrupt:
        watchdog.stop()
        logger.info("Watchdog stopped.")