"""DOLPHIN MIG3 — High-Frequency System Watchdog Service Runs a continuous loop (< 1s tick) to verify system health. Writes DOLPHIN_SYSTEM_HEALTH state to Hazelcast strongly consistent structure. """ import os import time import json import logging import argparse import subprocess import urllib.request from pathlib import Path from datetime import datetime, timezone, timedelta import sys import hazelcast HCM_DIR = Path(__file__).parent.parent sys.path.insert(0, str(HCM_DIR / 'nautilus_dolphin')) from nautilus_dolphin.nautilus.survival_stack import SurvivalStack logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s - %(message)s') logger = logging.getLogger("Watchdog") HCM_DIR = Path(__file__).parent.parent # Use platform-independent paths sys.path.insert(0, str(HCM_DIR)) sys.path.insert(0, str(HCM_DIR / 'prod')) from dolphin_paths import get_eigenvalues_path, get_project_root SCANS_DIR = get_eigenvalues_path() LOGS_DIR = get_project_root() / 'paper_logs' HZ_HOST = "localhost:5701" HZ_CLUSTER = "dolphin" class SystemWatchdog: def __init__(self, tick_rate=0.5): self.tick_rate = tick_rate self.slow_tick_rate = 10.0 self.last_slow_tick = 0 self.running = False self.health = { 'hz': 'FAIL', 'hz_container': 'UNKNOWN', 'prefect': 'FAIL', 'prefect_container': 'UNKNOWN', 'mc_container': 'UNKNOWN', 'scans': 'UNKNOWN', 'logs': 'UNKNOWN', 'timestamp': '', 'overall': 'DEGRADED' } # HZ auto-heal: fast-tick HTTP probe, 2 failures (~1s) → restart (~19s worst-case) self._hz_container_fail_streak = 0 self._hz_container_heal_threshold = 2 self._hz_container_last_restart = 0.0 # Prefect auto-heal: fast-tick HTTP probe, 4 failures (~2s) → restart (~35s worst-case) self._prefect_fail_streak = 0 self._prefect_heal_threshold = 4 self._prefect_last_restart = 0.0 # MC auto-heal: slow-tick only (non-critical), 5 failures (50s) → restart self._mc_fail_streak = 0 self._mc_heal_threshold = 5 self._mc_last_restart = 0.0 _fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes') self.survival_stack = SurvivalStack(fast_recovery=_fast_recovery) self._init_hz() def _init_hz(self): try: self.hz_client = hazelcast.HazelcastClient( cluster_name=HZ_CLUSTER, cluster_members=[HZ_HOST], connection_timeout=2.0 ) # Use IMap for strong consistency, no near-cache self.health_map = self.hz_client.get_map('DOLPHIN_SYSTEM_HEALTH').blocking() self.features_map = self.hz_client.get_map('DOLPHIN_FEATURES').blocking() self.state_map = self.hz_client.get_map('DOLPHIN_STATE_BLUE').blocking() except Exception as e: logger.error(f"HZ Init failed: {e}") self.hz_client = None self.health_map = None self.features_map = None self.state_map = None def start(self): self.running = True logger.info(f"Starting System Watchdog (tick={self.tick_rate}s)") while self.running: t0 = time.time() self._check_hz() self._check_hz_container() # fast-tick: ~1ms HTTP probe self._check_prefect_container() # fast-tick: ~1ms HTTP probe if t0 - self.last_slow_tick >= self.slow_tick_rate: self._check_prefect() self._check_scans() self._check_logs() self._check_mc_container() # slow-tick: MC is non-critical self.last_slow_tick = t0 self.health['timestamp'] = datetime.now(timezone.utc).isoformat() overall = 'GREEN' if all(v == 'OK' for k, v in self.health.items() if k not in ('timestamp', 'overall')) else 'DEGRADED' if overall != self.health['overall']: logger.info(f"Health transitioned {self.health['overall']} -> {overall}. Details: {self.health}") self.health['overall'] = overall self._write_health() self._process_survival_stack() elapsed = time.time() - t0 time.sleep(max(0.01, self.tick_rate - elapsed)) def stop(self): self.running = False if self.hz_client: self.hz_client.shutdown() def _check_hz(self): if self.hz_client and self.hz_client.lifecycle_service.is_running(): if self.hz_client.cluster_service.get_members(): self.health['hz'] = 'OK' return self.health['hz'] = 'FAIL' if not self.hz_client or not self.hz_client.lifecycle_service.is_running(): self._init_hz() def _check_prefect(self): try: req = urllib.request.Request("http://localhost:4200/api/health") with urllib.request.urlopen(req, timeout=1) as resp: data = json.loads(resp.read().decode('utf-8')) self.health['prefect'] = 'OK' if data.get('status') == 'ok' else 'FAIL' except Exception: self.health['prefect'] = 'FAIL' def _check_scans(self): try: if not SCANS_DIR.exists(): self.health['scans'] = 'FAIL (No directory)' return dirs = sorted(d.name for d in SCANS_DIR.iterdir() if d.is_dir() and len(d.name) == 10 and d.name.startswith('20')) if not dirs: self.health['scans'] = 'FAIL (No scans)' return latest_date = datetime.strptime(dirs[-1], '%Y-%m-%d').replace(tzinfo=timezone.utc) age = datetime.now(timezone.utc) - latest_date if age > timedelta(days=2): self.health['scans'] = f"FAIL (Stale {age.days} days)" else: self.health['scans'] = 'OK' except Exception as e: self.health['scans'] = f"FAIL ({str(e)})" def _check_logs(self): try: if not LOGS_DIR.exists(): self.health['logs'] = 'FAIL (No directory)' return log_files = sorted(LOGS_DIR.glob("paper_pnl_*.jsonl")) if not log_files: self.health['logs'] = 'FAIL (No logs)' return last_line = "" with open(log_files[-1], 'r') as f: for line in f: if line.strip(): last_line = line if not last_line: self.health['logs'] = 'FAIL (Empty log)' return data = json.loads(last_line) log_date = datetime.fromisoformat(data['logged_at']) age = datetime.now(timezone.utc) - log_date if age > timedelta(days=2): self.health['logs'] = f"FAIL (Stale {age.days} days)" else: self.health['logs'] = 'OK' except Exception as e: self.health['logs'] = f"FAIL ({str(e)})" def _check_hz_container(self): """Fast-tick HTTP health probe (~1ms when healthy). Triggers auto-heal after 2 failures.""" try: req = urllib.request.Request('http://127.0.0.1:5701/hazelcast/health') with urllib.request.urlopen(req, timeout=0.3) as resp: data = json.loads(resp.read()) if data.get('nodeState') == 'ACTIVE': self.health['hz_container'] = 'OK' self._hz_container_fail_streak = 0 return # Node up but not ACTIVE (e.g. PASSIVE, FROZEN) state = data.get('nodeState', 'UNKNOWN') self.health['hz_container'] = f'WARN ({state})' self._hz_container_fail_streak += 1 except Exception: self.health['hz_container'] = 'FAIL' self._hz_container_fail_streak += 1 if self._hz_container_fail_streak >= self._hz_container_heal_threshold: self._heal_hz_container() def _heal_hz_container(self): """Restart the HZ container after sustained unhealthy state. Idempotent.""" now = time.time() if now - self._hz_container_last_restart < 30: # minimum 30s between restarts (restart takes ~15s) logger.warning("HZ heal suppressed — last restart was <30s ago") return logger.critical("AUTO-HEAL: restarting dolphin-hazelcast (sustained unhealthy)") try: subprocess.run(['docker', 'restart', 'dolphin-hazelcast'], timeout=30, check=True) self._hz_container_fail_streak = 0 self._hz_container_last_restart = now logger.info("AUTO-HEAL: dolphin-hazelcast restarted successfully") except Exception as e: logger.error(f"AUTO-HEAL: docker restart failed: {e}") def _check_prefect_container(self): """Fast-tick HTTP probe to Prefect API. Heals after 4 consecutive failures (~2s).""" try: req = urllib.request.Request('http://127.0.0.1:4200/api/health') with urllib.request.urlopen(req, timeout=0.3) as resp: body = resp.read().strip() if body == b'true': self.health['prefect_container'] = 'OK' self._prefect_fail_streak = 0 return self.health['prefect_container'] = f'WARN (body={body[:20]})' self._prefect_fail_streak += 1 except Exception: self.health['prefect_container'] = 'FAIL' self._prefect_fail_streak += 1 if self._prefect_fail_streak >= self._prefect_heal_threshold: self._heal_prefect_container() def _heal_prefect_container(self): now = time.time() if now - self._prefect_last_restart < 60: logger.warning("Prefect heal suppressed — last restart <60s ago") return logger.critical("AUTO-HEAL: restarting dolphin-prefect (sustained unhealthy)") try: subprocess.run(['docker', 'restart', 'dolphin-prefect'], timeout=45, check=True) self._prefect_fail_streak = 0 self._prefect_last_restart = now logger.info("AUTO-HEAL: dolphin-prefect restarted successfully") except Exception as e: logger.error(f"AUTO-HEAL: prefect restart failed: {e}") def _check_mc_container(self): """Slow-tick HTTP probe to Hazelcast MC (non-critical). Heals after 5 slow-tick failures.""" try: req = urllib.request.Request('http://127.0.0.1:8080/') with urllib.request.urlopen(req, timeout=1.0) as resp: if resp.status == 200: self.health['mc_container'] = 'OK' self._mc_fail_streak = 0 return self.health['mc_container'] = f'WARN (status={resp.status})' self._mc_fail_streak += 1 except Exception: self.health['mc_container'] = 'FAIL' self._mc_fail_streak += 1 if self._mc_fail_streak >= self._mc_heal_threshold: self._heal_mc_container() def _heal_mc_container(self): now = time.time() if now - self._mc_last_restart < 120: logger.warning("MC heal suppressed — last restart <120s ago") return logger.critical("AUTO-HEAL: restarting dolphin-hazelcast-mc (sustained unhealthy)") try: subprocess.run(['docker', 'restart', 'dolphin-hazelcast-mc'], timeout=60, check=True) self._mc_fail_streak = 0 self._mc_last_restart = now logger.info("AUTO-HEAL: dolphin-hazelcast-mc restarted successfully") except Exception as e: logger.error(f"AUTO-HEAL: mc restart failed: {e}") def _process_survival_stack(self): if not self.features_map: return try: # Gather state safely hz_nodes = len(self.hz_client.cluster_service.get_members()) if self.hz_client else 0 heartbeat_age = 0.0 # Handled by the watchdog directly being alive mc_raw = self.features_map.get('mc_forewarner_latest') mc_state = json.loads(mc_raw) if mc_raw else {} mc_status = mc_state.get('status', 'ORANGE') # default ORANGE if missing mc_ts = mc_state.get('timestamp') if mc_ts: mc_age = (datetime.now(timezone.utc) - datetime.fromisoformat(mc_ts)).total_seconds() / 3600.0 else: mc_age = 999.0 ob_raw = self.features_map.get('asset_BTCUSDT_ob') ob_state = json.loads(ob_raw) if ob_raw else {} ob_depth = ob_state.get('depth_quality', 0.5) ob_fill = ob_state.get('fill_prob', 0.5) ob_stale = ob_state.get('stale', True) dvol_spike = False # Extracted from ExF, currently hardcoded neutral t_since_spike = 999.0 state_raw = self.state_map.get('latest') state = json.loads(state_raw) if state_raw else {} drawdown = state.get('drawdown', 0.0) rm, breakdown = self.survival_stack.compute_rm( hz_nodes=hz_nodes, heartbeat_age_s=heartbeat_age, mc_status=mc_status, mc_staleness_hours=mc_age, ob_depth_quality=ob_depth, ob_fill_prob=ob_fill, ob_stale=ob_stale, dvol_spike=dvol_spike, t_since_spike_min=t_since_spike, drawdown=drawdown ) posture = self.survival_stack.update_posture(rm) self.survival_stack.write_to_hz(rm, breakdown, self.hz_client) except Exception as e: logger.error(f"Failed to process Survival Stack: {e}") def _write_health(self): if self.health_map: try: self.health_map.put('latest', json.dumps(self.health)) except Exception as e: logger.debug(f"Failed to write health to HZ: {e}") if __name__ == '__main__': parser = argparse.ArgumentParser() args = parser.parse_args() watchdog = SystemWatchdog() try: watchdog.start() except KeyboardInterrupt: watchdog.stop() logger.info("Watchdog stopped.")