360 lines
14 KiB
Python
360 lines
14 KiB
Python
|
|
"""DOLPHIN MIG3 — High-Frequency System Watchdog Service
|
||
|
|
Runs a continuous loop (< 1s tick) to verify system health.
|
||
|
|
Writes DOLPHIN_SYSTEM_HEALTH state to Hazelcast strongly consistent structure.
|
||
|
|
"""
|
||
|
|
import os
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import logging
|
||
|
|
import argparse
|
||
|
|
import subprocess
|
||
|
|
import urllib.request
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime, timezone, timedelta
|
||
|
|
import sys
|
||
|
|
|
||
|
|
import hazelcast
|
||
|
|
|
||
|
|
HCM_DIR = Path(__file__).parent.parent
|
||
|
|
sys.path.insert(0, str(HCM_DIR / 'nautilus_dolphin'))
|
||
|
|
from nautilus_dolphin.nautilus.survival_stack import SurvivalStack
|
||
|
|
|
||
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s - %(message)s')
|
||
|
|
logger = logging.getLogger("Watchdog")
|
||
|
|
|
||
|
|
HCM_DIR = Path(__file__).parent.parent
|
||
|
|
|
||
|
|
# Use platform-independent paths
|
||
|
|
sys.path.insert(0, str(HCM_DIR))
|
||
|
|
sys.path.insert(0, str(HCM_DIR / 'prod'))
|
||
|
|
from dolphin_paths import get_eigenvalues_path, get_project_root
|
||
|
|
|
||
|
|
SCANS_DIR = get_eigenvalues_path()
|
||
|
|
LOGS_DIR = get_project_root() / 'paper_logs'
|
||
|
|
|
||
|
|
HZ_HOST = "localhost:5701"
|
||
|
|
HZ_CLUSTER = "dolphin"
|
||
|
|
|
||
|
|
class SystemWatchdog:
|
||
|
|
def __init__(self, tick_rate=0.5):
|
||
|
|
self.tick_rate = tick_rate
|
||
|
|
self.slow_tick_rate = 10.0
|
||
|
|
self.last_slow_tick = 0
|
||
|
|
self.running = False
|
||
|
|
|
||
|
|
self.health = {
|
||
|
|
'hz': 'FAIL',
|
||
|
|
'hz_container': 'UNKNOWN',
|
||
|
|
'prefect': 'FAIL',
|
||
|
|
'prefect_container': 'UNKNOWN',
|
||
|
|
'mc_container': 'UNKNOWN',
|
||
|
|
'scans': 'UNKNOWN',
|
||
|
|
'logs': 'UNKNOWN',
|
||
|
|
'timestamp': '',
|
||
|
|
'overall': 'DEGRADED'
|
||
|
|
}
|
||
|
|
|
||
|
|
# HZ auto-heal: fast-tick HTTP probe, 2 failures (~1s) → restart (~19s worst-case)
|
||
|
|
self._hz_container_fail_streak = 0
|
||
|
|
self._hz_container_heal_threshold = 2
|
||
|
|
self._hz_container_last_restart = 0.0
|
||
|
|
|
||
|
|
# Prefect auto-heal: fast-tick HTTP probe, 4 failures (~2s) → restart (~35s worst-case)
|
||
|
|
self._prefect_fail_streak = 0
|
||
|
|
self._prefect_heal_threshold = 4
|
||
|
|
self._prefect_last_restart = 0.0
|
||
|
|
|
||
|
|
# MC auto-heal: slow-tick only (non-critical), 5 failures (50s) → restart
|
||
|
|
self._mc_fail_streak = 0
|
||
|
|
self._mc_heal_threshold = 5
|
||
|
|
self._mc_last_restart = 0.0
|
||
|
|
|
||
|
|
_fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes')
|
||
|
|
self.survival_stack = SurvivalStack(fast_recovery=_fast_recovery)
|
||
|
|
|
||
|
|
self._init_hz()
|
||
|
|
|
||
|
|
def _init_hz(self):
|
||
|
|
try:
|
||
|
|
self.hz_client = hazelcast.HazelcastClient(
|
||
|
|
cluster_name=HZ_CLUSTER,
|
||
|
|
cluster_members=[HZ_HOST],
|
||
|
|
connection_timeout=2.0
|
||
|
|
)
|
||
|
|
# Use IMap for strong consistency, no near-cache
|
||
|
|
self.health_map = self.hz_client.get_map('DOLPHIN_SYSTEM_HEALTH').blocking()
|
||
|
|
self.features_map = self.hz_client.get_map('DOLPHIN_FEATURES').blocking()
|
||
|
|
self.state_map = self.hz_client.get_map('DOLPHIN_STATE_BLUE').blocking()
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"HZ Init failed: {e}")
|
||
|
|
self.hz_client = None
|
||
|
|
self.health_map = None
|
||
|
|
self.features_map = None
|
||
|
|
self.state_map = None
|
||
|
|
|
||
|
|
def start(self):
|
||
|
|
self.running = True
|
||
|
|
logger.info(f"Starting System Watchdog (tick={self.tick_rate}s)")
|
||
|
|
|
||
|
|
while self.running:
|
||
|
|
t0 = time.time()
|
||
|
|
|
||
|
|
self._check_hz()
|
||
|
|
self._check_hz_container() # fast-tick: ~1ms HTTP probe
|
||
|
|
self._check_prefect_container() # fast-tick: ~1ms HTTP probe
|
||
|
|
|
||
|
|
if t0 - self.last_slow_tick >= self.slow_tick_rate:
|
||
|
|
self._check_prefect()
|
||
|
|
self._check_scans()
|
||
|
|
self._check_logs()
|
||
|
|
self._check_mc_container() # slow-tick: MC is non-critical
|
||
|
|
self.last_slow_tick = t0
|
||
|
|
|
||
|
|
self.health['timestamp'] = datetime.now(timezone.utc).isoformat()
|
||
|
|
|
||
|
|
overall = 'GREEN' if all(v == 'OK' for k, v in self.health.items() if k not in ('timestamp', 'overall')) else 'DEGRADED'
|
||
|
|
|
||
|
|
if overall != self.health['overall']:
|
||
|
|
logger.info(f"Health transitioned {self.health['overall']} -> {overall}. Details: {self.health}")
|
||
|
|
|
||
|
|
self.health['overall'] = overall
|
||
|
|
self._write_health()
|
||
|
|
self._process_survival_stack()
|
||
|
|
|
||
|
|
elapsed = time.time() - t0
|
||
|
|
time.sleep(max(0.01, self.tick_rate - elapsed))
|
||
|
|
|
||
|
|
def stop(self):
|
||
|
|
self.running = False
|
||
|
|
if self.hz_client:
|
||
|
|
self.hz_client.shutdown()
|
||
|
|
|
||
|
|
def _check_hz(self):
|
||
|
|
if self.hz_client and self.hz_client.lifecycle_service.is_running():
|
||
|
|
if self.hz_client.cluster_service.get_members():
|
||
|
|
self.health['hz'] = 'OK'
|
||
|
|
return
|
||
|
|
|
||
|
|
self.health['hz'] = 'FAIL'
|
||
|
|
if not self.hz_client or not self.hz_client.lifecycle_service.is_running():
|
||
|
|
self._init_hz()
|
||
|
|
|
||
|
|
def _check_prefect(self):
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request("http://localhost:4200/api/health")
|
||
|
|
with urllib.request.urlopen(req, timeout=1) as resp:
|
||
|
|
data = json.loads(resp.read().decode('utf-8'))
|
||
|
|
self.health['prefect'] = 'OK' if data.get('status') == 'ok' else 'FAIL'
|
||
|
|
except Exception:
|
||
|
|
self.health['prefect'] = 'FAIL'
|
||
|
|
|
||
|
|
def _check_scans(self):
|
||
|
|
try:
|
||
|
|
if not SCANS_DIR.exists():
|
||
|
|
self.health['scans'] = 'FAIL (No directory)'
|
||
|
|
return
|
||
|
|
dirs = sorted(d.name for d in SCANS_DIR.iterdir() if d.is_dir() and len(d.name) == 10 and d.name.startswith('20'))
|
||
|
|
if not dirs:
|
||
|
|
self.health['scans'] = 'FAIL (No scans)'
|
||
|
|
return
|
||
|
|
latest_date = datetime.strptime(dirs[-1], '%Y-%m-%d').replace(tzinfo=timezone.utc)
|
||
|
|
age = datetime.now(timezone.utc) - latest_date
|
||
|
|
if age > timedelta(days=2):
|
||
|
|
self.health['scans'] = f"FAIL (Stale {age.days} days)"
|
||
|
|
else:
|
||
|
|
self.health['scans'] = 'OK'
|
||
|
|
except Exception as e:
|
||
|
|
self.health['scans'] = f"FAIL ({str(e)})"
|
||
|
|
|
||
|
|
def _check_logs(self):
|
||
|
|
try:
|
||
|
|
if not LOGS_DIR.exists():
|
||
|
|
self.health['logs'] = 'FAIL (No directory)'
|
||
|
|
return
|
||
|
|
log_files = sorted(LOGS_DIR.glob("paper_pnl_*.jsonl"))
|
||
|
|
if not log_files:
|
||
|
|
self.health['logs'] = 'FAIL (No logs)'
|
||
|
|
return
|
||
|
|
last_line = ""
|
||
|
|
with open(log_files[-1], 'r') as f:
|
||
|
|
for line in f:
|
||
|
|
if line.strip():
|
||
|
|
last_line = line
|
||
|
|
if not last_line:
|
||
|
|
self.health['logs'] = 'FAIL (Empty log)'
|
||
|
|
return
|
||
|
|
data = json.loads(last_line)
|
||
|
|
log_date = datetime.fromisoformat(data['logged_at'])
|
||
|
|
age = datetime.now(timezone.utc) - log_date
|
||
|
|
if age > timedelta(days=2):
|
||
|
|
self.health['logs'] = f"FAIL (Stale {age.days} days)"
|
||
|
|
else:
|
||
|
|
self.health['logs'] = 'OK'
|
||
|
|
except Exception as e:
|
||
|
|
self.health['logs'] = f"FAIL ({str(e)})"
|
||
|
|
|
||
|
|
def _check_hz_container(self):
|
||
|
|
"""Fast-tick HTTP health probe (~1ms when healthy). Triggers auto-heal after 2 failures."""
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request('http://127.0.0.1:5701/hazelcast/health')
|
||
|
|
with urllib.request.urlopen(req, timeout=0.3) as resp:
|
||
|
|
data = json.loads(resp.read())
|
||
|
|
if data.get('nodeState') == 'ACTIVE':
|
||
|
|
self.health['hz_container'] = 'OK'
|
||
|
|
self._hz_container_fail_streak = 0
|
||
|
|
return
|
||
|
|
# Node up but not ACTIVE (e.g. PASSIVE, FROZEN)
|
||
|
|
state = data.get('nodeState', 'UNKNOWN')
|
||
|
|
self.health['hz_container'] = f'WARN ({state})'
|
||
|
|
self._hz_container_fail_streak += 1
|
||
|
|
except Exception:
|
||
|
|
self.health['hz_container'] = 'FAIL'
|
||
|
|
self._hz_container_fail_streak += 1
|
||
|
|
|
||
|
|
if self._hz_container_fail_streak >= self._hz_container_heal_threshold:
|
||
|
|
self._heal_hz_container()
|
||
|
|
|
||
|
|
def _heal_hz_container(self):
|
||
|
|
"""Restart the HZ container after sustained unhealthy state. Idempotent."""
|
||
|
|
now = time.time()
|
||
|
|
if now - self._hz_container_last_restart < 30: # minimum 30s between restarts (restart takes ~15s)
|
||
|
|
logger.warning("HZ heal suppressed — last restart was <30s ago")
|
||
|
|
return
|
||
|
|
logger.critical("AUTO-HEAL: restarting dolphin-hazelcast (sustained unhealthy)")
|
||
|
|
try:
|
||
|
|
subprocess.run(['docker', 'restart', 'dolphin-hazelcast'], timeout=30, check=True)
|
||
|
|
self._hz_container_fail_streak = 0
|
||
|
|
self._hz_container_last_restart = now
|
||
|
|
logger.info("AUTO-HEAL: dolphin-hazelcast restarted successfully")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"AUTO-HEAL: docker restart failed: {e}")
|
||
|
|
|
||
|
|
def _check_prefect_container(self):
|
||
|
|
"""Fast-tick HTTP probe to Prefect API. Heals after 4 consecutive failures (~2s)."""
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request('http://127.0.0.1:4200/api/health')
|
||
|
|
with urllib.request.urlopen(req, timeout=0.3) as resp:
|
||
|
|
body = resp.read().strip()
|
||
|
|
if body == b'true':
|
||
|
|
self.health['prefect_container'] = 'OK'
|
||
|
|
self._prefect_fail_streak = 0
|
||
|
|
return
|
||
|
|
self.health['prefect_container'] = f'WARN (body={body[:20]})'
|
||
|
|
self._prefect_fail_streak += 1
|
||
|
|
except Exception:
|
||
|
|
self.health['prefect_container'] = 'FAIL'
|
||
|
|
self._prefect_fail_streak += 1
|
||
|
|
|
||
|
|
if self._prefect_fail_streak >= self._prefect_heal_threshold:
|
||
|
|
self._heal_prefect_container()
|
||
|
|
|
||
|
|
def _heal_prefect_container(self):
|
||
|
|
now = time.time()
|
||
|
|
if now - self._prefect_last_restart < 60:
|
||
|
|
logger.warning("Prefect heal suppressed — last restart <60s ago")
|
||
|
|
return
|
||
|
|
logger.critical("AUTO-HEAL: restarting dolphin-prefect (sustained unhealthy)")
|
||
|
|
try:
|
||
|
|
subprocess.run(['docker', 'restart', 'dolphin-prefect'], timeout=45, check=True)
|
||
|
|
self._prefect_fail_streak = 0
|
||
|
|
self._prefect_last_restart = now
|
||
|
|
logger.info("AUTO-HEAL: dolphin-prefect restarted successfully")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"AUTO-HEAL: prefect restart failed: {e}")
|
||
|
|
|
||
|
|
def _check_mc_container(self):
|
||
|
|
"""Slow-tick HTTP probe to Hazelcast MC (non-critical). Heals after 5 slow-tick failures."""
|
||
|
|
try:
|
||
|
|
req = urllib.request.Request('http://127.0.0.1:8080/')
|
||
|
|
with urllib.request.urlopen(req, timeout=1.0) as resp:
|
||
|
|
if resp.status == 200:
|
||
|
|
self.health['mc_container'] = 'OK'
|
||
|
|
self._mc_fail_streak = 0
|
||
|
|
return
|
||
|
|
self.health['mc_container'] = f'WARN (status={resp.status})'
|
||
|
|
self._mc_fail_streak += 1
|
||
|
|
except Exception:
|
||
|
|
self.health['mc_container'] = 'FAIL'
|
||
|
|
self._mc_fail_streak += 1
|
||
|
|
|
||
|
|
if self._mc_fail_streak >= self._mc_heal_threshold:
|
||
|
|
self._heal_mc_container()
|
||
|
|
|
||
|
|
def _heal_mc_container(self):
|
||
|
|
now = time.time()
|
||
|
|
if now - self._mc_last_restart < 120:
|
||
|
|
logger.warning("MC heal suppressed — last restart <120s ago")
|
||
|
|
return
|
||
|
|
logger.critical("AUTO-HEAL: restarting dolphin-hazelcast-mc (sustained unhealthy)")
|
||
|
|
try:
|
||
|
|
subprocess.run(['docker', 'restart', 'dolphin-hazelcast-mc'], timeout=60, check=True)
|
||
|
|
self._mc_fail_streak = 0
|
||
|
|
self._mc_last_restart = now
|
||
|
|
logger.info("AUTO-HEAL: dolphin-hazelcast-mc restarted successfully")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"AUTO-HEAL: mc restart failed: {e}")
|
||
|
|
|
||
|
|
def _process_survival_stack(self):
|
||
|
|
if not self.features_map:
|
||
|
|
return
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Gather state safely
|
||
|
|
hz_nodes = len(self.hz_client.cluster_service.get_members()) if self.hz_client else 0
|
||
|
|
heartbeat_age = 0.0 # Handled by the watchdog directly being alive
|
||
|
|
|
||
|
|
mc_raw = self.features_map.get('mc_forewarner_latest')
|
||
|
|
mc_state = json.loads(mc_raw) if mc_raw else {}
|
||
|
|
mc_status = mc_state.get('status', 'ORANGE') # default ORANGE if missing
|
||
|
|
mc_ts = mc_state.get('timestamp')
|
||
|
|
|
||
|
|
if mc_ts:
|
||
|
|
mc_age = (datetime.now(timezone.utc) - datetime.fromisoformat(mc_ts)).total_seconds() / 3600.0
|
||
|
|
else:
|
||
|
|
mc_age = 999.0
|
||
|
|
|
||
|
|
ob_raw = self.features_map.get('asset_BTCUSDT_ob')
|
||
|
|
ob_state = json.loads(ob_raw) if ob_raw else {}
|
||
|
|
|
||
|
|
ob_depth = ob_state.get('depth_quality', 0.5)
|
||
|
|
ob_fill = ob_state.get('fill_prob', 0.5)
|
||
|
|
ob_stale = ob_state.get('stale', True)
|
||
|
|
|
||
|
|
dvol_spike = False # Extracted from ExF, currently hardcoded neutral
|
||
|
|
t_since_spike = 999.0
|
||
|
|
|
||
|
|
state_raw = self.state_map.get('latest')
|
||
|
|
state = json.loads(state_raw) if state_raw else {}
|
||
|
|
drawdown = state.get('drawdown', 0.0)
|
||
|
|
|
||
|
|
rm, breakdown = self.survival_stack.compute_rm(
|
||
|
|
hz_nodes=hz_nodes, heartbeat_age_s=heartbeat_age,
|
||
|
|
mc_status=mc_status, mc_staleness_hours=mc_age,
|
||
|
|
ob_depth_quality=ob_depth, ob_fill_prob=ob_fill, ob_stale=ob_stale,
|
||
|
|
dvol_spike=dvol_spike, t_since_spike_min=t_since_spike,
|
||
|
|
drawdown=drawdown
|
||
|
|
)
|
||
|
|
|
||
|
|
posture = self.survival_stack.update_posture(rm)
|
||
|
|
self.survival_stack.write_to_hz(rm, breakdown, self.hz_client)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to process Survival Stack: {e}")
|
||
|
|
|
||
|
|
def _write_health(self):
|
||
|
|
if self.health_map:
|
||
|
|
try:
|
||
|
|
self.health_map.put('latest', json.dumps(self.health))
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"Failed to write health to HZ: {e}")
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
args = parser.parse_args()
|
||
|
|
watchdog = SystemWatchdog()
|
||
|
|
try:
|
||
|
|
watchdog.start()
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
watchdog.stop()
|
||
|
|
logger.info("Watchdog stopped.")
|