Files
DOLPHIN/prod/system_watchdog_service.py

360 lines
14 KiB
Python
Raw Normal View History

"""DOLPHIN MIG3 — High-Frequency System Watchdog Service
Runs a continuous loop (< 1s tick) to verify system health.
Writes DOLPHIN_SYSTEM_HEALTH state to Hazelcast strongly consistent structure.
"""
import os
import time
import json
import logging
import argparse
import subprocess
import urllib.request
from pathlib import Path
from datetime import datetime, timezone, timedelta
import sys
import hazelcast
HCM_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(HCM_DIR / 'nautilus_dolphin'))
from nautilus_dolphin.nautilus.survival_stack import SurvivalStack
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s - %(message)s')
logger = logging.getLogger("Watchdog")
HCM_DIR = Path(__file__).parent.parent
# Use platform-independent paths
sys.path.insert(0, str(HCM_DIR))
sys.path.insert(0, str(HCM_DIR / 'prod'))
from dolphin_paths import get_eigenvalues_path, get_project_root
SCANS_DIR = get_eigenvalues_path()
LOGS_DIR = get_project_root() / 'paper_logs'
HZ_HOST = "localhost:5701"
HZ_CLUSTER = "dolphin"
class SystemWatchdog:
def __init__(self, tick_rate=0.5):
self.tick_rate = tick_rate
self.slow_tick_rate = 10.0
self.last_slow_tick = 0
self.running = False
self.health = {
'hz': 'FAIL',
'hz_container': 'UNKNOWN',
'prefect': 'FAIL',
'prefect_container': 'UNKNOWN',
'mc_container': 'UNKNOWN',
'scans': 'UNKNOWN',
'logs': 'UNKNOWN',
'timestamp': '',
'overall': 'DEGRADED'
}
# HZ auto-heal: fast-tick HTTP probe, 2 failures (~1s) → restart (~19s worst-case)
self._hz_container_fail_streak = 0
self._hz_container_heal_threshold = 2
self._hz_container_last_restart = 0.0
# Prefect auto-heal: fast-tick HTTP probe, 4 failures (~2s) → restart (~35s worst-case)
self._prefect_fail_streak = 0
self._prefect_heal_threshold = 4
self._prefect_last_restart = 0.0
# MC auto-heal: slow-tick only (non-critical), 5 failures (50s) → restart
self._mc_fail_streak = 0
self._mc_heal_threshold = 5
self._mc_last_restart = 0.0
_fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes')
self.survival_stack = SurvivalStack(fast_recovery=_fast_recovery)
self._init_hz()
def _init_hz(self):
try:
self.hz_client = hazelcast.HazelcastClient(
cluster_name=HZ_CLUSTER,
cluster_members=[HZ_HOST],
connection_timeout=2.0
)
# Use IMap for strong consistency, no near-cache
self.health_map = self.hz_client.get_map('DOLPHIN_SYSTEM_HEALTH').blocking()
self.features_map = self.hz_client.get_map('DOLPHIN_FEATURES').blocking()
self.state_map = self.hz_client.get_map('DOLPHIN_STATE_BLUE').blocking()
except Exception as e:
logger.error(f"HZ Init failed: {e}")
self.hz_client = None
self.health_map = None
self.features_map = None
self.state_map = None
def start(self):
self.running = True
logger.info(f"Starting System Watchdog (tick={self.tick_rate}s)")
while self.running:
t0 = time.time()
self._check_hz()
self._check_hz_container() # fast-tick: ~1ms HTTP probe
self._check_prefect_container() # fast-tick: ~1ms HTTP probe
if t0 - self.last_slow_tick >= self.slow_tick_rate:
self._check_prefect()
self._check_scans()
self._check_logs()
self._check_mc_container() # slow-tick: MC is non-critical
self.last_slow_tick = t0
self.health['timestamp'] = datetime.now(timezone.utc).isoformat()
overall = 'GREEN' if all(v == 'OK' for k, v in self.health.items() if k not in ('timestamp', 'overall')) else 'DEGRADED'
if overall != self.health['overall']:
logger.info(f"Health transitioned {self.health['overall']} -> {overall}. Details: {self.health}")
self.health['overall'] = overall
self._write_health()
self._process_survival_stack()
elapsed = time.time() - t0
time.sleep(max(0.01, self.tick_rate - elapsed))
def stop(self):
self.running = False
if self.hz_client:
self.hz_client.shutdown()
def _check_hz(self):
if self.hz_client and self.hz_client.lifecycle_service.is_running():
if self.hz_client.cluster_service.get_members():
self.health['hz'] = 'OK'
return
self.health['hz'] = 'FAIL'
if not self.hz_client or not self.hz_client.lifecycle_service.is_running():
self._init_hz()
def _check_prefect(self):
try:
req = urllib.request.Request("http://localhost:4200/api/health")
with urllib.request.urlopen(req, timeout=1) as resp:
data = json.loads(resp.read().decode('utf-8'))
self.health['prefect'] = 'OK' if data.get('status') == 'ok' else 'FAIL'
except Exception:
self.health['prefect'] = 'FAIL'
def _check_scans(self):
try:
if not SCANS_DIR.exists():
self.health['scans'] = 'FAIL (No directory)'
return
dirs = sorted(d.name for d in SCANS_DIR.iterdir() if d.is_dir() and len(d.name) == 10 and d.name.startswith('20'))
if not dirs:
self.health['scans'] = 'FAIL (No scans)'
return
latest_date = datetime.strptime(dirs[-1], '%Y-%m-%d').replace(tzinfo=timezone.utc)
age = datetime.now(timezone.utc) - latest_date
if age > timedelta(days=2):
self.health['scans'] = f"FAIL (Stale {age.days} days)"
else:
self.health['scans'] = 'OK'
except Exception as e:
self.health['scans'] = f"FAIL ({str(e)})"
def _check_logs(self):
try:
if not LOGS_DIR.exists():
self.health['logs'] = 'FAIL (No directory)'
return
log_files = sorted(LOGS_DIR.glob("paper_pnl_*.jsonl"))
if not log_files:
self.health['logs'] = 'FAIL (No logs)'
return
last_line = ""
with open(log_files[-1], 'r') as f:
for line in f:
if line.strip():
last_line = line
if not last_line:
self.health['logs'] = 'FAIL (Empty log)'
return
data = json.loads(last_line)
log_date = datetime.fromisoformat(data['logged_at'])
age = datetime.now(timezone.utc) - log_date
if age > timedelta(days=2):
self.health['logs'] = f"FAIL (Stale {age.days} days)"
else:
self.health['logs'] = 'OK'
except Exception as e:
self.health['logs'] = f"FAIL ({str(e)})"
def _check_hz_container(self):
"""Fast-tick HTTP health probe (~1ms when healthy). Triggers auto-heal after 2 failures."""
try:
req = urllib.request.Request('http://127.0.0.1:5701/hazelcast/health')
with urllib.request.urlopen(req, timeout=0.3) as resp:
data = json.loads(resp.read())
if data.get('nodeState') == 'ACTIVE':
self.health['hz_container'] = 'OK'
self._hz_container_fail_streak = 0
return
# Node up but not ACTIVE (e.g. PASSIVE, FROZEN)
state = data.get('nodeState', 'UNKNOWN')
self.health['hz_container'] = f'WARN ({state})'
self._hz_container_fail_streak += 1
except Exception:
self.health['hz_container'] = 'FAIL'
self._hz_container_fail_streak += 1
if self._hz_container_fail_streak >= self._hz_container_heal_threshold:
self._heal_hz_container()
def _heal_hz_container(self):
"""Restart the HZ container after sustained unhealthy state. Idempotent."""
now = time.time()
if now - self._hz_container_last_restart < 30: # minimum 30s between restarts (restart takes ~15s)
logger.warning("HZ heal suppressed — last restart was <30s ago")
return
logger.critical("AUTO-HEAL: restarting dolphin-hazelcast (sustained unhealthy)")
try:
subprocess.run(['docker', 'restart', 'dolphin-hazelcast'], timeout=30, check=True)
self._hz_container_fail_streak = 0
self._hz_container_last_restart = now
logger.info("AUTO-HEAL: dolphin-hazelcast restarted successfully")
except Exception as e:
logger.error(f"AUTO-HEAL: docker restart failed: {e}")
def _check_prefect_container(self):
"""Fast-tick HTTP probe to Prefect API. Heals after 4 consecutive failures (~2s)."""
try:
req = urllib.request.Request('http://127.0.0.1:4200/api/health')
with urllib.request.urlopen(req, timeout=0.3) as resp:
body = resp.read().strip()
if body == b'true':
self.health['prefect_container'] = 'OK'
self._prefect_fail_streak = 0
return
self.health['prefect_container'] = f'WARN (body={body[:20]})'
self._prefect_fail_streak += 1
except Exception:
self.health['prefect_container'] = 'FAIL'
self._prefect_fail_streak += 1
if self._prefect_fail_streak >= self._prefect_heal_threshold:
self._heal_prefect_container()
def _heal_prefect_container(self):
now = time.time()
if now - self._prefect_last_restart < 60:
logger.warning("Prefect heal suppressed — last restart <60s ago")
return
logger.critical("AUTO-HEAL: restarting dolphin-prefect (sustained unhealthy)")
try:
subprocess.run(['docker', 'restart', 'dolphin-prefect'], timeout=45, check=True)
self._prefect_fail_streak = 0
self._prefect_last_restart = now
logger.info("AUTO-HEAL: dolphin-prefect restarted successfully")
except Exception as e:
logger.error(f"AUTO-HEAL: prefect restart failed: {e}")
def _check_mc_container(self):
"""Slow-tick HTTP probe to Hazelcast MC (non-critical). Heals after 5 slow-tick failures."""
try:
req = urllib.request.Request('http://127.0.0.1:8080/')
with urllib.request.urlopen(req, timeout=1.0) as resp:
if resp.status == 200:
self.health['mc_container'] = 'OK'
self._mc_fail_streak = 0
return
self.health['mc_container'] = f'WARN (status={resp.status})'
self._mc_fail_streak += 1
except Exception:
self.health['mc_container'] = 'FAIL'
self._mc_fail_streak += 1
if self._mc_fail_streak >= self._mc_heal_threshold:
self._heal_mc_container()
def _heal_mc_container(self):
now = time.time()
if now - self._mc_last_restart < 120:
logger.warning("MC heal suppressed — last restart <120s ago")
return
logger.critical("AUTO-HEAL: restarting dolphin-hazelcast-mc (sustained unhealthy)")
try:
subprocess.run(['docker', 'restart', 'dolphin-hazelcast-mc'], timeout=60, check=True)
self._mc_fail_streak = 0
self._mc_last_restart = now
logger.info("AUTO-HEAL: dolphin-hazelcast-mc restarted successfully")
except Exception as e:
logger.error(f"AUTO-HEAL: mc restart failed: {e}")
def _process_survival_stack(self):
if not self.features_map:
return
try:
# Gather state safely
hz_nodes = len(self.hz_client.cluster_service.get_members()) if self.hz_client else 0
heartbeat_age = 0.0 # Handled by the watchdog directly being alive
mc_raw = self.features_map.get('mc_forewarner_latest')
mc_state = json.loads(mc_raw) if mc_raw else {}
mc_status = mc_state.get('status', 'ORANGE') # default ORANGE if missing
mc_ts = mc_state.get('timestamp')
if mc_ts:
mc_age = (datetime.now(timezone.utc) - datetime.fromisoformat(mc_ts)).total_seconds() / 3600.0
else:
mc_age = 999.0
ob_raw = self.features_map.get('asset_BTCUSDT_ob')
ob_state = json.loads(ob_raw) if ob_raw else {}
ob_depth = ob_state.get('depth_quality', 0.5)
ob_fill = ob_state.get('fill_prob', 0.5)
ob_stale = ob_state.get('stale', True)
dvol_spike = False # Extracted from ExF, currently hardcoded neutral
t_since_spike = 999.0
state_raw = self.state_map.get('latest')
state = json.loads(state_raw) if state_raw else {}
drawdown = state.get('drawdown', 0.0)
rm, breakdown = self.survival_stack.compute_rm(
hz_nodes=hz_nodes, heartbeat_age_s=heartbeat_age,
mc_status=mc_status, mc_staleness_hours=mc_age,
ob_depth_quality=ob_depth, ob_fill_prob=ob_fill, ob_stale=ob_stale,
dvol_spike=dvol_spike, t_since_spike_min=t_since_spike,
drawdown=drawdown
)
posture = self.survival_stack.update_posture(rm)
self.survival_stack.write_to_hz(rm, breakdown, self.hz_client)
except Exception as e:
logger.error(f"Failed to process Survival Stack: {e}")
def _write_health(self):
if self.health_map:
try:
self.health_map.put('latest', json.dumps(self.health))
except Exception as e:
logger.debug(f"Failed to write health to HZ: {e}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
args = parser.parse_args()
watchdog = SystemWatchdog()
try:
watchdog.start()
except KeyboardInterrupt:
watchdog.stop()
logger.info("Watchdog stopped.")