initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
359
prod/system_watchdog_service.py
Executable file
359
prod/system_watchdog_service.py
Executable file
@@ -0,0 +1,359 @@
|
||||
"""DOLPHIN MIG3 — High-Frequency System Watchdog Service
|
||||
Runs a continuous loop (< 1s tick) to verify system health.
|
||||
Writes DOLPHIN_SYSTEM_HEALTH state to Hazelcast strongly consistent structure.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
import subprocess
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import sys
|
||||
|
||||
import hazelcast
|
||||
|
||||
HCM_DIR = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(HCM_DIR / 'nautilus_dolphin'))
|
||||
from nautilus_dolphin.nautilus.survival_stack import SurvivalStack
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s - %(message)s')
|
||||
logger = logging.getLogger("Watchdog")
|
||||
|
||||
HCM_DIR = Path(__file__).parent.parent
|
||||
|
||||
# Use platform-independent paths
|
||||
sys.path.insert(0, str(HCM_DIR))
|
||||
sys.path.insert(0, str(HCM_DIR / 'prod'))
|
||||
from dolphin_paths import get_eigenvalues_path, get_project_root
|
||||
|
||||
SCANS_DIR = get_eigenvalues_path()
|
||||
LOGS_DIR = get_project_root() / 'paper_logs'
|
||||
|
||||
HZ_HOST = "localhost:5701"
|
||||
HZ_CLUSTER = "dolphin"
|
||||
|
||||
class SystemWatchdog:
|
||||
def __init__(self, tick_rate=0.5):
|
||||
self.tick_rate = tick_rate
|
||||
self.slow_tick_rate = 10.0
|
||||
self.last_slow_tick = 0
|
||||
self.running = False
|
||||
|
||||
self.health = {
|
||||
'hz': 'FAIL',
|
||||
'hz_container': 'UNKNOWN',
|
||||
'prefect': 'FAIL',
|
||||
'prefect_container': 'UNKNOWN',
|
||||
'mc_container': 'UNKNOWN',
|
||||
'scans': 'UNKNOWN',
|
||||
'logs': 'UNKNOWN',
|
||||
'timestamp': '',
|
||||
'overall': 'DEGRADED'
|
||||
}
|
||||
|
||||
# HZ auto-heal: fast-tick HTTP probe, 2 failures (~1s) → restart (~19s worst-case)
|
||||
self._hz_container_fail_streak = 0
|
||||
self._hz_container_heal_threshold = 2
|
||||
self._hz_container_last_restart = 0.0
|
||||
|
||||
# Prefect auto-heal: fast-tick HTTP probe, 4 failures (~2s) → restart (~35s worst-case)
|
||||
self._prefect_fail_streak = 0
|
||||
self._prefect_heal_threshold = 4
|
||||
self._prefect_last_restart = 0.0
|
||||
|
||||
# MC auto-heal: slow-tick only (non-critical), 5 failures (50s) → restart
|
||||
self._mc_fail_streak = 0
|
||||
self._mc_heal_threshold = 5
|
||||
self._mc_last_restart = 0.0
|
||||
|
||||
_fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes')
|
||||
self.survival_stack = SurvivalStack(fast_recovery=_fast_recovery)
|
||||
|
||||
self._init_hz()
|
||||
|
||||
def _init_hz(self):
|
||||
try:
|
||||
self.hz_client = hazelcast.HazelcastClient(
|
||||
cluster_name=HZ_CLUSTER,
|
||||
cluster_members=[HZ_HOST],
|
||||
connection_timeout=2.0
|
||||
)
|
||||
# Use IMap for strong consistency, no near-cache
|
||||
self.health_map = self.hz_client.get_map('DOLPHIN_SYSTEM_HEALTH').blocking()
|
||||
self.features_map = self.hz_client.get_map('DOLPHIN_FEATURES').blocking()
|
||||
self.state_map = self.hz_client.get_map('DOLPHIN_STATE_BLUE').blocking()
|
||||
except Exception as e:
|
||||
logger.error(f"HZ Init failed: {e}")
|
||||
self.hz_client = None
|
||||
self.health_map = None
|
||||
self.features_map = None
|
||||
self.state_map = None
|
||||
|
||||
def start(self):
|
||||
self.running = True
|
||||
logger.info(f"Starting System Watchdog (tick={self.tick_rate}s)")
|
||||
|
||||
while self.running:
|
||||
t0 = time.time()
|
||||
|
||||
self._check_hz()
|
||||
self._check_hz_container() # fast-tick: ~1ms HTTP probe
|
||||
self._check_prefect_container() # fast-tick: ~1ms HTTP probe
|
||||
|
||||
if t0 - self.last_slow_tick >= self.slow_tick_rate:
|
||||
self._check_prefect()
|
||||
self._check_scans()
|
||||
self._check_logs()
|
||||
self._check_mc_container() # slow-tick: MC is non-critical
|
||||
self.last_slow_tick = t0
|
||||
|
||||
self.health['timestamp'] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
overall = 'GREEN' if all(v == 'OK' for k, v in self.health.items() if k not in ('timestamp', 'overall')) else 'DEGRADED'
|
||||
|
||||
if overall != self.health['overall']:
|
||||
logger.info(f"Health transitioned {self.health['overall']} -> {overall}. Details: {self.health}")
|
||||
|
||||
self.health['overall'] = overall
|
||||
self._write_health()
|
||||
self._process_survival_stack()
|
||||
|
||||
elapsed = time.time() - t0
|
||||
time.sleep(max(0.01, self.tick_rate - elapsed))
|
||||
|
||||
def stop(self):
|
||||
self.running = False
|
||||
if self.hz_client:
|
||||
self.hz_client.shutdown()
|
||||
|
||||
def _check_hz(self):
|
||||
if self.hz_client and self.hz_client.lifecycle_service.is_running():
|
||||
if self.hz_client.cluster_service.get_members():
|
||||
self.health['hz'] = 'OK'
|
||||
return
|
||||
|
||||
self.health['hz'] = 'FAIL'
|
||||
if not self.hz_client or not self.hz_client.lifecycle_service.is_running():
|
||||
self._init_hz()
|
||||
|
||||
def _check_prefect(self):
|
||||
try:
|
||||
req = urllib.request.Request("http://localhost:4200/api/health")
|
||||
with urllib.request.urlopen(req, timeout=1) as resp:
|
||||
data = json.loads(resp.read().decode('utf-8'))
|
||||
self.health['prefect'] = 'OK' if data.get('status') == 'ok' else 'FAIL'
|
||||
except Exception:
|
||||
self.health['prefect'] = 'FAIL'
|
||||
|
||||
def _check_scans(self):
|
||||
try:
|
||||
if not SCANS_DIR.exists():
|
||||
self.health['scans'] = 'FAIL (No directory)'
|
||||
return
|
||||
dirs = sorted(d.name for d in SCANS_DIR.iterdir() if d.is_dir() and len(d.name) == 10 and d.name.startswith('20'))
|
||||
if not dirs:
|
||||
self.health['scans'] = 'FAIL (No scans)'
|
||||
return
|
||||
latest_date = datetime.strptime(dirs[-1], '%Y-%m-%d').replace(tzinfo=timezone.utc)
|
||||
age = datetime.now(timezone.utc) - latest_date
|
||||
if age > timedelta(days=2):
|
||||
self.health['scans'] = f"FAIL (Stale {age.days} days)"
|
||||
else:
|
||||
self.health['scans'] = 'OK'
|
||||
except Exception as e:
|
||||
self.health['scans'] = f"FAIL ({str(e)})"
|
||||
|
||||
def _check_logs(self):
|
||||
try:
|
||||
if not LOGS_DIR.exists():
|
||||
self.health['logs'] = 'FAIL (No directory)'
|
||||
return
|
||||
log_files = sorted(LOGS_DIR.glob("paper_pnl_*.jsonl"))
|
||||
if not log_files:
|
||||
self.health['logs'] = 'FAIL (No logs)'
|
||||
return
|
||||
last_line = ""
|
||||
with open(log_files[-1], 'r') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
last_line = line
|
||||
if not last_line:
|
||||
self.health['logs'] = 'FAIL (Empty log)'
|
||||
return
|
||||
data = json.loads(last_line)
|
||||
log_date = datetime.fromisoformat(data['logged_at'])
|
||||
age = datetime.now(timezone.utc) - log_date
|
||||
if age > timedelta(days=2):
|
||||
self.health['logs'] = f"FAIL (Stale {age.days} days)"
|
||||
else:
|
||||
self.health['logs'] = 'OK'
|
||||
except Exception as e:
|
||||
self.health['logs'] = f"FAIL ({str(e)})"
|
||||
|
||||
def _check_hz_container(self):
|
||||
"""Fast-tick HTTP health probe (~1ms when healthy). Triggers auto-heal after 2 failures."""
|
||||
try:
|
||||
req = urllib.request.Request('http://127.0.0.1:5701/hazelcast/health')
|
||||
with urllib.request.urlopen(req, timeout=0.3) as resp:
|
||||
data = json.loads(resp.read())
|
||||
if data.get('nodeState') == 'ACTIVE':
|
||||
self.health['hz_container'] = 'OK'
|
||||
self._hz_container_fail_streak = 0
|
||||
return
|
||||
# Node up but not ACTIVE (e.g. PASSIVE, FROZEN)
|
||||
state = data.get('nodeState', 'UNKNOWN')
|
||||
self.health['hz_container'] = f'WARN ({state})'
|
||||
self._hz_container_fail_streak += 1
|
||||
except Exception:
|
||||
self.health['hz_container'] = 'FAIL'
|
||||
self._hz_container_fail_streak += 1
|
||||
|
||||
if self._hz_container_fail_streak >= self._hz_container_heal_threshold:
|
||||
self._heal_hz_container()
|
||||
|
||||
def _heal_hz_container(self):
|
||||
"""Restart the HZ container after sustained unhealthy state. Idempotent."""
|
||||
now = time.time()
|
||||
if now - self._hz_container_last_restart < 30: # minimum 30s between restarts (restart takes ~15s)
|
||||
logger.warning("HZ heal suppressed — last restart was <30s ago")
|
||||
return
|
||||
logger.critical("AUTO-HEAL: restarting dolphin-hazelcast (sustained unhealthy)")
|
||||
try:
|
||||
subprocess.run(['docker', 'restart', 'dolphin-hazelcast'], timeout=30, check=True)
|
||||
self._hz_container_fail_streak = 0
|
||||
self._hz_container_last_restart = now
|
||||
logger.info("AUTO-HEAL: dolphin-hazelcast restarted successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"AUTO-HEAL: docker restart failed: {e}")
|
||||
|
||||
def _check_prefect_container(self):
|
||||
"""Fast-tick HTTP probe to Prefect API. Heals after 4 consecutive failures (~2s)."""
|
||||
try:
|
||||
req = urllib.request.Request('http://127.0.0.1:4200/api/health')
|
||||
with urllib.request.urlopen(req, timeout=0.3) as resp:
|
||||
body = resp.read().strip()
|
||||
if body == b'true':
|
||||
self.health['prefect_container'] = 'OK'
|
||||
self._prefect_fail_streak = 0
|
||||
return
|
||||
self.health['prefect_container'] = f'WARN (body={body[:20]})'
|
||||
self._prefect_fail_streak += 1
|
||||
except Exception:
|
||||
self.health['prefect_container'] = 'FAIL'
|
||||
self._prefect_fail_streak += 1
|
||||
|
||||
if self._prefect_fail_streak >= self._prefect_heal_threshold:
|
||||
self._heal_prefect_container()
|
||||
|
||||
def _heal_prefect_container(self):
|
||||
now = time.time()
|
||||
if now - self._prefect_last_restart < 60:
|
||||
logger.warning("Prefect heal suppressed — last restart <60s ago")
|
||||
return
|
||||
logger.critical("AUTO-HEAL: restarting dolphin-prefect (sustained unhealthy)")
|
||||
try:
|
||||
subprocess.run(['docker', 'restart', 'dolphin-prefect'], timeout=45, check=True)
|
||||
self._prefect_fail_streak = 0
|
||||
self._prefect_last_restart = now
|
||||
logger.info("AUTO-HEAL: dolphin-prefect restarted successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"AUTO-HEAL: prefect restart failed: {e}")
|
||||
|
||||
def _check_mc_container(self):
|
||||
"""Slow-tick HTTP probe to Hazelcast MC (non-critical). Heals after 5 slow-tick failures."""
|
||||
try:
|
||||
req = urllib.request.Request('http://127.0.0.1:8080/')
|
||||
with urllib.request.urlopen(req, timeout=1.0) as resp:
|
||||
if resp.status == 200:
|
||||
self.health['mc_container'] = 'OK'
|
||||
self._mc_fail_streak = 0
|
||||
return
|
||||
self.health['mc_container'] = f'WARN (status={resp.status})'
|
||||
self._mc_fail_streak += 1
|
||||
except Exception:
|
||||
self.health['mc_container'] = 'FAIL'
|
||||
self._mc_fail_streak += 1
|
||||
|
||||
if self._mc_fail_streak >= self._mc_heal_threshold:
|
||||
self._heal_mc_container()
|
||||
|
||||
def _heal_mc_container(self):
|
||||
now = time.time()
|
||||
if now - self._mc_last_restart < 120:
|
||||
logger.warning("MC heal suppressed — last restart <120s ago")
|
||||
return
|
||||
logger.critical("AUTO-HEAL: restarting dolphin-hazelcast-mc (sustained unhealthy)")
|
||||
try:
|
||||
subprocess.run(['docker', 'restart', 'dolphin-hazelcast-mc'], timeout=60, check=True)
|
||||
self._mc_fail_streak = 0
|
||||
self._mc_last_restart = now
|
||||
logger.info("AUTO-HEAL: dolphin-hazelcast-mc restarted successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"AUTO-HEAL: mc restart failed: {e}")
|
||||
|
||||
def _process_survival_stack(self):
|
||||
if not self.features_map:
|
||||
return
|
||||
|
||||
try:
|
||||
# Gather state safely
|
||||
hz_nodes = len(self.hz_client.cluster_service.get_members()) if self.hz_client else 0
|
||||
heartbeat_age = 0.0 # Handled by the watchdog directly being alive
|
||||
|
||||
mc_raw = self.features_map.get('mc_forewarner_latest')
|
||||
mc_state = json.loads(mc_raw) if mc_raw else {}
|
||||
mc_status = mc_state.get('status', 'ORANGE') # default ORANGE if missing
|
||||
mc_ts = mc_state.get('timestamp')
|
||||
|
||||
if mc_ts:
|
||||
mc_age = (datetime.now(timezone.utc) - datetime.fromisoformat(mc_ts)).total_seconds() / 3600.0
|
||||
else:
|
||||
mc_age = 999.0
|
||||
|
||||
ob_raw = self.features_map.get('asset_BTCUSDT_ob')
|
||||
ob_state = json.loads(ob_raw) if ob_raw else {}
|
||||
|
||||
ob_depth = ob_state.get('depth_quality', 0.5)
|
||||
ob_fill = ob_state.get('fill_prob', 0.5)
|
||||
ob_stale = ob_state.get('stale', True)
|
||||
|
||||
dvol_spike = False # Extracted from ExF, currently hardcoded neutral
|
||||
t_since_spike = 999.0
|
||||
|
||||
state_raw = self.state_map.get('latest')
|
||||
state = json.loads(state_raw) if state_raw else {}
|
||||
drawdown = state.get('drawdown', 0.0)
|
||||
|
||||
rm, breakdown = self.survival_stack.compute_rm(
|
||||
hz_nodes=hz_nodes, heartbeat_age_s=heartbeat_age,
|
||||
mc_status=mc_status, mc_staleness_hours=mc_age,
|
||||
ob_depth_quality=ob_depth, ob_fill_prob=ob_fill, ob_stale=ob_stale,
|
||||
dvol_spike=dvol_spike, t_since_spike_min=t_since_spike,
|
||||
drawdown=drawdown
|
||||
)
|
||||
|
||||
posture = self.survival_stack.update_posture(rm)
|
||||
self.survival_stack.write_to_hz(rm, breakdown, self.hz_client)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process Survival Stack: {e}")
|
||||
|
||||
def _write_health(self):
|
||||
if self.health_map:
|
||||
try:
|
||||
self.health_map.put('latest', json.dumps(self.health))
|
||||
except Exception as e:
|
||||
logger.debug(f"Failed to write health to HZ: {e}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
args = parser.parse_args()
|
||||
watchdog = SystemWatchdog()
|
||||
try:
|
||||
watchdog.start()
|
||||
except KeyboardInterrupt:
|
||||
watchdog.stop()
|
||||
logger.info("Watchdog stopped.")
|
||||
Reference in New Issue
Block a user