DOLPHIN/prod/system_watchdog_service.py

"""DOLPHIN MIG3 — High-Frequency System Watchdog Service
Runs a continuous loop (< 1s tick) to verify system health.
Writes DOLPHIN_SYSTEM_HEALTH state to Hazelcast strongly consistent structure.
"""
import os
import time
import json
import logging
import argparse
import subprocess
import urllib.request
from pathlib import Path
from datetime import datetime, timezone, timedelta
import sys

import hazelcast

HCM_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(HCM_DIR / 'nautilus_dolphin'))
from nautilus_dolphin.nautilus.survival_stack import SurvivalStack

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s - %(message)s')
logger = logging.getLogger("Watchdog")

HCM_DIR = Path(__file__).parent.parent

# Use platform-independent paths
sys.path.insert(0, str(HCM_DIR))
sys.path.insert(0, str(HCM_DIR / 'prod'))
from dolphin_paths import get_eigenvalues_path, get_project_root

SCANS_DIR = get_eigenvalues_path()
LOGS_DIR = get_project_root() / 'paper_logs'

HZ_HOST = "localhost:5701"
HZ_CLUSTER = "dolphin"

class SystemWatchdog:
    def __init__(self, tick_rate=0.5):
        self.tick_rate = tick_rate
        self.slow_tick_rate = 10.0
        self.last_slow_tick = 0
        self.running = False
        
        self.health = {
            'hz': 'FAIL',
            'hz_container': 'UNKNOWN',
            'prefect': 'FAIL',
            'prefect_container': 'UNKNOWN',
            'mc_container': 'UNKNOWN',
            'scans': 'UNKNOWN',
            'logs': 'UNKNOWN',
            'timestamp': '',
            'overall': 'DEGRADED'
        }

        # HZ auto-heal: fast-tick HTTP probe, 2 failures (~1s) → restart (~19s worst-case)
        self._hz_container_fail_streak = 0
        self._hz_container_heal_threshold = 2
        self._hz_container_last_restart = 0.0

        # Prefect auto-heal: fast-tick HTTP probe, 4 failures (~2s) → restart (~35s worst-case)
        self._prefect_fail_streak = 0
        self._prefect_heal_threshold = 4
        self._prefect_last_restart = 0.0

        # MC auto-heal: slow-tick only (non-critical), 5 failures (50s) → restart
        self._mc_fail_streak = 0
        self._mc_heal_threshold = 5
        self._mc_last_restart = 0.0

        _fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes')
        self.survival_stack = SurvivalStack(fast_recovery=_fast_recovery)
        
        self._init_hz()

    def _init_hz(self):
        try:
            self.hz_client = hazelcast.HazelcastClient(
                cluster_name=HZ_CLUSTER,
                cluster_members=[HZ_HOST],
                connection_timeout=2.0
            )
            # Use IMap for strong consistency, no near-cache
            self.health_map = self.hz_client.get_map('DOLPHIN_SYSTEM_HEALTH').blocking()
            self.features_map = self.hz_client.get_map('DOLPHIN_FEATURES').blocking()
            self.state_map = self.hz_client.get_map('DOLPHIN_STATE_BLUE').blocking()
        except Exception as e:
            logger.error(f"HZ Init failed: {e}")
            self.hz_client = None
            self.health_map = None
            self.features_map = None
            self.state_map = None

    def start(self):
        self.running = True
        logger.info(f"Starting System Watchdog (tick={self.tick_rate}s)")
        
        while self.running:
            t0 = time.time()
            
            self._check_hz()
            self._check_hz_container()      # fast-tick: ~1ms HTTP probe
            self._check_prefect_container() # fast-tick: ~1ms HTTP probe

            if t0 - self.last_slow_tick >= self.slow_tick_rate:
                self._check_prefect()
                self._check_scans()
                self._check_logs()
                self._check_mc_container()  # slow-tick: MC is non-critical
                self.last_slow_tick = t0
                
            self.health['timestamp'] = datetime.now(timezone.utc).isoformat()
            
            overall = 'GREEN' if all(v == 'OK' for k, v in self.health.items() if k not in ('timestamp', 'overall')) else 'DEGRADED'
            
            if overall != self.health['overall']:
                logger.info(f"Health transitioned {self.health['overall']} -> {overall}. Details: {self.health}")
                
            self.health['overall'] = overall
            self._write_health()
            self._process_survival_stack()
            
            elapsed = time.time() - t0
            time.sleep(max(0.01, self.tick_rate - elapsed))
            
    def stop(self):
        self.running = False
        if self.hz_client:
            self.hz_client.shutdown()

    def _check_hz(self):
        if self.hz_client and self.hz_client.lifecycle_service.is_running():
            if self.hz_client.cluster_service.get_members():
                self.health['hz'] = 'OK'
                return
        
        self.health['hz'] = 'FAIL'
        if not self.hz_client or not self.hz_client.lifecycle_service.is_running():
            self._init_hz()

    def _check_prefect(self):
        try:
            req = urllib.request.Request("http://localhost:4200/api/health")
            with urllib.request.urlopen(req, timeout=1) as resp:
                data = json.loads(resp.read().decode('utf-8'))
                self.health['prefect'] = 'OK' if data.get('status') == 'ok' else 'FAIL'
        except Exception:
            self.health['prefect'] = 'FAIL'

    def _check_scans(self):
        try:
            if not SCANS_DIR.exists():
                self.health['scans'] = 'FAIL (No directory)'
                return
            dirs = sorted(d.name for d in SCANS_DIR.iterdir() if d.is_dir() and len(d.name) == 10 and d.name.startswith('20'))
            if not dirs:
                self.health['scans'] = 'FAIL (No scans)'
                return
            latest_date = datetime.strptime(dirs[-1], '%Y-%m-%d').replace(tzinfo=timezone.utc)
            age = datetime.now(timezone.utc) - latest_date
            if age > timedelta(days=2):
                self.health['scans'] = f"FAIL (Stale {age.days} days)"
            else:
                self.health['scans'] = 'OK'
        except Exception as e:
            self.health['scans'] = f"FAIL ({str(e)})"

    def _check_logs(self):
        try:
            if not LOGS_DIR.exists():
                self.health['logs'] = 'FAIL (No directory)'
                return
            log_files = sorted(LOGS_DIR.glob("paper_pnl_*.jsonl"))
            if not log_files:
                self.health['logs'] = 'FAIL (No logs)'
                return
            last_line = ""
            with open(log_files[-1], 'r') as f:
                for line in f:
                    if line.strip():
                        last_line = line
            if not last_line:
                self.health['logs'] = 'FAIL (Empty log)'
                return
            data = json.loads(last_line)
            log_date = datetime.fromisoformat(data['logged_at'])
            age = datetime.now(timezone.utc) - log_date
            if age > timedelta(days=2):
                self.health['logs'] = f"FAIL (Stale {age.days} days)"
            else:
                self.health['logs'] = 'OK'
        except Exception as e:
            self.health['logs'] = f"FAIL ({str(e)})"

    def _check_hz_container(self):
        """Fast-tick HTTP health probe (~1ms when healthy). Triggers auto-heal after 2 failures."""
        try:
            req = urllib.request.Request('http://127.0.0.1:5701/hazelcast/health')
            with urllib.request.urlopen(req, timeout=0.3) as resp:
                data = json.loads(resp.read())
            if data.get('nodeState') == 'ACTIVE':
                self.health['hz_container'] = 'OK'
                self._hz_container_fail_streak = 0
                return
            # Node up but not ACTIVE (e.g. PASSIVE, FROZEN)
            state = data.get('nodeState', 'UNKNOWN')
            self.health['hz_container'] = f'WARN ({state})'
            self._hz_container_fail_streak += 1
        except Exception:
            self.health['hz_container'] = 'FAIL'
            self._hz_container_fail_streak += 1

        if self._hz_container_fail_streak >= self._hz_container_heal_threshold:
            self._heal_hz_container()

    def _heal_hz_container(self):
        """Restart the HZ container after sustained unhealthy state. Idempotent."""
        now = time.time()
        if now - self._hz_container_last_restart < 30:   # minimum 30s between restarts (restart takes ~15s)
            logger.warning("HZ heal suppressed — last restart was <30s ago")
            return
        logger.critical("AUTO-HEAL: restarting dolphin-hazelcast (sustained unhealthy)")
        try:
            subprocess.run(['docker', 'restart', 'dolphin-hazelcast'], timeout=30, check=True)
            self._hz_container_fail_streak = 0
            self._hz_container_last_restart = now
            logger.info("AUTO-HEAL: dolphin-hazelcast restarted successfully")
        except Exception as e:
            logger.error(f"AUTO-HEAL: docker restart failed: {e}")

    def _check_prefect_container(self):
        """Fast-tick HTTP probe to Prefect API. Heals after 4 consecutive failures (~2s)."""
        try:
            req = urllib.request.Request('http://127.0.0.1:4200/api/health')
            with urllib.request.urlopen(req, timeout=0.3) as resp:
                body = resp.read().strip()
            if body == b'true':
                self.health['prefect_container'] = 'OK'
                self._prefect_fail_streak = 0
                return
            self.health['prefect_container'] = f'WARN (body={body[:20]})'
            self._prefect_fail_streak += 1
        except Exception:
            self.health['prefect_container'] = 'FAIL'
            self._prefect_fail_streak += 1

        if self._prefect_fail_streak >= self._prefect_heal_threshold:
            self._heal_prefect_container()

    def _heal_prefect_container(self):
        now = time.time()
        if now - self._prefect_last_restart < 60:
            logger.warning("Prefect heal suppressed — last restart <60s ago")
            return
        logger.critical("AUTO-HEAL: restarting dolphin-prefect (sustained unhealthy)")
        try:
            subprocess.run(['docker', 'restart', 'dolphin-prefect'], timeout=45, check=True)
            self._prefect_fail_streak = 0
            self._prefect_last_restart = now
            logger.info("AUTO-HEAL: dolphin-prefect restarted successfully")
        except Exception as e:
            logger.error(f"AUTO-HEAL: prefect restart failed: {e}")

    def _check_mc_container(self):
        """Slow-tick HTTP probe to Hazelcast MC (non-critical). Heals after 5 slow-tick failures."""
        try:
            req = urllib.request.Request('http://127.0.0.1:8080/')
            with urllib.request.urlopen(req, timeout=1.0) as resp:
                if resp.status == 200:
                    self.health['mc_container'] = 'OK'
                    self._mc_fail_streak = 0
                    return
            self.health['mc_container'] = f'WARN (status={resp.status})'
            self._mc_fail_streak += 1
        except Exception:
            self.health['mc_container'] = 'FAIL'
            self._mc_fail_streak += 1

        if self._mc_fail_streak >= self._mc_heal_threshold:
            self._heal_mc_container()

    def _heal_mc_container(self):
        now = time.time()
        if now - self._mc_last_restart < 120:
            logger.warning("MC heal suppressed — last restart <120s ago")
            return
        logger.critical("AUTO-HEAL: restarting dolphin-hazelcast-mc (sustained unhealthy)")
        try:
            subprocess.run(['docker', 'restart', 'dolphin-hazelcast-mc'], timeout=60, check=True)
            self._mc_fail_streak = 0
            self._mc_last_restart = now
            logger.info("AUTO-HEAL: dolphin-hazelcast-mc restarted successfully")
        except Exception as e:
            logger.error(f"AUTO-HEAL: mc restart failed: {e}")

    def _process_survival_stack(self):
        if not self.features_map:
            return
            
        try:
            # Gather state safely
            hz_nodes = len(self.hz_client.cluster_service.get_members()) if self.hz_client else 0
            heartbeat_age = 0.0 # Handled by the watchdog directly being alive
            
            mc_raw = self.features_map.get('mc_forewarner_latest')
            mc_state = json.loads(mc_raw) if mc_raw else {}
            mc_status = mc_state.get('status', 'ORANGE') # default ORANGE if missing
            mc_ts = mc_state.get('timestamp')
            
            if mc_ts:
                mc_age = (datetime.now(timezone.utc) - datetime.fromisoformat(mc_ts)).total_seconds() / 3600.0
            else:
                mc_age = 999.0
            
            ob_raw = self.features_map.get('asset_BTCUSDT_ob')
            ob_state = json.loads(ob_raw) if ob_raw else {}
            
            ob_depth = ob_state.get('depth_quality', 0.5)
            ob_fill = ob_state.get('fill_prob', 0.5)
            ob_stale = ob_state.get('stale', True)
            
            dvol_spike = False # Extracted from ExF, currently hardcoded neutral
            t_since_spike = 999.0
            
            state_raw = self.state_map.get('latest')
            state = json.loads(state_raw) if state_raw else {}
            drawdown = state.get('drawdown', 0.0)
            
            rm, breakdown = self.survival_stack.compute_rm(
                hz_nodes=hz_nodes, heartbeat_age_s=heartbeat_age,
                mc_status=mc_status, mc_staleness_hours=mc_age,
                ob_depth_quality=ob_depth, ob_fill_prob=ob_fill, ob_stale=ob_stale,
                dvol_spike=dvol_spike, t_since_spike_min=t_since_spike,
                drawdown=drawdown
            )
            
            posture = self.survival_stack.update_posture(rm)
            self.survival_stack.write_to_hz(rm, breakdown, self.hz_client)
            
        except Exception as e:
            logger.error(f"Failed to process Survival Stack: {e}")

    def _write_health(self):
        if self.health_map:
            try:
                self.health_map.put('latest', json.dumps(self.health))
            except Exception as e:
                logger.debug(f"Failed to write health to HZ: {e}")

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    args = parser.parse_args()
    watchdog = SystemWatchdog()
    try:
        watchdog.start()
    except KeyboardInterrupt:
        watchdog.stop()
        logger.info("Watchdog stopped.")
initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore. 2026-04-21 16:58:38 +02:00			`"""DOLPHIN MIG3 — High-Frequency System Watchdog Service`
			`Runs a continuous loop (< 1s tick) to verify system health.`
			`Writes DOLPHIN_SYSTEM_HEALTH state to Hazelcast strongly consistent structure.`
			`"""`
			`import os`
			`import time`
			`import json`
			`import logging`
			`import argparse`
			`import subprocess`
			`import urllib.request`
			`from pathlib import Path`
			`from datetime import datetime, timezone, timedelta`
			`import sys`

			`import hazelcast`

			`HCM_DIR = Path(__file__).parent.parent`
			`sys.path.insert(0, str(HCM_DIR / 'nautilus_dolphin'))`
			`from nautilus_dolphin.nautilus.survival_stack import SurvivalStack`

			`logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s - %(message)s')`
			`logger = logging.getLogger("Watchdog")`

			`HCM_DIR = Path(__file__).parent.parent`

			`# Use platform-independent paths`
			`sys.path.insert(0, str(HCM_DIR))`
			`sys.path.insert(0, str(HCM_DIR / 'prod'))`
			`from dolphin_paths import get_eigenvalues_path, get_project_root`

			`SCANS_DIR = get_eigenvalues_path()`
			`LOGS_DIR = get_project_root() / 'paper_logs'`

			`HZ_HOST = "localhost:5701"`
			`HZ_CLUSTER = "dolphin"`

			`class SystemWatchdog:`
			`def __init__(self, tick_rate=0.5):`
			`self.tick_rate = tick_rate`
			`self.slow_tick_rate = 10.0`
			`self.last_slow_tick = 0`
			`self.running = False`

			`self.health = {`
			`'hz': 'FAIL',`
			`'hz_container': 'UNKNOWN',`
			`'prefect': 'FAIL',`
			`'prefect_container': 'UNKNOWN',`
			`'mc_container': 'UNKNOWN',`
			`'scans': 'UNKNOWN',`
			`'logs': 'UNKNOWN',`
			`'timestamp': '',`
			`'overall': 'DEGRADED'`
			`}`

			`# HZ auto-heal: fast-tick HTTP probe, 2 failures (~1s) → restart (~19s worst-case)`
			`self._hz_container_fail_streak = 0`
			`self._hz_container_heal_threshold = 2`
			`self._hz_container_last_restart = 0.0`

			`# Prefect auto-heal: fast-tick HTTP probe, 4 failures (~2s) → restart (~35s worst-case)`
			`self._prefect_fail_streak = 0`
			`self._prefect_heal_threshold = 4`
			`self._prefect_last_restart = 0.0`

			`# MC auto-heal: slow-tick only (non-critical), 5 failures (50s) → restart`
			`self._mc_fail_streak = 0`
			`self._mc_heal_threshold = 5`
			`self._mc_last_restart = 0.0`

			`_fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes')`
			`self.survival_stack = SurvivalStack(fast_recovery=_fast_recovery)`

			`self._init_hz()`

			`def _init_hz(self):`
			`try:`
			`self.hz_client = hazelcast.HazelcastClient(`
			`cluster_name=HZ_CLUSTER,`
			`cluster_members=[HZ_HOST],`
			`connection_timeout=2.0`
			`)`
			`# Use IMap for strong consistency, no near-cache`
			`self.health_map = self.hz_client.get_map('DOLPHIN_SYSTEM_HEALTH').blocking()`
			`self.features_map = self.hz_client.get_map('DOLPHIN_FEATURES').blocking()`
			`self.state_map = self.hz_client.get_map('DOLPHIN_STATE_BLUE').blocking()`
			`except Exception as e:`
			`logger.error(f"HZ Init failed: {e}")`
			`self.hz_client = None`
			`self.health_map = None`
			`self.features_map = None`
			`self.state_map = None`

			`def start(self):`
			`self.running = True`
			`logger.info(f"Starting System Watchdog (tick={self.tick_rate}s)")`

			`while self.running:`
			`t0 = time.time()`

			`self._check_hz()`
			`self._check_hz_container() # fast-tick: ~1ms HTTP probe`
			`self._check_prefect_container() # fast-tick: ~1ms HTTP probe`

			`if t0 - self.last_slow_tick >= self.slow_tick_rate:`
			`self._check_prefect()`
			`self._check_scans()`
			`self._check_logs()`
			`self._check_mc_container() # slow-tick: MC is non-critical`
			`self.last_slow_tick = t0`

			`self.health['timestamp'] = datetime.now(timezone.utc).isoformat()`

			`overall = 'GREEN' if all(v == 'OK' for k, v in self.health.items() if k not in ('timestamp', 'overall')) else 'DEGRADED'`

			`if overall != self.health['overall']:`
			`logger.info(f"Health transitioned {self.health['overall']} -> {overall}. Details: {self.health}")`

			`self.health['overall'] = overall`
			`self._write_health()`
			`self._process_survival_stack()`

			`elapsed = time.time() - t0`
			`time.sleep(max(0.01, self.tick_rate - elapsed))`

			`def stop(self):`
			`self.running = False`
			`if self.hz_client:`
			`self.hz_client.shutdown()`

			`def _check_hz(self):`
			`if self.hz_client and self.hz_client.lifecycle_service.is_running():`
			`if self.hz_client.cluster_service.get_members():`
			`self.health['hz'] = 'OK'`
			`return`

			`self.health['hz'] = 'FAIL'`
			`if not self.hz_client or not self.hz_client.lifecycle_service.is_running():`
			`self._init_hz()`

			`def _check_prefect(self):`
			`try:`
			`req = urllib.request.Request("http://localhost:4200/api/health")`
			`with urllib.request.urlopen(req, timeout=1) as resp:`
			`data = json.loads(resp.read().decode('utf-8'))`
			`self.health['prefect'] = 'OK' if data.get('status') == 'ok' else 'FAIL'`
			`except Exception:`
			`self.health['prefect'] = 'FAIL'`

			`def _check_scans(self):`
			`try:`
			`if not SCANS_DIR.exists():`
			`self.health['scans'] = 'FAIL (No directory)'`
			`return`
			`dirs = sorted(d.name for d in SCANS_DIR.iterdir() if d.is_dir() and len(d.name) == 10 and d.name.startswith('20'))`
			`if not dirs:`
			`self.health['scans'] = 'FAIL (No scans)'`
			`return`
			`latest_date = datetime.strptime(dirs[-1], '%Y-%m-%d').replace(tzinfo=timezone.utc)`
			`age = datetime.now(timezone.utc) - latest_date`
			`if age > timedelta(days=2):`
			`self.health['scans'] = f"FAIL (Stale {age.days} days)"`
			`else:`
			`self.health['scans'] = 'OK'`
			`except Exception as e:`
			`self.health['scans'] = f"FAIL ({str(e)})"`

			`def _check_logs(self):`
			`try:`
			`if not LOGS_DIR.exists():`
			`self.health['logs'] = 'FAIL (No directory)'`
			`return`
			`log_files = sorted(LOGS_DIR.glob("paper_pnl_*.jsonl"))`
			`if not log_files:`
			`self.health['logs'] = 'FAIL (No logs)'`
			`return`
			`last_line = ""`
			`with open(log_files[-1], 'r') as f:`
			`for line in f:`
			`if line.strip():`
			`last_line = line`
			`if not last_line:`
			`self.health['logs'] = 'FAIL (Empty log)'`
			`return`
			`data = json.loads(last_line)`
			`log_date = datetime.fromisoformat(data['logged_at'])`
			`age = datetime.now(timezone.utc) - log_date`
			`if age > timedelta(days=2):`
			`self.health['logs'] = f"FAIL (Stale {age.days} days)"`
			`else:`
			`self.health['logs'] = 'OK'`
			`except Exception as e:`
			`self.health['logs'] = f"FAIL ({str(e)})"`

			`def _check_hz_container(self):`
			`"""Fast-tick HTTP health probe (~1ms when healthy). Triggers auto-heal after 2 failures."""`
			`try:`
			`req = urllib.request.Request('http://127.0.0.1:5701/hazelcast/health')`
			`with urllib.request.urlopen(req, timeout=0.3) as resp:`
			`data = json.loads(resp.read())`
			`if data.get('nodeState') == 'ACTIVE':`
			`self.health['hz_container'] = 'OK'`
			`self._hz_container_fail_streak = 0`
			`return`
			`# Node up but not ACTIVE (e.g. PASSIVE, FROZEN)`
			`state = data.get('nodeState', 'UNKNOWN')`
			`self.health['hz_container'] = f'WARN ({state})'`
			`self._hz_container_fail_streak += 1`
			`except Exception:`
			`self.health['hz_container'] = 'FAIL'`
			`self._hz_container_fail_streak += 1`

			`if self._hz_container_fail_streak >= self._hz_container_heal_threshold:`
			`self._heal_hz_container()`

			`def _heal_hz_container(self):`
			`"""Restart the HZ container after sustained unhealthy state. Idempotent."""`
			`now = time.time()`
			`if now - self._hz_container_last_restart < 30: # minimum 30s between restarts (restart takes ~15s)`
			`logger.warning("HZ heal suppressed — last restart was <30s ago")`
			`return`
			`logger.critical("AUTO-HEAL: restarting dolphin-hazelcast (sustained unhealthy)")`
			`try:`
			`subprocess.run(['docker', 'restart', 'dolphin-hazelcast'], timeout=30, check=True)`
			`self._hz_container_fail_streak = 0`
			`self._hz_container_last_restart = now`
			`logger.info("AUTO-HEAL: dolphin-hazelcast restarted successfully")`
			`except Exception as e:`
			`logger.error(f"AUTO-HEAL: docker restart failed: {e}")`

			`def _check_prefect_container(self):`
			`"""Fast-tick HTTP probe to Prefect API. Heals after 4 consecutive failures (~2s)."""`
			`try:`
			`req = urllib.request.Request('http://127.0.0.1:4200/api/health')`
			`with urllib.request.urlopen(req, timeout=0.3) as resp:`
			`body = resp.read().strip()`
			`if body == b'true':`
			`self.health['prefect_container'] = 'OK'`
			`self._prefect_fail_streak = 0`
			`return`
			`self.health['prefect_container'] = f'WARN (body={body[:20]})'`
			`self._prefect_fail_streak += 1`
			`except Exception:`
			`self.health['prefect_container'] = 'FAIL'`
			`self._prefect_fail_streak += 1`

			`if self._prefect_fail_streak >= self._prefect_heal_threshold:`
			`self._heal_prefect_container()`

			`def _heal_prefect_container(self):`
			`now = time.time()`
			`if now - self._prefect_last_restart < 60:`
			`logger.warning("Prefect heal suppressed — last restart <60s ago")`
			`return`
			`logger.critical("AUTO-HEAL: restarting dolphin-prefect (sustained unhealthy)")`
			`try:`
			`subprocess.run(['docker', 'restart', 'dolphin-prefect'], timeout=45, check=True)`
			`self._prefect_fail_streak = 0`
			`self._prefect_last_restart = now`
			`logger.info("AUTO-HEAL: dolphin-prefect restarted successfully")`
			`except Exception as e:`
			`logger.error(f"AUTO-HEAL: prefect restart failed: {e}")`

			`def _check_mc_container(self):`
			`"""Slow-tick HTTP probe to Hazelcast MC (non-critical). Heals after 5 slow-tick failures."""`
			`try:`
			`req = urllib.request.Request('http://127.0.0.1:8080/')`
			`with urllib.request.urlopen(req, timeout=1.0) as resp:`
			`if resp.status == 200:`
			`self.health['mc_container'] = 'OK'`
			`self._mc_fail_streak = 0`
			`return`
			`self.health['mc_container'] = f'WARN (status={resp.status})'`
			`self._mc_fail_streak += 1`
			`except Exception:`
			`self.health['mc_container'] = 'FAIL'`
			`self._mc_fail_streak += 1`

			`if self._mc_fail_streak >= self._mc_heal_threshold:`
			`self._heal_mc_container()`

			`def _heal_mc_container(self):`
			`now = time.time()`
			`if now - self._mc_last_restart < 120:`
			`logger.warning("MC heal suppressed — last restart <120s ago")`
			`return`
			`logger.critical("AUTO-HEAL: restarting dolphin-hazelcast-mc (sustained unhealthy)")`
			`try:`
			`subprocess.run(['docker', 'restart', 'dolphin-hazelcast-mc'], timeout=60, check=True)`
			`self._mc_fail_streak = 0`
			`self._mc_last_restart = now`
			`logger.info("AUTO-HEAL: dolphin-hazelcast-mc restarted successfully")`
			`except Exception as e:`
			`logger.error(f"AUTO-HEAL: mc restart failed: {e}")`

			`def _process_survival_stack(self):`
			`if not self.features_map:`
			`return`

			`try:`
			`# Gather state safely`
			`hz_nodes = len(self.hz_client.cluster_service.get_members()) if self.hz_client else 0`
			`heartbeat_age = 0.0 # Handled by the watchdog directly being alive`

			`mc_raw = self.features_map.get('mc_forewarner_latest')`
			`mc_state = json.loads(mc_raw) if mc_raw else {}`
			`mc_status = mc_state.get('status', 'ORANGE') # default ORANGE if missing`
			`mc_ts = mc_state.get('timestamp')`

			`if mc_ts:`
			`mc_age = (datetime.now(timezone.utc) - datetime.fromisoformat(mc_ts)).total_seconds() / 3600.0`
			`else:`
			`mc_age = 999.0`

			`ob_raw = self.features_map.get('asset_BTCUSDT_ob')`
			`ob_state = json.loads(ob_raw) if ob_raw else {}`

			`ob_depth = ob_state.get('depth_quality', 0.5)`
			`ob_fill = ob_state.get('fill_prob', 0.5)`
			`ob_stale = ob_state.get('stale', True)`

			`dvol_spike = False # Extracted from ExF, currently hardcoded neutral`
			`t_since_spike = 999.0`

			`state_raw = self.state_map.get('latest')`
			`state = json.loads(state_raw) if state_raw else {}`
			`drawdown = state.get('drawdown', 0.0)`

			`rm, breakdown = self.survival_stack.compute_rm(`
			`hz_nodes=hz_nodes, heartbeat_age_s=heartbeat_age,`
			`mc_status=mc_status, mc_staleness_hours=mc_age,`
			`ob_depth_quality=ob_depth, ob_fill_prob=ob_fill, ob_stale=ob_stale,`
			`dvol_spike=dvol_spike, t_since_spike_min=t_since_spike,`
			`drawdown=drawdown`
			`)`

			`posture = self.survival_stack.update_posture(rm)`
			`self.survival_stack.write_to_hz(rm, breakdown, self.hz_client)`

			`except Exception as e:`
			`logger.error(f"Failed to process Survival Stack: {e}")`

			`def _write_health(self):`
			`if self.health_map:`
			`try:`
			`self.health_map.put('latest', json.dumps(self.health))`
			`except Exception as e:`
			`logger.debug(f"Failed to write health to HZ: {e}")`

			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser()`
			`args = parser.parse_args()`
			`watchdog = SystemWatchdog()`
			`try:`
			`watchdog.start()`
			`except KeyboardInterrupt:`
			`watchdog.stop()`
			`logger.info("Watchdog stopped.")`