initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/prod/system_watchdog_service.py
+++ b/prod/system_watchdog_service.py
@@ -0,0 +1,359 @@
+"""DOLPHIN MIG3 — High-Frequency System Watchdog Service
+Runs a continuous loop (< 1s tick) to verify system health.
+Writes DOLPHIN_SYSTEM_HEALTH state to Hazelcast strongly consistent structure.
+"""
+import os
+import time
+import json
+import logging
+import argparse
+import subprocess
+import urllib.request
+from pathlib import Path
+from datetime import datetime, timezone, timedelta
+import sys
+
+import hazelcast
+
+HCM_DIR = Path(__file__).parent.parent
+sys.path.insert(0, str(HCM_DIR / 'nautilus_dolphin'))
+from nautilus_dolphin.nautilus.survival_stack import SurvivalStack
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s - %(message)s')
+logger = logging.getLogger("Watchdog")
+
+HCM_DIR = Path(__file__).parent.parent
+
+# Use platform-independent paths
+sys.path.insert(0, str(HCM_DIR))
+sys.path.insert(0, str(HCM_DIR / 'prod'))
+from dolphin_paths import get_eigenvalues_path, get_project_root
+
+SCANS_DIR = get_eigenvalues_path()
+LOGS_DIR = get_project_root() / 'paper_logs'
+
+HZ_HOST = "localhost:5701"
+HZ_CLUSTER = "dolphin"
+
+class SystemWatchdog:
+    def __init__(self, tick_rate=0.5):
+        self.tick_rate = tick_rate
+        self.slow_tick_rate = 10.0
+        self.last_slow_tick = 0
+        self.running = False
+        
+        self.health = {
+            'hz': 'FAIL',
+            'hz_container': 'UNKNOWN',
+            'prefect': 'FAIL',
+            'prefect_container': 'UNKNOWN',
+            'mc_container': 'UNKNOWN',
+            'scans': 'UNKNOWN',
+            'logs': 'UNKNOWN',
+            'timestamp': '',
+            'overall': 'DEGRADED'
+        }
+
+        # HZ auto-heal: fast-tick HTTP probe, 2 failures (~1s) → restart (~19s worst-case)
+        self._hz_container_fail_streak = 0
+        self._hz_container_heal_threshold = 2
+        self._hz_container_last_restart = 0.0
+
+        # Prefect auto-heal: fast-tick HTTP probe, 4 failures (~2s) → restart (~35s worst-case)
+        self._prefect_fail_streak = 0
+        self._prefect_heal_threshold = 4
+        self._prefect_last_restart = 0.0
+
+        # MC auto-heal: slow-tick only (non-critical), 5 failures (50s) → restart
+        self._mc_fail_streak = 0
+        self._mc_heal_threshold = 5
+        self._mc_last_restart = 0.0
+
+        _fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes')
+        self.survival_stack = SurvivalStack(fast_recovery=_fast_recovery)
+        
+        self._init_hz()
+
+    def _init_hz(self):
+        try:
+            self.hz_client = hazelcast.HazelcastClient(
+                cluster_name=HZ_CLUSTER,
+                cluster_members=[HZ_HOST],
+                connection_timeout=2.0
+            )
+            # Use IMap for strong consistency, no near-cache
+            self.health_map = self.hz_client.get_map('DOLPHIN_SYSTEM_HEALTH').blocking()
+            self.features_map = self.hz_client.get_map('DOLPHIN_FEATURES').blocking()
+            self.state_map = self.hz_client.get_map('DOLPHIN_STATE_BLUE').blocking()
+        except Exception as e:
+            logger.error(f"HZ Init failed: {e}")
+            self.hz_client = None
+            self.health_map = None
+            self.features_map = None
+            self.state_map = None
+
+    def start(self):
+        self.running = True
+        logger.info(f"Starting System Watchdog (tick={self.tick_rate}s)")
+        
+        while self.running:
+            t0 = time.time()
+            
+            self._check_hz()
+            self._check_hz_container()      # fast-tick: ~1ms HTTP probe
+            self._check_prefect_container() # fast-tick: ~1ms HTTP probe
+
+            if t0 - self.last_slow_tick >= self.slow_tick_rate:
+                self._check_prefect()
+                self._check_scans()
+                self._check_logs()
+                self._check_mc_container()  # slow-tick: MC is non-critical
+                self.last_slow_tick = t0
+                
+            self.health['timestamp'] = datetime.now(timezone.utc).isoformat()
+            
+            overall = 'GREEN' if all(v == 'OK' for k, v in self.health.items() if k not in ('timestamp', 'overall')) else 'DEGRADED'
+            
+            if overall != self.health['overall']:
+                logger.info(f"Health transitioned {self.health['overall']} -> {overall}. Details: {self.health}")
+                
+            self.health['overall'] = overall
+            self._write_health()
+            self._process_survival_stack()
+            
+            elapsed = time.time() - t0
+            time.sleep(max(0.01, self.tick_rate - elapsed))
+            
+    def stop(self):
+        self.running = False
+        if self.hz_client:
+            self.hz_client.shutdown()
+
+    def _check_hz(self):
+        if self.hz_client and self.hz_client.lifecycle_service.is_running():
+            if self.hz_client.cluster_service.get_members():
+                self.health['hz'] = 'OK'
+                return
+        
+        self.health['hz'] = 'FAIL'
+        if not self.hz_client or not self.hz_client.lifecycle_service.is_running():
+            self._init_hz()
+
+    def _check_prefect(self):
+        try:
+            req = urllib.request.Request("http://localhost:4200/api/health")
+            with urllib.request.urlopen(req, timeout=1) as resp:
+                data = json.loads(resp.read().decode('utf-8'))
+                self.health['prefect'] = 'OK' if data.get('status') == 'ok' else 'FAIL'
+        except Exception:
+            self.health['prefect'] = 'FAIL'
+
+    def _check_scans(self):
+        try:
+            if not SCANS_DIR.exists():
+                self.health['scans'] = 'FAIL (No directory)'
+                return
+            dirs = sorted(d.name for d in SCANS_DIR.iterdir() if d.is_dir() and len(d.name) == 10 and d.name.startswith('20'))
+            if not dirs:
+                self.health['scans'] = 'FAIL (No scans)'
+                return
+            latest_date = datetime.strptime(dirs[-1], '%Y-%m-%d').replace(tzinfo=timezone.utc)
+            age = datetime.now(timezone.utc) - latest_date
+            if age > timedelta(days=2):
+                self.health['scans'] = f"FAIL (Stale {age.days} days)"
+            else:
+                self.health['scans'] = 'OK'
+        except Exception as e:
+            self.health['scans'] = f"FAIL ({str(e)})"
+
+    def _check_logs(self):
+        try:
+            if not LOGS_DIR.exists():
+                self.health['logs'] = 'FAIL (No directory)'
+                return
+            log_files = sorted(LOGS_DIR.glob("paper_pnl_*.jsonl"))
+            if not log_files:
+                self.health['logs'] = 'FAIL (No logs)'
+                return
+            last_line = ""
+            with open(log_files[-1], 'r') as f:
+                for line in f:
+                    if line.strip():
+                        last_line = line
+            if not last_line:
+                self.health['logs'] = 'FAIL (Empty log)'
+                return
+            data = json.loads(last_line)
+            log_date = datetime.fromisoformat(data['logged_at'])
+            age = datetime.now(timezone.utc) - log_date
+            if age > timedelta(days=2):
+                self.health['logs'] = f"FAIL (Stale {age.days} days)"
+            else:
+                self.health['logs'] = 'OK'
+        except Exception as e:
+            self.health['logs'] = f"FAIL ({str(e)})"
+
+    def _check_hz_container(self):
+        """Fast-tick HTTP health probe (~1ms when healthy). Triggers auto-heal after 2 failures."""
+        try:
+            req = urllib.request.Request('http://127.0.0.1:5701/hazelcast/health')
+            with urllib.request.urlopen(req, timeout=0.3) as resp:
+                data = json.loads(resp.read())
+            if data.get('nodeState') == 'ACTIVE':
+                self.health['hz_container'] = 'OK'
+                self._hz_container_fail_streak = 0
+                return
+            # Node up but not ACTIVE (e.g. PASSIVE, FROZEN)
+            state = data.get('nodeState', 'UNKNOWN')
+            self.health['hz_container'] = f'WARN ({state})'
+            self._hz_container_fail_streak += 1
+        except Exception:
+            self.health['hz_container'] = 'FAIL'
+            self._hz_container_fail_streak += 1
+
+        if self._hz_container_fail_streak >= self._hz_container_heal_threshold:
+            self._heal_hz_container()
+
+    def _heal_hz_container(self):
+        """Restart the HZ container after sustained unhealthy state. Idempotent."""
+        now = time.time()
+        if now - self._hz_container_last_restart < 30:   # minimum 30s between restarts (restart takes ~15s)
+            logger.warning("HZ heal suppressed — last restart was <30s ago")
+            return
+        logger.critical("AUTO-HEAL: restarting dolphin-hazelcast (sustained unhealthy)")
+        try:
+            subprocess.run(['docker', 'restart', 'dolphin-hazelcast'], timeout=30, check=True)
+            self._hz_container_fail_streak = 0
+            self._hz_container_last_restart = now
+            logger.info("AUTO-HEAL: dolphin-hazelcast restarted successfully")
+        except Exception as e:
+            logger.error(f"AUTO-HEAL: docker restart failed: {e}")
+
+    def _check_prefect_container(self):
+        """Fast-tick HTTP probe to Prefect API. Heals after 4 consecutive failures (~2s)."""
+        try:
+            req = urllib.request.Request('http://127.0.0.1:4200/api/health')
+            with urllib.request.urlopen(req, timeout=0.3) as resp:
+                body = resp.read().strip()
+            if body == b'true':
+                self.health['prefect_container'] = 'OK'
+                self._prefect_fail_streak = 0
+                return
+            self.health['prefect_container'] = f'WARN (body={body[:20]})'
+            self._prefect_fail_streak += 1
+        except Exception:
+            self.health['prefect_container'] = 'FAIL'
+            self._prefect_fail_streak += 1
+
+        if self._prefect_fail_streak >= self._prefect_heal_threshold:
+            self._heal_prefect_container()
+
+    def _heal_prefect_container(self):
+        now = time.time()
+        if now - self._prefect_last_restart < 60:
+            logger.warning("Prefect heal suppressed — last restart <60s ago")
+            return
+        logger.critical("AUTO-HEAL: restarting dolphin-prefect (sustained unhealthy)")
+        try:
+            subprocess.run(['docker', 'restart', 'dolphin-prefect'], timeout=45, check=True)
+            self._prefect_fail_streak = 0
+            self._prefect_last_restart = now
+            logger.info("AUTO-HEAL: dolphin-prefect restarted successfully")
+        except Exception as e:
+            logger.error(f"AUTO-HEAL: prefect restart failed: {e}")
+
+    def _check_mc_container(self):
+        """Slow-tick HTTP probe to Hazelcast MC (non-critical). Heals after 5 slow-tick failures."""
+        try:
+            req = urllib.request.Request('http://127.0.0.1:8080/')
+            with urllib.request.urlopen(req, timeout=1.0) as resp:
+                if resp.status == 200:
+                    self.health['mc_container'] = 'OK'
+                    self._mc_fail_streak = 0
+                    return
+            self.health['mc_container'] = f'WARN (status={resp.status})'
+            self._mc_fail_streak += 1
+        except Exception:
+            self.health['mc_container'] = 'FAIL'
+            self._mc_fail_streak += 1
+
+        if self._mc_fail_streak >= self._mc_heal_threshold:
+            self._heal_mc_container()
+
+    def _heal_mc_container(self):
+        now = time.time()
+        if now - self._mc_last_restart < 120:
+            logger.warning("MC heal suppressed — last restart <120s ago")
+            return
+        logger.critical("AUTO-HEAL: restarting dolphin-hazelcast-mc (sustained unhealthy)")
+        try:
+            subprocess.run(['docker', 'restart', 'dolphin-hazelcast-mc'], timeout=60, check=True)
+            self._mc_fail_streak = 0
+            self._mc_last_restart = now
+            logger.info("AUTO-HEAL: dolphin-hazelcast-mc restarted successfully")
+        except Exception as e:
+            logger.error(f"AUTO-HEAL: mc restart failed: {e}")
+
+    def _process_survival_stack(self):
+        if not self.features_map:
+            return
+            
+        try:
+            # Gather state safely
+            hz_nodes = len(self.hz_client.cluster_service.get_members()) if self.hz_client else 0
+            heartbeat_age = 0.0 # Handled by the watchdog directly being alive
+            
+            mc_raw = self.features_map.get('mc_forewarner_latest')
+            mc_state = json.loads(mc_raw) if mc_raw else {}
+            mc_status = mc_state.get('status', 'ORANGE') # default ORANGE if missing
+            mc_ts = mc_state.get('timestamp')
+            
+            if mc_ts:
+                mc_age = (datetime.now(timezone.utc) - datetime.fromisoformat(mc_ts)).total_seconds() / 3600.0
+            else:
+                mc_age = 999.0
+            
+            ob_raw = self.features_map.get('asset_BTCUSDT_ob')
+            ob_state = json.loads(ob_raw) if ob_raw else {}
+            
+            ob_depth = ob_state.get('depth_quality', 0.5)
+            ob_fill = ob_state.get('fill_prob', 0.5)
+            ob_stale = ob_state.get('stale', True)
+            
+            dvol_spike = False # Extracted from ExF, currently hardcoded neutral
+            t_since_spike = 999.0
+            
+            state_raw = self.state_map.get('latest')
+            state = json.loads(state_raw) if state_raw else {}
+            drawdown = state.get('drawdown', 0.0)
+            
+            rm, breakdown = self.survival_stack.compute_rm(
+                hz_nodes=hz_nodes, heartbeat_age_s=heartbeat_age,
+                mc_status=mc_status, mc_staleness_hours=mc_age,
+                ob_depth_quality=ob_depth, ob_fill_prob=ob_fill, ob_stale=ob_stale,
+                dvol_spike=dvol_spike, t_since_spike_min=t_since_spike,
+                drawdown=drawdown
+            )
+            
+            posture = self.survival_stack.update_posture(rm)
+            self.survival_stack.write_to_hz(rm, breakdown, self.hz_client)
+            
+        except Exception as e:
+            logger.error(f"Failed to process Survival Stack: {e}")
+
+    def _write_health(self):
+        if self.health_map:
+            try:
+                self.health_map.put('latest', json.dumps(self.health))
+            except Exception as e:
+                logger.debug(f"Failed to write health to HZ: {e}")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    args = parser.parse_args()
+    watchdog = SystemWatchdog()
+    try:
+        watchdog.start()
+    except KeyboardInterrupt:
+        watchdog.stop()
+        logger.info("Watchdog stopped.")