#!/usr/bin/env python3 """ DOLPHIN Meta Health Service (MHS) v2 ===================================== Enhanced monitoring for all subsystems: - Process integrity (M1) - Heartbeat freshness (M2) - Hz heartbeats from all services - Data freshness (M3) - Per-subsystem Hz key timestamps - Control plane (M4) - Ports + Hz connectivity - Data coherence (M5) - Integrity checks, posture validity Monitored Subsystems: 1. Scan Bridge → Hz DOLPHIN_FEATURES["latest_eigen_scan"] 2. OBF → Hz DOLPHIN_FEATURES_SHARD_* 3. ExtF → Hz DOLPHIN_FEATURES["exf_latest"] 4. EsoF → Hz DOLPHIN_FEATURES["esof_latest"] 5. Nautilus Trader → Hz DOLPHIN_PNL_BLUE, DOLPHIN_STATE_BLUE 6. System Watchdog → Hz DOLPHIN_SAFETY Outputs: - Local JSON: /mnt/dolphinng5_predict/run_logs/meta_health.json - Hz: DOLPHIN_META_HEALTH["latest"] - Logs: /mnt/dolphinng5_predict/run_logs/meta_health.log """ import os import sys import time import json import socket import logging import platform import subprocess from datetime import datetime, timezone from pathlib import Path from dataclasses import dataclass, asdict # Optional deps try: import psutil PSUTIL_AVAILABLE = True except ImportError: PSUTIL_AVAILABLE = False try: from hazelcast import HazelcastClient HZ_CLIENT_AVAILABLE = True except ImportError: HZ_CLIENT_AVAILABLE = False # --- CONFIGURATION --- PROJECT_ROOT = Path("/mnt/dolphinng5_predict") LOG_DIR = PROJECT_ROOT / "run_logs" LOG_FILE = LOG_DIR / "meta_health.log" STATUS_JSON = LOG_DIR / "meta_health.json" CHECK_INTERVAL = 5.0 # seconds DATA_STALE_THRESHOLD = 30.0 # seconds before data considered stale DATA_DEAD_THRESHOLD = 120.0 # seconds before data considered dead # Critical processes to monitor CRITICAL_PROCESSES = [ ("scan_bridge", ["scan_bridge"]), # scan_bridge_prefect_flow or direct ("nautilus_trader", ["nautilus_event_trader"]), ("extf", ["exf_prefect_final"]), ("obf", ["obf_prefect_flow"]), ("esof", ["esof_prefect_flow"]), ("hazelcast", ["hazelcast", "HzMember"]), ] # Hz keys to monitor for freshness HZ_DATA_SOURCES = { "scan": ("DOLPHIN_FEATURES", "latest_eigen_scan", "bridge_ts"), "obf": ("DOLPHIN_FEATURES", "ob_features_latest", "_pushed_at"), "extf": ("DOLPHIN_FEATURES", "exf_latest", "_pushed_at"), "esof": ("DOLPHIN_FEATURES", "esof_latest", "_pushed_at"), "safety": ("DOLPHIN_SAFETY", "latest", "ts"), "state": ("DOLPHIN_STATE_BLUE", "latest_nautilus", "updated_at"), } PORTS = { "hazelcast": 5701, "prefect_api": 4200, } # --- LOGGING --- os.makedirs(LOG_DIR, exist_ok=True) logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler(LOG_FILE), logging.StreamHandler() ] ) logger = logging.getLogger("MHS") @dataclass class HealthReport: rm_meta: float status: str m1_proc: float m2_heartbeat: float m3_data_freshness: float m4_control_plane: float m5_coherence: float subsystem_health: dict timestamp: str class MetaHealthService: def __init__(self): self.platform = platform.system().lower() self.hz_client = None self.last_report = None logger.info(f"MHS v2 starting. PID: {os.getpid()}") if not PSUTIL_AVAILABLE: logger.warning("psutil not available - process checks limited") if not HZ_CLIENT_AVAILABLE: logger.error("Hazelcast not available - critical failure") def _get_hz(self): """Lazy Hz connection with retry.""" if not HZ_CLIENT_AVAILABLE: return None if self.hz_client: return self.hz_client try: self.hz_client = HazelcastClient( cluster_name="dolphin", cluster_members=["127.0.0.1:5701"], connect_timeout=1.0, connection_retry_limit=1 ) return self.hz_client except Exception as e: logger.debug(f"Hz connection failed: {e}") return None # --- SENSOR M1: Process Integrity --- def m1_process_integrity(self): """Check critical processes are running.""" if not PSUTIL_AVAILABLE: return 1.0, {} results = {} all_ok = True for service_name, patterns in CRITICAL_PROCESSES: found = False for proc in psutil.process_iter(['pid', 'name', 'cmdline']): try: cmdline = ' '.join(proc.info['cmdline'] or []) for pattern in patterns: if pattern.lower() in proc.info['name'].lower() or pattern.lower() in cmdline.lower(): found = True break if found: break except (psutil.NoSuchProcess, psutil.AccessDenied): continue results[service_name] = found if not found: all_ok = False logger.warning(f"M1: {service_name} not running") return 1.0 if all_ok else 0.0, results # --- SENSOR M2: Hz Heartbeat Freshness --- def m2_heartbeat_freshness(self): """Check Hz heartbeats from services.""" hz = self._get_hz() if not hz: return 0.0 try: hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking() latest = hb_map.get("nautilus_flow_heartbeat") if not latest: return 0.5 # No heartbeat but Hz is up data = json.loads(latest) if isinstance(latest, str) else latest ts = data.get("ts", 0) age = time.time() - ts if age > 60: return 0.0 elif age > 30: return 0.5 return 1.0 except Exception as e: logger.debug(f"M2 error: {e}") return 0.5 # --- SENSOR M3: Data Freshness --- def m3_data_freshness(self): """Check all Hz data sources are fresh.""" hz = self._get_hz() if not hz: return 0.0, {} results = {} scores = [] for name, (map_name, key, ts_field) in HZ_DATA_SOURCES.items(): try: map_obj = hz.get_map(map_name).blocking() data_raw = map_obj.get(key) if not data_raw: results[name] = {"status": "missing", "score": 0.0} scores.append(0.0) continue data = json.loads(data_raw) if isinstance(data_raw, str) else data_raw ts_str = data.get(ts_field) if isinstance(data, dict) else None if not ts_str: results[name] = {"status": "no_timestamp", "score": 0.5} scores.append(0.5) continue # Parse timestamp try: if isinstance(ts_str, (int, float)): ts = ts_str else: # Try ISO format ts = datetime.fromisoformat(ts_str.replace('Z', '+00:00')).timestamp() except: results[name] = {"status": "bad_timestamp", "score": 0.5} scores.append(0.5) continue age = time.time() - ts if age > DATA_DEAD_THRESHOLD: score = 0.0 status = "dead" elif age > DATA_STALE_THRESHOLD: score = 0.5 status = "stale" else: score = 1.0 status = "fresh" results[name] = {"status": status, "age_s": round(age, 1), "score": score} scores.append(score) if status == "dead": logger.warning(f"M3: {name} data dead ({age:.0f}s old)") elif status == "stale": logger.debug(f"M3: {name} data stale ({age:.0f}s old)") except Exception as e: logger.debug(f"M3 error for {name}: {e}") results[name] = {"status": "error", "score": 0.0} scores.append(0.0) # Average score avg_score = sum(scores) / len(scores) if scores else 0.0 return avg_score, results # --- SENSOR M4: Control Plane --- def m4_control_plane(self): """Check Hz and Prefect ports.""" def check_port(port): try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.settimeout(1.0) return s.connect_ex(('127.0.0.1', port)) == 0 except: return False hz_up = check_port(PORTS["hazelcast"]) prefect_up = check_port(PORTS["prefect_api"]) if hz_up and prefect_up: return 1.0 elif hz_up or prefect_up: return 0.5 return 0.0 # --- SENSOR M5: Data Coherence --- def m5_coherence(self): """Check data integrity and posture validity.""" hz = self._get_hz() if not hz: return 0.0 checks = [] # Check DOLPHIN_SAFETY coherence try: safety_map = hz.get_map("DOLPHIN_SAFETY").blocking() safety_raw = safety_map.get("latest") if safety_raw: safety = json.loads(safety_raw) if isinstance(safety_raw, str) else safety_raw rm = safety.get("Rm", -1) posture = safety.get("posture", "UNKNOWN") valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"] if 0.0 <= rm <= 1.0 and posture in valid_postures: checks.append(1.0) else: checks.append(0.0) logger.warning(f"M5: Invalid safety data - Rm={rm}, posture={posture}") else: checks.append(0.5) except Exception as e: logger.debug(f"M5 safety error: {e}") checks.append(0.0) # Check scan data integrity (basic JSON parse test) try: features_map = hz.get_map("DOLPHIN_FEATURES").blocking() scan_raw = features_map.get("latest_eigen_scan") if scan_raw: scan = json.loads(scan_raw) # Basic sanity checks has_scan_num = "scan_number" in scan has_timestamp = "timestamp" in scan or "bridge_ts" in scan checks.append(1.0 if (has_scan_num and has_timestamp) else 0.5) else: checks.append(0.5) except Exception as e: logger.debug(f"M5 scan error: {e}") checks.append(0.0) return sum(checks) / len(checks) if checks else 0.0 # --- MAIN ENGINE --- def compute_health(self): """Compute overall health score.""" m1_proc, proc_details = self.m1_process_integrity() m2_hb = self.m2_heartbeat_freshness() m3_data, data_details = self.m3_data_freshness() m4_cp = self.m4_control_plane() m5_coh = self.m5_coherence() # Compute Rm_meta (product of all sensors) rm_meta = m1_proc * m2_hb * m3_data * m4_cp * m5_coh # Status mapping if rm_meta > 0.8: status = "GREEN" elif rm_meta > 0.5: status = "DEGRADED" elif rm_meta > 0.2: status = "CRITICAL" else: status = "DEAD" report = HealthReport( rm_meta=round(rm_meta, 3), status=status, m1_proc=round(m1_proc, 2), m2_heartbeat=round(m2_hb, 2), m3_data_freshness=round(m3_data, 2), m4_control_plane=round(m4_cp, 2), m5_coherence=round(m5_coh, 2), subsystem_health={ "processes": proc_details, "data_sources": data_details }, timestamp=datetime.now(timezone.utc).isoformat() ) return report def emit_outputs(self, report: HealthReport): """Write health report to all outputs.""" report_dict = asdict(report) # Local JSON try: with open(STATUS_JSON, 'w') as f: json.dump(report_dict, f, indent=2) except Exception as e: logger.error(f"Failed to write status JSON: {e}") # Hz push hz = self._get_hz() if hz and report.m4_control_plane > 0.5: try: meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking() meta_map.put("latest", json.dumps(report_dict)) except Exception as e: logger.debug(f"Hz push failed: {e}") # Log summary logger.info( f"RM_META={report.rm_meta} [{report.status}] " f"M1={report.m1_proc} M2={report.m2_heartbeat} M3={report.m3_data_freshness} " f"M4={report.m4_control_plane} M5={report.m5_coherence}" ) def attempt_recovery(self, report: HealthReport): """Attempt to recover from degraded states.""" if report.status in ["GREEN", "DEGRADED"]: return logger.critical(f"RECOVERY: System status is {report.status}") # Restart services based on failures services_to_restart = [] if report.m4_control_plane < 0.5: services_to_restart.extend(["hazelcast"]) if report.m1_proc < 1.0: # Check which processes are missing for service, running in report.subsystem_health.get("processes", {}).items(): if not running: if service == "scan_bridge": services_to_restart.append("dolphin-scan-bridge") elif service == "nautilus_trader": services_to_restart.append("dolphin-nautilus-trader") for svc in services_to_restart: try: subprocess.run(["systemctl", "restart", svc], check=True, timeout=30) logger.info(f"RECOVERY: Restarted {svc}") except Exception as e: logger.error(f"RECOVERY: Failed to restart {svc}: {e}") def run(self): """Main monitoring loop.""" while True: try: report = self.compute_health() self.emit_outputs(report) self.attempt_recovery(report) self.last_report = report except Exception as e: logger.error(f"Error in main loop: {e}") time.sleep(CHECK_INTERVAL) if __name__ == "__main__": mhs = MetaHealthService() try: mhs.run() except KeyboardInterrupt: logger.info("MHS stopped by user") sys.exit(0)