import os import sys import time import json import socket import logging import platform import subprocess from datetime import datetime from pathlib import Path # Try to import psutil for process and system metrics try: import psutil PSUTIL_AVAILABLE = True except ImportError: PSUTIL_AVAILABLE = False # External dependencies (optional/lazy-loaded) try: from hazelcast import HazelcastClient from hazelcast.serialization.api import Portable HZ_CLIENT_AVAILABLE = True except ImportError: HZ_CLIENT_AVAILABLE = False # --- CONFIGURATION (Canonical Paths from SYSTEM_FILE_MAP) --- PROJECT_ROOT = Path("C:/Users/Lenovo/Documents/- DOLPHIN NG HD HCM TSF Predict") if not PROJECT_ROOT.exists(): # Fallback for Linux/Production path if running in a different env PROJECT_ROOT = Path("/mnt/dolphinng5_predict") LOG_DIR = PROJECT_ROOT / "run_logs" LOG_FILE = LOG_DIR / "meta_health.log" STATUS_JSON = LOG_DIR / "meta_health.json" OB_CACHE_FILE = PROJECT_ROOT / "ob_cache" / "latest_ob_features.json" CHECK_INTERVAL = 2.0 # seconds WATCHDOG_TIMEOUT = 10.0 # seconds for systemd/watchdogd # Monitor Targets CRITICAL_PROCESSES = [ "system_watchdog_service.py", "acb_processor_service.py", "obf_prefect_flow.py", "scan-bridge-flow", # NEW: Scan bridge under Prefect management "prefect", # Catching the prefect worker/server "hazelcast" # Catching the HZ process ] PORTS = { "hazelcast": 5701, "prefect_api": 4200, "prefect_ui": 8080 # Management Center for HZ is usually 8080 too, check bible vs spec } # --- LOGGING SETUP --- os.makedirs(LOG_DIR, exist_ok=True) logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler(LOG_FILE), logging.StreamHandler() ] ) logger = logging.getLogger("MetaHealth") class MetaHealthDaemon: def __init__(self): self.platform = platform.system().lower() # 'linux' or 'freebsd' self.start_time = time.time() self.hz_client = None self.last_rm_meta = 1.0 self.status = "INITIALIZING" logger.info(f"MHD starting on {self.platform}. Process ID: {os.getpid()}") if not PSUTIL_AVAILABLE: logger.warning("psutil NOT found. Process checks will be limited.") def _lazy_get_hz(self): """Best-effort Hazelcast connection.""" if not HZ_CLIENT_AVAILABLE: return None if self.hz_client is not None: return self.hz_client try: # Short timeout to avoid blocking the main loop self.hz_client = HazelcastClient( cluster_name="dolphin", cluster_members=["127.0.0.1:5701"], connect_timeout=0.5, connection_retry_limit=1 ) logger.info("MHD connected to Hazelcast cluster 'dolphin'") return self.hz_client except Exception: self.hz_client = None return None # --- SENSORS --- def m1_process_integrity(self): """Check if critical processes are running.""" if not PSUTIL_AVAILABLE: return 1.0 # Cannot check, assume OK for Rm math missing = [] for proc_name in CRITICAL_PROCESSES: found = False for p in psutil.process_iter(['name', 'cmdline']): try: # Check process name or cmdline (for python scripts) if proc_name in p.info['name'].lower(): found = True break if p.info['cmdline'] and any(proc_name in arg.lower() for arg in p.info['cmdline']): found = True break except (psutil.NoSuchProcess, psutil.AccessDenied): continue if not found: missing.append(proc_name) if missing: logger.warning(f"M1 MISSING: {missing}") return 0.0 # Total failure of a critical component return 1.0 def m2_heartbeat_freshness(self): """Check HZ heartbeats (SILOQY-style).""" hz = self._lazy_get_hz() if not hz: return 0.0 # If we can't connect, heartbeats are effectively missing try: hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking() latest = hb_map.get("nautilus_flow_heartbeat") if not latest: return 0.0 # Expecting JSON string or dict if isinstance(latest, str): data = json.loads(latest) else: data = latest ts = data.get("ts", 0) age = time.time() - ts if age > 30: return 0.0 if age > 10: return 0.5 return 1.0 except Exception as e: logger.debug(f"M2 HZ Read Error: {e}") return 0.0 def m3_data_freshness(self): """Check file-based cache freshness.""" if not OB_CACHE_FILE.exists(): return 0.0 try: mtime = OB_CACHE_FILE.stat().st_mtime age = time.time() - mtime if age > 10: return 0.0 if age > 5: return 0.3 return 1.0 except Exception: return 0.0 def m4_control_plane(self): """Check if ports are listening.""" hz_up = False prefect_up = False def check_port(port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.settimeout(0.5) return s.connect_ex(('127.0.0.1', port)) == 0 hz_up = check_port(PORTS["hazelcast"]) prefect_up = check_port(PORTS["prefect_api"]) if not hz_up and not prefect_up: return 0.2 if not hz_up or not prefect_up: return 0.6 return 1.0 def m5_health_coherence(self): """Check if internal DOLPHIN_SAFETY is updating correctly.""" hz = self._lazy_get_hz() if not hz: return 0.0 try: safety_map = hz.get_map("DOLPHIN_SAFETY").blocking() latest = safety_map.get("latest") if not latest: return 0.0 if isinstance(latest, str): data = json.loads(latest) else: data = latest rm = data.get("Rm", -1) posture = data.get("posture", "UNKNOWN") ts = data.get("ts", 0) # Coherence checks age = time.time() - ts valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"] if age > 60: return 0.0 # Safety system is dead if not (0.0 <= rm <= 1.0): return 0.0 # Garbage Rm if posture not in valid_postures: return 0.0 # Corrupt posture return 1.0 except Exception: return 0.0 # --- ENGINE --- def compute_rm_meta(self): m1 = self.m1_process_integrity() m2 = self.m2_heartbeat_freshness() m3 = self.m3_data_freshness() m4 = self.m4_control_plane() m5 = self.m5_health_coherence() rm_meta = m1 * m2 * m3 * m4 * m5 # State mapping if rm_meta > 0.8: status = "GREEN" elif rm_meta > 0.5: status = "DEGRADED" elif rm_meta > 0.2: status = "CRITICAL" else: status = "DEAD" self.last_rm_meta = rm_meta self.status = status return { "rm_meta": round(rm_meta, 3), "status": status, "sensors": { "m1_proc": m1, "m2_hb": m2, "m3_data": m3, "m4_cp": m4, "m5_coh": m5 }, "timestamp": time.time(), "iso": datetime.now().isoformat() } def emit_outputs(self, report): # Local JSON state try: with open(STATUS_JSON, 'w') as f: json.dump(report, f, indent=2) except Exception as e: logger.error(f"Failed to write state JSON: {e}") # HZ Push (best effort) hz = self._lazy_get_hz() if hz and report['sensors']['m4_cp'] > 0.5: try: meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking() meta_map.put("latest", json.dumps(report)) except Exception: pass # Silent fail if HZ is struggling logger.info(f"RM_META: {report['rm_meta']} | STATUS: {report['status']} | HB: {report['sensors']['m2_hb']}") def attempt_restart(self, report): """Platform-independent service restart logic.""" if report['rm_meta'] > 0.2: return # Don't auto-restart unless DEAD or deeply CRITICAL logger.critical("DEAD STATE detected. Attempting component restarts.") services_to_check = { "hazelcast": "hazelcast", "prefect": "dolphin-prefect-worker", # Updated service name "scan-bridge": "dolphin-prefect-worker" # Scan bridge runs under Prefect worker } # This is where one would add specific service names mapped to m-sensors # For this PoC, we try to restart the docker compose or main services def restart_svc(name): if self.platform == "linux": cmd = ["systemctl", "restart", name] elif "bsd" in self.platform: cmd = ["service", name, "restart"] else: logger.error(f"Unsupported platform for restart: {self.platform}") return try: subprocess.run(cmd, check=True) logger.info(f"Executed restart for {name}") except Exception as e: logger.error(f"Failed to restart {name}: {e}") # If M4 (Control Plane) is failed, try restarting primary infrastructure if report['sensors']['m4_cp'] <= 0.2: restart_svc("hazelcast") restart_svc("dolphin-prefect-worker") # This will restart all Prefect flows # If scan-bridge-flow is not running, trigger a new deployment run if report['sensors']['m1_proc'] < 1.0: logger.warning("Scan bridge or critical process missing - triggering Prefect deployment") self._trigger_scan_bridge_deploy() def _trigger_scan_bridge_deploy(self): """Trigger scan-bridge-flow deployment via Prefect API.""" try: env = os.environ.copy() env["PREFECT_API_URL"] = "http://localhost:4200/api" cmd = [ "/home/dolphin/siloqy_env/bin/prefect", "deployment", "run", "scan-bridge-flow/scan-bridge" ] subprocess.run(cmd, env=env, check=True, capture_output=True, timeout=30) logger.info("Triggered scan-bridge-flow deployment run") except Exception as e: logger.error(f"Failed to trigger scan-bridge deployment: {e}") def run(self): while True: try: report = self.compute_rm_meta() self.emit_outputs(report) self.attempt_restart(report) # systemd Watchdog Notify (Linux only) if self.platform == "linux" and 'NOTIFY_SOCKET' in os.environ: # In a real impl, we'd use sd_notify. Here we can use subprocess # or a library. For now, we skip but note its place. pass except Exception as e: logger.error(f"Error in MHD loop: {e}") time.sleep(CHECK_INTERVAL) if __name__ == "__main__": daemon = MetaHealthDaemon() try: daemon.run() except KeyboardInterrupt: logger.info("MHD stopped by user.") sys.exit(0)