Files
DOLPHIN/prod/meta_health_daemon.py

360 lines
12 KiB
Python
Raw Normal View History

import os
import sys
import time
import json
import socket
import logging
import platform
import subprocess
from datetime import datetime
from pathlib import Path
# Try to import psutil for process and system metrics
try:
import psutil
PSUTIL_AVAILABLE = True
except ImportError:
PSUTIL_AVAILABLE = False
# External dependencies (optional/lazy-loaded)
try:
from hazelcast import HazelcastClient
from hazelcast.serialization.api import Portable
HZ_CLIENT_AVAILABLE = True
except ImportError:
HZ_CLIENT_AVAILABLE = False
# --- CONFIGURATION (Canonical Paths from SYSTEM_FILE_MAP) ---
PROJECT_ROOT = Path("C:/Users/Lenovo/Documents/- DOLPHIN NG HD HCM TSF Predict")
if not PROJECT_ROOT.exists():
# Fallback for Linux/Production path if running in a different env
PROJECT_ROOT = Path("/mnt/dolphinng5_predict")
LOG_DIR = PROJECT_ROOT / "run_logs"
LOG_FILE = LOG_DIR / "meta_health.log"
STATUS_JSON = LOG_DIR / "meta_health.json"
OB_CACHE_FILE = PROJECT_ROOT / "ob_cache" / "latest_ob_features.json"
CHECK_INTERVAL = 2.0 # seconds
WATCHDOG_TIMEOUT = 10.0 # seconds for systemd/watchdogd
# Monitor Targets
CRITICAL_PROCESSES = [
"system_watchdog_service.py",
"acb_processor_service.py",
"obf_prefect_flow.py",
"scan-bridge-flow", # NEW: Scan bridge under Prefect management
"prefect", # Catching the prefect worker/server
"hazelcast" # Catching the HZ process
]
PORTS = {
"hazelcast": 5701,
"prefect_api": 4200,
"prefect_ui": 8080 # Management Center for HZ is usually 8080 too, check bible vs spec
}
# --- LOGGING SETUP ---
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler()
]
)
logger = logging.getLogger("MetaHealth")
class MetaHealthDaemon:
def __init__(self):
self.platform = platform.system().lower() # 'linux' or 'freebsd'
self.start_time = time.time()
self.hz_client = None
self.last_rm_meta = 1.0
self.status = "INITIALIZING"
logger.info(f"MHD starting on {self.platform}. Process ID: {os.getpid()}")
if not PSUTIL_AVAILABLE:
logger.warning("psutil NOT found. Process checks will be limited.")
def _lazy_get_hz(self):
"""Best-effort Hazelcast connection."""
if not HZ_CLIENT_AVAILABLE:
return None
if self.hz_client is not None:
return self.hz_client
try:
# Short timeout to avoid blocking the main loop
self.hz_client = HazelcastClient(
cluster_name="dolphin",
cluster_members=["127.0.0.1:5701"],
connect_timeout=0.5,
connection_retry_limit=1
)
logger.info("MHD connected to Hazelcast cluster 'dolphin'")
return self.hz_client
except Exception:
self.hz_client = None
return None
# --- SENSORS ---
def m1_process_integrity(self):
"""Check if critical processes are running."""
if not PSUTIL_AVAILABLE:
return 1.0 # Cannot check, assume OK for Rm math
missing = []
for proc_name in CRITICAL_PROCESSES:
found = False
for p in psutil.process_iter(['name', 'cmdline']):
try:
# Check process name or cmdline (for python scripts)
if proc_name in p.info['name'].lower():
found = True
break
if p.info['cmdline'] and any(proc_name in arg.lower() for arg in p.info['cmdline']):
found = True
break
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
if not found:
missing.append(proc_name)
if missing:
logger.warning(f"M1 MISSING: {missing}")
return 0.0 # Total failure of a critical component
return 1.0
def m2_heartbeat_freshness(self):
"""Check HZ heartbeats (SILOQY-style)."""
hz = self._lazy_get_hz()
if not hz:
return 0.0 # If we can't connect, heartbeats are effectively missing
try:
hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking()
latest = hb_map.get("nautilus_flow_heartbeat")
if not latest:
return 0.0
# Expecting JSON string or dict
if isinstance(latest, str):
data = json.loads(latest)
else:
data = latest
ts = data.get("ts", 0)
age = time.time() - ts
if age > 30: return 0.0
if age > 10: return 0.5
return 1.0
except Exception as e:
logger.debug(f"M2 HZ Read Error: {e}")
return 0.0
def m3_data_freshness(self):
"""Check file-based cache freshness."""
if not OB_CACHE_FILE.exists():
return 0.0
try:
mtime = OB_CACHE_FILE.stat().st_mtime
age = time.time() - mtime
if age > 10: return 0.0
if age > 5: return 0.3
return 1.0
except Exception:
return 0.0
def m4_control_plane(self):
"""Check if ports are listening."""
hz_up = False
prefect_up = False
def check_port(port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(0.5)
return s.connect_ex(('127.0.0.1', port)) == 0
hz_up = check_port(PORTS["hazelcast"])
prefect_up = check_port(PORTS["prefect_api"])
if not hz_up and not prefect_up:
return 0.2
if not hz_up or not prefect_up:
return 0.6
return 1.0
def m5_health_coherence(self):
"""Check if internal DOLPHIN_SAFETY is updating correctly."""
hz = self._lazy_get_hz()
if not hz:
return 0.0
try:
safety_map = hz.get_map("DOLPHIN_SAFETY").blocking()
latest = safety_map.get("latest")
if not latest:
return 0.0
if isinstance(latest, str):
data = json.loads(latest)
else:
data = latest
rm = data.get("Rm", -1)
posture = data.get("posture", "UNKNOWN")
ts = data.get("ts", 0)
# Coherence checks
age = time.time() - ts
valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"]
if age > 60: return 0.0 # Safety system is dead
if not (0.0 <= rm <= 1.0): return 0.0 # Garbage Rm
if posture not in valid_postures: return 0.0 # Corrupt posture
return 1.0
except Exception:
return 0.0
# --- ENGINE ---
def compute_rm_meta(self):
m1 = self.m1_process_integrity()
m2 = self.m2_heartbeat_freshness()
m3 = self.m3_data_freshness()
m4 = self.m4_control_plane()
m5 = self.m5_health_coherence()
rm_meta = m1 * m2 * m3 * m4 * m5
# State mapping
if rm_meta > 0.8: status = "GREEN"
elif rm_meta > 0.5: status = "DEGRADED"
elif rm_meta > 0.2: status = "CRITICAL"
else: status = "DEAD"
self.last_rm_meta = rm_meta
self.status = status
return {
"rm_meta": round(rm_meta, 3),
"status": status,
"sensors": {
"m1_proc": m1,
"m2_hb": m2,
"m3_data": m3,
"m4_cp": m4,
"m5_coh": m5
},
"timestamp": time.time(),
"iso": datetime.now().isoformat()
}
def emit_outputs(self, report):
# Local JSON state
try:
with open(STATUS_JSON, 'w') as f:
json.dump(report, f, indent=2)
except Exception as e:
logger.error(f"Failed to write state JSON: {e}")
# HZ Push (best effort)
hz = self._lazy_get_hz()
if hz and report['sensors']['m4_cp'] > 0.5:
try:
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
meta_map.put("latest", json.dumps(report))
except Exception:
pass # Silent fail if HZ is struggling
logger.info(f"RM_META: {report['rm_meta']} | STATUS: {report['status']} | HB: {report['sensors']['m2_hb']}")
def attempt_restart(self, report):
"""Platform-independent service restart logic."""
if report['rm_meta'] > 0.2:
return # Don't auto-restart unless DEAD or deeply CRITICAL
logger.critical("DEAD STATE detected. Attempting component restarts.")
services_to_check = {
"hazelcast": "hazelcast",
"prefect": "dolphin-prefect-worker", # Updated service name
"scan-bridge": "dolphin-prefect-worker" # Scan bridge runs under Prefect worker
}
# This is where one would add specific service names mapped to m-sensors
# For this PoC, we try to restart the docker compose or main services
def restart_svc(name):
if self.platform == "linux":
cmd = ["systemctl", "restart", name]
elif "bsd" in self.platform:
cmd = ["service", name, "restart"]
else:
logger.error(f"Unsupported platform for restart: {self.platform}")
return
try:
subprocess.run(cmd, check=True)
logger.info(f"Executed restart for {name}")
except Exception as e:
logger.error(f"Failed to restart {name}: {e}")
# If M4 (Control Plane) is failed, try restarting primary infrastructure
if report['sensors']['m4_cp'] <= 0.2:
restart_svc("hazelcast")
restart_svc("dolphin-prefect-worker") # This will restart all Prefect flows
# If scan-bridge-flow is not running, trigger a new deployment run
if report['sensors']['m1_proc'] < 1.0:
logger.warning("Scan bridge or critical process missing - triggering Prefect deployment")
self._trigger_scan_bridge_deploy()
def _trigger_scan_bridge_deploy(self):
"""Trigger scan-bridge-flow deployment via Prefect API."""
try:
env = os.environ.copy()
env["PREFECT_API_URL"] = "http://localhost:4200/api"
cmd = [
"/home/dolphin/siloqy_env/bin/prefect",
"deployment", "run",
"scan-bridge-flow/scan-bridge"
]
subprocess.run(cmd, env=env, check=True, capture_output=True, timeout=30)
logger.info("Triggered scan-bridge-flow deployment run")
except Exception as e:
logger.error(f"Failed to trigger scan-bridge deployment: {e}")
def run(self):
while True:
try:
report = self.compute_rm_meta()
self.emit_outputs(report)
self.attempt_restart(report)
# systemd Watchdog Notify (Linux only)
if self.platform == "linux" and 'NOTIFY_SOCKET' in os.environ:
# In a real impl, we'd use sd_notify. Here we can use subprocess
# or a library. For now, we skip but note its place.
pass
except Exception as e:
logger.error(f"Error in MHD loop: {e}")
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":
daemon = MetaHealthDaemon()
try:
daemon.run()
except KeyboardInterrupt:
logger.info("MHD stopped by user.")
sys.exit(0)