360 lines
12 KiB
Python
360 lines
12 KiB
Python
|
|
import os
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import socket
|
||
|
|
import logging
|
||
|
|
import platform
|
||
|
|
import subprocess
|
||
|
|
from datetime import datetime
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
# Try to import psutil for process and system metrics
|
||
|
|
try:
|
||
|
|
import psutil
|
||
|
|
PSUTIL_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
PSUTIL_AVAILABLE = False
|
||
|
|
|
||
|
|
# External dependencies (optional/lazy-loaded)
|
||
|
|
try:
|
||
|
|
from hazelcast import HazelcastClient
|
||
|
|
from hazelcast.serialization.api import Portable
|
||
|
|
HZ_CLIENT_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
HZ_CLIENT_AVAILABLE = False
|
||
|
|
|
||
|
|
# --- CONFIGURATION (Canonical Paths from SYSTEM_FILE_MAP) ---
|
||
|
|
PROJECT_ROOT = Path("C:/Users/Lenovo/Documents/- DOLPHIN NG HD HCM TSF Predict")
|
||
|
|
if not PROJECT_ROOT.exists():
|
||
|
|
# Fallback for Linux/Production path if running in a different env
|
||
|
|
PROJECT_ROOT = Path("/mnt/dolphinng5_predict")
|
||
|
|
|
||
|
|
LOG_DIR = PROJECT_ROOT / "run_logs"
|
||
|
|
LOG_FILE = LOG_DIR / "meta_health.log"
|
||
|
|
STATUS_JSON = LOG_DIR / "meta_health.json"
|
||
|
|
OB_CACHE_FILE = PROJECT_ROOT / "ob_cache" / "latest_ob_features.json"
|
||
|
|
|
||
|
|
CHECK_INTERVAL = 2.0 # seconds
|
||
|
|
WATCHDOG_TIMEOUT = 10.0 # seconds for systemd/watchdogd
|
||
|
|
|
||
|
|
# Monitor Targets
|
||
|
|
CRITICAL_PROCESSES = [
|
||
|
|
"system_watchdog_service.py",
|
||
|
|
"acb_processor_service.py",
|
||
|
|
"obf_prefect_flow.py",
|
||
|
|
"scan-bridge-flow", # NEW: Scan bridge under Prefect management
|
||
|
|
"prefect", # Catching the prefect worker/server
|
||
|
|
"hazelcast" # Catching the HZ process
|
||
|
|
]
|
||
|
|
|
||
|
|
PORTS = {
|
||
|
|
"hazelcast": 5701,
|
||
|
|
"prefect_api": 4200,
|
||
|
|
"prefect_ui": 8080 # Management Center for HZ is usually 8080 too, check bible vs spec
|
||
|
|
}
|
||
|
|
|
||
|
|
# --- LOGGING SETUP ---
|
||
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(asctime)s [%(levelname)s] %(message)s',
|
||
|
|
handlers=[
|
||
|
|
logging.FileHandler(LOG_FILE),
|
||
|
|
logging.StreamHandler()
|
||
|
|
]
|
||
|
|
)
|
||
|
|
logger = logging.getLogger("MetaHealth")
|
||
|
|
|
||
|
|
class MetaHealthDaemon:
|
||
|
|
def __init__(self):
|
||
|
|
self.platform = platform.system().lower() # 'linux' or 'freebsd'
|
||
|
|
self.start_time = time.time()
|
||
|
|
self.hz_client = None
|
||
|
|
self.last_rm_meta = 1.0
|
||
|
|
self.status = "INITIALIZING"
|
||
|
|
|
||
|
|
logger.info(f"MHD starting on {self.platform}. Process ID: {os.getpid()}")
|
||
|
|
if not PSUTIL_AVAILABLE:
|
||
|
|
logger.warning("psutil NOT found. Process checks will be limited.")
|
||
|
|
|
||
|
|
def _lazy_get_hz(self):
|
||
|
|
"""Best-effort Hazelcast connection."""
|
||
|
|
if not HZ_CLIENT_AVAILABLE:
|
||
|
|
return None
|
||
|
|
if self.hz_client is not None:
|
||
|
|
return self.hz_client
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Short timeout to avoid blocking the main loop
|
||
|
|
self.hz_client = HazelcastClient(
|
||
|
|
cluster_name="dolphin",
|
||
|
|
cluster_members=["127.0.0.1:5701"],
|
||
|
|
connect_timeout=0.5,
|
||
|
|
connection_retry_limit=1
|
||
|
|
)
|
||
|
|
logger.info("MHD connected to Hazelcast cluster 'dolphin'")
|
||
|
|
return self.hz_client
|
||
|
|
except Exception:
|
||
|
|
self.hz_client = None
|
||
|
|
return None
|
||
|
|
|
||
|
|
# --- SENSORS ---
|
||
|
|
|
||
|
|
def m1_process_integrity(self):
|
||
|
|
"""Check if critical processes are running."""
|
||
|
|
if not PSUTIL_AVAILABLE:
|
||
|
|
return 1.0 # Cannot check, assume OK for Rm math
|
||
|
|
|
||
|
|
missing = []
|
||
|
|
for proc_name in CRITICAL_PROCESSES:
|
||
|
|
found = False
|
||
|
|
for p in psutil.process_iter(['name', 'cmdline']):
|
||
|
|
try:
|
||
|
|
# Check process name or cmdline (for python scripts)
|
||
|
|
if proc_name in p.info['name'].lower():
|
||
|
|
found = True
|
||
|
|
break
|
||
|
|
if p.info['cmdline'] and any(proc_name in arg.lower() for arg in p.info['cmdline']):
|
||
|
|
found = True
|
||
|
|
break
|
||
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||
|
|
continue
|
||
|
|
if not found:
|
||
|
|
missing.append(proc_name)
|
||
|
|
|
||
|
|
if missing:
|
||
|
|
logger.warning(f"M1 MISSING: {missing}")
|
||
|
|
return 0.0 # Total failure of a critical component
|
||
|
|
return 1.0
|
||
|
|
|
||
|
|
def m2_heartbeat_freshness(self):
|
||
|
|
"""Check HZ heartbeats (SILOQY-style)."""
|
||
|
|
hz = self._lazy_get_hz()
|
||
|
|
if not hz:
|
||
|
|
return 0.0 # If we can't connect, heartbeats are effectively missing
|
||
|
|
|
||
|
|
try:
|
||
|
|
hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking()
|
||
|
|
latest = hb_map.get("nautilus_flow_heartbeat")
|
||
|
|
if not latest:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
# Expecting JSON string or dict
|
||
|
|
if isinstance(latest, str):
|
||
|
|
data = json.loads(latest)
|
||
|
|
else:
|
||
|
|
data = latest
|
||
|
|
|
||
|
|
ts = data.get("ts", 0)
|
||
|
|
age = time.time() - ts
|
||
|
|
|
||
|
|
if age > 30: return 0.0
|
||
|
|
if age > 10: return 0.5
|
||
|
|
return 1.0
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"M2 HZ Read Error: {e}")
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
def m3_data_freshness(self):
|
||
|
|
"""Check file-based cache freshness."""
|
||
|
|
if not OB_CACHE_FILE.exists():
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
try:
|
||
|
|
mtime = OB_CACHE_FILE.stat().st_mtime
|
||
|
|
age = time.time() - mtime
|
||
|
|
|
||
|
|
if age > 10: return 0.0
|
||
|
|
if age > 5: return 0.3
|
||
|
|
return 1.0
|
||
|
|
except Exception:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
def m4_control_plane(self):
|
||
|
|
"""Check if ports are listening."""
|
||
|
|
hz_up = False
|
||
|
|
prefect_up = False
|
||
|
|
|
||
|
|
def check_port(port):
|
||
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||
|
|
s.settimeout(0.5)
|
||
|
|
return s.connect_ex(('127.0.0.1', port)) == 0
|
||
|
|
|
||
|
|
hz_up = check_port(PORTS["hazelcast"])
|
||
|
|
prefect_up = check_port(PORTS["prefect_api"])
|
||
|
|
|
||
|
|
if not hz_up and not prefect_up:
|
||
|
|
return 0.2
|
||
|
|
if not hz_up or not prefect_up:
|
||
|
|
return 0.6
|
||
|
|
return 1.0
|
||
|
|
|
||
|
|
def m5_health_coherence(self):
|
||
|
|
"""Check if internal DOLPHIN_SAFETY is updating correctly."""
|
||
|
|
hz = self._lazy_get_hz()
|
||
|
|
if not hz:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
try:
|
||
|
|
safety_map = hz.get_map("DOLPHIN_SAFETY").blocking()
|
||
|
|
latest = safety_map.get("latest")
|
||
|
|
if not latest:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
if isinstance(latest, str):
|
||
|
|
data = json.loads(latest)
|
||
|
|
else:
|
||
|
|
data = latest
|
||
|
|
|
||
|
|
rm = data.get("Rm", -1)
|
||
|
|
posture = data.get("posture", "UNKNOWN")
|
||
|
|
ts = data.get("ts", 0)
|
||
|
|
|
||
|
|
# Coherence checks
|
||
|
|
age = time.time() - ts
|
||
|
|
valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"]
|
||
|
|
|
||
|
|
if age > 60: return 0.0 # Safety system is dead
|
||
|
|
if not (0.0 <= rm <= 1.0): return 0.0 # Garbage Rm
|
||
|
|
if posture not in valid_postures: return 0.0 # Corrupt posture
|
||
|
|
|
||
|
|
return 1.0
|
||
|
|
except Exception:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
# --- ENGINE ---
|
||
|
|
|
||
|
|
def compute_rm_meta(self):
|
||
|
|
m1 = self.m1_process_integrity()
|
||
|
|
m2 = self.m2_heartbeat_freshness()
|
||
|
|
m3 = self.m3_data_freshness()
|
||
|
|
m4 = self.m4_control_plane()
|
||
|
|
m5 = self.m5_health_coherence()
|
||
|
|
|
||
|
|
rm_meta = m1 * m2 * m3 * m4 * m5
|
||
|
|
|
||
|
|
# State mapping
|
||
|
|
if rm_meta > 0.8: status = "GREEN"
|
||
|
|
elif rm_meta > 0.5: status = "DEGRADED"
|
||
|
|
elif rm_meta > 0.2: status = "CRITICAL"
|
||
|
|
else: status = "DEAD"
|
||
|
|
|
||
|
|
self.last_rm_meta = rm_meta
|
||
|
|
self.status = status
|
||
|
|
|
||
|
|
return {
|
||
|
|
"rm_meta": round(rm_meta, 3),
|
||
|
|
"status": status,
|
||
|
|
"sensors": {
|
||
|
|
"m1_proc": m1,
|
||
|
|
"m2_hb": m2,
|
||
|
|
"m3_data": m3,
|
||
|
|
"m4_cp": m4,
|
||
|
|
"m5_coh": m5
|
||
|
|
},
|
||
|
|
"timestamp": time.time(),
|
||
|
|
"iso": datetime.now().isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
def emit_outputs(self, report):
|
||
|
|
# Local JSON state
|
||
|
|
try:
|
||
|
|
with open(STATUS_JSON, 'w') as f:
|
||
|
|
json.dump(report, f, indent=2)
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to write state JSON: {e}")
|
||
|
|
|
||
|
|
# HZ Push (best effort)
|
||
|
|
hz = self._lazy_get_hz()
|
||
|
|
if hz and report['sensors']['m4_cp'] > 0.5:
|
||
|
|
try:
|
||
|
|
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
|
||
|
|
meta_map.put("latest", json.dumps(report))
|
||
|
|
except Exception:
|
||
|
|
pass # Silent fail if HZ is struggling
|
||
|
|
|
||
|
|
logger.info(f"RM_META: {report['rm_meta']} | STATUS: {report['status']} | HB: {report['sensors']['m2_hb']}")
|
||
|
|
|
||
|
|
def attempt_restart(self, report):
|
||
|
|
"""Platform-independent service restart logic."""
|
||
|
|
if report['rm_meta'] > 0.2:
|
||
|
|
return # Don't auto-restart unless DEAD or deeply CRITICAL
|
||
|
|
|
||
|
|
logger.critical("DEAD STATE detected. Attempting component restarts.")
|
||
|
|
|
||
|
|
services_to_check = {
|
||
|
|
"hazelcast": "hazelcast",
|
||
|
|
"prefect": "dolphin-prefect-worker", # Updated service name
|
||
|
|
"scan-bridge": "dolphin-prefect-worker" # Scan bridge runs under Prefect worker
|
||
|
|
}
|
||
|
|
|
||
|
|
# This is where one would add specific service names mapped to m-sensors
|
||
|
|
# For this PoC, we try to restart the docker compose or main services
|
||
|
|
|
||
|
|
def restart_svc(name):
|
||
|
|
if self.platform == "linux":
|
||
|
|
cmd = ["systemctl", "restart", name]
|
||
|
|
elif "bsd" in self.platform:
|
||
|
|
cmd = ["service", name, "restart"]
|
||
|
|
else:
|
||
|
|
logger.error(f"Unsupported platform for restart: {self.platform}")
|
||
|
|
return
|
||
|
|
|
||
|
|
try:
|
||
|
|
subprocess.run(cmd, check=True)
|
||
|
|
logger.info(f"Executed restart for {name}")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to restart {name}: {e}")
|
||
|
|
|
||
|
|
# If M4 (Control Plane) is failed, try restarting primary infrastructure
|
||
|
|
if report['sensors']['m4_cp'] <= 0.2:
|
||
|
|
restart_svc("hazelcast")
|
||
|
|
restart_svc("dolphin-prefect-worker") # This will restart all Prefect flows
|
||
|
|
|
||
|
|
# If scan-bridge-flow is not running, trigger a new deployment run
|
||
|
|
if report['sensors']['m1_proc'] < 1.0:
|
||
|
|
logger.warning("Scan bridge or critical process missing - triggering Prefect deployment")
|
||
|
|
self._trigger_scan_bridge_deploy()
|
||
|
|
|
||
|
|
def _trigger_scan_bridge_deploy(self):
|
||
|
|
"""Trigger scan-bridge-flow deployment via Prefect API."""
|
||
|
|
try:
|
||
|
|
env = os.environ.copy()
|
||
|
|
env["PREFECT_API_URL"] = "http://localhost:4200/api"
|
||
|
|
cmd = [
|
||
|
|
"/home/dolphin/siloqy_env/bin/prefect",
|
||
|
|
"deployment", "run",
|
||
|
|
"scan-bridge-flow/scan-bridge"
|
||
|
|
]
|
||
|
|
subprocess.run(cmd, env=env, check=True, capture_output=True, timeout=30)
|
||
|
|
logger.info("Triggered scan-bridge-flow deployment run")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to trigger scan-bridge deployment: {e}")
|
||
|
|
|
||
|
|
def run(self):
|
||
|
|
while True:
|
||
|
|
try:
|
||
|
|
report = self.compute_rm_meta()
|
||
|
|
self.emit_outputs(report)
|
||
|
|
self.attempt_restart(report)
|
||
|
|
|
||
|
|
# systemd Watchdog Notify (Linux only)
|
||
|
|
if self.platform == "linux" and 'NOTIFY_SOCKET' in os.environ:
|
||
|
|
# In a real impl, we'd use sd_notify. Here we can use subprocess
|
||
|
|
# or a library. For now, we skip but note its place.
|
||
|
|
pass
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Error in MHD loop: {e}")
|
||
|
|
|
||
|
|
time.sleep(CHECK_INTERVAL)
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
daemon = MetaHealthDaemon()
|
||
|
|
try:
|
||
|
|
daemon.run()
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
logger.info("MHD stopped by user.")
|
||
|
|
sys.exit(0)
|