initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
359
prod/meta_health_daemon.py
Executable file
359
prod/meta_health_daemon.py
Executable file
@@ -0,0 +1,359 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import socket
|
||||
import logging
|
||||
import platform
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Try to import psutil for process and system metrics
|
||||
try:
|
||||
import psutil
|
||||
PSUTIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
PSUTIL_AVAILABLE = False
|
||||
|
||||
# External dependencies (optional/lazy-loaded)
|
||||
try:
|
||||
from hazelcast import HazelcastClient
|
||||
from hazelcast.serialization.api import Portable
|
||||
HZ_CLIENT_AVAILABLE = True
|
||||
except ImportError:
|
||||
HZ_CLIENT_AVAILABLE = False
|
||||
|
||||
# --- CONFIGURATION (Canonical Paths from SYSTEM_FILE_MAP) ---
|
||||
PROJECT_ROOT = Path("C:/Users/Lenovo/Documents/- DOLPHIN NG HD HCM TSF Predict")
|
||||
if not PROJECT_ROOT.exists():
|
||||
# Fallback for Linux/Production path if running in a different env
|
||||
PROJECT_ROOT = Path("/mnt/dolphinng5_predict")
|
||||
|
||||
LOG_DIR = PROJECT_ROOT / "run_logs"
|
||||
LOG_FILE = LOG_DIR / "meta_health.log"
|
||||
STATUS_JSON = LOG_DIR / "meta_health.json"
|
||||
OB_CACHE_FILE = PROJECT_ROOT / "ob_cache" / "latest_ob_features.json"
|
||||
|
||||
CHECK_INTERVAL = 2.0 # seconds
|
||||
WATCHDOG_TIMEOUT = 10.0 # seconds for systemd/watchdogd
|
||||
|
||||
# Monitor Targets
|
||||
CRITICAL_PROCESSES = [
|
||||
"system_watchdog_service.py",
|
||||
"acb_processor_service.py",
|
||||
"obf_prefect_flow.py",
|
||||
"scan-bridge-flow", # NEW: Scan bridge under Prefect management
|
||||
"prefect", # Catching the prefect worker/server
|
||||
"hazelcast" # Catching the HZ process
|
||||
]
|
||||
|
||||
PORTS = {
|
||||
"hazelcast": 5701,
|
||||
"prefect_api": 4200,
|
||||
"prefect_ui": 8080 # Management Center for HZ is usually 8080 too, check bible vs spec
|
||||
}
|
||||
|
||||
# --- LOGGING SETUP ---
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("MetaHealth")
|
||||
|
||||
class MetaHealthDaemon:
|
||||
def __init__(self):
|
||||
self.platform = platform.system().lower() # 'linux' or 'freebsd'
|
||||
self.start_time = time.time()
|
||||
self.hz_client = None
|
||||
self.last_rm_meta = 1.0
|
||||
self.status = "INITIALIZING"
|
||||
|
||||
logger.info(f"MHD starting on {self.platform}. Process ID: {os.getpid()}")
|
||||
if not PSUTIL_AVAILABLE:
|
||||
logger.warning("psutil NOT found. Process checks will be limited.")
|
||||
|
||||
def _lazy_get_hz(self):
|
||||
"""Best-effort Hazelcast connection."""
|
||||
if not HZ_CLIENT_AVAILABLE:
|
||||
return None
|
||||
if self.hz_client is not None:
|
||||
return self.hz_client
|
||||
|
||||
try:
|
||||
# Short timeout to avoid blocking the main loop
|
||||
self.hz_client = HazelcastClient(
|
||||
cluster_name="dolphin",
|
||||
cluster_members=["127.0.0.1:5701"],
|
||||
connect_timeout=0.5,
|
||||
connection_retry_limit=1
|
||||
)
|
||||
logger.info("MHD connected to Hazelcast cluster 'dolphin'")
|
||||
return self.hz_client
|
||||
except Exception:
|
||||
self.hz_client = None
|
||||
return None
|
||||
|
||||
# --- SENSORS ---
|
||||
|
||||
def m1_process_integrity(self):
|
||||
"""Check if critical processes are running."""
|
||||
if not PSUTIL_AVAILABLE:
|
||||
return 1.0 # Cannot check, assume OK for Rm math
|
||||
|
||||
missing = []
|
||||
for proc_name in CRITICAL_PROCESSES:
|
||||
found = False
|
||||
for p in psutil.process_iter(['name', 'cmdline']):
|
||||
try:
|
||||
# Check process name or cmdline (for python scripts)
|
||||
if proc_name in p.info['name'].lower():
|
||||
found = True
|
||||
break
|
||||
if p.info['cmdline'] and any(proc_name in arg.lower() for arg in p.info['cmdline']):
|
||||
found = True
|
||||
break
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
continue
|
||||
if not found:
|
||||
missing.append(proc_name)
|
||||
|
||||
if missing:
|
||||
logger.warning(f"M1 MISSING: {missing}")
|
||||
return 0.0 # Total failure of a critical component
|
||||
return 1.0
|
||||
|
||||
def m2_heartbeat_freshness(self):
|
||||
"""Check HZ heartbeats (SILOQY-style)."""
|
||||
hz = self._lazy_get_hz()
|
||||
if not hz:
|
||||
return 0.0 # If we can't connect, heartbeats are effectively missing
|
||||
|
||||
try:
|
||||
hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking()
|
||||
latest = hb_map.get("nautilus_flow_heartbeat")
|
||||
if not latest:
|
||||
return 0.0
|
||||
|
||||
# Expecting JSON string or dict
|
||||
if isinstance(latest, str):
|
||||
data = json.loads(latest)
|
||||
else:
|
||||
data = latest
|
||||
|
||||
ts = data.get("ts", 0)
|
||||
age = time.time() - ts
|
||||
|
||||
if age > 30: return 0.0
|
||||
if age > 10: return 0.5
|
||||
return 1.0
|
||||
except Exception as e:
|
||||
logger.debug(f"M2 HZ Read Error: {e}")
|
||||
return 0.0
|
||||
|
||||
def m3_data_freshness(self):
|
||||
"""Check file-based cache freshness."""
|
||||
if not OB_CACHE_FILE.exists():
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
mtime = OB_CACHE_FILE.stat().st_mtime
|
||||
age = time.time() - mtime
|
||||
|
||||
if age > 10: return 0.0
|
||||
if age > 5: return 0.3
|
||||
return 1.0
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
def m4_control_plane(self):
|
||||
"""Check if ports are listening."""
|
||||
hz_up = False
|
||||
prefect_up = False
|
||||
|
||||
def check_port(port):
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.settimeout(0.5)
|
||||
return s.connect_ex(('127.0.0.1', port)) == 0
|
||||
|
||||
hz_up = check_port(PORTS["hazelcast"])
|
||||
prefect_up = check_port(PORTS["prefect_api"])
|
||||
|
||||
if not hz_up and not prefect_up:
|
||||
return 0.2
|
||||
if not hz_up or not prefect_up:
|
||||
return 0.6
|
||||
return 1.0
|
||||
|
||||
def m5_health_coherence(self):
|
||||
"""Check if internal DOLPHIN_SAFETY is updating correctly."""
|
||||
hz = self._lazy_get_hz()
|
||||
if not hz:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
safety_map = hz.get_map("DOLPHIN_SAFETY").blocking()
|
||||
latest = safety_map.get("latest")
|
||||
if not latest:
|
||||
return 0.0
|
||||
|
||||
if isinstance(latest, str):
|
||||
data = json.loads(latest)
|
||||
else:
|
||||
data = latest
|
||||
|
||||
rm = data.get("Rm", -1)
|
||||
posture = data.get("posture", "UNKNOWN")
|
||||
ts = data.get("ts", 0)
|
||||
|
||||
# Coherence checks
|
||||
age = time.time() - ts
|
||||
valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"]
|
||||
|
||||
if age > 60: return 0.0 # Safety system is dead
|
||||
if not (0.0 <= rm <= 1.0): return 0.0 # Garbage Rm
|
||||
if posture not in valid_postures: return 0.0 # Corrupt posture
|
||||
|
||||
return 1.0
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
# --- ENGINE ---
|
||||
|
||||
def compute_rm_meta(self):
|
||||
m1 = self.m1_process_integrity()
|
||||
m2 = self.m2_heartbeat_freshness()
|
||||
m3 = self.m3_data_freshness()
|
||||
m4 = self.m4_control_plane()
|
||||
m5 = self.m5_health_coherence()
|
||||
|
||||
rm_meta = m1 * m2 * m3 * m4 * m5
|
||||
|
||||
# State mapping
|
||||
if rm_meta > 0.8: status = "GREEN"
|
||||
elif rm_meta > 0.5: status = "DEGRADED"
|
||||
elif rm_meta > 0.2: status = "CRITICAL"
|
||||
else: status = "DEAD"
|
||||
|
||||
self.last_rm_meta = rm_meta
|
||||
self.status = status
|
||||
|
||||
return {
|
||||
"rm_meta": round(rm_meta, 3),
|
||||
"status": status,
|
||||
"sensors": {
|
||||
"m1_proc": m1,
|
||||
"m2_hb": m2,
|
||||
"m3_data": m3,
|
||||
"m4_cp": m4,
|
||||
"m5_coh": m5
|
||||
},
|
||||
"timestamp": time.time(),
|
||||
"iso": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
def emit_outputs(self, report):
|
||||
# Local JSON state
|
||||
try:
|
||||
with open(STATUS_JSON, 'w') as f:
|
||||
json.dump(report, f, indent=2)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write state JSON: {e}")
|
||||
|
||||
# HZ Push (best effort)
|
||||
hz = self._lazy_get_hz()
|
||||
if hz and report['sensors']['m4_cp'] > 0.5:
|
||||
try:
|
||||
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
|
||||
meta_map.put("latest", json.dumps(report))
|
||||
except Exception:
|
||||
pass # Silent fail if HZ is struggling
|
||||
|
||||
logger.info(f"RM_META: {report['rm_meta']} | STATUS: {report['status']} | HB: {report['sensors']['m2_hb']}")
|
||||
|
||||
def attempt_restart(self, report):
|
||||
"""Platform-independent service restart logic."""
|
||||
if report['rm_meta'] > 0.2:
|
||||
return # Don't auto-restart unless DEAD or deeply CRITICAL
|
||||
|
||||
logger.critical("DEAD STATE detected. Attempting component restarts.")
|
||||
|
||||
services_to_check = {
|
||||
"hazelcast": "hazelcast",
|
||||
"prefect": "dolphin-prefect-worker", # Updated service name
|
||||
"scan-bridge": "dolphin-prefect-worker" # Scan bridge runs under Prefect worker
|
||||
}
|
||||
|
||||
# This is where one would add specific service names mapped to m-sensors
|
||||
# For this PoC, we try to restart the docker compose or main services
|
||||
|
||||
def restart_svc(name):
|
||||
if self.platform == "linux":
|
||||
cmd = ["systemctl", "restart", name]
|
||||
elif "bsd" in self.platform:
|
||||
cmd = ["service", name, "restart"]
|
||||
else:
|
||||
logger.error(f"Unsupported platform for restart: {self.platform}")
|
||||
return
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True)
|
||||
logger.info(f"Executed restart for {name}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to restart {name}: {e}")
|
||||
|
||||
# If M4 (Control Plane) is failed, try restarting primary infrastructure
|
||||
if report['sensors']['m4_cp'] <= 0.2:
|
||||
restart_svc("hazelcast")
|
||||
restart_svc("dolphin-prefect-worker") # This will restart all Prefect flows
|
||||
|
||||
# If scan-bridge-flow is not running, trigger a new deployment run
|
||||
if report['sensors']['m1_proc'] < 1.0:
|
||||
logger.warning("Scan bridge or critical process missing - triggering Prefect deployment")
|
||||
self._trigger_scan_bridge_deploy()
|
||||
|
||||
def _trigger_scan_bridge_deploy(self):
|
||||
"""Trigger scan-bridge-flow deployment via Prefect API."""
|
||||
try:
|
||||
env = os.environ.copy()
|
||||
env["PREFECT_API_URL"] = "http://localhost:4200/api"
|
||||
cmd = [
|
||||
"/home/dolphin/siloqy_env/bin/prefect",
|
||||
"deployment", "run",
|
||||
"scan-bridge-flow/scan-bridge"
|
||||
]
|
||||
subprocess.run(cmd, env=env, check=True, capture_output=True, timeout=30)
|
||||
logger.info("Triggered scan-bridge-flow deployment run")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to trigger scan-bridge deployment: {e}")
|
||||
|
||||
def run(self):
|
||||
while True:
|
||||
try:
|
||||
report = self.compute_rm_meta()
|
||||
self.emit_outputs(report)
|
||||
self.attempt_restart(report)
|
||||
|
||||
# systemd Watchdog Notify (Linux only)
|
||||
if self.platform == "linux" and 'NOTIFY_SOCKET' in os.environ:
|
||||
# In a real impl, we'd use sd_notify. Here we can use subprocess
|
||||
# or a library. For now, we skip but note its place.
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in MHD loop: {e}")
|
||||
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
|
||||
if __name__ == "__main__":
|
||||
daemon = MetaHealthDaemon()
|
||||
try:
|
||||
daemon.run()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("MHD stopped by user.")
|
||||
sys.exit(0)
|
||||
Reference in New Issue
Block a user