initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems:
- prod/ (BLUE harness, configs, scripts, docs)
- nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved)
- adaptive_exit/ (AEM engine + models/bucket_assignments.pkl)
- Observability/ (EsoF advisor, TUI, dashboards)
- external_factors/ (EsoF producer)
- mc_forewarning_qlabs_fork/ (MC regime/envelope)

Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
hjnormey
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions

359
prod/meta_health_daemon.py Executable file
View File

@@ -0,0 +1,359 @@
import os
import sys
import time
import json
import socket
import logging
import platform
import subprocess
from datetime import datetime
from pathlib import Path
# Try to import psutil for process and system metrics
try:
import psutil
PSUTIL_AVAILABLE = True
except ImportError:
PSUTIL_AVAILABLE = False
# External dependencies (optional/lazy-loaded)
try:
from hazelcast import HazelcastClient
from hazelcast.serialization.api import Portable
HZ_CLIENT_AVAILABLE = True
except ImportError:
HZ_CLIENT_AVAILABLE = False
# --- CONFIGURATION (Canonical Paths from SYSTEM_FILE_MAP) ---
PROJECT_ROOT = Path("C:/Users/Lenovo/Documents/- DOLPHIN NG HD HCM TSF Predict")
if not PROJECT_ROOT.exists():
# Fallback for Linux/Production path if running in a different env
PROJECT_ROOT = Path("/mnt/dolphinng5_predict")
LOG_DIR = PROJECT_ROOT / "run_logs"
LOG_FILE = LOG_DIR / "meta_health.log"
STATUS_JSON = LOG_DIR / "meta_health.json"
OB_CACHE_FILE = PROJECT_ROOT / "ob_cache" / "latest_ob_features.json"
CHECK_INTERVAL = 2.0 # seconds
WATCHDOG_TIMEOUT = 10.0 # seconds for systemd/watchdogd
# Monitor Targets
CRITICAL_PROCESSES = [
"system_watchdog_service.py",
"acb_processor_service.py",
"obf_prefect_flow.py",
"scan-bridge-flow", # NEW: Scan bridge under Prefect management
"prefect", # Catching the prefect worker/server
"hazelcast" # Catching the HZ process
]
PORTS = {
"hazelcast": 5701,
"prefect_api": 4200,
"prefect_ui": 8080 # Management Center for HZ is usually 8080 too, check bible vs spec
}
# --- LOGGING SETUP ---
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler()
]
)
logger = logging.getLogger("MetaHealth")
class MetaHealthDaemon:
def __init__(self):
self.platform = platform.system().lower() # 'linux' or 'freebsd'
self.start_time = time.time()
self.hz_client = None
self.last_rm_meta = 1.0
self.status = "INITIALIZING"
logger.info(f"MHD starting on {self.platform}. Process ID: {os.getpid()}")
if not PSUTIL_AVAILABLE:
logger.warning("psutil NOT found. Process checks will be limited.")
def _lazy_get_hz(self):
"""Best-effort Hazelcast connection."""
if not HZ_CLIENT_AVAILABLE:
return None
if self.hz_client is not None:
return self.hz_client
try:
# Short timeout to avoid blocking the main loop
self.hz_client = HazelcastClient(
cluster_name="dolphin",
cluster_members=["127.0.0.1:5701"],
connect_timeout=0.5,
connection_retry_limit=1
)
logger.info("MHD connected to Hazelcast cluster 'dolphin'")
return self.hz_client
except Exception:
self.hz_client = None
return None
# --- SENSORS ---
def m1_process_integrity(self):
"""Check if critical processes are running."""
if not PSUTIL_AVAILABLE:
return 1.0 # Cannot check, assume OK for Rm math
missing = []
for proc_name in CRITICAL_PROCESSES:
found = False
for p in psutil.process_iter(['name', 'cmdline']):
try:
# Check process name or cmdline (for python scripts)
if proc_name in p.info['name'].lower():
found = True
break
if p.info['cmdline'] and any(proc_name in arg.lower() for arg in p.info['cmdline']):
found = True
break
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
if not found:
missing.append(proc_name)
if missing:
logger.warning(f"M1 MISSING: {missing}")
return 0.0 # Total failure of a critical component
return 1.0
def m2_heartbeat_freshness(self):
"""Check HZ heartbeats (SILOQY-style)."""
hz = self._lazy_get_hz()
if not hz:
return 0.0 # If we can't connect, heartbeats are effectively missing
try:
hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking()
latest = hb_map.get("nautilus_flow_heartbeat")
if not latest:
return 0.0
# Expecting JSON string or dict
if isinstance(latest, str):
data = json.loads(latest)
else:
data = latest
ts = data.get("ts", 0)
age = time.time() - ts
if age > 30: return 0.0
if age > 10: return 0.5
return 1.0
except Exception as e:
logger.debug(f"M2 HZ Read Error: {e}")
return 0.0
def m3_data_freshness(self):
"""Check file-based cache freshness."""
if not OB_CACHE_FILE.exists():
return 0.0
try:
mtime = OB_CACHE_FILE.stat().st_mtime
age = time.time() - mtime
if age > 10: return 0.0
if age > 5: return 0.3
return 1.0
except Exception:
return 0.0
def m4_control_plane(self):
"""Check if ports are listening."""
hz_up = False
prefect_up = False
def check_port(port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(0.5)
return s.connect_ex(('127.0.0.1', port)) == 0
hz_up = check_port(PORTS["hazelcast"])
prefect_up = check_port(PORTS["prefect_api"])
if not hz_up and not prefect_up:
return 0.2
if not hz_up or not prefect_up:
return 0.6
return 1.0
def m5_health_coherence(self):
"""Check if internal DOLPHIN_SAFETY is updating correctly."""
hz = self._lazy_get_hz()
if not hz:
return 0.0
try:
safety_map = hz.get_map("DOLPHIN_SAFETY").blocking()
latest = safety_map.get("latest")
if not latest:
return 0.0
if isinstance(latest, str):
data = json.loads(latest)
else:
data = latest
rm = data.get("Rm", -1)
posture = data.get("posture", "UNKNOWN")
ts = data.get("ts", 0)
# Coherence checks
age = time.time() - ts
valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"]
if age > 60: return 0.0 # Safety system is dead
if not (0.0 <= rm <= 1.0): return 0.0 # Garbage Rm
if posture not in valid_postures: return 0.0 # Corrupt posture
return 1.0
except Exception:
return 0.0
# --- ENGINE ---
def compute_rm_meta(self):
m1 = self.m1_process_integrity()
m2 = self.m2_heartbeat_freshness()
m3 = self.m3_data_freshness()
m4 = self.m4_control_plane()
m5 = self.m5_health_coherence()
rm_meta = m1 * m2 * m3 * m4 * m5
# State mapping
if rm_meta > 0.8: status = "GREEN"
elif rm_meta > 0.5: status = "DEGRADED"
elif rm_meta > 0.2: status = "CRITICAL"
else: status = "DEAD"
self.last_rm_meta = rm_meta
self.status = status
return {
"rm_meta": round(rm_meta, 3),
"status": status,
"sensors": {
"m1_proc": m1,
"m2_hb": m2,
"m3_data": m3,
"m4_cp": m4,
"m5_coh": m5
},
"timestamp": time.time(),
"iso": datetime.now().isoformat()
}
def emit_outputs(self, report):
# Local JSON state
try:
with open(STATUS_JSON, 'w') as f:
json.dump(report, f, indent=2)
except Exception as e:
logger.error(f"Failed to write state JSON: {e}")
# HZ Push (best effort)
hz = self._lazy_get_hz()
if hz and report['sensors']['m4_cp'] > 0.5:
try:
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
meta_map.put("latest", json.dumps(report))
except Exception:
pass # Silent fail if HZ is struggling
logger.info(f"RM_META: {report['rm_meta']} | STATUS: {report['status']} | HB: {report['sensors']['m2_hb']}")
def attempt_restart(self, report):
"""Platform-independent service restart logic."""
if report['rm_meta'] > 0.2:
return # Don't auto-restart unless DEAD or deeply CRITICAL
logger.critical("DEAD STATE detected. Attempting component restarts.")
services_to_check = {
"hazelcast": "hazelcast",
"prefect": "dolphin-prefect-worker", # Updated service name
"scan-bridge": "dolphin-prefect-worker" # Scan bridge runs under Prefect worker
}
# This is where one would add specific service names mapped to m-sensors
# For this PoC, we try to restart the docker compose or main services
def restart_svc(name):
if self.platform == "linux":
cmd = ["systemctl", "restart", name]
elif "bsd" in self.platform:
cmd = ["service", name, "restart"]
else:
logger.error(f"Unsupported platform for restart: {self.platform}")
return
try:
subprocess.run(cmd, check=True)
logger.info(f"Executed restart for {name}")
except Exception as e:
logger.error(f"Failed to restart {name}: {e}")
# If M4 (Control Plane) is failed, try restarting primary infrastructure
if report['sensors']['m4_cp'] <= 0.2:
restart_svc("hazelcast")
restart_svc("dolphin-prefect-worker") # This will restart all Prefect flows
# If scan-bridge-flow is not running, trigger a new deployment run
if report['sensors']['m1_proc'] < 1.0:
logger.warning("Scan bridge or critical process missing - triggering Prefect deployment")
self._trigger_scan_bridge_deploy()
def _trigger_scan_bridge_deploy(self):
"""Trigger scan-bridge-flow deployment via Prefect API."""
try:
env = os.environ.copy()
env["PREFECT_API_URL"] = "http://localhost:4200/api"
cmd = [
"/home/dolphin/siloqy_env/bin/prefect",
"deployment", "run",
"scan-bridge-flow/scan-bridge"
]
subprocess.run(cmd, env=env, check=True, capture_output=True, timeout=30)
logger.info("Triggered scan-bridge-flow deployment run")
except Exception as e:
logger.error(f"Failed to trigger scan-bridge deployment: {e}")
def run(self):
while True:
try:
report = self.compute_rm_meta()
self.emit_outputs(report)
self.attempt_restart(report)
# systemd Watchdog Notify (Linux only)
if self.platform == "linux" and 'NOTIFY_SOCKET' in os.environ:
# In a real impl, we'd use sd_notify. Here we can use subprocess
# or a library. For now, we skip but note its place.
pass
except Exception as e:
logger.error(f"Error in MHD loop: {e}")
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":
daemon = MetaHealthDaemon()
try:
daemon.run()
except KeyboardInterrupt:
logger.info("MHD stopped by user.")
sys.exit(0)