initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
450
prod/meta_health_daemon_v2.py
Executable file
450
prod/meta_health_daemon_v2.py
Executable file
@@ -0,0 +1,450 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DOLPHIN Meta Health Service (MHS) v2
|
||||
=====================================
|
||||
Enhanced monitoring for all subsystems:
|
||||
- Process integrity (M1)
|
||||
- Heartbeat freshness (M2) - Hz heartbeats from all services
|
||||
- Data freshness (M3) - Per-subsystem Hz key timestamps
|
||||
- Control plane (M4) - Ports + Hz connectivity
|
||||
- Data coherence (M5) - Integrity checks, posture validity
|
||||
|
||||
Monitored Subsystems:
|
||||
1. Scan Bridge → Hz DOLPHIN_FEATURES["latest_eigen_scan"]
|
||||
2. OBF → Hz DOLPHIN_FEATURES_SHARD_*
|
||||
3. ExtF → Hz DOLPHIN_FEATURES["exf_latest"]
|
||||
4. EsoF → Hz DOLPHIN_FEATURES["esof_latest"]
|
||||
5. Nautilus Trader → Hz DOLPHIN_PNL_BLUE, DOLPHIN_STATE_BLUE
|
||||
6. System Watchdog → Hz DOLPHIN_SAFETY
|
||||
|
||||
Outputs:
|
||||
- Local JSON: /mnt/dolphinng5_predict/run_logs/meta_health.json
|
||||
- Hz: DOLPHIN_META_HEALTH["latest"]
|
||||
- Logs: /mnt/dolphinng5_predict/run_logs/meta_health.log
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import socket
|
||||
import logging
|
||||
import platform
|
||||
import subprocess
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
# Optional deps
|
||||
try:
|
||||
import psutil
|
||||
PSUTIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
PSUTIL_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from hazelcast import HazelcastClient
|
||||
HZ_CLIENT_AVAILABLE = True
|
||||
except ImportError:
|
||||
HZ_CLIENT_AVAILABLE = False
|
||||
|
||||
# --- CONFIGURATION ---
|
||||
PROJECT_ROOT = Path("/mnt/dolphinng5_predict")
|
||||
LOG_DIR = PROJECT_ROOT / "run_logs"
|
||||
LOG_FILE = LOG_DIR / "meta_health.log"
|
||||
STATUS_JSON = LOG_DIR / "meta_health.json"
|
||||
|
||||
CHECK_INTERVAL = 5.0 # seconds
|
||||
DATA_STALE_THRESHOLD = 30.0 # seconds before data considered stale
|
||||
DATA_DEAD_THRESHOLD = 120.0 # seconds before data considered dead
|
||||
|
||||
# Critical processes to monitor
|
||||
CRITICAL_PROCESSES = [
|
||||
("scan_bridge", ["scan_bridge"]), # scan_bridge_prefect_flow or direct
|
||||
("nautilus_trader", ["nautilus_event_trader"]),
|
||||
("extf", ["exf_prefect_final"]),
|
||||
("obf", ["obf_prefect_flow"]),
|
||||
("esof", ["esof_prefect_flow"]),
|
||||
("hazelcast", ["hazelcast", "HzMember"]),
|
||||
]
|
||||
|
||||
# Hz keys to monitor for freshness
|
||||
HZ_DATA_SOURCES = {
|
||||
"scan": ("DOLPHIN_FEATURES", "latest_eigen_scan", "bridge_ts"),
|
||||
"obf": ("DOLPHIN_FEATURES", "ob_features_latest", "_pushed_at"),
|
||||
"extf": ("DOLPHIN_FEATURES", "exf_latest", "_pushed_at"),
|
||||
"esof": ("DOLPHIN_FEATURES", "esof_latest", "_pushed_at"),
|
||||
"safety": ("DOLPHIN_SAFETY", "latest", "ts"),
|
||||
"state": ("DOLPHIN_STATE_BLUE", "latest_nautilus", "updated_at"),
|
||||
}
|
||||
|
||||
PORTS = {
|
||||
"hazelcast": 5701,
|
||||
"prefect_api": 4200,
|
||||
}
|
||||
|
||||
# --- LOGGING ---
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s [%(levelname)s] %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("MHS")
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthReport:
|
||||
rm_meta: float
|
||||
status: str
|
||||
m1_proc: float
|
||||
m2_heartbeat: float
|
||||
m3_data_freshness: float
|
||||
m4_control_plane: float
|
||||
m5_coherence: float
|
||||
subsystem_health: dict
|
||||
timestamp: str
|
||||
|
||||
|
||||
class MetaHealthService:
|
||||
def __init__(self):
|
||||
self.platform = platform.system().lower()
|
||||
self.hz_client = None
|
||||
self.last_report = None
|
||||
|
||||
logger.info(f"MHS v2 starting. PID: {os.getpid()}")
|
||||
if not PSUTIL_AVAILABLE:
|
||||
logger.warning("psutil not available - process checks limited")
|
||||
if not HZ_CLIENT_AVAILABLE:
|
||||
logger.error("Hazelcast not available - critical failure")
|
||||
|
||||
def _get_hz(self):
|
||||
"""Lazy Hz connection with retry."""
|
||||
if not HZ_CLIENT_AVAILABLE:
|
||||
return None
|
||||
if self.hz_client:
|
||||
return self.hz_client
|
||||
try:
|
||||
self.hz_client = HazelcastClient(
|
||||
cluster_name="dolphin",
|
||||
cluster_members=["127.0.0.1:5701"],
|
||||
connect_timeout=1.0,
|
||||
connection_retry_limit=1
|
||||
)
|
||||
return self.hz_client
|
||||
except Exception as e:
|
||||
logger.debug(f"Hz connection failed: {e}")
|
||||
return None
|
||||
|
||||
# --- SENSOR M1: Process Integrity ---
|
||||
def m1_process_integrity(self):
|
||||
"""Check critical processes are running."""
|
||||
if not PSUTIL_AVAILABLE:
|
||||
return 1.0, {}
|
||||
|
||||
results = {}
|
||||
all_ok = True
|
||||
|
||||
for service_name, patterns in CRITICAL_PROCESSES:
|
||||
found = False
|
||||
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
|
||||
try:
|
||||
cmdline = ' '.join(proc.info['cmdline'] or [])
|
||||
for pattern in patterns:
|
||||
if pattern.lower() in proc.info['name'].lower() or pattern.lower() in cmdline.lower():
|
||||
found = True
|
||||
break
|
||||
if found:
|
||||
break
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
continue
|
||||
|
||||
results[service_name] = found
|
||||
if not found:
|
||||
all_ok = False
|
||||
logger.warning(f"M1: {service_name} not running")
|
||||
|
||||
return 1.0 if all_ok else 0.0, results
|
||||
|
||||
# --- SENSOR M2: Hz Heartbeat Freshness ---
|
||||
def m2_heartbeat_freshness(self):
|
||||
"""Check Hz heartbeats from services."""
|
||||
hz = self._get_hz()
|
||||
if not hz:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking()
|
||||
latest = hb_map.get("nautilus_flow_heartbeat")
|
||||
if not latest:
|
||||
return 0.5 # No heartbeat but Hz is up
|
||||
|
||||
data = json.loads(latest) if isinstance(latest, str) else latest
|
||||
ts = data.get("ts", 0)
|
||||
age = time.time() - ts
|
||||
|
||||
if age > 60:
|
||||
return 0.0
|
||||
elif age > 30:
|
||||
return 0.5
|
||||
return 1.0
|
||||
except Exception as e:
|
||||
logger.debug(f"M2 error: {e}")
|
||||
return 0.5
|
||||
|
||||
# --- SENSOR M3: Data Freshness ---
|
||||
def m3_data_freshness(self):
|
||||
"""Check all Hz data sources are fresh."""
|
||||
hz = self._get_hz()
|
||||
if not hz:
|
||||
return 0.0, {}
|
||||
|
||||
results = {}
|
||||
scores = []
|
||||
|
||||
for name, (map_name, key, ts_field) in HZ_DATA_SOURCES.items():
|
||||
try:
|
||||
map_obj = hz.get_map(map_name).blocking()
|
||||
data_raw = map_obj.get(key)
|
||||
|
||||
if not data_raw:
|
||||
results[name] = {"status": "missing", "score": 0.0}
|
||||
scores.append(0.0)
|
||||
continue
|
||||
|
||||
data = json.loads(data_raw) if isinstance(data_raw, str) else data_raw
|
||||
ts_str = data.get(ts_field) if isinstance(data, dict) else None
|
||||
|
||||
if not ts_str:
|
||||
results[name] = {"status": "no_timestamp", "score": 0.5}
|
||||
scores.append(0.5)
|
||||
continue
|
||||
|
||||
# Parse timestamp
|
||||
try:
|
||||
if isinstance(ts_str, (int, float)):
|
||||
ts = ts_str
|
||||
else:
|
||||
# Try ISO format
|
||||
ts = datetime.fromisoformat(ts_str.replace('Z', '+00:00')).timestamp()
|
||||
except:
|
||||
results[name] = {"status": "bad_timestamp", "score": 0.5}
|
||||
scores.append(0.5)
|
||||
continue
|
||||
|
||||
age = time.time() - ts
|
||||
|
||||
if age > DATA_DEAD_THRESHOLD:
|
||||
score = 0.0
|
||||
status = "dead"
|
||||
elif age > DATA_STALE_THRESHOLD:
|
||||
score = 0.5
|
||||
status = "stale"
|
||||
else:
|
||||
score = 1.0
|
||||
status = "fresh"
|
||||
|
||||
results[name] = {"status": status, "age_s": round(age, 1), "score": score}
|
||||
scores.append(score)
|
||||
|
||||
if status == "dead":
|
||||
logger.warning(f"M3: {name} data dead ({age:.0f}s old)")
|
||||
elif status == "stale":
|
||||
logger.debug(f"M3: {name} data stale ({age:.0f}s old)")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"M3 error for {name}: {e}")
|
||||
results[name] = {"status": "error", "score": 0.0}
|
||||
scores.append(0.0)
|
||||
|
||||
# Average score
|
||||
avg_score = sum(scores) / len(scores) if scores else 0.0
|
||||
return avg_score, results
|
||||
|
||||
# --- SENSOR M4: Control Plane ---
|
||||
def m4_control_plane(self):
|
||||
"""Check Hz and Prefect ports."""
|
||||
def check_port(port):
|
||||
try:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.settimeout(1.0)
|
||||
return s.connect_ex(('127.0.0.1', port)) == 0
|
||||
except:
|
||||
return False
|
||||
|
||||
hz_up = check_port(PORTS["hazelcast"])
|
||||
prefect_up = check_port(PORTS["prefect_api"])
|
||||
|
||||
if hz_up and prefect_up:
|
||||
return 1.0
|
||||
elif hz_up or prefect_up:
|
||||
return 0.5
|
||||
return 0.0
|
||||
|
||||
# --- SENSOR M5: Data Coherence ---
|
||||
def m5_coherence(self):
|
||||
"""Check data integrity and posture validity."""
|
||||
hz = self._get_hz()
|
||||
if not hz:
|
||||
return 0.0
|
||||
|
||||
checks = []
|
||||
|
||||
# Check DOLPHIN_SAFETY coherence
|
||||
try:
|
||||
safety_map = hz.get_map("DOLPHIN_SAFETY").blocking()
|
||||
safety_raw = safety_map.get("latest")
|
||||
if safety_raw:
|
||||
safety = json.loads(safety_raw) if isinstance(safety_raw, str) else safety_raw
|
||||
rm = safety.get("Rm", -1)
|
||||
posture = safety.get("posture", "UNKNOWN")
|
||||
valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"]
|
||||
|
||||
if 0.0 <= rm <= 1.0 and posture in valid_postures:
|
||||
checks.append(1.0)
|
||||
else:
|
||||
checks.append(0.0)
|
||||
logger.warning(f"M5: Invalid safety data - Rm={rm}, posture={posture}")
|
||||
else:
|
||||
checks.append(0.5)
|
||||
except Exception as e:
|
||||
logger.debug(f"M5 safety error: {e}")
|
||||
checks.append(0.0)
|
||||
|
||||
# Check scan data integrity (basic JSON parse test)
|
||||
try:
|
||||
features_map = hz.get_map("DOLPHIN_FEATURES").blocking()
|
||||
scan_raw = features_map.get("latest_eigen_scan")
|
||||
if scan_raw:
|
||||
scan = json.loads(scan_raw)
|
||||
# Basic sanity checks
|
||||
has_scan_num = "scan_number" in scan
|
||||
has_timestamp = "timestamp" in scan or "bridge_ts" in scan
|
||||
checks.append(1.0 if (has_scan_num and has_timestamp) else 0.5)
|
||||
else:
|
||||
checks.append(0.5)
|
||||
except Exception as e:
|
||||
logger.debug(f"M5 scan error: {e}")
|
||||
checks.append(0.0)
|
||||
|
||||
return sum(checks) / len(checks) if checks else 0.0
|
||||
|
||||
# --- MAIN ENGINE ---
|
||||
def compute_health(self):
|
||||
"""Compute overall health score."""
|
||||
m1_proc, proc_details = self.m1_process_integrity()
|
||||
m2_hb = self.m2_heartbeat_freshness()
|
||||
m3_data, data_details = self.m3_data_freshness()
|
||||
m4_cp = self.m4_control_plane()
|
||||
m5_coh = self.m5_coherence()
|
||||
|
||||
# Compute Rm_meta (product of all sensors)
|
||||
rm_meta = m1_proc * m2_hb * m3_data * m4_cp * m5_coh
|
||||
|
||||
# Status mapping
|
||||
if rm_meta > 0.8:
|
||||
status = "GREEN"
|
||||
elif rm_meta > 0.5:
|
||||
status = "DEGRADED"
|
||||
elif rm_meta > 0.2:
|
||||
status = "CRITICAL"
|
||||
else:
|
||||
status = "DEAD"
|
||||
|
||||
report = HealthReport(
|
||||
rm_meta=round(rm_meta, 3),
|
||||
status=status,
|
||||
m1_proc=round(m1_proc, 2),
|
||||
m2_heartbeat=round(m2_hb, 2),
|
||||
m3_data_freshness=round(m3_data, 2),
|
||||
m4_control_plane=round(m4_cp, 2),
|
||||
m5_coherence=round(m5_coh, 2),
|
||||
subsystem_health={
|
||||
"processes": proc_details,
|
||||
"data_sources": data_details
|
||||
},
|
||||
timestamp=datetime.now(timezone.utc).isoformat()
|
||||
)
|
||||
|
||||
return report
|
||||
|
||||
def emit_outputs(self, report: HealthReport):
|
||||
"""Write health report to all outputs."""
|
||||
report_dict = asdict(report)
|
||||
|
||||
# Local JSON
|
||||
try:
|
||||
with open(STATUS_JSON, 'w') as f:
|
||||
json.dump(report_dict, f, indent=2)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write status JSON: {e}")
|
||||
|
||||
# Hz push
|
||||
hz = self._get_hz()
|
||||
if hz and report.m4_control_plane > 0.5:
|
||||
try:
|
||||
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
|
||||
meta_map.put("latest", json.dumps(report_dict))
|
||||
except Exception as e:
|
||||
logger.debug(f"Hz push failed: {e}")
|
||||
|
||||
# Log summary
|
||||
logger.info(
|
||||
f"RM_META={report.rm_meta} [{report.status}] "
|
||||
f"M1={report.m1_proc} M2={report.m2_heartbeat} M3={report.m3_data_freshness} "
|
||||
f"M4={report.m4_control_plane} M5={report.m5_coherence}"
|
||||
)
|
||||
|
||||
def attempt_recovery(self, report: HealthReport):
|
||||
"""Attempt to recover from degraded states."""
|
||||
if report.status in ["GREEN", "DEGRADED"]:
|
||||
return
|
||||
|
||||
logger.critical(f"RECOVERY: System status is {report.status}")
|
||||
|
||||
# Restart services based on failures
|
||||
services_to_restart = []
|
||||
|
||||
if report.m4_control_plane < 0.5:
|
||||
services_to_restart.extend(["hazelcast"])
|
||||
|
||||
if report.m1_proc < 1.0:
|
||||
# Check which processes are missing
|
||||
for service, running in report.subsystem_health.get("processes", {}).items():
|
||||
if not running:
|
||||
if service == "scan_bridge":
|
||||
services_to_restart.append("dolphin-scan-bridge")
|
||||
elif service == "nautilus_trader":
|
||||
services_to_restart.append("dolphin-nautilus-trader")
|
||||
|
||||
for svc in services_to_restart:
|
||||
try:
|
||||
subprocess.run(["systemctl", "restart", svc], check=True, timeout=30)
|
||||
logger.info(f"RECOVERY: Restarted {svc}")
|
||||
except Exception as e:
|
||||
logger.error(f"RECOVERY: Failed to restart {svc}: {e}")
|
||||
|
||||
def run(self):
|
||||
"""Main monitoring loop."""
|
||||
while True:
|
||||
try:
|
||||
report = self.compute_health()
|
||||
self.emit_outputs(report)
|
||||
self.attempt_recovery(report)
|
||||
self.last_report = report
|
||||
except Exception as e:
|
||||
logger.error(f"Error in main loop: {e}")
|
||||
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mhs = MetaHealthService()
|
||||
try:
|
||||
mhs.run()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("MHS stopped by user")
|
||||
sys.exit(0)
|
||||
Reference in New Issue
Block a user