451 lines
15 KiB
Python
451 lines
15 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
DOLPHIN Meta Health Service (MHS) v2
|
||
|
|
=====================================
|
||
|
|
Enhanced monitoring for all subsystems:
|
||
|
|
- Process integrity (M1)
|
||
|
|
- Heartbeat freshness (M2) - Hz heartbeats from all services
|
||
|
|
- Data freshness (M3) - Per-subsystem Hz key timestamps
|
||
|
|
- Control plane (M4) - Ports + Hz connectivity
|
||
|
|
- Data coherence (M5) - Integrity checks, posture validity
|
||
|
|
|
||
|
|
Monitored Subsystems:
|
||
|
|
1. Scan Bridge → Hz DOLPHIN_FEATURES["latest_eigen_scan"]
|
||
|
|
2. OBF → Hz DOLPHIN_FEATURES_SHARD_*
|
||
|
|
3. ExtF → Hz DOLPHIN_FEATURES["exf_latest"]
|
||
|
|
4. EsoF → Hz DOLPHIN_FEATURES["esof_latest"]
|
||
|
|
5. Nautilus Trader → Hz DOLPHIN_PNL_BLUE, DOLPHIN_STATE_BLUE
|
||
|
|
6. System Watchdog → Hz DOLPHIN_SAFETY
|
||
|
|
|
||
|
|
Outputs:
|
||
|
|
- Local JSON: /mnt/dolphinng5_predict/run_logs/meta_health.json
|
||
|
|
- Hz: DOLPHIN_META_HEALTH["latest"]
|
||
|
|
- Logs: /mnt/dolphinng5_predict/run_logs/meta_health.log
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import socket
|
||
|
|
import logging
|
||
|
|
import platform
|
||
|
|
import subprocess
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from pathlib import Path
|
||
|
|
from dataclasses import dataclass, asdict
|
||
|
|
|
||
|
|
# Optional deps
|
||
|
|
try:
|
||
|
|
import psutil
|
||
|
|
PSUTIL_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
PSUTIL_AVAILABLE = False
|
||
|
|
|
||
|
|
try:
|
||
|
|
from hazelcast import HazelcastClient
|
||
|
|
HZ_CLIENT_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
HZ_CLIENT_AVAILABLE = False
|
||
|
|
|
||
|
|
# --- CONFIGURATION ---
|
||
|
|
PROJECT_ROOT = Path("/mnt/dolphinng5_predict")
|
||
|
|
LOG_DIR = PROJECT_ROOT / "run_logs"
|
||
|
|
LOG_FILE = LOG_DIR / "meta_health.log"
|
||
|
|
STATUS_JSON = LOG_DIR / "meta_health.json"
|
||
|
|
|
||
|
|
CHECK_INTERVAL = 5.0 # seconds
|
||
|
|
DATA_STALE_THRESHOLD = 30.0 # seconds before data considered stale
|
||
|
|
DATA_DEAD_THRESHOLD = 120.0 # seconds before data considered dead
|
||
|
|
|
||
|
|
# Critical processes to monitor
|
||
|
|
CRITICAL_PROCESSES = [
|
||
|
|
("scan_bridge", ["scan_bridge"]), # scan_bridge_prefect_flow or direct
|
||
|
|
("nautilus_trader", ["nautilus_event_trader"]),
|
||
|
|
("extf", ["exf_prefect_final"]),
|
||
|
|
("obf", ["obf_prefect_flow"]),
|
||
|
|
("esof", ["esof_prefect_flow"]),
|
||
|
|
("hazelcast", ["hazelcast", "HzMember"]),
|
||
|
|
]
|
||
|
|
|
||
|
|
# Hz keys to monitor for freshness
|
||
|
|
HZ_DATA_SOURCES = {
|
||
|
|
"scan": ("DOLPHIN_FEATURES", "latest_eigen_scan", "bridge_ts"),
|
||
|
|
"obf": ("DOLPHIN_FEATURES", "ob_features_latest", "_pushed_at"),
|
||
|
|
"extf": ("DOLPHIN_FEATURES", "exf_latest", "_pushed_at"),
|
||
|
|
"esof": ("DOLPHIN_FEATURES", "esof_latest", "_pushed_at"),
|
||
|
|
"safety": ("DOLPHIN_SAFETY", "latest", "ts"),
|
||
|
|
"state": ("DOLPHIN_STATE_BLUE", "latest_nautilus", "updated_at"),
|
||
|
|
}
|
||
|
|
|
||
|
|
PORTS = {
|
||
|
|
"hazelcast": 5701,
|
||
|
|
"prefect_api": 4200,
|
||
|
|
}
|
||
|
|
|
||
|
|
# --- LOGGING ---
|
||
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(asctime)s [%(levelname)s] %(message)s',
|
||
|
|
handlers=[
|
||
|
|
logging.FileHandler(LOG_FILE),
|
||
|
|
logging.StreamHandler()
|
||
|
|
]
|
||
|
|
)
|
||
|
|
logger = logging.getLogger("MHS")
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class HealthReport:
|
||
|
|
rm_meta: float
|
||
|
|
status: str
|
||
|
|
m1_proc: float
|
||
|
|
m2_heartbeat: float
|
||
|
|
m3_data_freshness: float
|
||
|
|
m4_control_plane: float
|
||
|
|
m5_coherence: float
|
||
|
|
subsystem_health: dict
|
||
|
|
timestamp: str
|
||
|
|
|
||
|
|
|
||
|
|
class MetaHealthService:
|
||
|
|
def __init__(self):
|
||
|
|
self.platform = platform.system().lower()
|
||
|
|
self.hz_client = None
|
||
|
|
self.last_report = None
|
||
|
|
|
||
|
|
logger.info(f"MHS v2 starting. PID: {os.getpid()}")
|
||
|
|
if not PSUTIL_AVAILABLE:
|
||
|
|
logger.warning("psutil not available - process checks limited")
|
||
|
|
if not HZ_CLIENT_AVAILABLE:
|
||
|
|
logger.error("Hazelcast not available - critical failure")
|
||
|
|
|
||
|
|
def _get_hz(self):
|
||
|
|
"""Lazy Hz connection with retry."""
|
||
|
|
if not HZ_CLIENT_AVAILABLE:
|
||
|
|
return None
|
||
|
|
if self.hz_client:
|
||
|
|
return self.hz_client
|
||
|
|
try:
|
||
|
|
self.hz_client = HazelcastClient(
|
||
|
|
cluster_name="dolphin",
|
||
|
|
cluster_members=["127.0.0.1:5701"],
|
||
|
|
connect_timeout=1.0,
|
||
|
|
connection_retry_limit=1
|
||
|
|
)
|
||
|
|
return self.hz_client
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"Hz connection failed: {e}")
|
||
|
|
return None
|
||
|
|
|
||
|
|
# --- SENSOR M1: Process Integrity ---
|
||
|
|
def m1_process_integrity(self):
|
||
|
|
"""Check critical processes are running."""
|
||
|
|
if not PSUTIL_AVAILABLE:
|
||
|
|
return 1.0, {}
|
||
|
|
|
||
|
|
results = {}
|
||
|
|
all_ok = True
|
||
|
|
|
||
|
|
for service_name, patterns in CRITICAL_PROCESSES:
|
||
|
|
found = False
|
||
|
|
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
|
||
|
|
try:
|
||
|
|
cmdline = ' '.join(proc.info['cmdline'] or [])
|
||
|
|
for pattern in patterns:
|
||
|
|
if pattern.lower() in proc.info['name'].lower() or pattern.lower() in cmdline.lower():
|
||
|
|
found = True
|
||
|
|
break
|
||
|
|
if found:
|
||
|
|
break
|
||
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||
|
|
continue
|
||
|
|
|
||
|
|
results[service_name] = found
|
||
|
|
if not found:
|
||
|
|
all_ok = False
|
||
|
|
logger.warning(f"M1: {service_name} not running")
|
||
|
|
|
||
|
|
return 1.0 if all_ok else 0.0, results
|
||
|
|
|
||
|
|
# --- SENSOR M2: Hz Heartbeat Freshness ---
|
||
|
|
def m2_heartbeat_freshness(self):
|
||
|
|
"""Check Hz heartbeats from services."""
|
||
|
|
hz = self._get_hz()
|
||
|
|
if not hz:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
try:
|
||
|
|
hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking()
|
||
|
|
latest = hb_map.get("nautilus_flow_heartbeat")
|
||
|
|
if not latest:
|
||
|
|
return 0.5 # No heartbeat but Hz is up
|
||
|
|
|
||
|
|
data = json.loads(latest) if isinstance(latest, str) else latest
|
||
|
|
ts = data.get("ts", 0)
|
||
|
|
age = time.time() - ts
|
||
|
|
|
||
|
|
if age > 60:
|
||
|
|
return 0.0
|
||
|
|
elif age > 30:
|
||
|
|
return 0.5
|
||
|
|
return 1.0
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"M2 error: {e}")
|
||
|
|
return 0.5
|
||
|
|
|
||
|
|
# --- SENSOR M3: Data Freshness ---
|
||
|
|
def m3_data_freshness(self):
|
||
|
|
"""Check all Hz data sources are fresh."""
|
||
|
|
hz = self._get_hz()
|
||
|
|
if not hz:
|
||
|
|
return 0.0, {}
|
||
|
|
|
||
|
|
results = {}
|
||
|
|
scores = []
|
||
|
|
|
||
|
|
for name, (map_name, key, ts_field) in HZ_DATA_SOURCES.items():
|
||
|
|
try:
|
||
|
|
map_obj = hz.get_map(map_name).blocking()
|
||
|
|
data_raw = map_obj.get(key)
|
||
|
|
|
||
|
|
if not data_raw:
|
||
|
|
results[name] = {"status": "missing", "score": 0.0}
|
||
|
|
scores.append(0.0)
|
||
|
|
continue
|
||
|
|
|
||
|
|
data = json.loads(data_raw) if isinstance(data_raw, str) else data_raw
|
||
|
|
ts_str = data.get(ts_field) if isinstance(data, dict) else None
|
||
|
|
|
||
|
|
if not ts_str:
|
||
|
|
results[name] = {"status": "no_timestamp", "score": 0.5}
|
||
|
|
scores.append(0.5)
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Parse timestamp
|
||
|
|
try:
|
||
|
|
if isinstance(ts_str, (int, float)):
|
||
|
|
ts = ts_str
|
||
|
|
else:
|
||
|
|
# Try ISO format
|
||
|
|
ts = datetime.fromisoformat(ts_str.replace('Z', '+00:00')).timestamp()
|
||
|
|
except:
|
||
|
|
results[name] = {"status": "bad_timestamp", "score": 0.5}
|
||
|
|
scores.append(0.5)
|
||
|
|
continue
|
||
|
|
|
||
|
|
age = time.time() - ts
|
||
|
|
|
||
|
|
if age > DATA_DEAD_THRESHOLD:
|
||
|
|
score = 0.0
|
||
|
|
status = "dead"
|
||
|
|
elif age > DATA_STALE_THRESHOLD:
|
||
|
|
score = 0.5
|
||
|
|
status = "stale"
|
||
|
|
else:
|
||
|
|
score = 1.0
|
||
|
|
status = "fresh"
|
||
|
|
|
||
|
|
results[name] = {"status": status, "age_s": round(age, 1), "score": score}
|
||
|
|
scores.append(score)
|
||
|
|
|
||
|
|
if status == "dead":
|
||
|
|
logger.warning(f"M3: {name} data dead ({age:.0f}s old)")
|
||
|
|
elif status == "stale":
|
||
|
|
logger.debug(f"M3: {name} data stale ({age:.0f}s old)")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"M3 error for {name}: {e}")
|
||
|
|
results[name] = {"status": "error", "score": 0.0}
|
||
|
|
scores.append(0.0)
|
||
|
|
|
||
|
|
# Average score
|
||
|
|
avg_score = sum(scores) / len(scores) if scores else 0.0
|
||
|
|
return avg_score, results
|
||
|
|
|
||
|
|
# --- SENSOR M4: Control Plane ---
|
||
|
|
def m4_control_plane(self):
|
||
|
|
"""Check Hz and Prefect ports."""
|
||
|
|
def check_port(port):
|
||
|
|
try:
|
||
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||
|
|
s.settimeout(1.0)
|
||
|
|
return s.connect_ex(('127.0.0.1', port)) == 0
|
||
|
|
except:
|
||
|
|
return False
|
||
|
|
|
||
|
|
hz_up = check_port(PORTS["hazelcast"])
|
||
|
|
prefect_up = check_port(PORTS["prefect_api"])
|
||
|
|
|
||
|
|
if hz_up and prefect_up:
|
||
|
|
return 1.0
|
||
|
|
elif hz_up or prefect_up:
|
||
|
|
return 0.5
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
# --- SENSOR M5: Data Coherence ---
|
||
|
|
def m5_coherence(self):
|
||
|
|
"""Check data integrity and posture validity."""
|
||
|
|
hz = self._get_hz()
|
||
|
|
if not hz:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
checks = []
|
||
|
|
|
||
|
|
# Check DOLPHIN_SAFETY coherence
|
||
|
|
try:
|
||
|
|
safety_map = hz.get_map("DOLPHIN_SAFETY").blocking()
|
||
|
|
safety_raw = safety_map.get("latest")
|
||
|
|
if safety_raw:
|
||
|
|
safety = json.loads(safety_raw) if isinstance(safety_raw, str) else safety_raw
|
||
|
|
rm = safety.get("Rm", -1)
|
||
|
|
posture = safety.get("posture", "UNKNOWN")
|
||
|
|
valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"]
|
||
|
|
|
||
|
|
if 0.0 <= rm <= 1.0 and posture in valid_postures:
|
||
|
|
checks.append(1.0)
|
||
|
|
else:
|
||
|
|
checks.append(0.0)
|
||
|
|
logger.warning(f"M5: Invalid safety data - Rm={rm}, posture={posture}")
|
||
|
|
else:
|
||
|
|
checks.append(0.5)
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"M5 safety error: {e}")
|
||
|
|
checks.append(0.0)
|
||
|
|
|
||
|
|
# Check scan data integrity (basic JSON parse test)
|
||
|
|
try:
|
||
|
|
features_map = hz.get_map("DOLPHIN_FEATURES").blocking()
|
||
|
|
scan_raw = features_map.get("latest_eigen_scan")
|
||
|
|
if scan_raw:
|
||
|
|
scan = json.loads(scan_raw)
|
||
|
|
# Basic sanity checks
|
||
|
|
has_scan_num = "scan_number" in scan
|
||
|
|
has_timestamp = "timestamp" in scan or "bridge_ts" in scan
|
||
|
|
checks.append(1.0 if (has_scan_num and has_timestamp) else 0.5)
|
||
|
|
else:
|
||
|
|
checks.append(0.5)
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"M5 scan error: {e}")
|
||
|
|
checks.append(0.0)
|
||
|
|
|
||
|
|
return sum(checks) / len(checks) if checks else 0.0
|
||
|
|
|
||
|
|
# --- MAIN ENGINE ---
|
||
|
|
def compute_health(self):
|
||
|
|
"""Compute overall health score."""
|
||
|
|
m1_proc, proc_details = self.m1_process_integrity()
|
||
|
|
m2_hb = self.m2_heartbeat_freshness()
|
||
|
|
m3_data, data_details = self.m3_data_freshness()
|
||
|
|
m4_cp = self.m4_control_plane()
|
||
|
|
m5_coh = self.m5_coherence()
|
||
|
|
|
||
|
|
# Compute Rm_meta (product of all sensors)
|
||
|
|
rm_meta = m1_proc * m2_hb * m3_data * m4_cp * m5_coh
|
||
|
|
|
||
|
|
# Status mapping
|
||
|
|
if rm_meta > 0.8:
|
||
|
|
status = "GREEN"
|
||
|
|
elif rm_meta > 0.5:
|
||
|
|
status = "DEGRADED"
|
||
|
|
elif rm_meta > 0.2:
|
||
|
|
status = "CRITICAL"
|
||
|
|
else:
|
||
|
|
status = "DEAD"
|
||
|
|
|
||
|
|
report = HealthReport(
|
||
|
|
rm_meta=round(rm_meta, 3),
|
||
|
|
status=status,
|
||
|
|
m1_proc=round(m1_proc, 2),
|
||
|
|
m2_heartbeat=round(m2_hb, 2),
|
||
|
|
m3_data_freshness=round(m3_data, 2),
|
||
|
|
m4_control_plane=round(m4_cp, 2),
|
||
|
|
m5_coherence=round(m5_coh, 2),
|
||
|
|
subsystem_health={
|
||
|
|
"processes": proc_details,
|
||
|
|
"data_sources": data_details
|
||
|
|
},
|
||
|
|
timestamp=datetime.now(timezone.utc).isoformat()
|
||
|
|
)
|
||
|
|
|
||
|
|
return report
|
||
|
|
|
||
|
|
def emit_outputs(self, report: HealthReport):
|
||
|
|
"""Write health report to all outputs."""
|
||
|
|
report_dict = asdict(report)
|
||
|
|
|
||
|
|
# Local JSON
|
||
|
|
try:
|
||
|
|
with open(STATUS_JSON, 'w') as f:
|
||
|
|
json.dump(report_dict, f, indent=2)
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Failed to write status JSON: {e}")
|
||
|
|
|
||
|
|
# Hz push
|
||
|
|
hz = self._get_hz()
|
||
|
|
if hz and report.m4_control_plane > 0.5:
|
||
|
|
try:
|
||
|
|
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
|
||
|
|
meta_map.put("latest", json.dumps(report_dict))
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"Hz push failed: {e}")
|
||
|
|
|
||
|
|
# Log summary
|
||
|
|
logger.info(
|
||
|
|
f"RM_META={report.rm_meta} [{report.status}] "
|
||
|
|
f"M1={report.m1_proc} M2={report.m2_heartbeat} M3={report.m3_data_freshness} "
|
||
|
|
f"M4={report.m4_control_plane} M5={report.m5_coherence}"
|
||
|
|
)
|
||
|
|
|
||
|
|
def attempt_recovery(self, report: HealthReport):
|
||
|
|
"""Attempt to recover from degraded states."""
|
||
|
|
if report.status in ["GREEN", "DEGRADED"]:
|
||
|
|
return
|
||
|
|
|
||
|
|
logger.critical(f"RECOVERY: System status is {report.status}")
|
||
|
|
|
||
|
|
# Restart services based on failures
|
||
|
|
services_to_restart = []
|
||
|
|
|
||
|
|
if report.m4_control_plane < 0.5:
|
||
|
|
services_to_restart.extend(["hazelcast"])
|
||
|
|
|
||
|
|
if report.m1_proc < 1.0:
|
||
|
|
# Check which processes are missing
|
||
|
|
for service, running in report.subsystem_health.get("processes", {}).items():
|
||
|
|
if not running:
|
||
|
|
if service == "scan_bridge":
|
||
|
|
services_to_restart.append("dolphin-scan-bridge")
|
||
|
|
elif service == "nautilus_trader":
|
||
|
|
services_to_restart.append("dolphin-nautilus-trader")
|
||
|
|
|
||
|
|
for svc in services_to_restart:
|
||
|
|
try:
|
||
|
|
subprocess.run(["systemctl", "restart", svc], check=True, timeout=30)
|
||
|
|
logger.info(f"RECOVERY: Restarted {svc}")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"RECOVERY: Failed to restart {svc}: {e}")
|
||
|
|
|
||
|
|
def run(self):
|
||
|
|
"""Main monitoring loop."""
|
||
|
|
while True:
|
||
|
|
try:
|
||
|
|
report = self.compute_health()
|
||
|
|
self.emit_outputs(report)
|
||
|
|
self.attempt_recovery(report)
|
||
|
|
self.last_report = report
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Error in main loop: {e}")
|
||
|
|
|
||
|
|
time.sleep(CHECK_INTERVAL)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
mhs = MetaHealthService()
|
||
|
|
try:
|
||
|
|
mhs.run()
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
logger.info("MHS stopped by user")
|
||
|
|
sys.exit(0)
|