Files
DOLPHIN/prod/meta_health_daemon_v2.py

451 lines
15 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN Meta Health Service (MHS) v2
=====================================
Enhanced monitoring for all subsystems:
- Process integrity (M1)
- Heartbeat freshness (M2) - Hz heartbeats from all services
- Data freshness (M3) - Per-subsystem Hz key timestamps
- Control plane (M4) - Ports + Hz connectivity
- Data coherence (M5) - Integrity checks, posture validity
Monitored Subsystems:
1. Scan Bridge Hz DOLPHIN_FEATURES["latest_eigen_scan"]
2. OBF Hz DOLPHIN_FEATURES_SHARD_*
3. ExtF Hz DOLPHIN_FEATURES["exf_latest"]
4. EsoF Hz DOLPHIN_FEATURES["esof_latest"]
5. Nautilus Trader Hz DOLPHIN_PNL_BLUE, DOLPHIN_STATE_BLUE
6. System Watchdog Hz DOLPHIN_SAFETY
Outputs:
- Local JSON: /mnt/dolphinng5_predict/run_logs/meta_health.json
- Hz: DOLPHIN_META_HEALTH["latest"]
- Logs: /mnt/dolphinng5_predict/run_logs/meta_health.log
"""
import os
import sys
import time
import json
import socket
import logging
import platform
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from dataclasses import dataclass, asdict
# Optional deps
try:
import psutil
PSUTIL_AVAILABLE = True
except ImportError:
PSUTIL_AVAILABLE = False
try:
from hazelcast import HazelcastClient
HZ_CLIENT_AVAILABLE = True
except ImportError:
HZ_CLIENT_AVAILABLE = False
# --- CONFIGURATION ---
PROJECT_ROOT = Path("/mnt/dolphinng5_predict")
LOG_DIR = PROJECT_ROOT / "run_logs"
LOG_FILE = LOG_DIR / "meta_health.log"
STATUS_JSON = LOG_DIR / "meta_health.json"
CHECK_INTERVAL = 5.0 # seconds
DATA_STALE_THRESHOLD = 30.0 # seconds before data considered stale
DATA_DEAD_THRESHOLD = 120.0 # seconds before data considered dead
# Critical processes to monitor
CRITICAL_PROCESSES = [
("scan_bridge", ["scan_bridge"]), # scan_bridge_prefect_flow or direct
("nautilus_trader", ["nautilus_event_trader"]),
("extf", ["exf_prefect_final"]),
("obf", ["obf_prefect_flow"]),
("esof", ["esof_prefect_flow"]),
("hazelcast", ["hazelcast", "HzMember"]),
]
# Hz keys to monitor for freshness
HZ_DATA_SOURCES = {
"scan": ("DOLPHIN_FEATURES", "latest_eigen_scan", "bridge_ts"),
"obf": ("DOLPHIN_FEATURES", "ob_features_latest", "_pushed_at"),
"extf": ("DOLPHIN_FEATURES", "exf_latest", "_pushed_at"),
"esof": ("DOLPHIN_FEATURES", "esof_latest", "_pushed_at"),
"safety": ("DOLPHIN_SAFETY", "latest", "ts"),
"state": ("DOLPHIN_STATE_BLUE", "latest_nautilus", "updated_at"),
}
PORTS = {
"hazelcast": 5701,
"prefect_api": 4200,
}
# --- LOGGING ---
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler()
]
)
logger = logging.getLogger("MHS")
@dataclass
class HealthReport:
rm_meta: float
status: str
m1_proc: float
m2_heartbeat: float
m3_data_freshness: float
m4_control_plane: float
m5_coherence: float
subsystem_health: dict
timestamp: str
class MetaHealthService:
def __init__(self):
self.platform = platform.system().lower()
self.hz_client = None
self.last_report = None
logger.info(f"MHS v2 starting. PID: {os.getpid()}")
if not PSUTIL_AVAILABLE:
logger.warning("psutil not available - process checks limited")
if not HZ_CLIENT_AVAILABLE:
logger.error("Hazelcast not available - critical failure")
def _get_hz(self):
"""Lazy Hz connection with retry."""
if not HZ_CLIENT_AVAILABLE:
return None
if self.hz_client:
return self.hz_client
try:
self.hz_client = HazelcastClient(
cluster_name="dolphin",
cluster_members=["127.0.0.1:5701"],
connect_timeout=1.0,
connection_retry_limit=1
)
return self.hz_client
except Exception as e:
logger.debug(f"Hz connection failed: {e}")
return None
# --- SENSOR M1: Process Integrity ---
def m1_process_integrity(self):
"""Check critical processes are running."""
if not PSUTIL_AVAILABLE:
return 1.0, {}
results = {}
all_ok = True
for service_name, patterns in CRITICAL_PROCESSES:
found = False
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
cmdline = ' '.join(proc.info['cmdline'] or [])
for pattern in patterns:
if pattern.lower() in proc.info['name'].lower() or pattern.lower() in cmdline.lower():
found = True
break
if found:
break
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
results[service_name] = found
if not found:
all_ok = False
logger.warning(f"M1: {service_name} not running")
return 1.0 if all_ok else 0.0, results
# --- SENSOR M2: Hz Heartbeat Freshness ---
def m2_heartbeat_freshness(self):
"""Check Hz heartbeats from services."""
hz = self._get_hz()
if not hz:
return 0.0
try:
hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking()
latest = hb_map.get("nautilus_flow_heartbeat")
if not latest:
return 0.5 # No heartbeat but Hz is up
data = json.loads(latest) if isinstance(latest, str) else latest
ts = data.get("ts", 0)
age = time.time() - ts
if age > 60:
return 0.0
elif age > 30:
return 0.5
return 1.0
except Exception as e:
logger.debug(f"M2 error: {e}")
return 0.5
# --- SENSOR M3: Data Freshness ---
def m3_data_freshness(self):
"""Check all Hz data sources are fresh."""
hz = self._get_hz()
if not hz:
return 0.0, {}
results = {}
scores = []
for name, (map_name, key, ts_field) in HZ_DATA_SOURCES.items():
try:
map_obj = hz.get_map(map_name).blocking()
data_raw = map_obj.get(key)
if not data_raw:
results[name] = {"status": "missing", "score": 0.0}
scores.append(0.0)
continue
data = json.loads(data_raw) if isinstance(data_raw, str) else data_raw
ts_str = data.get(ts_field) if isinstance(data, dict) else None
if not ts_str:
results[name] = {"status": "no_timestamp", "score": 0.5}
scores.append(0.5)
continue
# Parse timestamp
try:
if isinstance(ts_str, (int, float)):
ts = ts_str
else:
# Try ISO format
ts = datetime.fromisoformat(ts_str.replace('Z', '+00:00')).timestamp()
except:
results[name] = {"status": "bad_timestamp", "score": 0.5}
scores.append(0.5)
continue
age = time.time() - ts
if age > DATA_DEAD_THRESHOLD:
score = 0.0
status = "dead"
elif age > DATA_STALE_THRESHOLD:
score = 0.5
status = "stale"
else:
score = 1.0
status = "fresh"
results[name] = {"status": status, "age_s": round(age, 1), "score": score}
scores.append(score)
if status == "dead":
logger.warning(f"M3: {name} data dead ({age:.0f}s old)")
elif status == "stale":
logger.debug(f"M3: {name} data stale ({age:.0f}s old)")
except Exception as e:
logger.debug(f"M3 error for {name}: {e}")
results[name] = {"status": "error", "score": 0.0}
scores.append(0.0)
# Average score
avg_score = sum(scores) / len(scores) if scores else 0.0
return avg_score, results
# --- SENSOR M4: Control Plane ---
def m4_control_plane(self):
"""Check Hz and Prefect ports."""
def check_port(port):
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(1.0)
return s.connect_ex(('127.0.0.1', port)) == 0
except:
return False
hz_up = check_port(PORTS["hazelcast"])
prefect_up = check_port(PORTS["prefect_api"])
if hz_up and prefect_up:
return 1.0
elif hz_up or prefect_up:
return 0.5
return 0.0
# --- SENSOR M5: Data Coherence ---
def m5_coherence(self):
"""Check data integrity and posture validity."""
hz = self._get_hz()
if not hz:
return 0.0
checks = []
# Check DOLPHIN_SAFETY coherence
try:
safety_map = hz.get_map("DOLPHIN_SAFETY").blocking()
safety_raw = safety_map.get("latest")
if safety_raw:
safety = json.loads(safety_raw) if isinstance(safety_raw, str) else safety_raw
rm = safety.get("Rm", -1)
posture = safety.get("posture", "UNKNOWN")
valid_postures = ["APEX", "STALKER", "TURTLE", "HIBERNATE"]
if 0.0 <= rm <= 1.0 and posture in valid_postures:
checks.append(1.0)
else:
checks.append(0.0)
logger.warning(f"M5: Invalid safety data - Rm={rm}, posture={posture}")
else:
checks.append(0.5)
except Exception as e:
logger.debug(f"M5 safety error: {e}")
checks.append(0.0)
# Check scan data integrity (basic JSON parse test)
try:
features_map = hz.get_map("DOLPHIN_FEATURES").blocking()
scan_raw = features_map.get("latest_eigen_scan")
if scan_raw:
scan = json.loads(scan_raw)
# Basic sanity checks
has_scan_num = "scan_number" in scan
has_timestamp = "timestamp" in scan or "bridge_ts" in scan
checks.append(1.0 if (has_scan_num and has_timestamp) else 0.5)
else:
checks.append(0.5)
except Exception as e:
logger.debug(f"M5 scan error: {e}")
checks.append(0.0)
return sum(checks) / len(checks) if checks else 0.0
# --- MAIN ENGINE ---
def compute_health(self):
"""Compute overall health score."""
m1_proc, proc_details = self.m1_process_integrity()
m2_hb = self.m2_heartbeat_freshness()
m3_data, data_details = self.m3_data_freshness()
m4_cp = self.m4_control_plane()
m5_coh = self.m5_coherence()
# Compute Rm_meta (product of all sensors)
rm_meta = m1_proc * m2_hb * m3_data * m4_cp * m5_coh
# Status mapping
if rm_meta > 0.8:
status = "GREEN"
elif rm_meta > 0.5:
status = "DEGRADED"
elif rm_meta > 0.2:
status = "CRITICAL"
else:
status = "DEAD"
report = HealthReport(
rm_meta=round(rm_meta, 3),
status=status,
m1_proc=round(m1_proc, 2),
m2_heartbeat=round(m2_hb, 2),
m3_data_freshness=round(m3_data, 2),
m4_control_plane=round(m4_cp, 2),
m5_coherence=round(m5_coh, 2),
subsystem_health={
"processes": proc_details,
"data_sources": data_details
},
timestamp=datetime.now(timezone.utc).isoformat()
)
return report
def emit_outputs(self, report: HealthReport):
"""Write health report to all outputs."""
report_dict = asdict(report)
# Local JSON
try:
with open(STATUS_JSON, 'w') as f:
json.dump(report_dict, f, indent=2)
except Exception as e:
logger.error(f"Failed to write status JSON: {e}")
# Hz push
hz = self._get_hz()
if hz and report.m4_control_plane > 0.5:
try:
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
meta_map.put("latest", json.dumps(report_dict))
except Exception as e:
logger.debug(f"Hz push failed: {e}")
# Log summary
logger.info(
f"RM_META={report.rm_meta} [{report.status}] "
f"M1={report.m1_proc} M2={report.m2_heartbeat} M3={report.m3_data_freshness} "
f"M4={report.m4_control_plane} M5={report.m5_coherence}"
)
def attempt_recovery(self, report: HealthReport):
"""Attempt to recover from degraded states."""
if report.status in ["GREEN", "DEGRADED"]:
return
logger.critical(f"RECOVERY: System status is {report.status}")
# Restart services based on failures
services_to_restart = []
if report.m4_control_plane < 0.5:
services_to_restart.extend(["hazelcast"])
if report.m1_proc < 1.0:
# Check which processes are missing
for service, running in report.subsystem_health.get("processes", {}).items():
if not running:
if service == "scan_bridge":
services_to_restart.append("dolphin-scan-bridge")
elif service == "nautilus_trader":
services_to_restart.append("dolphin-nautilus-trader")
for svc in services_to_restart:
try:
subprocess.run(["systemctl", "restart", svc], check=True, timeout=30)
logger.info(f"RECOVERY: Restarted {svc}")
except Exception as e:
logger.error(f"RECOVERY: Failed to restart {svc}: {e}")
def run(self):
"""Main monitoring loop."""
while True:
try:
report = self.compute_health()
self.emit_outputs(report)
self.attempt_recovery(report)
self.last_report = report
except Exception as e:
logger.error(f"Error in main loop: {e}")
time.sleep(CHECK_INTERVAL)
if __name__ == "__main__":
mhs = MetaHealthService()
try:
mhs.run()
except KeyboardInterrupt:
logger.info("MHS stopped by user")
sys.exit(0)