927 lines
40 KiB
Python
927 lines
40 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
DOLPHIN Meta Health Service v3
|
|||
|
|
===============================
|
|||
|
|
Watchdog-of-watchdogs: monitors all Dolphin subsystems, emits Rm_meta,
|
|||
|
|
and performs supervised recovery via supervisorctl (NOT systemctl).
|
|||
|
|
|
|||
|
|
CRITICAL fixes vs v1/v2:
|
|||
|
|
FIX-1 Product formula replaced with weighted sum — one absent optional
|
|||
|
|
service no longer collapses rm_meta to 0 and triggers restarts.
|
|||
|
|
FIX-2 Recovery uses supervisorctl exclusively — systemctl calls are gone.
|
|||
|
|
systemctl was the "weird stopping bug": it killed supervisord-managed
|
|||
|
|
processes, causing supervisord to fight against MHS.
|
|||
|
|
FIX-3 Process patterns updated to current supervisord program names.
|
|||
|
|
FIX-4 HZ keys updated to current system (exf_latest, acb_boost, etc.).
|
|||
|
|
FIX-5 Trader services (nautilus_trader, scan_bridge) are NEVER auto-restarted
|
|||
|
|
— they may be intentionally stopped. Only dolphin_data infra is recovered.
|
|||
|
|
FIX-6 Recovery is rate-limited: max one restart per service per 10 min.
|
|||
|
|
FIX-7 EsoF removed (never deployed on this system).
|
|||
|
|
FIX-8 obf_universe added as a monitored data service.
|
|||
|
|
|
|||
|
|
Outputs:
|
|||
|
|
- /mnt/dolphinng5_predict/run_logs/meta_health.json
|
|||
|
|
- Hazelcast DOLPHIN_META_HEALTH["latest"]
|
|||
|
|
- stdout/file log
|
|||
|
|
|
|||
|
|
Services monitored (supervisord program names):
|
|||
|
|
dolphin_data:exf_fetcher — ExF live indicators → HZ exf_latest
|
|||
|
|
dolphin_data:acb_processor — ACB boost writer → HZ acb_boost
|
|||
|
|
dolphin_data:obf_universe — L2 universe health → HZ obf_universe_latest
|
|||
|
|
dolphin:nautilus_trader — Execution engine (informational only)
|
|||
|
|
dolphin:scan_bridge — Arrow→HZ bridge (informational only)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
import subprocess
|
|||
|
|
import sys
|
|||
|
|
import threading
|
|||
|
|
import time
|
|||
|
|
import urllib.request
|
|||
|
|
from dataclasses import asdict, dataclass
|
|||
|
|
from datetime import datetime, timezone
|
|||
|
|
from pathlib import Path
|
|||
|
|
from typing import Dict, Optional, Tuple
|
|||
|
|
|
|||
|
|
# Optional deps
|
|||
|
|
try:
|
|||
|
|
import psutil
|
|||
|
|
PSUTIL_AVAILABLE = True
|
|||
|
|
except ImportError:
|
|||
|
|
PSUTIL_AVAILABLE = False
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from hazelcast import HazelcastClient
|
|||
|
|
HZ_CLIENT_AVAILABLE = True
|
|||
|
|
except ImportError:
|
|||
|
|
HZ_CLIENT_AVAILABLE = False
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / 'nautilus_dolphin'))
|
|||
|
|
from nautilus_dolphin.nautilus.survival_stack import SurvivalStack
|
|||
|
|
SURVIVAL_STACK_AVAILABLE = True
|
|||
|
|
except ImportError:
|
|||
|
|
SURVIVAL_STACK_AVAILABLE = False
|
|||
|
|
|
|||
|
|
from dolphin_exit_handler import install_exit_handler
|
|||
|
|
install_exit_handler("meta_health")
|
|||
|
|
|
|||
|
|
from hz_warmup import hz_warmup, read_lifecycle_state
|
|||
|
|
|
|||
|
|
# ── Configuration ──────────────────────────────────────────────────────────────
|
|||
|
|
PROJECT_ROOT = Path("/mnt/dolphinng5_predict")
|
|||
|
|
LOG_DIR = PROJECT_ROOT / "run_logs"
|
|||
|
|
LOG_FILE = LOG_DIR / "meta_health.log"
|
|||
|
|
STATUS_JSON = LOG_DIR / "meta_health.json"
|
|||
|
|
SUPERVISORD_CONF = PROJECT_ROOT / "prod" / "supervisor" / "dolphin-supervisord.conf"
|
|||
|
|
|
|||
|
|
CHECK_INTERVAL_S = 10.0 # main loop cadence
|
|||
|
|
DATA_STALE_S = 30.0 # warn if HZ key older than this
|
|||
|
|
DATA_DEAD_S = 120.0 # score=0 if HZ key older than this
|
|||
|
|
RECOVERY_COOLDOWN_CRITICAL_S = 10.0 # critical data infra: retry every 10s
|
|||
|
|
RECOVERY_COOLDOWN_DEFAULT_S = 300.0 # non-critical / informational services
|
|||
|
|
|
|||
|
|
# ── Container watchdog (fast-tick background thread) ──────────────────────────
|
|||
|
|
# Probes HTTP health endpoints every 0.5s; triggers docker restart after N
|
|||
|
|
# consecutive failures. Runs independently of the 10s main loop.
|
|||
|
|
CONTAINER_WATCHDOGS = [
|
|||
|
|
# NOTE: dolphin-hazelcast REMOVED from watchdog (2026-04-07).
|
|||
|
|
# HZ is RAM-only volatile — restarting it wipes ALL state and triggers cascading
|
|||
|
|
# failures (Cat1=0 → HIBERNATE → trade halt). Docker autoheal handles container
|
|||
|
|
# health. MHS must NEVER restart HZ. See project_hz_volatile_state.md.
|
|||
|
|
{
|
|||
|
|
"name": "dolphin-prefect",
|
|||
|
|
"url": "http://127.0.0.1:4200/api/health",
|
|||
|
|
"check": lambda body: body.strip() == b"true",
|
|||
|
|
"threshold": 4, # 4 × 0.5s = ~2s detection → ~37s worst-case total
|
|||
|
|
"cooldown": 60,
|
|||
|
|
"restart_timeout": 45,
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"name": "dolphin-hazelcast-mc",
|
|||
|
|
"url": "http://127.0.0.1:8080/",
|
|||
|
|
"check": lambda body: len(body) > 0,
|
|||
|
|
"threshold": 20, # 20 × 0.5s = 10s — MC is non-critical
|
|||
|
|
"cooldown": 120,
|
|||
|
|
"restart_timeout": 60,
|
|||
|
|
},
|
|||
|
|
]
|
|||
|
|
CONTAINER_POLL_S = 0.5 # fast-tick interval
|
|||
|
|
|
|||
|
|
# ── Service registry ───────────────────────────────────────────────────────────
|
|||
|
|
# Maps supervisord program name → list of cmdline patterns to detect process.
|
|||
|
|
# "critical_data": True → auto-restart via supervisorctl on failure.
|
|||
|
|
# "critical_data": False → informational only, never auto-restarted.
|
|||
|
|
SERVICES = {
|
|||
|
|
"dolphin_data:exf_fetcher": {
|
|||
|
|
"patterns": ["exf_fetcher_flow"],
|
|||
|
|
"critical_data": True,
|
|||
|
|
},
|
|||
|
|
"dolphin_data:acb_processor": {
|
|||
|
|
"patterns": ["acb_processor_service"],
|
|||
|
|
"critical_data": True,
|
|||
|
|
},
|
|||
|
|
"dolphin_data:obf_universe": {
|
|||
|
|
"patterns": ["obf_universe_service"],
|
|||
|
|
"critical_data": True,
|
|||
|
|
},
|
|||
|
|
"dolphin:nautilus_trader": {
|
|||
|
|
"patterns": ["nautilus_event_trader"],
|
|||
|
|
"critical_data": False, # may be intentionally stopped
|
|||
|
|
},
|
|||
|
|
"dolphin:scan_bridge": {
|
|||
|
|
"patterns": ["scan_bridge_service"],
|
|||
|
|
"critical_data": False, # informational
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ── HZ data keys to check freshness ───────────────────────────────────────────
|
|||
|
|
# (map_name, key, ts_field_or_None)
|
|||
|
|
# ts_field=None → key presence only, no freshness score
|
|||
|
|
HZ_DATA_SOURCES = {
|
|||
|
|
"exf_latest": ("DOLPHIN_FEATURES", "exf_latest", "_pushed_at"),
|
|||
|
|
"acb_boost": ("DOLPHIN_FEATURES", "acb_boost", None),
|
|||
|
|
"latest_eigen_scan": ("DOLPHIN_FEATURES", "latest_eigen_scan", "timestamp"),
|
|||
|
|
"obf_universe": ("DOLPHIN_FEATURES", "obf_universe_latest", "_snapshot_utc"),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
HZ_PORTS = {"hazelcast": 5701, "prefect_api": 4200}
|
|||
|
|
|
|||
|
|
# ── Sensor weights (sum = 1.0) ─────────────────────────────────────────────────
|
|||
|
|
# FIX-1: weighted sum replaces product. No single sensor can zero-out rm_meta.
|
|||
|
|
SENSOR_WEIGHTS = {
|
|||
|
|
"m4_control_plane": 0.33, # HZ must be up — highest weight
|
|||
|
|
"m1_data_infra": 0.33, # data pipeline processes must run
|
|||
|
|
"m3_data_freshness": 0.19, # HZ data must be fresh
|
|||
|
|
"m5_coherence": 0.09, # coherence / sanity checks
|
|||
|
|
"m6_test_integrity": 0.06, # continuous test suite gate (NEW)
|
|||
|
|
# m1_trader and m2_heartbeat are emitted but NOT included in rm_meta
|
|||
|
|
# because they may be intentionally down during non-trading hours
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# M6 config
|
|||
|
|
_TEST_RESULTS_PATH = Path("/mnt/dolphinng5_predict/run_logs/test_results_latest.json")
|
|||
|
|
M6_STALE_SECONDS = 900 # 15 min — if last run older than this, penalise
|
|||
|
|
M6_DEAD_SECONDS = 3600 # 60 min — if no run in 1h, treat as 0
|
|||
|
|
|
|||
|
|
# ── Logging ────────────────────────────────────────────────────────────────────
|
|||
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO,
|
|||
|
|
format="%(asctime)s [%(levelname)s] %(name)s — %(message)s",
|
|||
|
|
handlers=[
|
|||
|
|
logging.FileHandler(LOG_FILE),
|
|||
|
|
logging.StreamHandler(sys.stdout),
|
|||
|
|
],
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger("MHSv3")
|
|||
|
|
|
|||
|
|
|
|||
|
|
@dataclass
|
|||
|
|
class HealthReport:
|
|||
|
|
rm_meta: float
|
|||
|
|
status: str # GREEN / DEGRADED / CRITICAL / DEAD
|
|||
|
|
m4_control_plane: float
|
|||
|
|
m1_data_infra: float # critical data services
|
|||
|
|
m1_trader: float # informational — not in rm_meta
|
|||
|
|
m2_heartbeat: float # informational — not in rm_meta
|
|||
|
|
m3_data_freshness: float
|
|||
|
|
m5_coherence: float
|
|||
|
|
m6_test_integrity: float # continuous test gate (NEW)
|
|||
|
|
service_status: dict # per-supervisord-program state
|
|||
|
|
hz_key_status: dict # per-HZ-key freshness
|
|||
|
|
timestamp: str
|
|||
|
|
|
|||
|
|
|
|||
|
|
class MetaHealthServiceV3:
|
|||
|
|
def __init__(self):
|
|||
|
|
self._hz_client: Optional[object] = None
|
|||
|
|
self._recovery_timestamps: Dict[str, float] = {} # FIX-6 rate limit
|
|||
|
|
# Per-container fail streaks and last-restart timestamps for watchdog thread
|
|||
|
|
self._cw_fail_streaks: Dict[str, int] = {c["name"]: 0 for c in CONTAINER_WATCHDOGS}
|
|||
|
|
self._cw_last_restart: Dict[str, float] = {c["name"]: 0.0 for c in CONTAINER_WATCHDOGS}
|
|||
|
|
# SurvivalStack — computes posture and writes to DOLPHIN_SAFETY
|
|||
|
|
_fast_recovery = os.environ.get('DOLPHIN_FAST_RECOVERY', '').lower() in ('1', 'true', 'yes')
|
|||
|
|
self._survival_stack = SurvivalStack(fast_recovery=_fast_recovery) if SURVIVAL_STACK_AVAILABLE else None
|
|||
|
|
if _fast_recovery:
|
|||
|
|
logger.warning("POSTURE_DEBUG: fast_recovery=True — bounded recovery BYPASSED")
|
|||
|
|
logger.info("MHSv3 starting. PID=%d psutil=%s hz=%s survival_stack=%s",
|
|||
|
|
os.getpid(), PSUTIL_AVAILABLE, HZ_CLIENT_AVAILABLE, SURVIVAL_STACK_AVAILABLE)
|
|||
|
|
# Start fast-tick container watchdog as daemon thread
|
|||
|
|
t = threading.Thread(target=self._container_watchdog, daemon=True,
|
|||
|
|
name="container-watchdog")
|
|||
|
|
t.start()
|
|||
|
|
logger.info("Container watchdog thread started (poll=%.1fs)", CONTAINER_POLL_S)
|
|||
|
|
|
|||
|
|
# ── Container watchdog (fast-tick background thread) ──────────────────────
|
|||
|
|
|
|||
|
|
def _container_watchdog(self):
|
|||
|
|
"""Runs forever in a daemon thread. Polls HTTP health endpoints every
|
|||
|
|
CONTAINER_POLL_S and triggers docker restart after N consecutive failures."""
|
|||
|
|
# Grace period on startup — let containers stabilize before watchdog activates
|
|||
|
|
# HZ restart takes ~15s and MHS itself needs ~10s to connect. 120s is safe.
|
|||
|
|
time.sleep(120)
|
|||
|
|
logger.info("Container watchdog armed after 120s grace period")
|
|||
|
|
while True:
|
|||
|
|
# Don't heal anything while HZ is in STARTING state
|
|||
|
|
try:
|
|||
|
|
hz = self._get_hz()
|
|||
|
|
if hz and read_lifecycle_state(hz) == "STARTING":
|
|||
|
|
time.sleep(CONTAINER_POLL_S)
|
|||
|
|
continue
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
t0 = time.monotonic()
|
|||
|
|
for cfg in CONTAINER_WATCHDOGS:
|
|||
|
|
name = cfg["name"]
|
|||
|
|
try:
|
|||
|
|
with urllib.request.urlopen(cfg["url"], timeout=0.3) as resp:
|
|||
|
|
body = resp.read()
|
|||
|
|
healthy = cfg["check"](body)
|
|||
|
|
except Exception:
|
|||
|
|
healthy = False
|
|||
|
|
|
|||
|
|
if healthy:
|
|||
|
|
self._cw_fail_streaks[name] = 0
|
|||
|
|
# Reset last_restart when healthy: next failure heals without cooldown wait.
|
|||
|
|
# Safe because confirmation of health proves prior restart completed.
|
|||
|
|
self._cw_last_restart[name] = 0.0
|
|||
|
|
else:
|
|||
|
|
self._cw_fail_streaks[name] += 1
|
|||
|
|
streak = self._cw_fail_streaks[name]
|
|||
|
|
if streak >= cfg["threshold"]:
|
|||
|
|
self._docker_heal(name, cfg)
|
|||
|
|
|
|||
|
|
elapsed = time.monotonic() - t0
|
|||
|
|
time.sleep(max(0.01, CONTAINER_POLL_S - elapsed))
|
|||
|
|
|
|||
|
|
def _docker_heal(self, name: str, cfg: dict):
|
|||
|
|
"""Issue docker restart for a container if cooldown allows."""
|
|||
|
|
now = time.time()
|
|||
|
|
last = self._cw_last_restart.get(name, 0.0)
|
|||
|
|
if now - last < cfg["cooldown"]:
|
|||
|
|
return
|
|||
|
|
self._cw_last_restart[name] = now
|
|||
|
|
self._cw_fail_streaks[name] = 0
|
|||
|
|
logger.critical("CONTAINER-HEAL: restarting %s (streak=%d)",
|
|||
|
|
name, self._cw_fail_streaks.get(name, 0))
|
|||
|
|
try:
|
|||
|
|
subprocess.run(
|
|||
|
|
["docker", "restart", name],
|
|||
|
|
timeout=cfg["restart_timeout"], check=True,
|
|||
|
|
capture_output=True,
|
|||
|
|
)
|
|||
|
|
logger.info("CONTAINER-HEAL: %s restarted successfully", name)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error("CONTAINER-HEAL: docker restart %s failed: %s", name, e)
|
|||
|
|
|
|||
|
|
# ── HZ connection ─────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _get_hz(self):
|
|||
|
|
if not HZ_CLIENT_AVAILABLE:
|
|||
|
|
return None
|
|||
|
|
if self._hz_client:
|
|||
|
|
try:
|
|||
|
|
if self._hz_client.lifecycle_service.is_running():
|
|||
|
|
return self._hz_client
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
try:
|
|||
|
|
self._hz_client = HazelcastClient(
|
|||
|
|
cluster_name="dolphin",
|
|||
|
|
cluster_members=["127.0.0.1:5701"],
|
|||
|
|
connection_timeout=2.0,
|
|||
|
|
)
|
|||
|
|
# Warm-up from CH if HZ maps are empty (fresh restart)
|
|||
|
|
try:
|
|||
|
|
lifecycle = read_lifecycle_state(self._hz_client)
|
|||
|
|
if lifecycle in ("UNKNOWN", "STARTING"):
|
|||
|
|
logger.info("HZ lifecycle=%s — running CH warm-up", lifecycle)
|
|||
|
|
hz_warmup(self._hz_client)
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning("HZ warm-up failed (non-fatal): %s", e)
|
|||
|
|
return self._hz_client
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug("HZ connect failed: %s", e)
|
|||
|
|
self._hz_client = None
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
# ── M4: Control Plane ────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _m4_control_plane(self) -> float:
|
|||
|
|
"""Check HZ and Prefect ports are listening."""
|
|||
|
|
import socket
|
|||
|
|
scores = []
|
|||
|
|
for name, port in HZ_PORTS.items():
|
|||
|
|
try:
|
|||
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|||
|
|
s.settimeout(1.0)
|
|||
|
|
up = s.connect_ex(("127.0.0.1", port)) == 0
|
|||
|
|
except Exception:
|
|||
|
|
up = False
|
|||
|
|
scores.append(1.0 if up else 0.0)
|
|||
|
|
if not up:
|
|||
|
|
logger.debug("M4: %s port %d DOWN", name, port)
|
|||
|
|
# HZ (index 0) is more important than Prefect (index 1)
|
|||
|
|
hz_score = scores[0]
|
|||
|
|
prefect_score = scores[1] if len(scores) > 1 else 0.0
|
|||
|
|
return hz_score * 0.8 + prefect_score * 0.2
|
|||
|
|
|
|||
|
|
# ── M1: Process / supervisord integrity ───────────────────────────────────
|
|||
|
|
|
|||
|
|
def _check_supervisord_status(self) -> Dict[str, str]:
|
|||
|
|
"""Ask supervisorctl for the status of all managed programs.
|
|||
|
|
Returns dict: program_name → 'RUNNING' | 'STOPPED' | 'UNKNOWN'
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
result = subprocess.run(
|
|||
|
|
["supervisorctl", "-c", str(SUPERVISORD_CONF), "status"],
|
|||
|
|
capture_output=True, text=True, timeout=5,
|
|||
|
|
)
|
|||
|
|
statuses = {}
|
|||
|
|
for line in result.stdout.splitlines():
|
|||
|
|
parts = line.split()
|
|||
|
|
if len(parts) >= 2:
|
|||
|
|
statuses[parts[0]] = parts[1]
|
|||
|
|
return statuses
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug("supervisorctl status failed: %s", e)
|
|||
|
|
return {}
|
|||
|
|
|
|||
|
|
def _m1_process_integrity(self) -> Tuple[float, float, dict]:
|
|||
|
|
"""
|
|||
|
|
Returns (m1_data_infra, m1_trader, per_service_status_dict).
|
|||
|
|
Uses supervisorctl status first; falls back to psutil cmdline scan.
|
|||
|
|
"""
|
|||
|
|
sv_status = self._check_supervisord_status()
|
|||
|
|
service_results = {}
|
|||
|
|
|
|||
|
|
for prog, cfg in SERVICES.items():
|
|||
|
|
# Try supervisorctl result first
|
|||
|
|
if prog in sv_status:
|
|||
|
|
running = sv_status[prog] == "RUNNING"
|
|||
|
|
elif PSUTIL_AVAILABLE:
|
|||
|
|
# Fallback: cmdline scan
|
|||
|
|
running = False
|
|||
|
|
for proc in psutil.process_iter(["name", "cmdline"]):
|
|||
|
|
try:
|
|||
|
|
cmdline = " ".join(proc.info["cmdline"] or [])
|
|||
|
|
if any(p in cmdline for p in cfg["patterns"]):
|
|||
|
|
running = True
|
|||
|
|
break
|
|||
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|||
|
|
continue
|
|||
|
|
else:
|
|||
|
|
running = True # Cannot check, assume OK
|
|||
|
|
|
|||
|
|
service_results[prog] = "RUNNING" if running else "STOPPED"
|
|||
|
|
if not running:
|
|||
|
|
logger.warning("M1: %s is STOPPED", prog)
|
|||
|
|
|
|||
|
|
# Score data infra (critical_data=True) separately from trader services
|
|||
|
|
data_progs = [p for p, c in SERVICES.items() if c["critical_data"]]
|
|||
|
|
trader_progs = [p for p, c in SERVICES.items() if not c["critical_data"]]
|
|||
|
|
|
|||
|
|
def score(progs):
|
|||
|
|
if not progs:
|
|||
|
|
return 1.0
|
|||
|
|
n_ok = sum(1 for p in progs if service_results.get(p) == "RUNNING")
|
|||
|
|
return n_ok / len(progs)
|
|||
|
|
|
|||
|
|
return score(data_progs), score(trader_progs), service_results
|
|||
|
|
|
|||
|
|
# ── M2: Heartbeat freshness ───────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _m2_heartbeat_freshness(self) -> float:
|
|||
|
|
"""Check for a live nautilus heartbeat in HZ (informational)."""
|
|||
|
|
hz = self._get_hz()
|
|||
|
|
if not hz:
|
|||
|
|
return 0.0
|
|||
|
|
try:
|
|||
|
|
hb_map = hz.get_map("DOLPHIN_HEARTBEAT").blocking()
|
|||
|
|
raw = hb_map.get("nautilus_flow_heartbeat")
|
|||
|
|
if not raw:
|
|||
|
|
return 0.5 # HZ up but no heartbeat — trader may be stopped
|
|||
|
|
data = json.loads(raw) if isinstance(raw, str) else raw
|
|||
|
|
age = time.time() - data.get("ts", 0)
|
|||
|
|
if age > 60:
|
|||
|
|
return 0.0
|
|||
|
|
if age > 30:
|
|||
|
|
return 0.5
|
|||
|
|
return 1.0
|
|||
|
|
except Exception:
|
|||
|
|
return 0.5
|
|||
|
|
|
|||
|
|
# ── M3: Data freshness ───────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _m3_data_freshness(self) -> Tuple[float, dict]:
|
|||
|
|
"""Check all critical HZ keys are present and fresh."""
|
|||
|
|
hz = self._get_hz()
|
|||
|
|
if not hz:
|
|||
|
|
return 0.0, {}
|
|||
|
|
|
|||
|
|
results = {}
|
|||
|
|
scores = []
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
features = hz.get_map("DOLPHIN_FEATURES").blocking()
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug("M3: DOLPHIN_FEATURES map error: %s", e)
|
|||
|
|
return 0.0, {}
|
|||
|
|
|
|||
|
|
for name, (map_name, key, ts_field) in HZ_DATA_SOURCES.items():
|
|||
|
|
try:
|
|||
|
|
if map_name == "DOLPHIN_FEATURES":
|
|||
|
|
raw = features.get(key)
|
|||
|
|
else:
|
|||
|
|
raw = hz.get_map(map_name).blocking().get(key)
|
|||
|
|
|
|||
|
|
if raw is None:
|
|||
|
|
results[name] = {"status": "missing", "score": 0.0}
|
|||
|
|
scores.append(0.0)
|
|||
|
|
logger.warning("M3: %s missing from HZ", name)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
if ts_field is None:
|
|||
|
|
# Presence-only check
|
|||
|
|
results[name] = {"status": "present", "score": 1.0}
|
|||
|
|
scores.append(1.0)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
data = json.loads(raw) if isinstance(raw, str) else raw
|
|||
|
|
ts_val = data.get(ts_field) if isinstance(data, dict) else None
|
|||
|
|
|
|||
|
|
if ts_val is None:
|
|||
|
|
results[name] = {"status": "no_ts", "score": 0.7}
|
|||
|
|
scores.append(0.7)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Parse timestamp
|
|||
|
|
if isinstance(ts_val, (int, float)):
|
|||
|
|
ts = float(ts_val)
|
|||
|
|
else:
|
|||
|
|
ts = datetime.fromisoformat(
|
|||
|
|
str(ts_val).replace("Z", "+00:00")
|
|||
|
|
).timestamp()
|
|||
|
|
|
|||
|
|
age = time.time() - ts
|
|||
|
|
if age > DATA_DEAD_S:
|
|||
|
|
score = 0.0
|
|||
|
|
status = f"dead ({age:.0f}s)"
|
|||
|
|
logger.warning("M3: %s DEAD — %s ago", name, f"{age:.0f}s")
|
|||
|
|
elif age > DATA_STALE_S:
|
|||
|
|
score = 0.5
|
|||
|
|
status = f"stale ({age:.0f}s)"
|
|||
|
|
else:
|
|||
|
|
score = 1.0
|
|||
|
|
status = f"fresh ({age:.1f}s)"
|
|||
|
|
|
|||
|
|
results[name] = {"status": status, "age_s": round(age, 1), "score": score}
|
|||
|
|
scores.append(score)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug("M3 error for %s: %s", name, e)
|
|||
|
|
results[name] = {"status": "error", "score": 0.0}
|
|||
|
|
scores.append(0.0)
|
|||
|
|
|
|||
|
|
avg = sum(scores) / len(scores) if scores else 0.0
|
|||
|
|
return avg, results
|
|||
|
|
|
|||
|
|
# ── M5: Coherence ────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _m5_coherence(self) -> float:
|
|||
|
|
"""Sanity checks on HZ data integrity."""
|
|||
|
|
hz = self._get_hz()
|
|||
|
|
if not hz:
|
|||
|
|
return 0.0
|
|||
|
|
checks = []
|
|||
|
|
try:
|
|||
|
|
features = hz.get_map("DOLPHIN_FEATURES").blocking()
|
|||
|
|
|
|||
|
|
# exf_latest: _acb_ready must be True and ok_count > 20
|
|||
|
|
exf_raw = features.get("exf_latest")
|
|||
|
|
if exf_raw:
|
|||
|
|
exf = json.loads(exf_raw)
|
|||
|
|
acb_ready = exf.get("_acb_ready", False)
|
|||
|
|
ok_count = exf.get("_ok_count", 0)
|
|||
|
|
checks.append(1.0 if (acb_ready and ok_count >= 20) else 0.5)
|
|||
|
|
else:
|
|||
|
|
checks.append(0.0)
|
|||
|
|
|
|||
|
|
# acb_boost: boost must be in [1.0, 2.5]
|
|||
|
|
acb_raw = features.get("acb_boost")
|
|||
|
|
if acb_raw:
|
|||
|
|
acb = json.loads(acb_raw)
|
|||
|
|
boost = acb.get("boost", 0)
|
|||
|
|
checks.append(1.0 if 1.0 <= boost <= 2.5 else 0.0)
|
|||
|
|
else:
|
|||
|
|
checks.append(0.0)
|
|||
|
|
|
|||
|
|
# obf_universe: must have >= 200 assets
|
|||
|
|
uni_raw = features.get("obf_universe_latest")
|
|||
|
|
if uni_raw:
|
|||
|
|
uni = json.loads(uni_raw)
|
|||
|
|
n = uni.get("_n_assets", 0)
|
|||
|
|
checks.append(1.0 if n >= 200 else 0.5)
|
|||
|
|
else:
|
|||
|
|
checks.append(0.5) # optional — not fatal
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug("M5 error: %s", e)
|
|||
|
|
return 0.0
|
|||
|
|
|
|||
|
|
return sum(checks) / len(checks) if checks else 0.0
|
|||
|
|
|
|||
|
|
# ── M6: Continuous test integrity ────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _m6_test_integrity(self) -> float:
|
|||
|
|
"""
|
|||
|
|
Reads run_logs/test_results_latest.json and scores the test suite health.
|
|||
|
|
|
|||
|
|
Scoring:
|
|||
|
|
- Each category in {data_integrity, finance_fuzz, signal_fill,
|
|||
|
|
degradation, actor} contributes equally.
|
|||
|
|
- PASS → 1.0 per category
|
|||
|
|
- FAIL → 0.0 per category
|
|||
|
|
- N/A → 0.8 (not yet automated — not penalised heavily)
|
|||
|
|
- Missing → 0.5 (unknown state)
|
|||
|
|
|
|||
|
|
Age penalty:
|
|||
|
|
- Last run > M6_STALE_SECONDS (15 min): score × 0.7
|
|||
|
|
- Last run > M6_DEAD_SECONDS (60 min): score = 0.0
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
if not _TEST_RESULTS_PATH.exists():
|
|||
|
|
return 0.5 # file never written — unknown
|
|||
|
|
|
|||
|
|
raw = json.loads(_TEST_RESULTS_PATH.read_text())
|
|||
|
|
|
|||
|
|
# Age check
|
|||
|
|
run_at_str = raw.get("_run_at")
|
|||
|
|
if run_at_str:
|
|||
|
|
try:
|
|||
|
|
run_at = datetime.fromisoformat(run_at_str.replace("Z", "+00:00"))
|
|||
|
|
age_s = (datetime.now(timezone.utc) - run_at).total_seconds()
|
|||
|
|
if age_s > M6_DEAD_SECONDS:
|
|||
|
|
logger.warning("M6: test results stale %.0f min > dead threshold", age_s / 60)
|
|||
|
|
return 0.0
|
|||
|
|
age_penalty = 0.7 if age_s > M6_STALE_SECONDS else 1.0
|
|||
|
|
except Exception:
|
|||
|
|
age_penalty = 1.0
|
|||
|
|
else:
|
|||
|
|
age_penalty = 1.0
|
|||
|
|
|
|||
|
|
cats = ["data_integrity", "finance_fuzz", "signal_fill", "degradation", "actor"]
|
|||
|
|
cat_scores = []
|
|||
|
|
for cat in cats:
|
|||
|
|
info = raw.get(cat, {})
|
|||
|
|
status = (info.get("status") or "MISSING") if info else "MISSING"
|
|||
|
|
if status == "PASS":
|
|||
|
|
cat_scores.append(1.0)
|
|||
|
|
elif status == "FAIL":
|
|||
|
|
cat_scores.append(0.0)
|
|||
|
|
elif status == "N/A":
|
|||
|
|
cat_scores.append(0.8)
|
|||
|
|
else:
|
|||
|
|
cat_scores.append(0.5)
|
|||
|
|
|
|||
|
|
score = sum(cat_scores) / len(cat_scores) * age_penalty
|
|||
|
|
logger.debug(
|
|||
|
|
"M6: cats=%s age_penalty=%.2f score=%.3f",
|
|||
|
|
[f"{c}={s:.1f}" for c, s in zip(cats, cat_scores)],
|
|||
|
|
age_penalty, score,
|
|||
|
|
)
|
|||
|
|
return round(score, 3)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug("M6 error: %s", e)
|
|||
|
|
return 0.5 # unknown — don't penalise if file unreadable
|
|||
|
|
|
|||
|
|
# ── Rm_meta ──────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _compute_rm_meta(self, m4, m1_data, m3, m5, m6) -> Tuple[float, str]:
|
|||
|
|
"""
|
|||
|
|
FIX-1: Weighted sum — no single sensor can zero rm_meta.
|
|||
|
|
Trader heartbeat (m2) and trader process (m1_trader) are excluded
|
|||
|
|
because they may be intentionally stopped.
|
|||
|
|
m6 = continuous test suite integrity gate (6% weight).
|
|||
|
|
"""
|
|||
|
|
w = SENSOR_WEIGHTS
|
|||
|
|
tot = sum(w.values())
|
|||
|
|
rm = (
|
|||
|
|
w["m4_control_plane"] * m4 +
|
|||
|
|
w["m1_data_infra"] * m1_data +
|
|||
|
|
w["m3_data_freshness"] * m3 +
|
|||
|
|
w["m5_coherence"] * m5 +
|
|||
|
|
w["m6_test_integrity"] * m6
|
|||
|
|
) / tot
|
|||
|
|
|
|||
|
|
if rm > 0.85: status = "GREEN"
|
|||
|
|
elif rm > 0.6: status = "DEGRADED"
|
|||
|
|
elif rm > 0.3: status = "CRITICAL"
|
|||
|
|
else: status = "DEAD"
|
|||
|
|
|
|||
|
|
return round(rm, 3), status
|
|||
|
|
|
|||
|
|
# ── Recovery ─────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _restart_via_supervisorctl(self, program: str):
|
|||
|
|
"""FIX-2: Restart via supervisorctl, never systemctl.
|
|||
|
|
FIX-7: Runs in a daemon thread so a slow/hung restart never blocks
|
|||
|
|
the main check loop — other services keep being monitored.
|
|||
|
|
"""
|
|||
|
|
import threading
|
|||
|
|
now = time.time()
|
|||
|
|
last = self._recovery_timestamps.get(program, 0.0)
|
|||
|
|
# Choose cooldown: critical data infra gets 60s, others 300s
|
|||
|
|
cfg = SERVICES.get(program, {})
|
|||
|
|
cooldown = (RECOVERY_COOLDOWN_CRITICAL_S if cfg.get("critical_data")
|
|||
|
|
else RECOVERY_COOLDOWN_DEFAULT_S)
|
|||
|
|
if now - last < cooldown:
|
|||
|
|
logger.debug("RECOVERY: %s cooldown active (%.0fs remaining)",
|
|||
|
|
program, cooldown - (now - last))
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Mark timestamp NOW (before thread runs) so concurrent calls don't
|
|||
|
|
# also trigger a restart while the thread is in flight.
|
|||
|
|
self._recovery_timestamps[program] = now
|
|||
|
|
logger.warning("RECOVERY: restarting %s via supervisorctl (async)", program)
|
|||
|
|
|
|||
|
|
def _do_restart():
|
|||
|
|
try:
|
|||
|
|
result = subprocess.run(
|
|||
|
|
["supervisorctl", "-c", str(SUPERVISORD_CONF), "restart", program],
|
|||
|
|
capture_output=True, text=True, timeout=30,
|
|||
|
|
)
|
|||
|
|
logger.info("RECOVERY: %s → %s", program, result.stdout.strip() or "ok")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error("RECOVERY: failed to restart %s: %s", program, e)
|
|||
|
|
|
|||
|
|
threading.Thread(target=_do_restart, daemon=True,
|
|||
|
|
name=f"recovery-{program.replace(':', '-')}").start()
|
|||
|
|
|
|||
|
|
def _attempt_recovery(self, report: HealthReport):
|
|||
|
|
"""
|
|||
|
|
FIX-5: Only auto-restart critical data infra services.
|
|||
|
|
Trader services are NEVER auto-restarted (may be intentionally stopped).
|
|||
|
|
FIX-6: Each service has its own 10-min restart cooldown.
|
|||
|
|
"""
|
|||
|
|
if report.status == "GREEN":
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Only recover data infra, and only when CRITICAL or DEAD
|
|||
|
|
if report.status not in ("CRITICAL", "DEAD"):
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
for prog, state in report.service_status.items():
|
|||
|
|
cfg = SERVICES.get(prog, {})
|
|||
|
|
if cfg.get("critical_data") and state == "STOPPED":
|
|||
|
|
self._restart_via_supervisorctl(prog)
|
|||
|
|
|
|||
|
|
# ── SurvivalStack (posture control) ──────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _process_survival_stack(self, rm_meta: float, m1_data: float):
|
|||
|
|
"""Compute posture via SurvivalStack and write to DOLPHIN_SAFETY.
|
|||
|
|
Integrated from system_watchdog_service.py (MIG3) — no duplicate process needed.
|
|||
|
|
"""
|
|||
|
|
if not self._survival_stack:
|
|||
|
|
return
|
|||
|
|
hz = self._get_hz()
|
|||
|
|
if not hz:
|
|||
|
|
return
|
|||
|
|
try:
|
|||
|
|
hz_nodes = len(hz.cluster_service.get_members())
|
|||
|
|
|
|||
|
|
# Heartbeat age — read from DOLPHIN_HEARTBEAT
|
|||
|
|
try:
|
|||
|
|
hb_raw = hz.get_map("DOLPHIN_HEARTBEAT").blocking().get("nautilus_flow_heartbeat")
|
|||
|
|
hb_data = json.loads(hb_raw) if hb_raw else {}
|
|||
|
|
heartbeat_age_s = (datetime.now(timezone.utc).timestamp() - hb_data.get("ts", 0))
|
|||
|
|
except Exception:
|
|||
|
|
heartbeat_age_s = 999.0
|
|||
|
|
|
|||
|
|
now_ts = datetime.now(timezone.utc).timestamp()
|
|||
|
|
features_map = hz.get_map("DOLPHIN_FEATURES").blocking()
|
|||
|
|
|
|||
|
|
# MC-Forewarner — absent = not deployed on this instance, treat as neutral
|
|||
|
|
try:
|
|||
|
|
mc_raw = features_map.get("mc_forewarner_latest")
|
|||
|
|
if mc_raw is None:
|
|||
|
|
mc_status, mc_age_h = "GREEN", 0.0
|
|||
|
|
else:
|
|||
|
|
mc_state = json.loads(mc_raw)
|
|||
|
|
mc_status = mc_state.get("status", "GREEN")
|
|||
|
|
mc_ts = mc_state.get("timestamp")
|
|||
|
|
mc_age_h = ((now_ts - datetime.fromisoformat(mc_ts).timestamp()) / 3600.0
|
|||
|
|
if mc_ts else 0.0)
|
|||
|
|
except Exception:
|
|||
|
|
mc_status, mc_age_h = "GREEN", 0.0
|
|||
|
|
|
|||
|
|
# OBF (order-book features) — obf_universe IS deployed; absence/staleness = real failure
|
|||
|
|
try:
|
|||
|
|
ob_raw = features_map.get("asset_BTCUSDT_ob")
|
|||
|
|
if ob_raw is None:
|
|||
|
|
# obf_universe should be writing — absent means subsystem failed
|
|||
|
|
ob_stale, ob_depth, ob_fill = True, 0.5, 0.5
|
|||
|
|
else:
|
|||
|
|
ob_state = json.loads(ob_raw)
|
|||
|
|
ob_ts = ob_state.get("timestamp", 0)
|
|||
|
|
ob_age_s = now_ts - ob_ts if ob_ts else 999.0
|
|||
|
|
if ob_age_s > 60.0:
|
|||
|
|
ob_stale, ob_depth, ob_fill = True, 0.5, 0.5
|
|||
|
|
else:
|
|||
|
|
# Data is live — measure DEPTH (liquidity), NOT balance (balance is a signal, not a risk)
|
|||
|
|
# Deep book with skew is fillable; thin book is risky regardless of balance
|
|||
|
|
bid_top = (ob_state.get("bid_notional") or [0])[0]
|
|||
|
|
ask_top = (ob_state.get("ask_notional") or [0])[0]
|
|||
|
|
total = bid_top + ask_top
|
|||
|
|
# Normalize total top-of-book notional: $200k = 1.0 (well above typical min order size)
|
|||
|
|
depth = min(1.0, total / 200_000.0) if total > 0 else 0.0
|
|||
|
|
ob_stale = False
|
|||
|
|
ob_depth = max(0.3, depth)
|
|||
|
|
ob_fill = ob_depth
|
|||
|
|
logger.debug("OBF live: age=%.1fs total_notional=%.0f depth=%.3f",
|
|||
|
|
ob_age_s, total, ob_depth)
|
|||
|
|
except Exception as ob_exc:
|
|||
|
|
logger.debug("OBF parse error: %s", ob_exc)
|
|||
|
|
ob_stale, ob_depth, ob_fill = True, 0.5, 0.5
|
|||
|
|
|
|||
|
|
# ExYF (external factors / dvol) — exf_fetcher IS deployed; absence/staleness = real failure
|
|||
|
|
# Cat4 models dvol spike risk: spike → 0.3, no spike (t=999) → ≈1.0
|
|||
|
|
DVOL_SPIKE_THRESHOLD = 70.0 # BTC DVOL > 70 = elevated vol regime
|
|||
|
|
EXF_STALE_S = 120.0 # exf_latest older than 2 min = stale
|
|||
|
|
DVOL_STALE_S = 300.0 # dvol_btc staleness > 5 min = unknown
|
|||
|
|
try:
|
|||
|
|
exf_raw = features_map.get("exf_latest")
|
|||
|
|
if exf_raw is None:
|
|||
|
|
dvol_spike, t_since_spike_min = True, 0.0
|
|||
|
|
else:
|
|||
|
|
exf = json.loads(exf_raw)
|
|||
|
|
pushed_at = exf.get("_pushed_at")
|
|||
|
|
exf_age_s = (now_ts - datetime.fromisoformat(pushed_at).timestamp()
|
|||
|
|
if pushed_at else 999.0)
|
|||
|
|
if exf_age_s > EXF_STALE_S:
|
|||
|
|
# ExF feed stale — treat as unknown risk
|
|||
|
|
dvol_spike, t_since_spike_min = True, 0.0
|
|||
|
|
else:
|
|||
|
|
dvol_btc = exf.get("dvol_btc")
|
|||
|
|
dvol_staleness = (exf.get("_staleness_s") or {}).get("dvol_btc", 999.0)
|
|||
|
|
if dvol_btc is None or dvol_staleness > DVOL_STALE_S:
|
|||
|
|
# dvol specifically not fresh
|
|||
|
|
dvol_spike, t_since_spike_min = True, 0.0
|
|||
|
|
else:
|
|||
|
|
dvol_spike = float(dvol_btc) > DVOL_SPIKE_THRESHOLD
|
|||
|
|
t_since_spike_min = 0.0 if dvol_spike else 999.0
|
|||
|
|
except Exception:
|
|||
|
|
dvol_spike, t_since_spike_min = True, 0.0
|
|||
|
|
|
|||
|
|
# Drawdown from engine snapshot
|
|||
|
|
try:
|
|||
|
|
snap_raw = hz.get_map("DOLPHIN_STATE_BLUE").blocking().get("engine_snapshot")
|
|||
|
|
snap = json.loads(snap_raw) if snap_raw else {}
|
|||
|
|
drawdown = snap.get("drawdown", 0.0)
|
|||
|
|
except Exception:
|
|||
|
|
drawdown = 0.0
|
|||
|
|
|
|||
|
|
rm, breakdown = self._survival_stack.compute_rm(
|
|||
|
|
hz_nodes=hz_nodes,
|
|||
|
|
heartbeat_age_s=heartbeat_age_s,
|
|||
|
|
mc_status=mc_status,
|
|||
|
|
mc_staleness_hours=mc_age_h,
|
|||
|
|
ob_depth_quality=ob_depth,
|
|||
|
|
ob_fill_prob=ob_fill,
|
|||
|
|
ob_stale=ob_stale,
|
|||
|
|
dvol_spike=dvol_spike,
|
|||
|
|
t_since_spike_min=t_since_spike_min,
|
|||
|
|
drawdown=drawdown,
|
|||
|
|
)
|
|||
|
|
self._survival_stack.update_posture(rm)
|
|||
|
|
self._survival_stack.write_to_hz(rm, breakdown, hz)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from ch_writer import ch_put, ts_us as _ts
|
|||
|
|
curr_posture = (getattr(self._survival_stack, 'posture', None) or
|
|||
|
|
getattr(self._survival_stack, 'current_posture', None) or '')
|
|||
|
|
if curr_posture:
|
|||
|
|
prev = getattr(self, '_ch_prev_posture', '')
|
|||
|
|
if curr_posture != prev:
|
|||
|
|
ch_put("posture_events", {
|
|||
|
|
"ts": _ts(),
|
|||
|
|
"posture": curr_posture,
|
|||
|
|
"rm": float(rm),
|
|||
|
|
"prev_posture": prev,
|
|||
|
|
"trigger": str(breakdown)[:200],
|
|||
|
|
"scan_uuid": "",
|
|||
|
|
})
|
|||
|
|
self._ch_prev_posture = curr_posture
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug("SurvivalStack processing error: %s", e)
|
|||
|
|
|
|||
|
|
# ── Emit ─────────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def _emit(self, report: HealthReport):
|
|||
|
|
d = asdict(report)
|
|||
|
|
|
|||
|
|
# Local JSON
|
|||
|
|
try:
|
|||
|
|
STATUS_JSON.write_text(json.dumps(d, indent=2))
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error("Failed to write status JSON: %s", e)
|
|||
|
|
|
|||
|
|
# HZ push
|
|||
|
|
hz = self._get_hz()
|
|||
|
|
if hz:
|
|||
|
|
try:
|
|||
|
|
hz.get_map("DOLPHIN_META_HEALTH").blocking().put("latest", json.dumps(d))
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from ch_writer import ch_put, ts_us as _ts
|
|||
|
|
ch_put("meta_health", {
|
|||
|
|
"ts": _ts(),
|
|||
|
|
"status": report.status,
|
|||
|
|
"rm_meta": report.rm_meta,
|
|||
|
|
"m1_data_infra": report.m1_data_infra,
|
|||
|
|
"m1_trader": report.m1_trader,
|
|||
|
|
"m2_heartbeat": report.m2_heartbeat,
|
|||
|
|
"m3_data_freshness": report.m3_data_freshness,
|
|||
|
|
"m4_control_plane": report.m4_control_plane,
|
|||
|
|
"m5_coherence": report.m5_coherence,
|
|||
|
|
"m6_test_integrity": report.m6_test_integrity,
|
|||
|
|
})
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
logger.info(
|
|||
|
|
"RM_META=%.3f [%s] M4=%.2f M1_data=%.2f M3=%.2f M5=%.2f M6=%.2f "
|
|||
|
|
"M1_trader=%.2f M2_hb=%.2f",
|
|||
|
|
report.rm_meta, report.status,
|
|||
|
|
report.m4_control_plane, report.m1_data_infra,
|
|||
|
|
report.m3_data_freshness, report.m5_coherence, report.m6_test_integrity,
|
|||
|
|
report.m1_trader, report.m2_heartbeat,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# ── Main loop ─────────────────────────────────────────────────────────────
|
|||
|
|
|
|||
|
|
def run(self):
|
|||
|
|
logger.info("MHSv3 running (interval=%.0fs)", CHECK_INTERVAL_S)
|
|||
|
|
while True:
|
|||
|
|
t0 = time.monotonic()
|
|||
|
|
try:
|
|||
|
|
m4 = self._m4_control_plane()
|
|||
|
|
m1_data, m1_trader, svc_s = self._m1_process_integrity()
|
|||
|
|
m2 = self._m2_heartbeat_freshness()
|
|||
|
|
m3, hz_keys = self._m3_data_freshness()
|
|||
|
|
m5 = self._m5_coherence()
|
|||
|
|
m6 = self._m6_test_integrity()
|
|||
|
|
|
|||
|
|
rm, status = self._compute_rm_meta(m4, m1_data, m3, m5, m6)
|
|||
|
|
|
|||
|
|
report = HealthReport(
|
|||
|
|
rm_meta = rm,
|
|||
|
|
status = status,
|
|||
|
|
m4_control_plane = round(m4, 3),
|
|||
|
|
m1_data_infra = round(m1_data, 3),
|
|||
|
|
m1_trader = round(m1_trader, 3),
|
|||
|
|
m2_heartbeat = round(m2, 3),
|
|||
|
|
m3_data_freshness = round(m3, 3),
|
|||
|
|
m5_coherence = round(m5, 3),
|
|||
|
|
m6_test_integrity = round(m6, 3),
|
|||
|
|
service_status = svc_s,
|
|||
|
|
hz_key_status = hz_keys,
|
|||
|
|
timestamp = datetime.now(timezone.utc).isoformat(),
|
|||
|
|
)
|
|||
|
|
self._emit(report)
|
|||
|
|
self._attempt_recovery(report)
|
|||
|
|
self._process_survival_stack(rm, m1_data)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error("Main loop error: %s", e)
|
|||
|
|
|
|||
|
|
elapsed = time.monotonic() - t0
|
|||
|
|
sleep = max(1.0, CHECK_INTERVAL_S - elapsed)
|
|||
|
|
time.sleep(sleep)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
import signal
|
|||
|
|
svc = MetaHealthServiceV3()
|
|||
|
|
|
|||
|
|
def _sig(signum, frame):
|
|||
|
|
logger.info("MHSv3 received signal %d — shutting down", signum)
|
|||
|
|
sys.exit(0)
|
|||
|
|
|
|||
|
|
signal.signal(signal.SIGTERM, _sig)
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
svc.run()
|
|||
|
|
except KeyboardInterrupt:
|
|||
|
|
logger.info("MHSv3 stopped by user")
|
|||
|
|
sys.exit(0)
|