Files
DOLPHIN/prod/tests/test_mhs_v3.py

1776 lines
77 KiB
Python
Raw Normal View History

"""
DOLPHIN Meta Health Service v3 Comprehensive Test Suite
==========================================================
Unit, integration, and E2E kill/revive tests.
Test classes:
TestSupervisordStatusParsing unit: parse supervisorctl output variants
TestM1ProcessIntegrity unit: scoring logic with mocked sv_status
TestM3DataFreshnessScoring unit: freshness thresholds and scoring
TestRmMetaFormula unit: weighted sum, thresholds, edge cases
TestRecoveryGating unit: cooldown, thread isolation, only-STOPPED rule
TestRecoveryNeverKillsRunning unit: RUNNING services are NEVER restarted
TestM4ControlPlane unit: port checks with mocked socket
TestM5Coherence unit: data integrity checks
TestLiveIntegration integration: live HZ + supervisord status
TestKillAndRevive E2E: stop critical services, verify MHS detects
and revives within 60s (critical cooldown window)
Run:
cd /mnt/dolphinng5_predict
source /home/dolphin/siloqy_env/bin/activate
python -m pytest prod/tests/test_mhs_v3.py -v --tb=short
"""
import json
import subprocess
import sys
import time
import threading
import unittest
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict
from unittest.mock import MagicMock, patch, call
sys.path.insert(0, str(Path(__file__).parent.parent))
# ── Import the module under test ───────────────────────────────────────────────
from meta_health_service_v3 import (
MetaHealthServiceV3,
SERVICES,
HZ_DATA_SOURCES,
SENSOR_WEIGHTS,
CHECK_INTERVAL_S,
DATA_STALE_S,
DATA_DEAD_S,
RECOVERY_COOLDOWN_CRITICAL_S,
RECOVERY_COOLDOWN_DEFAULT_S,
SUPERVISORD_CONF,
)
CONF_PATH = str(SUPERVISORD_CONF)
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def _fresh_svc() -> MetaHealthServiceV3:
"""Return a new MHS instance with no HZ connection (lazy)."""
return MetaHealthServiceV3()
def _hz_available() -> bool:
import socket
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(1.0)
return s.connect_ex(("127.0.0.1", 5701)) == 0
except Exception:
return False
def _supervisord_running() -> bool:
try:
r = subprocess.run(
["supervisorctl", "-c", CONF_PATH, "status"],
capture_output=True, text=True, timeout=5,
)
return "exf_fetcher" in r.stdout
except Exception:
return False
def _supervisorctl(cmd: str, prog: str) -> bool:
"""Run supervisorctl command, return True on success."""
try:
r = subprocess.run(
["supervisorctl", "-c", CONF_PATH, cmd, prog],
capture_output=True, text=True, timeout=15,
)
return r.returncode == 0
except Exception:
return False
def _get_supervisord_state(prog: str) -> str:
"""Return supervisord state for a program: RUNNING / STOPPED / FATAL / UNKNOWN."""
try:
r = subprocess.run(
["supervisorctl", "-c", CONF_PATH, "status", prog],
capture_output=True, text=True, timeout=5,
)
parts = r.stdout.strip().split()
return parts[1] if len(parts) >= 2 else "UNKNOWN"
except Exception:
return "UNKNOWN"
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
# ─────────────────────────────────────────────────────────────────────────────
# Unit tests
# ─────────────────────────────────────────────────────────────────────────────
class TestSupervisordStatusParsing(unittest.TestCase):
"""Unit: _check_supervisord_status parses all supervisorctl output variants."""
def setUp(self):
self.svc = _fresh_svc()
def _mock_run(self, stdout: str, returncode: int = 0):
mock = MagicMock()
mock.stdout = stdout
mock.returncode = returncode
return mock
def test_all_running(self):
stdout = (
"dolphin_data:exf_fetcher RUNNING pid 100, uptime 1:00:00\n"
"dolphin_data:acb_processor RUNNING pid 101, uptime 0:30:00\n"
"dolphin_data:obf_universe RUNNING pid 102, uptime 0:20:00\n"
"dolphin_data:meta_health RUNNING pid 103, uptime 0:10:00\n"
"dolphin:nautilus_trader RUNNING pid 200, uptime 2:00:00\n"
"dolphin:scan_bridge RUNNING pid 201, uptime 1:50:00\n"
)
with patch("subprocess.run", return_value=self._mock_run(stdout)):
result = self.svc._check_supervisord_status()
for prog in SERVICES:
self.assertIn(prog, result, f"{prog} not in parsed result")
self.assertEqual(result[prog], "RUNNING", f"{prog} should be RUNNING")
def test_one_stopped(self):
stdout = (
"dolphin_data:exf_fetcher STOPPED Not started\n"
"dolphin_data:acb_processor RUNNING pid 101\n"
"dolphin_data:obf_universe RUNNING pid 102\n"
)
with patch("subprocess.run", return_value=self._mock_run(stdout)):
result = self.svc._check_supervisord_status()
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "STOPPED")
self.assertEqual(result.get("dolphin_data:acb_processor"), "RUNNING")
def test_starting_state(self):
stdout = "dolphin_data:exf_fetcher STARTING \n"
with patch("subprocess.run", return_value=self._mock_run(stdout)):
result = self.svc._check_supervisord_status()
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "STARTING")
def test_fatal_state(self):
stdout = "dolphin_data:exf_fetcher FATAL Exited too quickly\n"
with patch("subprocess.run", return_value=self._mock_run(stdout)):
result = self.svc._check_supervisord_status()
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "FATAL")
def test_subprocess_failure_returns_empty(self):
with patch("subprocess.run", side_effect=Exception("timeout")):
result = self.svc._check_supervisord_status()
self.assertIsInstance(result, dict)
self.assertEqual(len(result), 0)
def test_exit_code_3_still_parsed(self):
"""supervisorctl exits 3 when some services are STOPPED — output still valid."""
stdout = (
"dolphin_data:exf_fetcher RUNNING pid 100\n"
"dolphin:clean_arch_trader STOPPED Not started\n"
)
with patch("subprocess.run", return_value=self._mock_run(stdout, returncode=3)):
result = self.svc._check_supervisord_status()
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "RUNNING")
def test_empty_output_returns_empty(self):
with patch("subprocess.run", return_value=self._mock_run("")):
result = self.svc._check_supervisord_status()
self.assertEqual(result, {})
class TestM1ProcessIntegrity(unittest.TestCase):
"""Unit: M1 scoring with various supervisorctl states."""
def setUp(self):
self.svc = _fresh_svc()
def _run_m1_with_statuses(self, sv: Dict[str, str]):
with patch.object(self.svc, "_check_supervisord_status", return_value=sv):
return self.svc._m1_process_integrity()
def test_all_running_score_1(self):
sv = {p: "RUNNING" for p in SERVICES}
m1_data, m1_trader, svc_s = self._run_m1_with_statuses(sv)
self.assertEqual(m1_data, 1.0)
self.assertEqual(m1_trader, 1.0)
def test_one_critical_stopped_reduces_data_score(self):
sv = {p: "RUNNING" for p in SERVICES}
sv["dolphin_data:exf_fetcher"] = "STOPPED"
m1_data, m1_trader, _ = self._run_m1_with_statuses(sv)
critical = [p for p, c in SERVICES.items() if c["critical_data"]]
expected = (len(critical) - 1) / len(critical)
self.assertAlmostEqual(m1_data, expected, places=3)
self.assertEqual(m1_trader, 1.0) # trader score unaffected
def test_trader_stopped_does_not_affect_data_score(self):
sv = {p: "RUNNING" for p in SERVICES}
sv["dolphin:nautilus_trader"] = "STOPPED"
m1_data, m1_trader, _ = self._run_m1_with_statuses(sv)
self.assertEqual(m1_data, 1.0) # data infra unaffected
trader_progs = [p for p, c in SERVICES.items() if not c["critical_data"]]
expected_trader = (len(trader_progs) - 1) / len(trader_progs)
self.assertAlmostEqual(m1_trader, expected_trader, places=3)
def test_all_critical_stopped_score_0(self):
sv = {p: "RUNNING" for p in SERVICES}
for p, c in SERVICES.items():
if c["critical_data"]:
sv[p] = "STOPPED"
m1_data, _, _ = self._run_m1_with_statuses(sv)
self.assertEqual(m1_data, 0.0)
def test_fatal_treated_same_as_stopped(self):
sv = {p: "RUNNING" for p in SERVICES}
sv["dolphin_data:acb_processor"] = "FATAL"
m1_data, _, svc_s = self._run_m1_with_statuses(sv)
self.assertLess(m1_data, 1.0)
self.assertEqual(svc_s["dolphin_data:acb_processor"], "STOPPED")
def test_service_dict_contains_all_services(self):
sv = {p: "RUNNING" for p in SERVICES}
_, _, svc_s = self._run_m1_with_statuses(sv)
for prog in SERVICES:
self.assertIn(prog, svc_s)
def test_empty_supervisord_output_falls_back_to_psutil(self):
"""If supervisorctl returns nothing, psutil fallback is used."""
with patch.object(self.svc, "_check_supervisord_status", return_value={}):
with patch("psutil.process_iter") as mock_pi:
mock_proc = MagicMock()
mock_proc.info = {
"name": "python3",
"cmdline": ["/usr/bin/python3", "exf_fetcher_flow.py"],
}
mock_pi.return_value = [mock_proc]
m1_data, _, svc_s = self.svc._m1_process_integrity()
self.assertEqual(svc_s["dolphin_data:exf_fetcher"], "RUNNING")
class TestM3DataFreshnessScoring(unittest.TestCase):
"""Unit: freshness thresholds applied correctly."""
def setUp(self):
self.svc = _fresh_svc()
def _make_hz_with_age(self, key: str, age_s: float, ts_field: str):
"""Build a mock features map that returns a JSON blob with ts_field at age_s."""
ts = time.time() - age_s
if ts_field == "_pushed_at":
ts_val = ts # numeric unix
else:
ts_val = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
return json.dumps({ts_field: ts_val, "dummy": 1})
def test_fresh_scores_1(self):
raw = self._make_hz_with_age("exf_latest", 5.0, "_pushed_at")
score, _ = self._score_one("exf_latest", raw, "_pushed_at")
self.assertEqual(score, 1.0)
def test_stale_scores_half(self):
raw = self._make_hz_with_age("exf_latest", DATA_STALE_S + 5, "_pushed_at")
score, _ = self._score_one("exf_latest", raw, "_pushed_at")
self.assertEqual(score, 0.5)
def test_dead_scores_zero(self):
raw = self._make_hz_with_age("exf_latest", DATA_DEAD_S + 1, "_pushed_at")
score, _ = self._score_one("exf_latest", raw, "_pushed_at")
self.assertEqual(score, 0.0)
def test_missing_key_scores_zero(self):
score, _ = self._score_one("exf_latest", None, "_pushed_at")
self.assertEqual(score, 0.0)
def test_no_ts_field_scores_point7(self):
"""Key present but no timestamp field → 0.7 (presence-only)."""
raw = json.dumps({"other_key": 123})
# ts_field is None in the registry for acb_boost → score=1.0 (presence-only)
# ts_field present but missing in data → 0.7
score, _ = self._score_one("exf_latest", raw, "_pushed_at")
self.assertAlmostEqual(score, 0.7, places=3)
def test_iso_timestamp_parsed_correctly(self):
"""ISO format timestamps (obf_universe) are parsed and aged correctly."""
ts_val = datetime.fromtimestamp(time.time() - 10, tz=timezone.utc).isoformat()
raw = json.dumps({"_snapshot_utc": ts_val})
score, _ = self._score_one("obf_universe", raw, "_snapshot_utc")
self.assertEqual(score, 1.0)
def test_presence_only_key_scores_1_when_present(self):
"""acb_boost has ts_field=None → presence-only, score=1.0."""
raw = json.dumps({"boost": 1.3, "signals": 1.0})
with patch.object(self.svc, "_get_hz") as mock_hz:
features_map = MagicMock()
features_map.get.side_effect = lambda k: raw if k == "acb_boost" else None
mock_hz.return_value.get_map.return_value.blocking.return_value = features_map
m3, results = self.svc._m3_data_freshness()
self.assertEqual(results.get("acb_boost", {}).get("score"), 1.0)
def _score_one(self, name: str, raw_value, ts_field: str):
"""Helper: run M3 for a single key by mocking HZ."""
with patch.object(self.svc, "_get_hz") as mock_hz:
features_map = MagicMock()
features_map.get.side_effect = lambda k: (
raw_value if k == HZ_DATA_SOURCES.get(name, (None, k, None))[1] else None
)
mock_hz.return_value.get_map.return_value.blocking.return_value = features_map
m3, results = self.svc._m3_data_freshness()
entry = results.get(name, {})
return entry.get("score", None), entry
class TestRmMetaFormula(unittest.TestCase):
"""Unit: weighted sum behaves correctly."""
def setUp(self):
self.svc = _fresh_svc()
def _rm(self, m4, m1, m3, m5):
rm, status = self.svc._compute_rm_meta(m4, m1, m3, m5)
return rm, status
def test_all_ones_is_green(self):
rm, status = self._rm(1.0, 1.0, 1.0, 1.0)
self.assertEqual(status, "GREEN")
self.assertAlmostEqual(rm, 1.0, places=2)
def test_all_zeros_is_dead(self):
rm, status = self._rm(0.0, 0.0, 0.0, 0.0)
self.assertEqual(status, "DEAD")
self.assertAlmostEqual(rm, 0.0, places=2)
def test_hz_down_alone_is_still_degraded(self):
"""M4=0 (HZ down) but all processes running and data fresh → DEGRADED, not DEAD."""
rm, status = self._rm(0.0, 1.0, 1.0, 1.0)
# M4 weight=0.35; remaining = 0.65 → DEGRADED (>0.6)
self.assertIn(status, ("DEGRADED", "GREEN"))
self.assertGreater(rm, 0.3)
def test_one_data_service_down_still_not_dead(self):
"""One of three critical data services stopped → M1_data=0.667."""
m1_data = 2/3
rm, status = self._rm(1.0, m1_data, 1.0, 1.0)
self.assertNotEqual(status, "DEAD")
self.assertGreater(rm, 0.6)
def test_all_data_services_down_critical_or_dead(self):
rm, status = self._rm(1.0, 0.0, 0.0, 0.0)
self.assertIn(status, ("CRITICAL", "DEAD"))
def test_hz_and_data_both_down_is_critical(self):
rm, status = self._rm(0.0, 0.0, 1.0, 1.0)
self.assertIn(status, ("CRITICAL", "DEAD"))
def test_product_formula_would_have_been_wrong(self):
"""Prove that the old product formula collapses when M3=0."""
product = 1.0 * 1.0 * 0.0 * 1.0 # M3=0 → product=0
self.assertEqual(product, 0.0)
# But weighted sum is fine:
rm, _ = self._rm(1.0, 1.0, 0.0, 1.0)
self.assertGreater(rm, 0.5)
def test_status_thresholds(self):
cases = [
(1.0, 1.0, 1.0, 1.0, "GREEN"),
(1.0, 0.5, 1.0, 1.0, "GREEN"),
(0.5, 0.5, 0.5, 0.5, "DEGRADED"),
(0.0, 0.5, 0.0, 0.5, "CRITICAL"),
(0.0, 0.0, 0.0, 0.0, "DEAD"),
]
for m4, m1, m3, m5, expected in cases:
_, status = self._rm(m4, m1, m3, m5)
self.assertIn(status, ("GREEN", "DEGRADED", "CRITICAL", "DEAD"),
f"Invalid status for m4={m4} m1={m1} m3={m3} m5={m5}")
def test_weights_sum_to_1(self):
self.assertAlmostEqual(sum(SENSOR_WEIGHTS.values()), 1.0, places=6)
def test_m2_and_trader_excluded_from_rm(self):
"""rm_meta must be independent of M2 and M1_trader."""
rm1, _ = self._rm(1.0, 1.0, 1.0, 1.0)
# Even if M2=0 or M1_trader=0, rm_meta shouldn't change
# (they aren't inputs to _compute_rm_meta)
rm2, _ = self._rm(1.0, 1.0, 1.0, 1.0)
self.assertAlmostEqual(rm1, rm2, places=6)
class TestRecoveryGating(unittest.TestCase):
"""Unit: cooldown, thread isolation, only-STOPPED gating."""
def setUp(self):
self.svc = _fresh_svc()
def test_cooldown_prevents_double_restart(self):
"""Two calls within cooldown window → only one restart fires."""
calls = []
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(stdout="ok", returncode=0)
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher")
time.sleep(0.1)
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher")
time.sleep(0.5) # let thread finish
self.assertEqual(mock_run.call_count, 1)
def test_critical_service_cooldown_is_10s(self):
"""Critical services have RECOVERY_COOLDOWN_CRITICAL_S=10 cooldown."""
self.assertEqual(RECOVERY_COOLDOWN_CRITICAL_S, 10.0)
self.assertLess(RECOVERY_COOLDOWN_CRITICAL_S, 60.0,
"Critical cooldown must be < 60 seconds")
def test_cooldown_expires_allows_second_restart(self):
"""After cooldown, a second restart is allowed."""
self.svc._recovery_timestamps["dolphin_data:exf_fetcher"] = (
time.time() - RECOVERY_COOLDOWN_CRITICAL_S - 1
)
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(stdout="ok", returncode=0)
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher")
time.sleep(0.5)
self.assertEqual(mock_run.call_count, 1)
def test_restart_runs_in_separate_thread(self):
"""Recovery must not block the calling thread."""
main_thread = threading.current_thread()
restart_thread_name = None
def fake_run(*args, **kwargs):
nonlocal restart_thread_name
restart_thread_name = threading.current_thread().name
time.sleep(0.1)
return MagicMock(stdout="ok", returncode=0)
with patch("subprocess.run", side_effect=fake_run):
t0 = time.monotonic()
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher")
elapsed = time.monotonic() - t0
self.assertLess(elapsed, 0.05, # main thread was not blocked
"Recovery blocked the calling thread")
time.sleep(0.3)
self.assertIsNotNone(restart_thread_name)
self.assertNotEqual(restart_thread_name, main_thread.name)
self.assertIn("recovery", restart_thread_name)
def test_different_services_have_independent_cooldowns(self):
"""Cooldown on exf_fetcher must not block acb_processor restart."""
self.svc._recovery_timestamps["dolphin_data:exf_fetcher"] = time.time()
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(stdout="ok", returncode=0)
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher") # blocked by cooldown
self.svc._restart_via_supervisorctl("dolphin_data:acb_processor") # should fire
time.sleep(0.3)
called_progs = [str(c) for c in mock_run.call_args_list]
self.assertEqual(mock_run.call_count, 1)
self.assertTrue(any("acb_processor" in c for c in called_progs))
class TestRecoveryNeverKillsRunning(unittest.TestCase):
"""Unit: RUNNING services are NEVER touched by recovery."""
def setUp(self):
self.svc = _fresh_svc()
def _make_report(self, status, service_states):
from dataclasses import fields
from meta_health_service_v3 import HealthReport
return HealthReport(
rm_meta=0.0,
status=status,
m4_control_plane=0.0,
m1_data_infra=0.0,
m1_trader=1.0,
m2_heartbeat=0.5,
m3_data_freshness=0.0,
m5_coherence=0.0,
service_status=service_states,
hz_key_status={},
timestamp=_now_iso(),
)
def test_running_service_never_restarted_even_in_dead_state(self):
"""Status=DEAD but all services RUNNING → no restart attempt."""
states = {p: "RUNNING" for p in SERVICES}
report = self._make_report("DEAD", states)
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
self.svc._attempt_recovery(report)
mock_restart.assert_not_called()
def test_green_status_never_triggers_recovery(self):
states = {p: "STOPPED" for p in SERVICES} # all stopped but GREEN
report = self._make_report("GREEN", states)
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
self.svc._attempt_recovery(report)
mock_restart.assert_not_called()
def test_degraded_status_never_triggers_recovery(self):
states = {"dolphin_data:exf_fetcher": "STOPPED"}
report = self._make_report("DEGRADED", states)
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
self.svc._attempt_recovery(report)
mock_restart.assert_not_called()
def test_trader_stopped_never_restarted(self):
"""Trader is informational — never auto-restarted regardless of status."""
states = {p: "RUNNING" for p in SERVICES}
states["dolphin:nautilus_trader"] = "STOPPED"
states["dolphin:scan_bridge"] = "STOPPED"
report = self._make_report("CRITICAL", states)
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
self.svc._attempt_recovery(report)
# trader and scan_bridge must never be restarted
for c in mock_restart.call_args_list:
prog = c[0][0]
self.assertNotIn("nautilus_trader", prog)
self.assertNotIn("scan_bridge", prog)
def test_critical_stopped_service_triggers_restart(self):
"""critical_data=True service that is STOPPED + status CRITICAL → restart fires."""
states = {p: "RUNNING" for p in SERVICES}
states["dolphin_data:exf_fetcher"] = "STOPPED"
report = self._make_report("CRITICAL", states)
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
self.svc._attempt_recovery(report)
mock_restart.assert_called_once_with("dolphin_data:exf_fetcher")
def test_dead_with_stopped_data_service_triggers_restart(self):
states = {p: "RUNNING" for p in SERVICES}
states["dolphin_data:obf_universe"] = "STOPPED"
report = self._make_report("DEAD", states)
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
self.svc._attempt_recovery(report)
mock_restart.assert_called_once_with("dolphin_data:obf_universe")
class TestM4ControlPlane(unittest.TestCase):
"""Unit: port checks."""
def setUp(self):
self.svc = _fresh_svc()
def _mock_socket(self, ports_up: set):
import socket as _sock
orig = _sock.socket
class FakeSocket:
def __init__(self, *a, **kw): pass
def __enter__(self): return self
def __exit__(self, *a): pass
def settimeout(self, t): pass
def connect_ex(self, addr):
return 0 if addr[1] in ports_up else 1
return patch("socket.socket", FakeSocket)
def test_both_up_scores_1(self):
with self._mock_socket({5701, 4200}):
score = self.svc._m4_control_plane()
self.assertAlmostEqual(score, 1.0, places=2)
def test_hz_up_prefect_down_scores_high(self):
with self._mock_socket({5701}):
score = self.svc._m4_control_plane()
# HZ weight=0.8, Prefect weight=0.2 → 0.8
self.assertAlmostEqual(score, 0.8, places=2)
def test_both_down_scores_zero(self):
with self._mock_socket(set()):
score = self.svc._m4_control_plane()
self.assertAlmostEqual(score, 0.0, places=2)
def test_hz_down_prefect_up_scores_point2(self):
with self._mock_socket({4200}):
score = self.svc._m4_control_plane()
self.assertAlmostEqual(score, 0.2, places=2)
class TestM5Coherence(unittest.TestCase):
"""Unit: coherence checks on HZ data."""
def setUp(self):
self.svc = _fresh_svc()
def _mock_features(self, exf=None, acb=None, uni=None):
mock_hz = MagicMock()
features = MagicMock()
def _get(k):
if k == "exf_latest":
return json.dumps(exf) if exf else None
if k == "acb_boost":
return json.dumps(acb) if acb else None
if k == "obf_universe_latest":
return json.dumps(uni) if uni else None
return None
features.get.side_effect = _get
mock_hz.get_map.return_value.blocking.return_value = features
return mock_hz
def test_all_coherent_scores_1(self):
exf = {"_acb_ready": True, "_ok_count": 30}
acb = {"boost": 1.35}
uni = {"_n_assets": 500}
with patch.object(self.svc, "_get_hz",
return_value=self._mock_features(exf, acb, uni)):
score = self.svc._m5_coherence()
self.assertAlmostEqual(score, 1.0, places=2)
def test_acb_not_ready_scores_half(self):
exf = {"_acb_ready": False, "_ok_count": 30}
acb = {"boost": 1.35}
uni = {"_n_assets": 500}
with patch.object(self.svc, "_get_hz",
return_value=self._mock_features(exf, acb, uni)):
score = self.svc._m5_coherence()
self.assertLess(score, 1.0)
def test_boost_out_of_range_scores_low(self):
exf = {"_acb_ready": True, "_ok_count": 30}
acb = {"boost": 0.5} # invalid: must be [1.0, 2.5]
uni = {"_n_assets": 500}
with patch.object(self.svc, "_get_hz",
return_value=self._mock_features(exf, acb, uni)):
score = self.svc._m5_coherence()
self.assertLess(score, 1.0)
def test_boost_exactly_1_is_valid(self):
exf = {"_acb_ready": True, "_ok_count": 30}
acb = {"boost": 1.0}
uni = {"_n_assets": 500}
with patch.object(self.svc, "_get_hz",
return_value=self._mock_features(exf, acb, uni)):
score = self.svc._m5_coherence()
# boost=1.0 is valid → acb check passes
self.assertGreater(score, 0.5)
def test_obf_universe_below_200_scores_half(self):
exf = {"_acb_ready": True, "_ok_count": 30}
acb = {"boost": 1.35}
uni = {"_n_assets": 50} # too few assets
with patch.object(self.svc, "_get_hz",
return_value=self._mock_features(exf, acb, uni)):
score = self.svc._m5_coherence()
# obf check gives 0.5, others give 1.0 → avg < 1.0
self.assertLess(score, 1.0)
def test_no_hz_scores_zero(self):
with patch.object(self.svc, "_get_hz", return_value=None):
score = self.svc._m5_coherence()
self.assertEqual(score, 0.0)
def test_exf_missing_scores_low(self):
with patch.object(self.svc, "_get_hz",
return_value=self._mock_features(None, {"boost": 1.3}, {"_n_assets": 500})):
score = self.svc._m5_coherence()
self.assertLess(score, 1.0)
# ─────────────────────────────────────────────────────────────────────────────
# Integration tests (require live HZ + supervisord)
# ─────────────────────────────────────────────────────────────────────────────
@unittest.skipUnless(_hz_available(), "HZ not reachable")
@unittest.skipUnless(_supervisord_running(), "supervisord not running")
class TestLiveIntegration(unittest.TestCase):
"""Integration: real HZ and supervisord."""
def setUp(self):
self.svc = _fresh_svc()
def test_hz_connects(self):
hz = self.svc._get_hz()
self.assertIsNotNone(hz, "HZ client should connect")
def test_m4_control_plane_live_is_1(self):
score = self.svc._m4_control_plane()
self.assertGreaterEqual(score, 0.8,
f"M4={score:.2f}: HZ or Prefect port down")
def test_all_data_services_running(self):
sv = self.svc._check_supervisord_status()
for prog, cfg in SERVICES.items():
if cfg["critical_data"]:
state = sv.get(prog, "UNKNOWN")
self.assertEqual(state, "RUNNING",
f"Critical data service {prog} is {state}")
def test_all_hz_keys_present(self):
_, results = self.svc._m3_data_freshness()
for key, info in results.items():
self.assertNotEqual(info.get("status"), "missing",
f"HZ key {key} is missing")
def test_exf_latest_is_fresh(self):
_, results = self.svc._m3_data_freshness()
exf = results.get("exf_latest", {})
age = exf.get("age_s", 9999)
self.assertLess(age, DATA_DEAD_S, f"exf_latest is dead ({age:.0f}s old)")
def test_obf_universe_has_many_assets(self):
score = self.svc._m5_coherence()
_, results = self.svc._m3_data_freshness()
# If obf_universe is present, coherence should reflect it
obf = results.get("obf_universe", {})
if obf.get("status") != "missing":
self.assertNotEqual(obf.get("score", 0.0), 0.0)
def test_acb_boost_is_plausible(self):
score = self.svc._m5_coherence()
self.assertGreater(score, 0.0, "M5 coherence is 0 — check HZ data integrity")
def test_full_health_check_is_green(self):
m4 = self.svc._m4_control_plane()
m1_data, m1_trader, svc_s = self.svc._m1_process_integrity()
m3, _ = self.svc._m3_data_freshness()
m5 = self.svc._m5_coherence()
rm, status = self.svc._compute_rm_meta(m4, m1_data, m3, m5)
self.assertIn(status, ("GREEN", "DEGRADED"),
f"System is {status} (rm={rm:.3f}). Sensors: "
f"M4={m4:.2f} M1={m1_data:.2f} M3={m3:.2f} M5={m5:.2f}")
self.assertGreater(rm, 0.6, f"rm_meta={rm:.3f} too low for production")
def test_status_json_written(self):
from meta_health_service_v3 import STATUS_JSON
# Run one cycle
m4 = self.svc._m4_control_plane()
m1_data, m1_trader, svc_s = self.svc._m1_process_integrity()
m2 = self.svc._m2_heartbeat_freshness()
m3, hz_keys = self.svc._m3_data_freshness()
m5 = self.svc._m5_coherence()
rm, status = self.svc._compute_rm_meta(m4, m1_data, m3, m5)
from meta_health_service_v3 import HealthReport
report = HealthReport(
rm_meta=rm, status=status,
m4_control_plane=m4, m1_data_infra=m1_data,
m1_trader=m1_trader, m2_heartbeat=m2,
m3_data_freshness=m3, m5_coherence=m5,
service_status=svc_s, hz_key_status=hz_keys,
timestamp=_now_iso(),
)
self.svc._emit(report)
self.assertTrue(STATUS_JSON.exists(), "Status JSON was not written")
# Retry read: the live MHS daemon writes to the same file concurrently
# and may leave it momentarily empty between truncate and write.
raw = ""
for _ in range(5):
raw = STATUS_JSON.read_text().strip()
if raw:
break
time.sleep(0.2)
self.assertTrue(raw, "Status JSON is empty after retries")
data = json.loads(raw)
self.assertIn("rm_meta", data)
self.assertIn("status", data)
self.assertIn("service_status", data)
def test_mhs_meta_health_hz_key_written(self):
"""MHS should push its own report to DOLPHIN_META_HEALTH."""
hz = self.svc._get_hz()
if hz is None:
self.skipTest("HZ not connected")
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
raw = meta_map.get("latest")
# May not exist yet if MHS only just started
if raw:
data = json.loads(raw)
self.assertIn("rm_meta", data)
self.assertIn("status", data)
# ─────────────────────────────────────────────────────────────────────────────
# E2E kill and revive tests
# ─────────────────────────────────────────────────────────────────────────────
@unittest.skipUnless(_hz_available(), "HZ not reachable")
@unittest.skipUnless(_supervisord_running(), "supervisord not running")
class TestKillAndRevive(unittest.TestCase):
"""
E2E: Stop critical data services via supervisorctl, verify MHS:
1. Detects STOPPED within one check cycle (15s)
2. Calls supervisorctl restart (via real supervisorctl or mock)
3. Service comes back RUNNING within RECOVERY_COOLDOWN_CRITICAL_S (10s) + startup
Note: rm_meta may stay GREEN if only one-of-three data services is stopped
(weighted sum design). Detection of STOPPED state is the key assertion.
Uses supervisorctl stop (not kill -9) so supervisord autorestart does NOT
fire MHS is the sole recovery mechanism in this scenario.
"""
DETECT_TIMEOUT_S = 20 # MHS must detect STOPPED within this many seconds
REVIVE_TIMEOUT_S = 30 # cooldown=10s + supervisord startsecs≤10s + margin
def _wait_for_state(self, prog: str, target: str, timeout: float) -> bool:
deadline = time.time() + timeout
while time.time() < deadline:
if _get_supervisord_state(prog) == target:
return True
time.sleep(2)
return False
def _run_one_check(self, svc: MetaHealthServiceV3):
m4 = svc._m4_control_plane()
m1_data, m1_trader, svc_s = svc._m1_process_integrity()
m3, hz_keys = svc._m3_data_freshness()
m5 = svc._m5_coherence()
rm, status = svc._compute_rm_meta(m4, m1_data, m3, m5)
return rm, status, svc_s
def _kill_and_revive(self, prog: str, label: str):
svc = _fresh_svc()
# ── Pre-condition: service must be RUNNING ────────────────────────────
initial_state = _get_supervisord_state(prog)
if initial_state != "RUNNING":
self.skipTest(f"{label} not RUNNING (state={initial_state}) — skipping kill test")
try:
# ── Kill it ───────────────────────────────────────────────────────
ok = _supervisorctl("stop", prog)
self.assertTrue(ok, f"supervisorctl stop {prog} failed")
stopped = self._wait_for_state(prog, "STOPPED", 10)
self.assertTrue(stopped, f"{label} did not reach STOPPED state in 10s")
# ── MHS detects STOPPED ──────────────────────────────────────────
rm, status, svc_s = self._run_one_check(svc)
self.assertEqual(svc_s.get(prog), "STOPPED",
f"MHS did not detect {label} as STOPPED (got {svc_s.get(prog)})")
# Note: rm_meta may stay GREEN if only one data service is stopped
# (weighted sum; GREEN threshold not breached at 2/3 data infra).
# The key assertion is that MHS correctly sees the service as STOPPED.
# ── MHS recovery fires ───────────────────────────────────────────
restart_called = threading.Event()
original_fn = svc._restart_via_supervisorctl
def _spy_restart(p):
if p == prog:
restart_called.set()
original_fn(p) # call through to actually restart
with patch.object(svc, "_restart_via_supervisorctl", side_effect=_spy_restart):
svc._attempt_recovery(
__import__("meta_health_service_v3").HealthReport(
rm_meta=rm, status="CRITICAL",
m4_control_plane=1.0, m1_data_infra=0.67,
m1_trader=1.0, m2_heartbeat=0.5,
m3_data_freshness=1.0, m5_coherence=1.0,
service_status=svc_s, hz_key_status={},
timestamp=_now_iso(),
)
)
restart_called.wait(timeout=5)
self.assertTrue(restart_called.is_set(),
f"MHS did not call restart for {label}")
# ── Service comes back ───────────────────────────────────────────
revived = self._wait_for_state(prog, "RUNNING", self.REVIVE_TIMEOUT_S)
self.assertTrue(revived,
f"{label} did not revive within {self.REVIVE_TIMEOUT_S}s")
# ── rm_meta recovers to GREEN ─────────────────────────────────────
svc2 = _fresh_svc()
time.sleep(5) # allow service to settle and push to HZ
rm2, status2, _ = self._run_one_check(svc2)
self.assertIn(status2, ("GREEN", "DEGRADED"),
f"rm_meta did not recover after {label} revival "
f"(rm={rm2:.3f} [{status2}])")
finally:
# Always ensure service is running after test
if _get_supervisord_state(prog) != "RUNNING":
_supervisorctl("start", prog)
self._wait_for_state(prog, "RUNNING", 30)
def test_kill_and_revive_exf_fetcher(self):
self._kill_and_revive("dolphin_data:exf_fetcher", "ExF Fetcher")
def test_kill_and_revive_acb_processor(self):
self._kill_and_revive("dolphin_data:acb_processor", "ACB Processor")
def test_kill_and_revive_obf_universe(self):
self._kill_and_revive("dolphin_data:obf_universe", "OBF Universe")
def test_detection_within_check_interval(self):
"""MHS must detect a stopped service within CHECK_INTERVAL_S seconds."""
self.assertLessEqual(CHECK_INTERVAL_S, 15.0,
f"CHECK_INTERVAL_S={CHECK_INTERVAL_S} too slow for detection")
def test_critical_revive_faster_than_10s(self):
"""Critical data services must have cooldown ≤ 10s."""
self.assertLessEqual(RECOVERY_COOLDOWN_CRITICAL_S, 10.0,
"Critical service cooldown must be ≤ 10s")
def test_double_kill_resistance(self):
"""
Kill same service twice within cooldown window.
MHS cooldown must prevent a supervisorctl restart storm.
We count actual subprocess.run("supervisorctl restart ...) calls — not
method invocations so the counter only increments when the cooldown
gate allows the call through to the daemon thread.
"""
prog = "dolphin_data:acb_processor"
if _get_supervisord_state(prog) != "RUNNING":
self.skipTest(f"{prog} not RUNNING")
svc = _fresh_svc()
restart_subprocess_count = [0]
_original_run = subprocess.run
def _counting_run(args, **kwargs):
if (isinstance(args, list)
and "supervisorctl" in str(args)
and "restart" in args):
restart_subprocess_count[0] += 1
return _original_run(args, **kwargs)
try:
_supervisorctl("stop", prog)
time.sleep(3)
import meta_health_service_v3 as _mod
with patch.object(_mod.subprocess, "run", side_effect=_counting_run):
svc._restart_via_supervisorctl(prog) # fires daemon thread
time.sleep(5) # let thread complete
svc._restart_via_supervisorctl(prog) # must be blocked by cooldown
time.sleep(2)
self.assertEqual(
restart_subprocess_count[0], 1,
"Cooldown failed: supervisorctl restart invoked more than once "
f"within the {RECOVERY_COOLDOWN_CRITICAL_S}s window",
)
finally:
self._wait_for_state(prog, "RUNNING", 30)
if _get_supervisord_state(prog) != "RUNNING":
_supervisorctl("start", prog)
self._wait_for_state(prog, "RUNNING", 30)
def test_m3_drops_when_exf_stopped(self):
"""
When ExF is stopped, exf_latest in HZ goes stale.
M3 score must drop from 1.0 toward 0.0 within DATA_DEAD_S seconds.
We wait up to DATA_STALE_S for a fresh baseline before starting,
so prior kill/revive tests don't leave stale data that confuses the delta.
"""
prog = "dolphin_data:exf_fetcher"
if _get_supervisord_state(prog) != "RUNNING":
self.skipTest(f"{prog} not RUNNING")
# Wait for exf_latest to be fresh (age < DATA_STALE_S) before baseline
deadline = time.time() + DATA_STALE_S + 15
svc_pre = _fresh_svc()
while time.time() < deadline:
_, details = svc_pre._m3_data_freshness()
exf_age = details.get("exf_latest", {}).get("age_s", 9999)
if exf_age < DATA_STALE_S:
break
time.sleep(5)
else:
self.skipTest(
f"exf_latest not fresh after {DATA_STALE_S+15}s wait "
f"(age={exf_age:.0f}s) — prior tests may have left HZ stale"
)
m3_pre, _ = svc_pre._m3_data_freshness()
# Pause meta_health so it doesn't heal exf before data goes stale
mh_prog = "dolphin_data:meta_health"
mh_was_running = _get_supervisord_state(mh_prog) == "RUNNING"
if mh_was_running:
_supervisorctl("stop", mh_prog)
try:
_supervisorctl("stop", prog)
self._wait_for_state(prog, "STOPPED", 10)
# Poll until exf_latest actually goes stale — don't rely on a fixed sleep
# since exf may push one final batch during graceful shutdown (stopwaitsecs=15)
svc_post = _fresh_svc()
exf_post = {}
stale_deadline = time.time() + DATA_STALE_S + 25 # generous: 55s max
while time.time() < stale_deadline:
m3_post, results = svc_post._m3_data_freshness()
exf_post = results.get("exf_latest", {})
if exf_post.get("score", 1.0) < 1.0:
break
time.sleep(3)
self.assertLessEqual(exf_post.get("score", 1.0), 0.5,
f"exf_latest score should be stale after {DATA_STALE_S+5}s "
f"without exf_fetcher: got {exf_post}")
self.assertLess(m3_post, m3_pre,
f"M3 did not drop after killing exf_fetcher "
f"(pre={m3_pre:.2f}, post={m3_post:.2f})")
finally:
_supervisorctl("start", prog)
self._wait_for_state(prog, "RUNNING", 45)
if mh_was_running:
_supervisorctl("start", mh_prog)
def test_no_systemd_units_active_for_managed_services(self):
"""
Verify no conflicting systemd units are active for supervisord-managed services.
This was the root cause of the original 'random killer' bug.
"""
conflict_units = [
"meta_health_daemon.service",
"dolphin-nautilus-trader.service",
"dolphin-scan-bridge.service",
"dolphin-ng.service",
]
for unit in conflict_units:
r = subprocess.run(
["systemctl", "is-active", unit],
capture_output=True, text=True,
)
state = r.stdout.strip()
self.assertNotEqual(state, "active",
f"Conflicting systemd unit {unit} is still active! "
f"This will fight supervisord and kill services.")
def test_no_duplicate_trader_processes(self):
"""
Nautilus trader must run as exactly ONE process.
Transient duplicates can appear briefly during supervisord autorestart
races when adjacent kill/revive tests fire rapidly. We allow one retry
after a 15s grace window if duplicates persist that long, it's a real bug.
"""
import psutil
def _scan():
found = []
for p in psutil.process_iter(["cmdline", "status"]):
try:
if (p.info["cmdline"]
and any("nautilus_event_trader" in a
for a in p.info["cmdline"])
and p.info["status"] not in ("zombie",)):
found.append(p)
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
return found
time.sleep(8) # let rapid-cooldown MHS restarts settle
procs = _scan()
if len(procs) > 1:
# One retry — transient autorestart race should resolve within 15s
time.sleep(15)
procs = _scan()
self.assertLessEqual(len(procs), 1,
f"Trader running {len(procs)} times after 23s grace! PIDs: "
f"{[p.pid for p in procs]}"
f"possible systemd/supervisord dual-management regression")
# ─────────────────────────────────────────────────────────────────────────────
# Regression / invariant tests
# ─────────────────────────────────────────────────────────────────────────────
class TestServiceRegistry(unittest.TestCase):
"""Sanity checks on the SERVICES registry itself."""
def test_all_critical_data_services_present(self):
expected = {
"dolphin_data:exf_fetcher",
"dolphin_data:acb_processor",
"dolphin_data:obf_universe",
}
for prog in expected:
self.assertIn(prog, SERVICES, f"{prog} missing from SERVICES registry")
self.assertTrue(SERVICES[prog]["critical_data"],
f"{prog} should have critical_data=True")
def test_trader_services_not_critical(self):
for prog in ["dolphin:nautilus_trader", "dolphin:scan_bridge"]:
if prog in SERVICES:
self.assertFalse(SERVICES[prog]["critical_data"],
f"{prog} must NOT be critical_data (never auto-restart trader)")
def test_hz_data_sources_match_expected_keys(self):
expected_keys = {"exf_latest", "acb_boost", "latest_eigen_scan", "obf_universe"}
self.assertEqual(set(HZ_DATA_SOURCES.keys()), expected_keys)
def test_exf_latest_ts_field(self):
self.assertEqual(HZ_DATA_SOURCES["exf_latest"][2], "_pushed_at")
def test_acb_boost_presence_only(self):
self.assertIsNone(HZ_DATA_SOURCES["acb_boost"][2],
"acb_boost has no reliable timestamp — should be presence-only")
def test_critical_cooldown_less_than_10s(self):
self.assertLessEqual(RECOVERY_COOLDOWN_CRITICAL_S, 10.0)
def test_check_interval_reasonable(self):
self.assertLessEqual(CHECK_INTERVAL_S, 15.0,
"Check interval too slow — services may go undetected")
self.assertGreaterEqual(CHECK_INTERVAL_S, 2.0,
"Check interval too aggressive — excessive HZ load")
# ─────────────────────────────────────────────────────────────────────────────
# Race condition tests
# ─────────────────────────────────────────────────────────────────────────────
class TestRaceConditions(unittest.TestCase):
"""
Multi-threaded stress tests that expose cooldown and recovery races.
All tests use mocked subprocess so no real services are touched.
"""
def setUp(self):
self.svc = _fresh_svc()
def test_concurrent_restarts_same_service_only_one_fires(self):
"""
10 threads all call _restart_via_supervisorctl simultaneously.
Cooldown must guarantee exactly ONE subprocess.run call.
"""
prog = "dolphin_data:exf_fetcher"
call_count = [0]
barrier = threading.Barrier(10)
import meta_health_service_v3 as _mod
real_run = _mod.subprocess.run
def _counting_run(args, **kwargs):
if isinstance(args, list) and "restart" in args:
call_count[0] += 1
return MagicMock(stdout="ok", returncode=0)
with patch.object(_mod.subprocess, "run", side_effect=_counting_run):
threads = []
for _ in range(10):
def _worker():
barrier.wait() # all start at the same instant
self.svc._restart_via_supervisorctl(prog)
t = threading.Thread(target=_worker, daemon=True)
threads.append(t)
t.start()
for t in threads:
t.join(timeout=5)
time.sleep(1) # let daemon threads flush
self.assertEqual(call_count[0], 1,
f"Expected 1 restart, got {call_count[0]} — cooldown not thread-safe")
def test_concurrent_restarts_different_services_all_fire(self):
"""
Each service has its own cooldown bucket.
3 services called simultaneously 3 restarts.
"""
progs = [
"dolphin_data:exf_fetcher",
"dolphin_data:acb_processor",
"dolphin_data:obf_universe",
]
fired = set()
lock = threading.Lock()
import meta_health_service_v3 as _mod
def _counting_run(args, **kwargs):
if isinstance(args, list) and "restart" in args:
# The program name is the last element of the supervisorctl cmd
with lock:
fired.add(args[-1])
return MagicMock(stdout="ok", returncode=0)
with patch.object(_mod.subprocess, "run", side_effect=_counting_run):
threads = [
threading.Thread(
target=self.svc._restart_via_supervisorctl,
args=(p,), daemon=True
)
for p in progs
]
for t in threads:
t.start()
for t in threads:
t.join(timeout=5)
time.sleep(1)
self.assertEqual(fired, set(progs),
f"Not all services fired a restart: fired={fired}")
def test_cooldown_timestamp_set_before_thread_completes(self):
"""
The cooldown timestamp must be committed BEFORE the daemon thread
runs, so a second call arriving while the first thread is still
executing is also blocked.
"""
prog = "dolphin_data:exf_fetcher"
slow_start = threading.Event()
import meta_health_service_v3 as _mod
def _slow_run(args, **kwargs):
slow_start.set()
time.sleep(2) # simulate slow supervisorctl
return MagicMock(stdout="ok", returncode=0)
with patch.object(_mod.subprocess, "run", side_effect=_slow_run):
self.svc._restart_via_supervisorctl(prog) # fires async thread
slow_start.wait(timeout=3) # thread is in subprocess.run
# Timestamp must already be set even though thread hasn't finished
self.assertIn(prog, self.svc._recovery_timestamps,
"Timestamp not set before thread completed")
call_count = [0]
original = _mod.subprocess.run
def _count(args, **kwargs):
call_count[0] += 1
return MagicMock(returncode=0)
# Second call while thread still running — must be blocked
with patch.object(_mod.subprocess, "run", side_effect=_count):
self.svc._restart_via_supervisorctl(prog)
time.sleep(0.5)
self.assertEqual(call_count[0], 0,
"Second restart fired while first thread still running")
def test_check_loop_not_blocked_by_slow_supervisorctl(self):
"""
_restart_via_supervisorctl must return immediately even if
supervisorctl hangs for 30s. The check loop (CHECK_INTERVAL_S=10s)
must not be blocked.
"""
import meta_health_service_v3 as _mod
prog = "dolphin_data:acb_processor"
thread_started = threading.Event()
def _hanging_run(args, **kwargs):
thread_started.set()
time.sleep(30) # simulate completely hung supervisorctl
return MagicMock(returncode=0)
with patch.object(_mod.subprocess, "run", side_effect=_hanging_run):
t0 = time.time()
self.svc._restart_via_supervisorctl(prog)
elapsed = time.time() - t0
self.assertLess(elapsed, 1.0,
f"_restart_via_supervisorctl blocked for {elapsed:.2f}s — must be async")
thread_started.wait(timeout=3) # daemon thread did start
def test_recovery_timestamps_dict_survives_concurrent_writes(self):
"""
Concurrent writes to _recovery_timestamps from multiple threads
must not corrupt the dict (no KeyError, no lost entries).
"""
progs = list(SERVICES.keys())
import meta_health_service_v3 as _mod
def _noop_run(args, **kwargs):
return MagicMock(returncode=0)
errors = []
def _worker(p):
try:
with patch.object(_mod.subprocess, "run", side_effect=_noop_run):
for _ in range(20):
# backdate so every call is "allowed"
self.svc._recovery_timestamps.pop(p, None)
self.svc._restart_via_supervisorctl(p)
time.sleep(0.001)
except Exception as e:
errors.append(e)
threads = [threading.Thread(target=_worker, args=(p,), daemon=True) for p in progs]
for t in threads:
t.start()
for t in threads:
t.join(timeout=10)
time.sleep(0.5)
self.assertEqual(errors, [], f"Concurrent dict access raised: {errors}")
# ─────────────────────────────────────────────────────────────────────────────
# Edge case tests
# ─────────────────────────────────────────────────────────────────────────────
class TestEdgeCases(unittest.TestCase):
"""Boundary values, malformed data, and unusual-but-valid states."""
def setUp(self):
self.svc = _fresh_svc()
# ── _compute_rm_meta edge cases ──────────────────────────────────────────
def test_rm_meta_exactly_at_green_threshold(self):
rm, status = self.svc._compute_rm_meta(
m4=1.0, m1_data=1.0, m3=1.0, m5=1.0
)
self.assertEqual(status, "GREEN")
self.assertAlmostEqual(rm, 1.0, places=6)
def test_rm_meta_exactly_at_degraded_boundary(self):
# Find exact boundary value: GREEN iff rm >= 0.85
# Set m1=0 gives: 0.35*m4 + 0.20*m3 + 0.10*m5 = 0.35+0.20+0.10 = 0.65 → CRITICAL
rm, status = self.svc._compute_rm_meta(
m4=1.0, m1_data=0.0, m3=1.0, m5=1.0
)
self.assertIn(status, ("CRITICAL", "DEGRADED"),
f"rm={rm:.3f} with m1_data=0 should not be GREEN")
def test_rm_meta_all_sensors_zero(self):
rm, status = self.svc._compute_rm_meta(
m4=0.0, m1_data=0.0, m3=0.0, m5=0.0
)
self.assertAlmostEqual(rm, 0.0, places=6)
self.assertEqual(status, "DEAD")
def test_rm_meta_nan_sensor_does_not_propagate(self):
"""NaN in one sensor must not corrupt rm_meta (treat as 0)."""
import math
# Manually build the weighted sum as MHS does, but inject nan
sensors = {"m4_control_plane": float("nan"), "m1_data_infra": 1.0,
"m3_data_freshness": 1.0, "m5_coherence": 1.0}
rm = sum(
(v if not math.isnan(v) else 0.0) * SENSOR_WEIGHTS[k]
for k, v in sensors.items()
)
self.assertFalse(math.isnan(rm), "NaN propagated into rm_meta")
self.assertGreater(rm, 0.0)
def test_rm_meta_clamped_to_0_1(self):
"""rm_meta result should never exceed 1.0 or go below 0.0."""
rm, _ = self.svc._compute_rm_meta(1.0, 1.0, 1.0, 1.0)
self.assertLessEqual(rm, 1.0)
self.assertGreaterEqual(rm, 0.0)
# ── M3 freshness edge cases ──────────────────────────────────────────────
def test_m3_timestamp_just_past_stale_boundary(self):
"""A key 1s past DATA_STALE_S should score 0.5 (stale, not fresh).
The boundary is strict (>), so exactly DATA_STALE_S still scores 1.0."""
svc = _fresh_svc()
hz_mock = MagicMock()
stale_ts = datetime.now(timezone.utc).timestamp() - (DATA_STALE_S + 1.0)
hz_mock.get_map.return_value.blocking.return_value.get.return_value = (
json.dumps({"_pushed_at": stale_ts})
)
svc._hz_client = hz_mock
m3, details = svc._m3_data_freshness()
exf_score = details.get("exf_latest", {}).get("score", 1.0)
self.assertLess(exf_score, 1.0,
f"Score 1s past stale boundary should be < 1.0 (got {exf_score})")
self.assertAlmostEqual(exf_score, 0.5, places=5,
msg=f"Stale key should score 0.5 (got {exf_score})")
def test_m3_timestamp_in_the_future(self):
"""A timestamp 5s in the future (clock skew) must not crash and score > 0."""
svc = _fresh_svc()
hz_mock = MagicMock()
future_ts = datetime.now(timezone.utc).timestamp() + 5.0
hz_mock.get_map.return_value.blocking.return_value.get.return_value = (
json.dumps({"_pushed_at": future_ts})
)
svc._hz_client = hz_mock
m3, _ = svc._m3_data_freshness()
self.assertGreater(m3, 0.0, "Future timestamp should not score 0")
def test_m3_timestamp_iso_string_with_tz(self):
svc = _fresh_svc()
hz_mock = MagicMock()
iso = datetime.now(timezone.utc).isoformat()
hz_mock.get_map.return_value.blocking.return_value.get.return_value = (
json.dumps({"timestamp": iso})
)
svc._hz_client = hz_mock
m3, _ = svc._m3_data_freshness()
self.assertGreater(m3, 0.5)
def test_m3_garbage_json_in_hz_scores_zero_not_crash(self):
svc = _fresh_svc()
hz_mock = MagicMock()
hz_mock.get_map.return_value.blocking.return_value.get.return_value = (
"NOT_VALID_JSON{{{"
)
svc._hz_client = hz_mock
try:
m3, _ = svc._m3_data_freshness()
# Should not raise; score for corrupted key should be 0 or 0.5
self.assertGreaterEqual(m3, 0.0)
except Exception as exc:
self.fail(f"_m3_data_freshness crashed on garbage JSON: {exc}")
def test_m3_empty_dict_in_hz_scores_low(self):
svc = _fresh_svc()
hz_mock = MagicMock()
hz_mock.get_map.return_value.blocking.return_value.get.return_value = "{}"
svc._hz_client = hz_mock
m3, details = svc._m3_data_freshness()
# Missing ts field → presence-only logic kicks in
self.assertGreaterEqual(m3, 0.0)
# ── _attempt_recovery edge cases ─────────────────────────────────────────
def test_attempt_recovery_ignores_non_critical_stopped(self):
"""trader/scan_bridge stopped must NOT trigger a restart."""
import meta_health_service_v3 as _mod
svc = _fresh_svc()
call_log = []
def _spy(p):
call_log.append(p)
with patch.object(svc, "_restart_via_supervisorctl", side_effect=_spy):
svc._attempt_recovery(
_mod.HealthReport(
rm_meta=0.1, status="DEAD",
m4_control_plane=0.0, m1_data_infra=0.0,
m1_trader=0.0, m2_heartbeat=0.0,
m3_data_freshness=0.0, m5_coherence=0.0,
service_status={
"dolphin:nautilus_trader": "STOPPED",
"dolphin:scan_bridge": "STOPPED",
# critical services still running
"dolphin_data:exf_fetcher": "RUNNING",
"dolphin_data:acb_processor": "RUNNING",
"dolphin_data:obf_universe": "RUNNING",
},
hz_key_status={}, timestamp=_now_iso(),
)
)
self.assertEqual(call_log, [],
f"Non-critical stopped services triggered restart: {call_log}")
def test_attempt_recovery_noop_when_degraded(self):
"""DEGRADED status must never trigger any restart."""
import meta_health_service_v3 as _mod
svc = _fresh_svc()
call_log = []
def _spy(p):
call_log.append(p)
with patch.object(svc, "_restart_via_supervisorctl", side_effect=_spy):
svc._attempt_recovery(
_mod.HealthReport(
rm_meta=0.70, status="DEGRADED",
m4_control_plane=0.8, m1_data_infra=0.67,
m1_trader=1.0, m2_heartbeat=1.0,
m3_data_freshness=0.9, m5_coherence=0.8,
service_status={p: "RUNNING" for p in SERVICES},
hz_key_status={}, timestamp=_now_iso(),
)
)
self.assertEqual(call_log, [],
f"DEGRADED status should never trigger restart: {call_log}")
def test_supervisorctl_parse_extra_whitespace(self):
"""Lines with extra spaces/tabs must parse correctly."""
svc = _fresh_svc()
output = (
"dolphin_data:exf_fetcher RUNNING pid 12345, uptime 0:01:00\n"
"dolphin_data:acb_processor\t\tSTOPPED\n"
"dolphin:nautilus_trader FATAL Exited too quickly\n"
)
with patch("subprocess.run") as mock_run:
mock_run.return_value = MagicMock(
stdout=output, returncode=0
)
result = svc._check_supervisord_status()
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "RUNNING")
self.assertEqual(result.get("dolphin_data:acb_processor"), "STOPPED")
self.assertEqual(result.get("dolphin:nautilus_trader"), "FATAL")
def test_supervisorctl_timeout_returns_empty_not_crash(self):
import subprocess as sp
svc = _fresh_svc()
with patch("subprocess.run", side_effect=sp.TimeoutExpired(cmd=[], timeout=5)):
result = svc._check_supervisord_status()
self.assertEqual(result, {}, "TimeoutExpired must return empty dict")
def test_cooldown_very_old_timestamp_allows_restart(self):
"""A timestamp from a week ago should not block recovery."""
svc = _fresh_svc()
prog = "dolphin_data:exf_fetcher"
svc._recovery_timestamps[prog] = time.time() - 7 * 86400 # one week ago
import meta_health_service_v3 as _mod
fired = [False]
def _noop(args, **kwargs):
fired[0] = True
return MagicMock(returncode=0)
with patch.object(_mod.subprocess, "run", side_effect=_noop):
svc._restart_via_supervisorctl(prog)
time.sleep(0.5)
self.assertTrue(fired[0], "Old timestamp should not block restart")
def test_recovery_timestamps_initially_empty(self):
svc = _fresh_svc()
self.assertEqual(svc._recovery_timestamps, {},
"Fresh MHS should have no prior cooldown timestamps")
def test_m4_hz_port_down_scores_at_most_0_2(self):
"""If HZ port is down, m4 ≤ 0.2 (only Prefect weight remains)."""
svc = _fresh_svc()
import socket as _sock
def _always_refused(*a, **kw):
raise ConnectionRefusedError
with patch.object(_sock.socket, "connect_ex", return_value=1):
m4 = svc._m4_control_plane()
self.assertLessEqual(m4, 0.2 + 1e-6,
f"HZ down should give m4 ≤ 0.2, got {m4:.3f}")
# ─────────────────────────────────────────────────────────────────────────────
# Hypothesis-based property tests
# ─────────────────────────────────────────────────────────────────────────────
from hypothesis import given, assume, settings as hyp_settings, HealthCheck
from hypothesis import strategies as st
_sensor_st = st.floats(min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False)
class TestHypothesisProperties(unittest.TestCase):
"""
Property-based tests using Hypothesis.
Verify invariants that must hold for ALL valid sensor combinations.
"""
def _svc(self):
return _fresh_svc()
@given(m4=_sensor_st, m1=_sensor_st, m3=_sensor_st, m5=_sensor_st)
@hyp_settings(max_examples=500, suppress_health_check=[HealthCheck.too_slow])
def test_rm_meta_always_in_0_1(self, m4, m1, m3, m5):
"""rm_meta must always be in [0, 1] for any valid sensor inputs."""
rm, _ = self._svc()._compute_rm_meta(m4, m1, m3, m5)
self.assertGreaterEqual(rm, 0.0 - 1e-9)
self.assertLessEqual(rm, 1.0 + 1e-9)
@given(m4=_sensor_st, m1=_sensor_st, m3=_sensor_st, m5=_sensor_st)
@hyp_settings(max_examples=500, suppress_health_check=[HealthCheck.too_slow])
def test_status_always_valid_string(self, m4, m1, m3, m5):
"""Status label must always be one of the four valid strings."""
_, status = self._svc()._compute_rm_meta(m4, m1, m3, m5)
self.assertIn(status, {"GREEN", "DEGRADED", "CRITICAL", "DEAD"})
@given(m4=_sensor_st, m1=_sensor_st, m3=_sensor_st, m5=_sensor_st)
@hyp_settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow])
def test_rm_meta_monotone_in_each_sensor(self, m4, m1, m3, m5):
"""Increasing any sensor by 0.1 must not decrease rm_meta."""
svc = self._svc()
rm_base, _ = svc._compute_rm_meta(m4, m1, m3, m5)
delta = 0.1
rm_m4, _ = svc._compute_rm_meta(min(m4 + delta, 1.0), m1, m3, m5)
rm_m1, _ = svc._compute_rm_meta(m4, min(m1 + delta, 1.0), m3, m5)
rm_m3, _ = svc._compute_rm_meta(m4, m1, min(m3 + delta, 1.0), m5)
rm_m5, _ = svc._compute_rm_meta(m4, m1, m3, min(m5 + delta, 1.0))
self.assertGreaterEqual(rm_m4, rm_base - 1e-9)
self.assertGreaterEqual(rm_m1, rm_base - 1e-9)
self.assertGreaterEqual(rm_m3, rm_base - 1e-9)
self.assertGreaterEqual(rm_m5, rm_base - 1e-9)
@given(m4=_sensor_st, m1=_sensor_st, m3=_sensor_st, m5=_sensor_st)
@hyp_settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow])
def test_green_requires_both_heavy_sensors_high(self, m4, m1, m3, m5):
"""
If BOTH of the two highest-weighted sensors (m4=0.35, m1=0.35) are
below 0.5, the system cannot be GREEN.
Math: max rm with m4<0.5, m1<0.5 is
0.35*0.499 + 0.35*0.499 + 0.20*1.0 + 0.10*1.0 0.649 < 0.85 (GREEN)
So NOT GREEN is guaranteed.
Note: one low sensor alone (e.g. m3=0.375, m4=m1=m5=1.0) can still be
GREEN since the other sensors compensate that is a correct system behaviour.
"""
assume(m4 < 0.5 and m1 < 0.5)
_, status = self._svc()._compute_rm_meta(m4, m1, m3, m5)
self.assertNotEqual(status, "GREEN",
f"GREEN with both heavy sensors low: m4={m4:.3f} m1={m1:.3f}")
@given(age_s=st.floats(min_value=0.0, max_value=300.0, allow_nan=False))
@hyp_settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow],
deadline=None)
def test_freshness_score_decreases_with_age(self, age_s):
"""
Freshness score must be non-increasing as age increases.
Specifically: age < DATA_STALE_S score=1.0,
DATA_STALE_S < age < DATA_DEAD_S score=0.5,
age > DATA_DEAD_S score=0.0.
We exclude a ±1s margin around each boundary because the small gap
between computing `ts` and the `time.time()` call inside the method
creates timing jitter values exactly at the boundary are unreliable.
"""
# Skip the ±1s zone around each threshold to avoid timing jitter
assume(abs(age_s - DATA_STALE_S) > 1.0)
assume(abs(age_s - DATA_DEAD_S) > 1.0)
svc = _fresh_svc()
hz = MagicMock()
now_ts = datetime.now(timezone.utc).timestamp()
ts = now_ts - age_s
hz.get_map.return_value.blocking.return_value.get.return_value = (
json.dumps({"_pushed_at": ts})
)
svc._hz_client = hz
_, details = svc._m3_data_freshness()
score = details.get("exf_latest", {}).get("score", 0.0)
# Boundary is strict: age > DATA_STALE_S → stale, age > DATA_DEAD_S → dead
if age_s < DATA_STALE_S:
self.assertAlmostEqual(score, 1.0, places=5,
msg=f"Fresh key (age={age_s:.1f}s) should score 1.0, got {score}")
elif age_s > DATA_DEAD_S:
self.assertAlmostEqual(score, 0.0, places=5,
msg=f"Dead key (age={age_s:.1f}s) should score 0.0, got {score}")
else:
self.assertGreater(score, 0.0,
msg=f"Stale key (age={age_s:.1f}s) should score > 0, got {score}")
self.assertLess(score, 1.0,
msg=f"Stale key (age={age_s:.1f}s) should score < 1.0, got {score}")
@given(
statuses=st.dictionaries(
st.sampled_from(list(SERVICES.keys())),
st.sampled_from(["RUNNING", "STOPPED", "FATAL", "STARTING"]),
min_size=1,
)
)
@hyp_settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow])
def test_m1_score_always_in_0_1(self, statuses):
"""M1 scores must always be in [0, 1] regardless of service states."""
svc = _fresh_svc()
with patch.object(svc, "_check_supervisord_status", return_value=statuses):
m1_data, m1_trader, _ = svc._m1_process_integrity()
self.assertGreaterEqual(m1_data, 0.0)
self.assertLessEqual(m1_data, 1.0)
self.assertGreaterEqual(m1_trader, 0.0)
self.assertLessEqual(m1_trader, 1.0)
@given(
statuses=st.dictionaries(
st.sampled_from(list(SERVICES.keys())),
st.just("RUNNING"),
min_size=len(SERVICES),
max_size=len(SERVICES),
)
)
@hyp_settings(max_examples=50, suppress_health_check=[HealthCheck.too_slow])
def test_all_running_always_scores_1(self, statuses):
"""All services RUNNING must always give m1_data=1.0 and m1_trader=1.0."""
svc = _fresh_svc()
with patch.object(svc, "_check_supervisord_status", return_value=statuses):
m1_data, m1_trader, _ = svc._m1_process_integrity()
self.assertAlmostEqual(m1_data, 1.0, places=6)
self.assertAlmostEqual(m1_trader, 1.0, places=6)
@given(n_healthy=st.integers(min_value=0, max_value=540))
@hyp_settings(max_examples=200, suppress_health_check=[HealthCheck.too_slow])
def test_m5_obf_score_monotone_in_healthy_count(self, n_healthy):
"""
M5 OBF coverage score must be non-decreasing with n_healthy_assets.
"""
svc_lo = _fresh_svc()
svc_hi = _fresh_svc()
n_hi = min(n_healthy + 10, 540)
hz = MagicMock()
def _make_hz_return(n):
return json.dumps({
"exf_latest": {"feature_count": 50},
"acb_boost": {"boost": 1.05},
"obf_universe_latest": {"n_healthy": n},
})
hz_lo = MagicMock()
hz_lo.get_map.return_value.blocking.return_value.get.side_effect = (
lambda k: _make_hz_return(n_healthy) if k == "obf_universe_latest"
else json.dumps({"boost": 1.05, "feature_count": 50})
)
hz_hi = MagicMock()
hz_hi.get_map.return_value.blocking.return_value.get.side_effect = (
lambda k: _make_hz_return(n_hi) if k == "obf_universe_latest"
else json.dumps({"boost": 1.05, "feature_count": 50})
)
svc_lo._hz_client = hz_lo
svc_hi._hz_client = hz_hi
m5_lo = svc_lo._m5_coherence()
m5_hi = svc_hi._m5_coherence()
self.assertLessEqual(m5_lo, m5_hi + 1e-9,
f"m5 should not decrease when n_healthy goes {n_healthy}{n_hi}: "
f"m5_lo={m5_lo:.3f} m5_hi={m5_hi:.3f}")
@given(
cooldown=st.floats(min_value=0.1, max_value=RECOVERY_COOLDOWN_CRITICAL_S * 0.99),
)
@hyp_settings(max_examples=100, suppress_health_check=[HealthCheck.too_slow], deadline=None)
def test_restart_blocked_within_cooldown_window(self, cooldown):
"""
Any timestamp within the cooldown window must block a second restart.
"""
import meta_health_service_v3 as _mod
svc = _fresh_svc()
prog = "dolphin_data:exf_fetcher"
# Set timestamp so it's `cooldown` seconds in the past (within window)
svc._recovery_timestamps[prog] = time.time() - cooldown
fired = [False]
def _noop(args, **kwargs):
fired[0] = True
return MagicMock(returncode=0)
with patch.object(_mod.subprocess, "run", side_effect=_noop):
svc._restart_via_supervisorctl(prog)
time.sleep(0.2)
self.assertFalse(fired[0],
f"Restart fired with {cooldown:.2f}s elapsed (cooldown={RECOVERY_COOLDOWN_CRITICAL_S}s)")
@given(
elapsed=st.floats(
min_value=RECOVERY_COOLDOWN_CRITICAL_S + 0.1, max_value=3600.0,
allow_nan=False, allow_infinity=False
)
)
@hyp_settings(max_examples=100, suppress_health_check=[HealthCheck.too_slow], deadline=None)
def test_restart_allowed_after_cooldown_expires(self, elapsed):
"""Any timestamp beyond the cooldown window must allow a restart."""
import meta_health_service_v3 as _mod
svc = _fresh_svc()
prog = "dolphin_data:exf_fetcher"
svc._recovery_timestamps[prog] = time.time() - elapsed
fired = [False]
def _noop(args, **kwargs):
fired[0] = True
return MagicMock(returncode=0)
with patch.object(_mod.subprocess, "run", side_effect=_noop):
svc._restart_via_supervisorctl(prog)
time.sleep(0.5)
self.assertTrue(fired[0],
f"Restart blocked with {elapsed:.1f}s elapsed (cooldown={RECOVERY_COOLDOWN_CRITICAL_S}s)")
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
unittest.main(verbosity=2)