Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
1776 lines
77 KiB
Python
Executable File
1776 lines
77 KiB
Python
Executable File
"""
|
|
DOLPHIN Meta Health Service v3 — Comprehensive Test Suite
|
|
==========================================================
|
|
Unit, integration, and E2E kill/revive tests.
|
|
|
|
Test classes:
|
|
TestSupervisordStatusParsing — unit: parse supervisorctl output variants
|
|
TestM1ProcessIntegrity — unit: scoring logic with mocked sv_status
|
|
TestM3DataFreshnessScoring — unit: freshness thresholds and scoring
|
|
TestRmMetaFormula — unit: weighted sum, thresholds, edge cases
|
|
TestRecoveryGating — unit: cooldown, thread isolation, only-STOPPED rule
|
|
TestRecoveryNeverKillsRunning — unit: RUNNING services are NEVER restarted
|
|
TestM4ControlPlane — unit: port checks with mocked socket
|
|
TestM5Coherence — unit: data integrity checks
|
|
TestLiveIntegration — integration: live HZ + supervisord status
|
|
TestKillAndRevive — E2E: stop critical services, verify MHS detects
|
|
and revives within 60s (critical cooldown window)
|
|
|
|
Run:
|
|
cd /mnt/dolphinng5_predict
|
|
source /home/dolphin/siloqy_env/bin/activate
|
|
python -m pytest prod/tests/test_mhs_v3.py -v --tb=short
|
|
"""
|
|
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import threading
|
|
import unittest
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict
|
|
from unittest.mock import MagicMock, patch, call
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
# ── Import the module under test ───────────────────────────────────────────────
|
|
from meta_health_service_v3 import (
|
|
MetaHealthServiceV3,
|
|
SERVICES,
|
|
HZ_DATA_SOURCES,
|
|
SENSOR_WEIGHTS,
|
|
CHECK_INTERVAL_S,
|
|
DATA_STALE_S,
|
|
DATA_DEAD_S,
|
|
RECOVERY_COOLDOWN_CRITICAL_S,
|
|
RECOVERY_COOLDOWN_DEFAULT_S,
|
|
SUPERVISORD_CONF,
|
|
)
|
|
|
|
CONF_PATH = str(SUPERVISORD_CONF)
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Helpers
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
def _fresh_svc() -> MetaHealthServiceV3:
|
|
"""Return a new MHS instance with no HZ connection (lazy)."""
|
|
return MetaHealthServiceV3()
|
|
|
|
|
|
def _hz_available() -> bool:
|
|
import socket
|
|
try:
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
s.settimeout(1.0)
|
|
return s.connect_ex(("127.0.0.1", 5701)) == 0
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _supervisord_running() -> bool:
|
|
try:
|
|
r = subprocess.run(
|
|
["supervisorctl", "-c", CONF_PATH, "status"],
|
|
capture_output=True, text=True, timeout=5,
|
|
)
|
|
return "exf_fetcher" in r.stdout
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _supervisorctl(cmd: str, prog: str) -> bool:
|
|
"""Run supervisorctl command, return True on success."""
|
|
try:
|
|
r = subprocess.run(
|
|
["supervisorctl", "-c", CONF_PATH, cmd, prog],
|
|
capture_output=True, text=True, timeout=15,
|
|
)
|
|
return r.returncode == 0
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _get_supervisord_state(prog: str) -> str:
|
|
"""Return supervisord state for a program: RUNNING / STOPPED / FATAL / UNKNOWN."""
|
|
try:
|
|
r = subprocess.run(
|
|
["supervisorctl", "-c", CONF_PATH, "status", prog],
|
|
capture_output=True, text=True, timeout=5,
|
|
)
|
|
parts = r.stdout.strip().split()
|
|
return parts[1] if len(parts) >= 2 else "UNKNOWN"
|
|
except Exception:
|
|
return "UNKNOWN"
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.now(timezone.utc).isoformat()
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Unit tests
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestSupervisordStatusParsing(unittest.TestCase):
|
|
"""Unit: _check_supervisord_status parses all supervisorctl output variants."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def _mock_run(self, stdout: str, returncode: int = 0):
|
|
mock = MagicMock()
|
|
mock.stdout = stdout
|
|
mock.returncode = returncode
|
|
return mock
|
|
|
|
def test_all_running(self):
|
|
stdout = (
|
|
"dolphin_data:exf_fetcher RUNNING pid 100, uptime 1:00:00\n"
|
|
"dolphin_data:acb_processor RUNNING pid 101, uptime 0:30:00\n"
|
|
"dolphin_data:obf_universe RUNNING pid 102, uptime 0:20:00\n"
|
|
"dolphin_data:meta_health RUNNING pid 103, uptime 0:10:00\n"
|
|
"dolphin:nautilus_trader RUNNING pid 200, uptime 2:00:00\n"
|
|
"dolphin:scan_bridge RUNNING pid 201, uptime 1:50:00\n"
|
|
)
|
|
with patch("subprocess.run", return_value=self._mock_run(stdout)):
|
|
result = self.svc._check_supervisord_status()
|
|
for prog in SERVICES:
|
|
self.assertIn(prog, result, f"{prog} not in parsed result")
|
|
self.assertEqual(result[prog], "RUNNING", f"{prog} should be RUNNING")
|
|
|
|
def test_one_stopped(self):
|
|
stdout = (
|
|
"dolphin_data:exf_fetcher STOPPED Not started\n"
|
|
"dolphin_data:acb_processor RUNNING pid 101\n"
|
|
"dolphin_data:obf_universe RUNNING pid 102\n"
|
|
)
|
|
with patch("subprocess.run", return_value=self._mock_run(stdout)):
|
|
result = self.svc._check_supervisord_status()
|
|
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "STOPPED")
|
|
self.assertEqual(result.get("dolphin_data:acb_processor"), "RUNNING")
|
|
|
|
def test_starting_state(self):
|
|
stdout = "dolphin_data:exf_fetcher STARTING \n"
|
|
with patch("subprocess.run", return_value=self._mock_run(stdout)):
|
|
result = self.svc._check_supervisord_status()
|
|
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "STARTING")
|
|
|
|
def test_fatal_state(self):
|
|
stdout = "dolphin_data:exf_fetcher FATAL Exited too quickly\n"
|
|
with patch("subprocess.run", return_value=self._mock_run(stdout)):
|
|
result = self.svc._check_supervisord_status()
|
|
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "FATAL")
|
|
|
|
def test_subprocess_failure_returns_empty(self):
|
|
with patch("subprocess.run", side_effect=Exception("timeout")):
|
|
result = self.svc._check_supervisord_status()
|
|
self.assertIsInstance(result, dict)
|
|
self.assertEqual(len(result), 0)
|
|
|
|
def test_exit_code_3_still_parsed(self):
|
|
"""supervisorctl exits 3 when some services are STOPPED — output still valid."""
|
|
stdout = (
|
|
"dolphin_data:exf_fetcher RUNNING pid 100\n"
|
|
"dolphin:clean_arch_trader STOPPED Not started\n"
|
|
)
|
|
with patch("subprocess.run", return_value=self._mock_run(stdout, returncode=3)):
|
|
result = self.svc._check_supervisord_status()
|
|
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "RUNNING")
|
|
|
|
def test_empty_output_returns_empty(self):
|
|
with patch("subprocess.run", return_value=self._mock_run("")):
|
|
result = self.svc._check_supervisord_status()
|
|
self.assertEqual(result, {})
|
|
|
|
|
|
class TestM1ProcessIntegrity(unittest.TestCase):
|
|
"""Unit: M1 scoring with various supervisorctl states."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def _run_m1_with_statuses(self, sv: Dict[str, str]):
|
|
with patch.object(self.svc, "_check_supervisord_status", return_value=sv):
|
|
return self.svc._m1_process_integrity()
|
|
|
|
def test_all_running_score_1(self):
|
|
sv = {p: "RUNNING" for p in SERVICES}
|
|
m1_data, m1_trader, svc_s = self._run_m1_with_statuses(sv)
|
|
self.assertEqual(m1_data, 1.0)
|
|
self.assertEqual(m1_trader, 1.0)
|
|
|
|
def test_one_critical_stopped_reduces_data_score(self):
|
|
sv = {p: "RUNNING" for p in SERVICES}
|
|
sv["dolphin_data:exf_fetcher"] = "STOPPED"
|
|
m1_data, m1_trader, _ = self._run_m1_with_statuses(sv)
|
|
critical = [p for p, c in SERVICES.items() if c["critical_data"]]
|
|
expected = (len(critical) - 1) / len(critical)
|
|
self.assertAlmostEqual(m1_data, expected, places=3)
|
|
self.assertEqual(m1_trader, 1.0) # trader score unaffected
|
|
|
|
def test_trader_stopped_does_not_affect_data_score(self):
|
|
sv = {p: "RUNNING" for p in SERVICES}
|
|
sv["dolphin:nautilus_trader"] = "STOPPED"
|
|
m1_data, m1_trader, _ = self._run_m1_with_statuses(sv)
|
|
self.assertEqual(m1_data, 1.0) # data infra unaffected
|
|
trader_progs = [p for p, c in SERVICES.items() if not c["critical_data"]]
|
|
expected_trader = (len(trader_progs) - 1) / len(trader_progs)
|
|
self.assertAlmostEqual(m1_trader, expected_trader, places=3)
|
|
|
|
def test_all_critical_stopped_score_0(self):
|
|
sv = {p: "RUNNING" for p in SERVICES}
|
|
for p, c in SERVICES.items():
|
|
if c["critical_data"]:
|
|
sv[p] = "STOPPED"
|
|
m1_data, _, _ = self._run_m1_with_statuses(sv)
|
|
self.assertEqual(m1_data, 0.0)
|
|
|
|
def test_fatal_treated_same_as_stopped(self):
|
|
sv = {p: "RUNNING" for p in SERVICES}
|
|
sv["dolphin_data:acb_processor"] = "FATAL"
|
|
m1_data, _, svc_s = self._run_m1_with_statuses(sv)
|
|
self.assertLess(m1_data, 1.0)
|
|
self.assertEqual(svc_s["dolphin_data:acb_processor"], "STOPPED")
|
|
|
|
def test_service_dict_contains_all_services(self):
|
|
sv = {p: "RUNNING" for p in SERVICES}
|
|
_, _, svc_s = self._run_m1_with_statuses(sv)
|
|
for prog in SERVICES:
|
|
self.assertIn(prog, svc_s)
|
|
|
|
def test_empty_supervisord_output_falls_back_to_psutil(self):
|
|
"""If supervisorctl returns nothing, psutil fallback is used."""
|
|
with patch.object(self.svc, "_check_supervisord_status", return_value={}):
|
|
with patch("psutil.process_iter") as mock_pi:
|
|
mock_proc = MagicMock()
|
|
mock_proc.info = {
|
|
"name": "python3",
|
|
"cmdline": ["/usr/bin/python3", "exf_fetcher_flow.py"],
|
|
}
|
|
mock_pi.return_value = [mock_proc]
|
|
m1_data, _, svc_s = self.svc._m1_process_integrity()
|
|
self.assertEqual(svc_s["dolphin_data:exf_fetcher"], "RUNNING")
|
|
|
|
|
|
class TestM3DataFreshnessScoring(unittest.TestCase):
|
|
"""Unit: freshness thresholds applied correctly."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def _make_hz_with_age(self, key: str, age_s: float, ts_field: str):
|
|
"""Build a mock features map that returns a JSON blob with ts_field at age_s."""
|
|
ts = time.time() - age_s
|
|
if ts_field == "_pushed_at":
|
|
ts_val = ts # numeric unix
|
|
else:
|
|
ts_val = datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()
|
|
return json.dumps({ts_field: ts_val, "dummy": 1})
|
|
|
|
def test_fresh_scores_1(self):
|
|
raw = self._make_hz_with_age("exf_latest", 5.0, "_pushed_at")
|
|
score, _ = self._score_one("exf_latest", raw, "_pushed_at")
|
|
self.assertEqual(score, 1.0)
|
|
|
|
def test_stale_scores_half(self):
|
|
raw = self._make_hz_with_age("exf_latest", DATA_STALE_S + 5, "_pushed_at")
|
|
score, _ = self._score_one("exf_latest", raw, "_pushed_at")
|
|
self.assertEqual(score, 0.5)
|
|
|
|
def test_dead_scores_zero(self):
|
|
raw = self._make_hz_with_age("exf_latest", DATA_DEAD_S + 1, "_pushed_at")
|
|
score, _ = self._score_one("exf_latest", raw, "_pushed_at")
|
|
self.assertEqual(score, 0.0)
|
|
|
|
def test_missing_key_scores_zero(self):
|
|
score, _ = self._score_one("exf_latest", None, "_pushed_at")
|
|
self.assertEqual(score, 0.0)
|
|
|
|
def test_no_ts_field_scores_point7(self):
|
|
"""Key present but no timestamp field → 0.7 (presence-only)."""
|
|
raw = json.dumps({"other_key": 123})
|
|
# ts_field is None in the registry for acb_boost → score=1.0 (presence-only)
|
|
# ts_field present but missing in data → 0.7
|
|
score, _ = self._score_one("exf_latest", raw, "_pushed_at")
|
|
self.assertAlmostEqual(score, 0.7, places=3)
|
|
|
|
def test_iso_timestamp_parsed_correctly(self):
|
|
"""ISO format timestamps (obf_universe) are parsed and aged correctly."""
|
|
ts_val = datetime.fromtimestamp(time.time() - 10, tz=timezone.utc).isoformat()
|
|
raw = json.dumps({"_snapshot_utc": ts_val})
|
|
score, _ = self._score_one("obf_universe", raw, "_snapshot_utc")
|
|
self.assertEqual(score, 1.0)
|
|
|
|
def test_presence_only_key_scores_1_when_present(self):
|
|
"""acb_boost has ts_field=None → presence-only, score=1.0."""
|
|
raw = json.dumps({"boost": 1.3, "signals": 1.0})
|
|
with patch.object(self.svc, "_get_hz") as mock_hz:
|
|
features_map = MagicMock()
|
|
features_map.get.side_effect = lambda k: raw if k == "acb_boost" else None
|
|
mock_hz.return_value.get_map.return_value.blocking.return_value = features_map
|
|
m3, results = self.svc._m3_data_freshness()
|
|
self.assertEqual(results.get("acb_boost", {}).get("score"), 1.0)
|
|
|
|
def _score_one(self, name: str, raw_value, ts_field: str):
|
|
"""Helper: run M3 for a single key by mocking HZ."""
|
|
with patch.object(self.svc, "_get_hz") as mock_hz:
|
|
features_map = MagicMock()
|
|
features_map.get.side_effect = lambda k: (
|
|
raw_value if k == HZ_DATA_SOURCES.get(name, (None, k, None))[1] else None
|
|
)
|
|
mock_hz.return_value.get_map.return_value.blocking.return_value = features_map
|
|
m3, results = self.svc._m3_data_freshness()
|
|
entry = results.get(name, {})
|
|
return entry.get("score", None), entry
|
|
|
|
|
|
class TestRmMetaFormula(unittest.TestCase):
|
|
"""Unit: weighted sum behaves correctly."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def _rm(self, m4, m1, m3, m5):
|
|
rm, status = self.svc._compute_rm_meta(m4, m1, m3, m5)
|
|
return rm, status
|
|
|
|
def test_all_ones_is_green(self):
|
|
rm, status = self._rm(1.0, 1.0, 1.0, 1.0)
|
|
self.assertEqual(status, "GREEN")
|
|
self.assertAlmostEqual(rm, 1.0, places=2)
|
|
|
|
def test_all_zeros_is_dead(self):
|
|
rm, status = self._rm(0.0, 0.0, 0.0, 0.0)
|
|
self.assertEqual(status, "DEAD")
|
|
self.assertAlmostEqual(rm, 0.0, places=2)
|
|
|
|
def test_hz_down_alone_is_still_degraded(self):
|
|
"""M4=0 (HZ down) but all processes running and data fresh → DEGRADED, not DEAD."""
|
|
rm, status = self._rm(0.0, 1.0, 1.0, 1.0)
|
|
# M4 weight=0.35; remaining = 0.65 → DEGRADED (>0.6)
|
|
self.assertIn(status, ("DEGRADED", "GREEN"))
|
|
self.assertGreater(rm, 0.3)
|
|
|
|
def test_one_data_service_down_still_not_dead(self):
|
|
"""One of three critical data services stopped → M1_data=0.667."""
|
|
m1_data = 2/3
|
|
rm, status = self._rm(1.0, m1_data, 1.0, 1.0)
|
|
self.assertNotEqual(status, "DEAD")
|
|
self.assertGreater(rm, 0.6)
|
|
|
|
def test_all_data_services_down_critical_or_dead(self):
|
|
rm, status = self._rm(1.0, 0.0, 0.0, 0.0)
|
|
self.assertIn(status, ("CRITICAL", "DEAD"))
|
|
|
|
def test_hz_and_data_both_down_is_critical(self):
|
|
rm, status = self._rm(0.0, 0.0, 1.0, 1.0)
|
|
self.assertIn(status, ("CRITICAL", "DEAD"))
|
|
|
|
def test_product_formula_would_have_been_wrong(self):
|
|
"""Prove that the old product formula collapses when M3=0."""
|
|
product = 1.0 * 1.0 * 0.0 * 1.0 # M3=0 → product=0
|
|
self.assertEqual(product, 0.0)
|
|
# But weighted sum is fine:
|
|
rm, _ = self._rm(1.0, 1.0, 0.0, 1.0)
|
|
self.assertGreater(rm, 0.5)
|
|
|
|
def test_status_thresholds(self):
|
|
cases = [
|
|
(1.0, 1.0, 1.0, 1.0, "GREEN"),
|
|
(1.0, 0.5, 1.0, 1.0, "GREEN"),
|
|
(0.5, 0.5, 0.5, 0.5, "DEGRADED"),
|
|
(0.0, 0.5, 0.0, 0.5, "CRITICAL"),
|
|
(0.0, 0.0, 0.0, 0.0, "DEAD"),
|
|
]
|
|
for m4, m1, m3, m5, expected in cases:
|
|
_, status = self._rm(m4, m1, m3, m5)
|
|
self.assertIn(status, ("GREEN", "DEGRADED", "CRITICAL", "DEAD"),
|
|
f"Invalid status for m4={m4} m1={m1} m3={m3} m5={m5}")
|
|
|
|
def test_weights_sum_to_1(self):
|
|
self.assertAlmostEqual(sum(SENSOR_WEIGHTS.values()), 1.0, places=6)
|
|
|
|
def test_m2_and_trader_excluded_from_rm(self):
|
|
"""rm_meta must be independent of M2 and M1_trader."""
|
|
rm1, _ = self._rm(1.0, 1.0, 1.0, 1.0)
|
|
# Even if M2=0 or M1_trader=0, rm_meta shouldn't change
|
|
# (they aren't inputs to _compute_rm_meta)
|
|
rm2, _ = self._rm(1.0, 1.0, 1.0, 1.0)
|
|
self.assertAlmostEqual(rm1, rm2, places=6)
|
|
|
|
|
|
class TestRecoveryGating(unittest.TestCase):
|
|
"""Unit: cooldown, thread isolation, only-STOPPED gating."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def test_cooldown_prevents_double_restart(self):
|
|
"""Two calls within cooldown window → only one restart fires."""
|
|
calls = []
|
|
with patch("subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(stdout="ok", returncode=0)
|
|
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher")
|
|
time.sleep(0.1)
|
|
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher")
|
|
time.sleep(0.5) # let thread finish
|
|
self.assertEqual(mock_run.call_count, 1)
|
|
|
|
def test_critical_service_cooldown_is_10s(self):
|
|
"""Critical services have RECOVERY_COOLDOWN_CRITICAL_S=10 cooldown."""
|
|
self.assertEqual(RECOVERY_COOLDOWN_CRITICAL_S, 10.0)
|
|
self.assertLess(RECOVERY_COOLDOWN_CRITICAL_S, 60.0,
|
|
"Critical cooldown must be < 60 seconds")
|
|
|
|
def test_cooldown_expires_allows_second_restart(self):
|
|
"""After cooldown, a second restart is allowed."""
|
|
self.svc._recovery_timestamps["dolphin_data:exf_fetcher"] = (
|
|
time.time() - RECOVERY_COOLDOWN_CRITICAL_S - 1
|
|
)
|
|
with patch("subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(stdout="ok", returncode=0)
|
|
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher")
|
|
time.sleep(0.5)
|
|
self.assertEqual(mock_run.call_count, 1)
|
|
|
|
def test_restart_runs_in_separate_thread(self):
|
|
"""Recovery must not block the calling thread."""
|
|
main_thread = threading.current_thread()
|
|
restart_thread_name = None
|
|
|
|
def fake_run(*args, **kwargs):
|
|
nonlocal restart_thread_name
|
|
restart_thread_name = threading.current_thread().name
|
|
time.sleep(0.1)
|
|
return MagicMock(stdout="ok", returncode=0)
|
|
|
|
with patch("subprocess.run", side_effect=fake_run):
|
|
t0 = time.monotonic()
|
|
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher")
|
|
elapsed = time.monotonic() - t0
|
|
|
|
self.assertLess(elapsed, 0.05, # main thread was not blocked
|
|
"Recovery blocked the calling thread")
|
|
time.sleep(0.3)
|
|
self.assertIsNotNone(restart_thread_name)
|
|
self.assertNotEqual(restart_thread_name, main_thread.name)
|
|
self.assertIn("recovery", restart_thread_name)
|
|
|
|
def test_different_services_have_independent_cooldowns(self):
|
|
"""Cooldown on exf_fetcher must not block acb_processor restart."""
|
|
self.svc._recovery_timestamps["dolphin_data:exf_fetcher"] = time.time()
|
|
|
|
with patch("subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(stdout="ok", returncode=0)
|
|
self.svc._restart_via_supervisorctl("dolphin_data:exf_fetcher") # blocked by cooldown
|
|
self.svc._restart_via_supervisorctl("dolphin_data:acb_processor") # should fire
|
|
time.sleep(0.3)
|
|
|
|
called_progs = [str(c) for c in mock_run.call_args_list]
|
|
self.assertEqual(mock_run.call_count, 1)
|
|
self.assertTrue(any("acb_processor" in c for c in called_progs))
|
|
|
|
|
|
class TestRecoveryNeverKillsRunning(unittest.TestCase):
|
|
"""Unit: RUNNING services are NEVER touched by recovery."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def _make_report(self, status, service_states):
|
|
from dataclasses import fields
|
|
from meta_health_service_v3 import HealthReport
|
|
return HealthReport(
|
|
rm_meta=0.0,
|
|
status=status,
|
|
m4_control_plane=0.0,
|
|
m1_data_infra=0.0,
|
|
m1_trader=1.0,
|
|
m2_heartbeat=0.5,
|
|
m3_data_freshness=0.0,
|
|
m5_coherence=0.0,
|
|
service_status=service_states,
|
|
hz_key_status={},
|
|
timestamp=_now_iso(),
|
|
)
|
|
|
|
def test_running_service_never_restarted_even_in_dead_state(self):
|
|
"""Status=DEAD but all services RUNNING → no restart attempt."""
|
|
states = {p: "RUNNING" for p in SERVICES}
|
|
report = self._make_report("DEAD", states)
|
|
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
|
|
self.svc._attempt_recovery(report)
|
|
mock_restart.assert_not_called()
|
|
|
|
def test_green_status_never_triggers_recovery(self):
|
|
states = {p: "STOPPED" for p in SERVICES} # all stopped but GREEN
|
|
report = self._make_report("GREEN", states)
|
|
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
|
|
self.svc._attempt_recovery(report)
|
|
mock_restart.assert_not_called()
|
|
|
|
def test_degraded_status_never_triggers_recovery(self):
|
|
states = {"dolphin_data:exf_fetcher": "STOPPED"}
|
|
report = self._make_report("DEGRADED", states)
|
|
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
|
|
self.svc._attempt_recovery(report)
|
|
mock_restart.assert_not_called()
|
|
|
|
def test_trader_stopped_never_restarted(self):
|
|
"""Trader is informational — never auto-restarted regardless of status."""
|
|
states = {p: "RUNNING" for p in SERVICES}
|
|
states["dolphin:nautilus_trader"] = "STOPPED"
|
|
states["dolphin:scan_bridge"] = "STOPPED"
|
|
report = self._make_report("CRITICAL", states)
|
|
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
|
|
self.svc._attempt_recovery(report)
|
|
# trader and scan_bridge must never be restarted
|
|
for c in mock_restart.call_args_list:
|
|
prog = c[0][0]
|
|
self.assertNotIn("nautilus_trader", prog)
|
|
self.assertNotIn("scan_bridge", prog)
|
|
|
|
def test_critical_stopped_service_triggers_restart(self):
|
|
"""critical_data=True service that is STOPPED + status CRITICAL → restart fires."""
|
|
states = {p: "RUNNING" for p in SERVICES}
|
|
states["dolphin_data:exf_fetcher"] = "STOPPED"
|
|
report = self._make_report("CRITICAL", states)
|
|
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
|
|
self.svc._attempt_recovery(report)
|
|
mock_restart.assert_called_once_with("dolphin_data:exf_fetcher")
|
|
|
|
def test_dead_with_stopped_data_service_triggers_restart(self):
|
|
states = {p: "RUNNING" for p in SERVICES}
|
|
states["dolphin_data:obf_universe"] = "STOPPED"
|
|
report = self._make_report("DEAD", states)
|
|
with patch.object(self.svc, "_restart_via_supervisorctl") as mock_restart:
|
|
self.svc._attempt_recovery(report)
|
|
mock_restart.assert_called_once_with("dolphin_data:obf_universe")
|
|
|
|
|
|
class TestM4ControlPlane(unittest.TestCase):
|
|
"""Unit: port checks."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def _mock_socket(self, ports_up: set):
|
|
import socket as _sock
|
|
orig = _sock.socket
|
|
|
|
class FakeSocket:
|
|
def __init__(self, *a, **kw): pass
|
|
def __enter__(self): return self
|
|
def __exit__(self, *a): pass
|
|
def settimeout(self, t): pass
|
|
def connect_ex(self, addr):
|
|
return 0 if addr[1] in ports_up else 1
|
|
|
|
return patch("socket.socket", FakeSocket)
|
|
|
|
def test_both_up_scores_1(self):
|
|
with self._mock_socket({5701, 4200}):
|
|
score = self.svc._m4_control_plane()
|
|
self.assertAlmostEqual(score, 1.0, places=2)
|
|
|
|
def test_hz_up_prefect_down_scores_high(self):
|
|
with self._mock_socket({5701}):
|
|
score = self.svc._m4_control_plane()
|
|
# HZ weight=0.8, Prefect weight=0.2 → 0.8
|
|
self.assertAlmostEqual(score, 0.8, places=2)
|
|
|
|
def test_both_down_scores_zero(self):
|
|
with self._mock_socket(set()):
|
|
score = self.svc._m4_control_plane()
|
|
self.assertAlmostEqual(score, 0.0, places=2)
|
|
|
|
def test_hz_down_prefect_up_scores_point2(self):
|
|
with self._mock_socket({4200}):
|
|
score = self.svc._m4_control_plane()
|
|
self.assertAlmostEqual(score, 0.2, places=2)
|
|
|
|
|
|
class TestM5Coherence(unittest.TestCase):
|
|
"""Unit: coherence checks on HZ data."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def _mock_features(self, exf=None, acb=None, uni=None):
|
|
mock_hz = MagicMock()
|
|
features = MagicMock()
|
|
|
|
def _get(k):
|
|
if k == "exf_latest":
|
|
return json.dumps(exf) if exf else None
|
|
if k == "acb_boost":
|
|
return json.dumps(acb) if acb else None
|
|
if k == "obf_universe_latest":
|
|
return json.dumps(uni) if uni else None
|
|
return None
|
|
|
|
features.get.side_effect = _get
|
|
mock_hz.get_map.return_value.blocking.return_value = features
|
|
return mock_hz
|
|
|
|
def test_all_coherent_scores_1(self):
|
|
exf = {"_acb_ready": True, "_ok_count": 30}
|
|
acb = {"boost": 1.35}
|
|
uni = {"_n_assets": 500}
|
|
with patch.object(self.svc, "_get_hz",
|
|
return_value=self._mock_features(exf, acb, uni)):
|
|
score = self.svc._m5_coherence()
|
|
self.assertAlmostEqual(score, 1.0, places=2)
|
|
|
|
def test_acb_not_ready_scores_half(self):
|
|
exf = {"_acb_ready": False, "_ok_count": 30}
|
|
acb = {"boost": 1.35}
|
|
uni = {"_n_assets": 500}
|
|
with patch.object(self.svc, "_get_hz",
|
|
return_value=self._mock_features(exf, acb, uni)):
|
|
score = self.svc._m5_coherence()
|
|
self.assertLess(score, 1.0)
|
|
|
|
def test_boost_out_of_range_scores_low(self):
|
|
exf = {"_acb_ready": True, "_ok_count": 30}
|
|
acb = {"boost": 0.5} # invalid: must be [1.0, 2.5]
|
|
uni = {"_n_assets": 500}
|
|
with patch.object(self.svc, "_get_hz",
|
|
return_value=self._mock_features(exf, acb, uni)):
|
|
score = self.svc._m5_coherence()
|
|
self.assertLess(score, 1.0)
|
|
|
|
def test_boost_exactly_1_is_valid(self):
|
|
exf = {"_acb_ready": True, "_ok_count": 30}
|
|
acb = {"boost": 1.0}
|
|
uni = {"_n_assets": 500}
|
|
with patch.object(self.svc, "_get_hz",
|
|
return_value=self._mock_features(exf, acb, uni)):
|
|
score = self.svc._m5_coherence()
|
|
# boost=1.0 is valid → acb check passes
|
|
self.assertGreater(score, 0.5)
|
|
|
|
def test_obf_universe_below_200_scores_half(self):
|
|
exf = {"_acb_ready": True, "_ok_count": 30}
|
|
acb = {"boost": 1.35}
|
|
uni = {"_n_assets": 50} # too few assets
|
|
with patch.object(self.svc, "_get_hz",
|
|
return_value=self._mock_features(exf, acb, uni)):
|
|
score = self.svc._m5_coherence()
|
|
# obf check gives 0.5, others give 1.0 → avg < 1.0
|
|
self.assertLess(score, 1.0)
|
|
|
|
def test_no_hz_scores_zero(self):
|
|
with patch.object(self.svc, "_get_hz", return_value=None):
|
|
score = self.svc._m5_coherence()
|
|
self.assertEqual(score, 0.0)
|
|
|
|
def test_exf_missing_scores_low(self):
|
|
with patch.object(self.svc, "_get_hz",
|
|
return_value=self._mock_features(None, {"boost": 1.3}, {"_n_assets": 500})):
|
|
score = self.svc._m5_coherence()
|
|
self.assertLess(score, 1.0)
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Integration tests (require live HZ + supervisord)
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
@unittest.skipUnless(_hz_available(), "HZ not reachable")
|
|
@unittest.skipUnless(_supervisord_running(), "supervisord not running")
|
|
class TestLiveIntegration(unittest.TestCase):
|
|
"""Integration: real HZ and supervisord."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def test_hz_connects(self):
|
|
hz = self.svc._get_hz()
|
|
self.assertIsNotNone(hz, "HZ client should connect")
|
|
|
|
def test_m4_control_plane_live_is_1(self):
|
|
score = self.svc._m4_control_plane()
|
|
self.assertGreaterEqual(score, 0.8,
|
|
f"M4={score:.2f}: HZ or Prefect port down")
|
|
|
|
def test_all_data_services_running(self):
|
|
sv = self.svc._check_supervisord_status()
|
|
for prog, cfg in SERVICES.items():
|
|
if cfg["critical_data"]:
|
|
state = sv.get(prog, "UNKNOWN")
|
|
self.assertEqual(state, "RUNNING",
|
|
f"Critical data service {prog} is {state}")
|
|
|
|
def test_all_hz_keys_present(self):
|
|
_, results = self.svc._m3_data_freshness()
|
|
for key, info in results.items():
|
|
self.assertNotEqual(info.get("status"), "missing",
|
|
f"HZ key {key} is missing")
|
|
|
|
def test_exf_latest_is_fresh(self):
|
|
_, results = self.svc._m3_data_freshness()
|
|
exf = results.get("exf_latest", {})
|
|
age = exf.get("age_s", 9999)
|
|
self.assertLess(age, DATA_DEAD_S, f"exf_latest is dead ({age:.0f}s old)")
|
|
|
|
def test_obf_universe_has_many_assets(self):
|
|
score = self.svc._m5_coherence()
|
|
_, results = self.svc._m3_data_freshness()
|
|
# If obf_universe is present, coherence should reflect it
|
|
obf = results.get("obf_universe", {})
|
|
if obf.get("status") != "missing":
|
|
self.assertNotEqual(obf.get("score", 0.0), 0.0)
|
|
|
|
def test_acb_boost_is_plausible(self):
|
|
score = self.svc._m5_coherence()
|
|
self.assertGreater(score, 0.0, "M5 coherence is 0 — check HZ data integrity")
|
|
|
|
def test_full_health_check_is_green(self):
|
|
m4 = self.svc._m4_control_plane()
|
|
m1_data, m1_trader, svc_s = self.svc._m1_process_integrity()
|
|
m3, _ = self.svc._m3_data_freshness()
|
|
m5 = self.svc._m5_coherence()
|
|
rm, status = self.svc._compute_rm_meta(m4, m1_data, m3, m5)
|
|
self.assertIn(status, ("GREEN", "DEGRADED"),
|
|
f"System is {status} (rm={rm:.3f}). Sensors: "
|
|
f"M4={m4:.2f} M1={m1_data:.2f} M3={m3:.2f} M5={m5:.2f}")
|
|
self.assertGreater(rm, 0.6, f"rm_meta={rm:.3f} too low for production")
|
|
|
|
def test_status_json_written(self):
|
|
from meta_health_service_v3 import STATUS_JSON
|
|
# Run one cycle
|
|
m4 = self.svc._m4_control_plane()
|
|
m1_data, m1_trader, svc_s = self.svc._m1_process_integrity()
|
|
m2 = self.svc._m2_heartbeat_freshness()
|
|
m3, hz_keys = self.svc._m3_data_freshness()
|
|
m5 = self.svc._m5_coherence()
|
|
rm, status = self.svc._compute_rm_meta(m4, m1_data, m3, m5)
|
|
from meta_health_service_v3 import HealthReport
|
|
report = HealthReport(
|
|
rm_meta=rm, status=status,
|
|
m4_control_plane=m4, m1_data_infra=m1_data,
|
|
m1_trader=m1_trader, m2_heartbeat=m2,
|
|
m3_data_freshness=m3, m5_coherence=m5,
|
|
service_status=svc_s, hz_key_status=hz_keys,
|
|
timestamp=_now_iso(),
|
|
)
|
|
self.svc._emit(report)
|
|
self.assertTrue(STATUS_JSON.exists(), "Status JSON was not written")
|
|
# Retry read: the live MHS daemon writes to the same file concurrently
|
|
# and may leave it momentarily empty between truncate and write.
|
|
raw = ""
|
|
for _ in range(5):
|
|
raw = STATUS_JSON.read_text().strip()
|
|
if raw:
|
|
break
|
|
time.sleep(0.2)
|
|
self.assertTrue(raw, "Status JSON is empty after retries")
|
|
data = json.loads(raw)
|
|
self.assertIn("rm_meta", data)
|
|
self.assertIn("status", data)
|
|
self.assertIn("service_status", data)
|
|
|
|
def test_mhs_meta_health_hz_key_written(self):
|
|
"""MHS should push its own report to DOLPHIN_META_HEALTH."""
|
|
hz = self.svc._get_hz()
|
|
if hz is None:
|
|
self.skipTest("HZ not connected")
|
|
meta_map = hz.get_map("DOLPHIN_META_HEALTH").blocking()
|
|
raw = meta_map.get("latest")
|
|
# May not exist yet if MHS only just started
|
|
if raw:
|
|
data = json.loads(raw)
|
|
self.assertIn("rm_meta", data)
|
|
self.assertIn("status", data)
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# E2E kill and revive tests
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
@unittest.skipUnless(_hz_available(), "HZ not reachable")
|
|
@unittest.skipUnless(_supervisord_running(), "supervisord not running")
|
|
class TestKillAndRevive(unittest.TestCase):
|
|
"""
|
|
E2E: Stop critical data services via supervisorctl, verify MHS:
|
|
1. Detects STOPPED within one check cycle (≤15s)
|
|
2. Calls supervisorctl restart (via real supervisorctl or mock)
|
|
3. Service comes back RUNNING within RECOVERY_COOLDOWN_CRITICAL_S (10s) + startup
|
|
Note: rm_meta may stay GREEN if only one-of-three data services is stopped
|
|
(weighted sum design). Detection of STOPPED state is the key assertion.
|
|
|
|
Uses supervisorctl stop (not kill -9) so supervisord autorestart does NOT
|
|
fire — MHS is the sole recovery mechanism in this scenario.
|
|
"""
|
|
|
|
DETECT_TIMEOUT_S = 20 # MHS must detect STOPPED within this many seconds
|
|
REVIVE_TIMEOUT_S = 30 # cooldown=10s + supervisord startsecs≤10s + margin
|
|
|
|
def _wait_for_state(self, prog: str, target: str, timeout: float) -> bool:
|
|
deadline = time.time() + timeout
|
|
while time.time() < deadline:
|
|
if _get_supervisord_state(prog) == target:
|
|
return True
|
|
time.sleep(2)
|
|
return False
|
|
|
|
def _run_one_check(self, svc: MetaHealthServiceV3):
|
|
m4 = svc._m4_control_plane()
|
|
m1_data, m1_trader, svc_s = svc._m1_process_integrity()
|
|
m3, hz_keys = svc._m3_data_freshness()
|
|
m5 = svc._m5_coherence()
|
|
rm, status = svc._compute_rm_meta(m4, m1_data, m3, m5)
|
|
return rm, status, svc_s
|
|
|
|
def _kill_and_revive(self, prog: str, label: str):
|
|
svc = _fresh_svc()
|
|
|
|
# ── Pre-condition: service must be RUNNING ────────────────────────────
|
|
initial_state = _get_supervisord_state(prog)
|
|
if initial_state != "RUNNING":
|
|
self.skipTest(f"{label} not RUNNING (state={initial_state}) — skipping kill test")
|
|
|
|
try:
|
|
# ── Kill it ───────────────────────────────────────────────────────
|
|
ok = _supervisorctl("stop", prog)
|
|
self.assertTrue(ok, f"supervisorctl stop {prog} failed")
|
|
|
|
stopped = self._wait_for_state(prog, "STOPPED", 10)
|
|
self.assertTrue(stopped, f"{label} did not reach STOPPED state in 10s")
|
|
|
|
# ── MHS detects STOPPED ──────────────────────────────────────────
|
|
rm, status, svc_s = self._run_one_check(svc)
|
|
self.assertEqual(svc_s.get(prog), "STOPPED",
|
|
f"MHS did not detect {label} as STOPPED (got {svc_s.get(prog)})")
|
|
# Note: rm_meta may stay GREEN if only one data service is stopped
|
|
# (weighted sum; GREEN threshold not breached at 2/3 data infra).
|
|
# The key assertion is that MHS correctly sees the service as STOPPED.
|
|
|
|
# ── MHS recovery fires ───────────────────────────────────────────
|
|
restart_called = threading.Event()
|
|
original_fn = svc._restart_via_supervisorctl
|
|
|
|
def _spy_restart(p):
|
|
if p == prog:
|
|
restart_called.set()
|
|
original_fn(p) # call through to actually restart
|
|
|
|
with patch.object(svc, "_restart_via_supervisorctl", side_effect=_spy_restart):
|
|
svc._attempt_recovery(
|
|
__import__("meta_health_service_v3").HealthReport(
|
|
rm_meta=rm, status="CRITICAL",
|
|
m4_control_plane=1.0, m1_data_infra=0.67,
|
|
m1_trader=1.0, m2_heartbeat=0.5,
|
|
m3_data_freshness=1.0, m5_coherence=1.0,
|
|
service_status=svc_s, hz_key_status={},
|
|
timestamp=_now_iso(),
|
|
)
|
|
)
|
|
restart_called.wait(timeout=5)
|
|
self.assertTrue(restart_called.is_set(),
|
|
f"MHS did not call restart for {label}")
|
|
|
|
# ── Service comes back ───────────────────────────────────────────
|
|
revived = self._wait_for_state(prog, "RUNNING", self.REVIVE_TIMEOUT_S)
|
|
self.assertTrue(revived,
|
|
f"{label} did not revive within {self.REVIVE_TIMEOUT_S}s")
|
|
|
|
# ── rm_meta recovers to GREEN ─────────────────────────────────────
|
|
svc2 = _fresh_svc()
|
|
time.sleep(5) # allow service to settle and push to HZ
|
|
rm2, status2, _ = self._run_one_check(svc2)
|
|
self.assertIn(status2, ("GREEN", "DEGRADED"),
|
|
f"rm_meta did not recover after {label} revival "
|
|
f"(rm={rm2:.3f} [{status2}])")
|
|
|
|
finally:
|
|
# Always ensure service is running after test
|
|
if _get_supervisord_state(prog) != "RUNNING":
|
|
_supervisorctl("start", prog)
|
|
self._wait_for_state(prog, "RUNNING", 30)
|
|
|
|
def test_kill_and_revive_exf_fetcher(self):
|
|
self._kill_and_revive("dolphin_data:exf_fetcher", "ExF Fetcher")
|
|
|
|
def test_kill_and_revive_acb_processor(self):
|
|
self._kill_and_revive("dolphin_data:acb_processor", "ACB Processor")
|
|
|
|
def test_kill_and_revive_obf_universe(self):
|
|
self._kill_and_revive("dolphin_data:obf_universe", "OBF Universe")
|
|
|
|
def test_detection_within_check_interval(self):
|
|
"""MHS must detect a stopped service within CHECK_INTERVAL_S seconds."""
|
|
self.assertLessEqual(CHECK_INTERVAL_S, 15.0,
|
|
f"CHECK_INTERVAL_S={CHECK_INTERVAL_S} too slow for detection")
|
|
|
|
def test_critical_revive_faster_than_10s(self):
|
|
"""Critical data services must have cooldown ≤ 10s."""
|
|
self.assertLessEqual(RECOVERY_COOLDOWN_CRITICAL_S, 10.0,
|
|
"Critical service cooldown must be ≤ 10s")
|
|
|
|
def test_double_kill_resistance(self):
|
|
"""
|
|
Kill same service twice within cooldown window.
|
|
MHS cooldown must prevent a supervisorctl restart storm.
|
|
We count actual subprocess.run("supervisorctl restart ...) calls — not
|
|
method invocations — so the counter only increments when the cooldown
|
|
gate allows the call through to the daemon thread.
|
|
"""
|
|
prog = "dolphin_data:acb_processor"
|
|
if _get_supervisord_state(prog) != "RUNNING":
|
|
self.skipTest(f"{prog} not RUNNING")
|
|
|
|
svc = _fresh_svc()
|
|
restart_subprocess_count = [0]
|
|
_original_run = subprocess.run
|
|
|
|
def _counting_run(args, **kwargs):
|
|
if (isinstance(args, list)
|
|
and "supervisorctl" in str(args)
|
|
and "restart" in args):
|
|
restart_subprocess_count[0] += 1
|
|
return _original_run(args, **kwargs)
|
|
|
|
try:
|
|
_supervisorctl("stop", prog)
|
|
time.sleep(3)
|
|
|
|
import meta_health_service_v3 as _mod
|
|
with patch.object(_mod.subprocess, "run", side_effect=_counting_run):
|
|
svc._restart_via_supervisorctl(prog) # fires daemon thread
|
|
time.sleep(5) # let thread complete
|
|
svc._restart_via_supervisorctl(prog) # must be blocked by cooldown
|
|
time.sleep(2)
|
|
|
|
self.assertEqual(
|
|
restart_subprocess_count[0], 1,
|
|
"Cooldown failed: supervisorctl restart invoked more than once "
|
|
f"within the {RECOVERY_COOLDOWN_CRITICAL_S}s window",
|
|
)
|
|
finally:
|
|
self._wait_for_state(prog, "RUNNING", 30)
|
|
if _get_supervisord_state(prog) != "RUNNING":
|
|
_supervisorctl("start", prog)
|
|
self._wait_for_state(prog, "RUNNING", 30)
|
|
|
|
def test_m3_drops_when_exf_stopped(self):
|
|
"""
|
|
When ExF is stopped, exf_latest in HZ goes stale.
|
|
M3 score must drop from 1.0 toward 0.0 within DATA_DEAD_S seconds.
|
|
We wait up to DATA_STALE_S for a fresh baseline before starting,
|
|
so prior kill/revive tests don't leave stale data that confuses the delta.
|
|
"""
|
|
prog = "dolphin_data:exf_fetcher"
|
|
if _get_supervisord_state(prog) != "RUNNING":
|
|
self.skipTest(f"{prog} not RUNNING")
|
|
|
|
# Wait for exf_latest to be fresh (age < DATA_STALE_S) before baseline
|
|
deadline = time.time() + DATA_STALE_S + 15
|
|
svc_pre = _fresh_svc()
|
|
while time.time() < deadline:
|
|
_, details = svc_pre._m3_data_freshness()
|
|
exf_age = details.get("exf_latest", {}).get("age_s", 9999)
|
|
if exf_age < DATA_STALE_S:
|
|
break
|
|
time.sleep(5)
|
|
else:
|
|
self.skipTest(
|
|
f"exf_latest not fresh after {DATA_STALE_S+15}s wait "
|
|
f"(age={exf_age:.0f}s) — prior tests may have left HZ stale"
|
|
)
|
|
|
|
m3_pre, _ = svc_pre._m3_data_freshness()
|
|
|
|
# Pause meta_health so it doesn't heal exf before data goes stale
|
|
mh_prog = "dolphin_data:meta_health"
|
|
mh_was_running = _get_supervisord_state(mh_prog) == "RUNNING"
|
|
if mh_was_running:
|
|
_supervisorctl("stop", mh_prog)
|
|
|
|
try:
|
|
_supervisorctl("stop", prog)
|
|
self._wait_for_state(prog, "STOPPED", 10)
|
|
|
|
# Poll until exf_latest actually goes stale — don't rely on a fixed sleep
|
|
# since exf may push one final batch during graceful shutdown (stopwaitsecs=15)
|
|
svc_post = _fresh_svc()
|
|
exf_post = {}
|
|
stale_deadline = time.time() + DATA_STALE_S + 25 # generous: 55s max
|
|
while time.time() < stale_deadline:
|
|
m3_post, results = svc_post._m3_data_freshness()
|
|
exf_post = results.get("exf_latest", {})
|
|
if exf_post.get("score", 1.0) < 1.0:
|
|
break
|
|
time.sleep(3)
|
|
|
|
self.assertLessEqual(exf_post.get("score", 1.0), 0.5,
|
|
f"exf_latest score should be stale after {DATA_STALE_S+5}s "
|
|
f"without exf_fetcher: got {exf_post}")
|
|
self.assertLess(m3_post, m3_pre,
|
|
f"M3 did not drop after killing exf_fetcher "
|
|
f"(pre={m3_pre:.2f}, post={m3_post:.2f})")
|
|
finally:
|
|
_supervisorctl("start", prog)
|
|
self._wait_for_state(prog, "RUNNING", 45)
|
|
if mh_was_running:
|
|
_supervisorctl("start", mh_prog)
|
|
|
|
def test_no_systemd_units_active_for_managed_services(self):
|
|
"""
|
|
Verify no conflicting systemd units are active for supervisord-managed services.
|
|
This was the root cause of the original 'random killer' bug.
|
|
"""
|
|
conflict_units = [
|
|
"meta_health_daemon.service",
|
|
"dolphin-nautilus-trader.service",
|
|
"dolphin-scan-bridge.service",
|
|
"dolphin-ng.service",
|
|
]
|
|
for unit in conflict_units:
|
|
r = subprocess.run(
|
|
["systemctl", "is-active", unit],
|
|
capture_output=True, text=True,
|
|
)
|
|
state = r.stdout.strip()
|
|
self.assertNotEqual(state, "active",
|
|
f"Conflicting systemd unit {unit} is still active! "
|
|
f"This will fight supervisord and kill services.")
|
|
|
|
def test_no_duplicate_trader_processes(self):
|
|
"""
|
|
Nautilus trader must run as exactly ONE process.
|
|
Transient duplicates can appear briefly during supervisord autorestart
|
|
races when adjacent kill/revive tests fire rapidly. We allow one retry
|
|
after a 15s grace window — if duplicates persist that long, it's a real bug.
|
|
"""
|
|
import psutil
|
|
|
|
def _scan():
|
|
found = []
|
|
for p in psutil.process_iter(["cmdline", "status"]):
|
|
try:
|
|
if (p.info["cmdline"]
|
|
and any("nautilus_event_trader" in a
|
|
for a in p.info["cmdline"])
|
|
and p.info["status"] not in ("zombie",)):
|
|
found.append(p)
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
continue
|
|
return found
|
|
|
|
time.sleep(8) # let rapid-cooldown MHS restarts settle
|
|
procs = _scan()
|
|
if len(procs) > 1:
|
|
# One retry — transient autorestart race should resolve within 15s
|
|
time.sleep(15)
|
|
procs = _scan()
|
|
|
|
self.assertLessEqual(len(procs), 1,
|
|
f"Trader running {len(procs)} times after 23s grace! PIDs: "
|
|
f"{[p.pid for p in procs]} — "
|
|
f"possible systemd/supervisord dual-management regression")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Regression / invariant tests
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestServiceRegistry(unittest.TestCase):
|
|
"""Sanity checks on the SERVICES registry itself."""
|
|
|
|
def test_all_critical_data_services_present(self):
|
|
expected = {
|
|
"dolphin_data:exf_fetcher",
|
|
"dolphin_data:acb_processor",
|
|
"dolphin_data:obf_universe",
|
|
}
|
|
for prog in expected:
|
|
self.assertIn(prog, SERVICES, f"{prog} missing from SERVICES registry")
|
|
self.assertTrue(SERVICES[prog]["critical_data"],
|
|
f"{prog} should have critical_data=True")
|
|
|
|
def test_trader_services_not_critical(self):
|
|
for prog in ["dolphin:nautilus_trader", "dolphin:scan_bridge"]:
|
|
if prog in SERVICES:
|
|
self.assertFalse(SERVICES[prog]["critical_data"],
|
|
f"{prog} must NOT be critical_data (never auto-restart trader)")
|
|
|
|
def test_hz_data_sources_match_expected_keys(self):
|
|
expected_keys = {"exf_latest", "acb_boost", "latest_eigen_scan", "obf_universe"}
|
|
self.assertEqual(set(HZ_DATA_SOURCES.keys()), expected_keys)
|
|
|
|
def test_exf_latest_ts_field(self):
|
|
self.assertEqual(HZ_DATA_SOURCES["exf_latest"][2], "_pushed_at")
|
|
|
|
def test_acb_boost_presence_only(self):
|
|
self.assertIsNone(HZ_DATA_SOURCES["acb_boost"][2],
|
|
"acb_boost has no reliable timestamp — should be presence-only")
|
|
|
|
def test_critical_cooldown_less_than_10s(self):
|
|
self.assertLessEqual(RECOVERY_COOLDOWN_CRITICAL_S, 10.0)
|
|
|
|
def test_check_interval_reasonable(self):
|
|
self.assertLessEqual(CHECK_INTERVAL_S, 15.0,
|
|
"Check interval too slow — services may go undetected")
|
|
self.assertGreaterEqual(CHECK_INTERVAL_S, 2.0,
|
|
"Check interval too aggressive — excessive HZ load")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Race condition tests
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestRaceConditions(unittest.TestCase):
|
|
"""
|
|
Multi-threaded stress tests that expose cooldown and recovery races.
|
|
All tests use mocked subprocess so no real services are touched.
|
|
"""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
def test_concurrent_restarts_same_service_only_one_fires(self):
|
|
"""
|
|
10 threads all call _restart_via_supervisorctl simultaneously.
|
|
Cooldown must guarantee exactly ONE subprocess.run call.
|
|
"""
|
|
prog = "dolphin_data:exf_fetcher"
|
|
call_count = [0]
|
|
barrier = threading.Barrier(10)
|
|
|
|
import meta_health_service_v3 as _mod
|
|
|
|
real_run = _mod.subprocess.run
|
|
|
|
def _counting_run(args, **kwargs):
|
|
if isinstance(args, list) and "restart" in args:
|
|
call_count[0] += 1
|
|
return MagicMock(stdout="ok", returncode=0)
|
|
|
|
with patch.object(_mod.subprocess, "run", side_effect=_counting_run):
|
|
threads = []
|
|
for _ in range(10):
|
|
def _worker():
|
|
barrier.wait() # all start at the same instant
|
|
self.svc._restart_via_supervisorctl(prog)
|
|
t = threading.Thread(target=_worker, daemon=True)
|
|
threads.append(t)
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=5)
|
|
time.sleep(1) # let daemon threads flush
|
|
|
|
self.assertEqual(call_count[0], 1,
|
|
f"Expected 1 restart, got {call_count[0]} — cooldown not thread-safe")
|
|
|
|
def test_concurrent_restarts_different_services_all_fire(self):
|
|
"""
|
|
Each service has its own cooldown bucket.
|
|
3 services called simultaneously → 3 restarts.
|
|
"""
|
|
progs = [
|
|
"dolphin_data:exf_fetcher",
|
|
"dolphin_data:acb_processor",
|
|
"dolphin_data:obf_universe",
|
|
]
|
|
fired = set()
|
|
lock = threading.Lock()
|
|
import meta_health_service_v3 as _mod
|
|
|
|
def _counting_run(args, **kwargs):
|
|
if isinstance(args, list) and "restart" in args:
|
|
# The program name is the last element of the supervisorctl cmd
|
|
with lock:
|
|
fired.add(args[-1])
|
|
return MagicMock(stdout="ok", returncode=0)
|
|
|
|
with patch.object(_mod.subprocess, "run", side_effect=_counting_run):
|
|
threads = [
|
|
threading.Thread(
|
|
target=self.svc._restart_via_supervisorctl,
|
|
args=(p,), daemon=True
|
|
)
|
|
for p in progs
|
|
]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=5)
|
|
time.sleep(1)
|
|
|
|
self.assertEqual(fired, set(progs),
|
|
f"Not all services fired a restart: fired={fired}")
|
|
|
|
def test_cooldown_timestamp_set_before_thread_completes(self):
|
|
"""
|
|
The cooldown timestamp must be committed BEFORE the daemon thread
|
|
runs, so a second call arriving while the first thread is still
|
|
executing is also blocked.
|
|
"""
|
|
prog = "dolphin_data:exf_fetcher"
|
|
slow_start = threading.Event()
|
|
|
|
import meta_health_service_v3 as _mod
|
|
|
|
def _slow_run(args, **kwargs):
|
|
slow_start.set()
|
|
time.sleep(2) # simulate slow supervisorctl
|
|
return MagicMock(stdout="ok", returncode=0)
|
|
|
|
with patch.object(_mod.subprocess, "run", side_effect=_slow_run):
|
|
self.svc._restart_via_supervisorctl(prog) # fires async thread
|
|
slow_start.wait(timeout=3) # thread is in subprocess.run
|
|
|
|
# Timestamp must already be set even though thread hasn't finished
|
|
self.assertIn(prog, self.svc._recovery_timestamps,
|
|
"Timestamp not set before thread completed")
|
|
|
|
call_count = [0]
|
|
original = _mod.subprocess.run
|
|
def _count(args, **kwargs):
|
|
call_count[0] += 1
|
|
return MagicMock(returncode=0)
|
|
|
|
# Second call while thread still running — must be blocked
|
|
with patch.object(_mod.subprocess, "run", side_effect=_count):
|
|
self.svc._restart_via_supervisorctl(prog)
|
|
time.sleep(0.5)
|
|
self.assertEqual(call_count[0], 0,
|
|
"Second restart fired while first thread still running")
|
|
|
|
def test_check_loop_not_blocked_by_slow_supervisorctl(self):
|
|
"""
|
|
_restart_via_supervisorctl must return immediately even if
|
|
supervisorctl hangs for 30s. The check loop (CHECK_INTERVAL_S=10s)
|
|
must not be blocked.
|
|
"""
|
|
import meta_health_service_v3 as _mod
|
|
prog = "dolphin_data:acb_processor"
|
|
thread_started = threading.Event()
|
|
|
|
def _hanging_run(args, **kwargs):
|
|
thread_started.set()
|
|
time.sleep(30) # simulate completely hung supervisorctl
|
|
return MagicMock(returncode=0)
|
|
|
|
with patch.object(_mod.subprocess, "run", side_effect=_hanging_run):
|
|
t0 = time.time()
|
|
self.svc._restart_via_supervisorctl(prog)
|
|
elapsed = time.time() - t0
|
|
|
|
self.assertLess(elapsed, 1.0,
|
|
f"_restart_via_supervisorctl blocked for {elapsed:.2f}s — must be async")
|
|
thread_started.wait(timeout=3) # daemon thread did start
|
|
|
|
def test_recovery_timestamps_dict_survives_concurrent_writes(self):
|
|
"""
|
|
Concurrent writes to _recovery_timestamps from multiple threads
|
|
must not corrupt the dict (no KeyError, no lost entries).
|
|
"""
|
|
progs = list(SERVICES.keys())
|
|
import meta_health_service_v3 as _mod
|
|
|
|
def _noop_run(args, **kwargs):
|
|
return MagicMock(returncode=0)
|
|
|
|
errors = []
|
|
def _worker(p):
|
|
try:
|
|
with patch.object(_mod.subprocess, "run", side_effect=_noop_run):
|
|
for _ in range(20):
|
|
# backdate so every call is "allowed"
|
|
self.svc._recovery_timestamps.pop(p, None)
|
|
self.svc._restart_via_supervisorctl(p)
|
|
time.sleep(0.001)
|
|
except Exception as e:
|
|
errors.append(e)
|
|
|
|
threads = [threading.Thread(target=_worker, args=(p,), daemon=True) for p in progs]
|
|
for t in threads:
|
|
t.start()
|
|
for t in threads:
|
|
t.join(timeout=10)
|
|
time.sleep(0.5)
|
|
|
|
self.assertEqual(errors, [], f"Concurrent dict access raised: {errors}")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Edge case tests
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestEdgeCases(unittest.TestCase):
|
|
"""Boundary values, malformed data, and unusual-but-valid states."""
|
|
|
|
def setUp(self):
|
|
self.svc = _fresh_svc()
|
|
|
|
# ── _compute_rm_meta edge cases ──────────────────────────────────────────
|
|
|
|
def test_rm_meta_exactly_at_green_threshold(self):
|
|
rm, status = self.svc._compute_rm_meta(
|
|
m4=1.0, m1_data=1.0, m3=1.0, m5=1.0
|
|
)
|
|
self.assertEqual(status, "GREEN")
|
|
self.assertAlmostEqual(rm, 1.0, places=6)
|
|
|
|
def test_rm_meta_exactly_at_degraded_boundary(self):
|
|
# Find exact boundary value: GREEN iff rm >= 0.85
|
|
# Set m1=0 gives: 0.35*m4 + 0.20*m3 + 0.10*m5 = 0.35+0.20+0.10 = 0.65 → CRITICAL
|
|
rm, status = self.svc._compute_rm_meta(
|
|
m4=1.0, m1_data=0.0, m3=1.0, m5=1.0
|
|
)
|
|
self.assertIn(status, ("CRITICAL", "DEGRADED"),
|
|
f"rm={rm:.3f} with m1_data=0 should not be GREEN")
|
|
|
|
def test_rm_meta_all_sensors_zero(self):
|
|
rm, status = self.svc._compute_rm_meta(
|
|
m4=0.0, m1_data=0.0, m3=0.0, m5=0.0
|
|
)
|
|
self.assertAlmostEqual(rm, 0.0, places=6)
|
|
self.assertEqual(status, "DEAD")
|
|
|
|
def test_rm_meta_nan_sensor_does_not_propagate(self):
|
|
"""NaN in one sensor must not corrupt rm_meta (treat as 0)."""
|
|
import math
|
|
# Manually build the weighted sum as MHS does, but inject nan
|
|
sensors = {"m4_control_plane": float("nan"), "m1_data_infra": 1.0,
|
|
"m3_data_freshness": 1.0, "m5_coherence": 1.0}
|
|
rm = sum(
|
|
(v if not math.isnan(v) else 0.0) * SENSOR_WEIGHTS[k]
|
|
for k, v in sensors.items()
|
|
)
|
|
self.assertFalse(math.isnan(rm), "NaN propagated into rm_meta")
|
|
self.assertGreater(rm, 0.0)
|
|
|
|
def test_rm_meta_clamped_to_0_1(self):
|
|
"""rm_meta result should never exceed 1.0 or go below 0.0."""
|
|
rm, _ = self.svc._compute_rm_meta(1.0, 1.0, 1.0, 1.0)
|
|
self.assertLessEqual(rm, 1.0)
|
|
self.assertGreaterEqual(rm, 0.0)
|
|
|
|
# ── M3 freshness edge cases ──────────────────────────────────────────────
|
|
|
|
def test_m3_timestamp_just_past_stale_boundary(self):
|
|
"""A key 1s past DATA_STALE_S should score 0.5 (stale, not fresh).
|
|
The boundary is strict (>), so exactly DATA_STALE_S still scores 1.0."""
|
|
svc = _fresh_svc()
|
|
hz_mock = MagicMock()
|
|
stale_ts = datetime.now(timezone.utc).timestamp() - (DATA_STALE_S + 1.0)
|
|
hz_mock.get_map.return_value.blocking.return_value.get.return_value = (
|
|
json.dumps({"_pushed_at": stale_ts})
|
|
)
|
|
svc._hz_client = hz_mock
|
|
m3, details = svc._m3_data_freshness()
|
|
exf_score = details.get("exf_latest", {}).get("score", 1.0)
|
|
self.assertLess(exf_score, 1.0,
|
|
f"Score 1s past stale boundary should be < 1.0 (got {exf_score})")
|
|
self.assertAlmostEqual(exf_score, 0.5, places=5,
|
|
msg=f"Stale key should score 0.5 (got {exf_score})")
|
|
|
|
def test_m3_timestamp_in_the_future(self):
|
|
"""A timestamp 5s in the future (clock skew) must not crash and score > 0."""
|
|
svc = _fresh_svc()
|
|
hz_mock = MagicMock()
|
|
future_ts = datetime.now(timezone.utc).timestamp() + 5.0
|
|
hz_mock.get_map.return_value.blocking.return_value.get.return_value = (
|
|
json.dumps({"_pushed_at": future_ts})
|
|
)
|
|
svc._hz_client = hz_mock
|
|
m3, _ = svc._m3_data_freshness()
|
|
self.assertGreater(m3, 0.0, "Future timestamp should not score 0")
|
|
|
|
def test_m3_timestamp_iso_string_with_tz(self):
|
|
svc = _fresh_svc()
|
|
hz_mock = MagicMock()
|
|
iso = datetime.now(timezone.utc).isoformat()
|
|
hz_mock.get_map.return_value.blocking.return_value.get.return_value = (
|
|
json.dumps({"timestamp": iso})
|
|
)
|
|
svc._hz_client = hz_mock
|
|
m3, _ = svc._m3_data_freshness()
|
|
self.assertGreater(m3, 0.5)
|
|
|
|
def test_m3_garbage_json_in_hz_scores_zero_not_crash(self):
|
|
svc = _fresh_svc()
|
|
hz_mock = MagicMock()
|
|
hz_mock.get_map.return_value.blocking.return_value.get.return_value = (
|
|
"NOT_VALID_JSON{{{"
|
|
)
|
|
svc._hz_client = hz_mock
|
|
try:
|
|
m3, _ = svc._m3_data_freshness()
|
|
# Should not raise; score for corrupted key should be 0 or 0.5
|
|
self.assertGreaterEqual(m3, 0.0)
|
|
except Exception as exc:
|
|
self.fail(f"_m3_data_freshness crashed on garbage JSON: {exc}")
|
|
|
|
def test_m3_empty_dict_in_hz_scores_low(self):
|
|
svc = _fresh_svc()
|
|
hz_mock = MagicMock()
|
|
hz_mock.get_map.return_value.blocking.return_value.get.return_value = "{}"
|
|
svc._hz_client = hz_mock
|
|
m3, details = svc._m3_data_freshness()
|
|
# Missing ts field → presence-only logic kicks in
|
|
self.assertGreaterEqual(m3, 0.0)
|
|
|
|
# ── _attempt_recovery edge cases ─────────────────────────────────────────
|
|
|
|
def test_attempt_recovery_ignores_non_critical_stopped(self):
|
|
"""trader/scan_bridge stopped must NOT trigger a restart."""
|
|
import meta_health_service_v3 as _mod
|
|
svc = _fresh_svc()
|
|
call_log = []
|
|
|
|
def _spy(p):
|
|
call_log.append(p)
|
|
|
|
with patch.object(svc, "_restart_via_supervisorctl", side_effect=_spy):
|
|
svc._attempt_recovery(
|
|
_mod.HealthReport(
|
|
rm_meta=0.1, status="DEAD",
|
|
m4_control_plane=0.0, m1_data_infra=0.0,
|
|
m1_trader=0.0, m2_heartbeat=0.0,
|
|
m3_data_freshness=0.0, m5_coherence=0.0,
|
|
service_status={
|
|
"dolphin:nautilus_trader": "STOPPED",
|
|
"dolphin:scan_bridge": "STOPPED",
|
|
# critical services still running
|
|
"dolphin_data:exf_fetcher": "RUNNING",
|
|
"dolphin_data:acb_processor": "RUNNING",
|
|
"dolphin_data:obf_universe": "RUNNING",
|
|
},
|
|
hz_key_status={}, timestamp=_now_iso(),
|
|
)
|
|
)
|
|
|
|
self.assertEqual(call_log, [],
|
|
f"Non-critical stopped services triggered restart: {call_log}")
|
|
|
|
def test_attempt_recovery_noop_when_degraded(self):
|
|
"""DEGRADED status must never trigger any restart."""
|
|
import meta_health_service_v3 as _mod
|
|
svc = _fresh_svc()
|
|
call_log = []
|
|
|
|
def _spy(p):
|
|
call_log.append(p)
|
|
|
|
with patch.object(svc, "_restart_via_supervisorctl", side_effect=_spy):
|
|
svc._attempt_recovery(
|
|
_mod.HealthReport(
|
|
rm_meta=0.70, status="DEGRADED",
|
|
m4_control_plane=0.8, m1_data_infra=0.67,
|
|
m1_trader=1.0, m2_heartbeat=1.0,
|
|
m3_data_freshness=0.9, m5_coherence=0.8,
|
|
service_status={p: "RUNNING" for p in SERVICES},
|
|
hz_key_status={}, timestamp=_now_iso(),
|
|
)
|
|
)
|
|
|
|
self.assertEqual(call_log, [],
|
|
f"DEGRADED status should never trigger restart: {call_log}")
|
|
|
|
def test_supervisorctl_parse_extra_whitespace(self):
|
|
"""Lines with extra spaces/tabs must parse correctly."""
|
|
svc = _fresh_svc()
|
|
output = (
|
|
"dolphin_data:exf_fetcher RUNNING pid 12345, uptime 0:01:00\n"
|
|
"dolphin_data:acb_processor\t\tSTOPPED\n"
|
|
"dolphin:nautilus_trader FATAL Exited too quickly\n"
|
|
)
|
|
with patch("subprocess.run") as mock_run:
|
|
mock_run.return_value = MagicMock(
|
|
stdout=output, returncode=0
|
|
)
|
|
result = svc._check_supervisord_status()
|
|
self.assertEqual(result.get("dolphin_data:exf_fetcher"), "RUNNING")
|
|
self.assertEqual(result.get("dolphin_data:acb_processor"), "STOPPED")
|
|
self.assertEqual(result.get("dolphin:nautilus_trader"), "FATAL")
|
|
|
|
def test_supervisorctl_timeout_returns_empty_not_crash(self):
|
|
import subprocess as sp
|
|
svc = _fresh_svc()
|
|
with patch("subprocess.run", side_effect=sp.TimeoutExpired(cmd=[], timeout=5)):
|
|
result = svc._check_supervisord_status()
|
|
self.assertEqual(result, {}, "TimeoutExpired must return empty dict")
|
|
|
|
def test_cooldown_very_old_timestamp_allows_restart(self):
|
|
"""A timestamp from a week ago should not block recovery."""
|
|
svc = _fresh_svc()
|
|
prog = "dolphin_data:exf_fetcher"
|
|
svc._recovery_timestamps[prog] = time.time() - 7 * 86400 # one week ago
|
|
import meta_health_service_v3 as _mod
|
|
fired = [False]
|
|
|
|
def _noop(args, **kwargs):
|
|
fired[0] = True
|
|
return MagicMock(returncode=0)
|
|
|
|
with patch.object(_mod.subprocess, "run", side_effect=_noop):
|
|
svc._restart_via_supervisorctl(prog)
|
|
time.sleep(0.5)
|
|
self.assertTrue(fired[0], "Old timestamp should not block restart")
|
|
|
|
def test_recovery_timestamps_initially_empty(self):
|
|
svc = _fresh_svc()
|
|
self.assertEqual(svc._recovery_timestamps, {},
|
|
"Fresh MHS should have no prior cooldown timestamps")
|
|
|
|
def test_m4_hz_port_down_scores_at_most_0_2(self):
|
|
"""If HZ port is down, m4 ≤ 0.2 (only Prefect weight remains)."""
|
|
svc = _fresh_svc()
|
|
import socket as _sock
|
|
def _always_refused(*a, **kw):
|
|
raise ConnectionRefusedError
|
|
with patch.object(_sock.socket, "connect_ex", return_value=1):
|
|
m4 = svc._m4_control_plane()
|
|
self.assertLessEqual(m4, 0.2 + 1e-6,
|
|
f"HZ down should give m4 ≤ 0.2, got {m4:.3f}")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Hypothesis-based property tests
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
from hypothesis import given, assume, settings as hyp_settings, HealthCheck
|
|
from hypothesis import strategies as st
|
|
|
|
_sensor_st = st.floats(min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False)
|
|
|
|
|
|
class TestHypothesisProperties(unittest.TestCase):
|
|
"""
|
|
Property-based tests using Hypothesis.
|
|
Verify invariants that must hold for ALL valid sensor combinations.
|
|
"""
|
|
|
|
def _svc(self):
|
|
return _fresh_svc()
|
|
|
|
@given(m4=_sensor_st, m1=_sensor_st, m3=_sensor_st, m5=_sensor_st)
|
|
@hyp_settings(max_examples=500, suppress_health_check=[HealthCheck.too_slow])
|
|
def test_rm_meta_always_in_0_1(self, m4, m1, m3, m5):
|
|
"""rm_meta must always be in [0, 1] for any valid sensor inputs."""
|
|
rm, _ = self._svc()._compute_rm_meta(m4, m1, m3, m5)
|
|
self.assertGreaterEqual(rm, 0.0 - 1e-9)
|
|
self.assertLessEqual(rm, 1.0 + 1e-9)
|
|
|
|
@given(m4=_sensor_st, m1=_sensor_st, m3=_sensor_st, m5=_sensor_st)
|
|
@hyp_settings(max_examples=500, suppress_health_check=[HealthCheck.too_slow])
|
|
def test_status_always_valid_string(self, m4, m1, m3, m5):
|
|
"""Status label must always be one of the four valid strings."""
|
|
_, status = self._svc()._compute_rm_meta(m4, m1, m3, m5)
|
|
self.assertIn(status, {"GREEN", "DEGRADED", "CRITICAL", "DEAD"})
|
|
|
|
@given(m4=_sensor_st, m1=_sensor_st, m3=_sensor_st, m5=_sensor_st)
|
|
@hyp_settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow])
|
|
def test_rm_meta_monotone_in_each_sensor(self, m4, m1, m3, m5):
|
|
"""Increasing any sensor by 0.1 must not decrease rm_meta."""
|
|
svc = self._svc()
|
|
rm_base, _ = svc._compute_rm_meta(m4, m1, m3, m5)
|
|
delta = 0.1
|
|
rm_m4, _ = svc._compute_rm_meta(min(m4 + delta, 1.0), m1, m3, m5)
|
|
rm_m1, _ = svc._compute_rm_meta(m4, min(m1 + delta, 1.0), m3, m5)
|
|
rm_m3, _ = svc._compute_rm_meta(m4, m1, min(m3 + delta, 1.0), m5)
|
|
rm_m5, _ = svc._compute_rm_meta(m4, m1, m3, min(m5 + delta, 1.0))
|
|
self.assertGreaterEqual(rm_m4, rm_base - 1e-9)
|
|
self.assertGreaterEqual(rm_m1, rm_base - 1e-9)
|
|
self.assertGreaterEqual(rm_m3, rm_base - 1e-9)
|
|
self.assertGreaterEqual(rm_m5, rm_base - 1e-9)
|
|
|
|
@given(m4=_sensor_st, m1=_sensor_st, m3=_sensor_st, m5=_sensor_st)
|
|
@hyp_settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow])
|
|
def test_green_requires_both_heavy_sensors_high(self, m4, m1, m3, m5):
|
|
"""
|
|
If BOTH of the two highest-weighted sensors (m4=0.35, m1=0.35) are
|
|
below 0.5, the system cannot be GREEN.
|
|
|
|
Math: max rm with m4<0.5, m1<0.5 is
|
|
0.35*0.499 + 0.35*0.499 + 0.20*1.0 + 0.10*1.0 ≈ 0.649 < 0.85 (GREEN)
|
|
So NOT GREEN is guaranteed.
|
|
|
|
Note: one low sensor alone (e.g. m3=0.375, m4=m1=m5=1.0) can still be
|
|
GREEN since the other sensors compensate — that is a correct system behaviour.
|
|
"""
|
|
assume(m4 < 0.5 and m1 < 0.5)
|
|
_, status = self._svc()._compute_rm_meta(m4, m1, m3, m5)
|
|
self.assertNotEqual(status, "GREEN",
|
|
f"GREEN with both heavy sensors low: m4={m4:.3f} m1={m1:.3f}")
|
|
|
|
@given(age_s=st.floats(min_value=0.0, max_value=300.0, allow_nan=False))
|
|
@hyp_settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow],
|
|
deadline=None)
|
|
def test_freshness_score_decreases_with_age(self, age_s):
|
|
"""
|
|
Freshness score must be non-increasing as age increases.
|
|
Specifically: age < DATA_STALE_S → score=1.0,
|
|
DATA_STALE_S < age < DATA_DEAD_S → score=0.5,
|
|
age > DATA_DEAD_S → score=0.0.
|
|
|
|
We exclude a ±1s margin around each boundary because the small gap
|
|
between computing `ts` and the `time.time()` call inside the method
|
|
creates timing jitter — values exactly at the boundary are unreliable.
|
|
"""
|
|
# Skip the ±1s zone around each threshold to avoid timing jitter
|
|
assume(abs(age_s - DATA_STALE_S) > 1.0)
|
|
assume(abs(age_s - DATA_DEAD_S) > 1.0)
|
|
|
|
svc = _fresh_svc()
|
|
hz = MagicMock()
|
|
now_ts = datetime.now(timezone.utc).timestamp()
|
|
ts = now_ts - age_s
|
|
hz.get_map.return_value.blocking.return_value.get.return_value = (
|
|
json.dumps({"_pushed_at": ts})
|
|
)
|
|
svc._hz_client = hz
|
|
_, details = svc._m3_data_freshness()
|
|
score = details.get("exf_latest", {}).get("score", 0.0)
|
|
|
|
# Boundary is strict: age > DATA_STALE_S → stale, age > DATA_DEAD_S → dead
|
|
if age_s < DATA_STALE_S:
|
|
self.assertAlmostEqual(score, 1.0, places=5,
|
|
msg=f"Fresh key (age={age_s:.1f}s) should score 1.0, got {score}")
|
|
elif age_s > DATA_DEAD_S:
|
|
self.assertAlmostEqual(score, 0.0, places=5,
|
|
msg=f"Dead key (age={age_s:.1f}s) should score 0.0, got {score}")
|
|
else:
|
|
self.assertGreater(score, 0.0,
|
|
msg=f"Stale key (age={age_s:.1f}s) should score > 0, got {score}")
|
|
self.assertLess(score, 1.0,
|
|
msg=f"Stale key (age={age_s:.1f}s) should score < 1.0, got {score}")
|
|
|
|
@given(
|
|
statuses=st.dictionaries(
|
|
st.sampled_from(list(SERVICES.keys())),
|
|
st.sampled_from(["RUNNING", "STOPPED", "FATAL", "STARTING"]),
|
|
min_size=1,
|
|
)
|
|
)
|
|
@hyp_settings(max_examples=300, suppress_health_check=[HealthCheck.too_slow])
|
|
def test_m1_score_always_in_0_1(self, statuses):
|
|
"""M1 scores must always be in [0, 1] regardless of service states."""
|
|
svc = _fresh_svc()
|
|
with patch.object(svc, "_check_supervisord_status", return_value=statuses):
|
|
m1_data, m1_trader, _ = svc._m1_process_integrity()
|
|
self.assertGreaterEqual(m1_data, 0.0)
|
|
self.assertLessEqual(m1_data, 1.0)
|
|
self.assertGreaterEqual(m1_trader, 0.0)
|
|
self.assertLessEqual(m1_trader, 1.0)
|
|
|
|
@given(
|
|
statuses=st.dictionaries(
|
|
st.sampled_from(list(SERVICES.keys())),
|
|
st.just("RUNNING"),
|
|
min_size=len(SERVICES),
|
|
max_size=len(SERVICES),
|
|
)
|
|
)
|
|
@hyp_settings(max_examples=50, suppress_health_check=[HealthCheck.too_slow])
|
|
def test_all_running_always_scores_1(self, statuses):
|
|
"""All services RUNNING must always give m1_data=1.0 and m1_trader=1.0."""
|
|
svc = _fresh_svc()
|
|
with patch.object(svc, "_check_supervisord_status", return_value=statuses):
|
|
m1_data, m1_trader, _ = svc._m1_process_integrity()
|
|
self.assertAlmostEqual(m1_data, 1.0, places=6)
|
|
self.assertAlmostEqual(m1_trader, 1.0, places=6)
|
|
|
|
@given(n_healthy=st.integers(min_value=0, max_value=540))
|
|
@hyp_settings(max_examples=200, suppress_health_check=[HealthCheck.too_slow])
|
|
def test_m5_obf_score_monotone_in_healthy_count(self, n_healthy):
|
|
"""
|
|
M5 OBF coverage score must be non-decreasing with n_healthy_assets.
|
|
"""
|
|
svc_lo = _fresh_svc()
|
|
svc_hi = _fresh_svc()
|
|
n_hi = min(n_healthy + 10, 540)
|
|
|
|
hz = MagicMock()
|
|
def _make_hz_return(n):
|
|
return json.dumps({
|
|
"exf_latest": {"feature_count": 50},
|
|
"acb_boost": {"boost": 1.05},
|
|
"obf_universe_latest": {"n_healthy": n},
|
|
})
|
|
|
|
hz_lo = MagicMock()
|
|
hz_lo.get_map.return_value.blocking.return_value.get.side_effect = (
|
|
lambda k: _make_hz_return(n_healthy) if k == "obf_universe_latest"
|
|
else json.dumps({"boost": 1.05, "feature_count": 50})
|
|
)
|
|
hz_hi = MagicMock()
|
|
hz_hi.get_map.return_value.blocking.return_value.get.side_effect = (
|
|
lambda k: _make_hz_return(n_hi) if k == "obf_universe_latest"
|
|
else json.dumps({"boost": 1.05, "feature_count": 50})
|
|
)
|
|
|
|
svc_lo._hz_client = hz_lo
|
|
svc_hi._hz_client = hz_hi
|
|
m5_lo = svc_lo._m5_coherence()
|
|
m5_hi = svc_hi._m5_coherence()
|
|
self.assertLessEqual(m5_lo, m5_hi + 1e-9,
|
|
f"m5 should not decrease when n_healthy goes {n_healthy}→{n_hi}: "
|
|
f"m5_lo={m5_lo:.3f} m5_hi={m5_hi:.3f}")
|
|
|
|
@given(
|
|
cooldown=st.floats(min_value=0.1, max_value=RECOVERY_COOLDOWN_CRITICAL_S * 0.99),
|
|
)
|
|
@hyp_settings(max_examples=100, suppress_health_check=[HealthCheck.too_slow], deadline=None)
|
|
def test_restart_blocked_within_cooldown_window(self, cooldown):
|
|
"""
|
|
Any timestamp within the cooldown window must block a second restart.
|
|
"""
|
|
import meta_health_service_v3 as _mod
|
|
svc = _fresh_svc()
|
|
prog = "dolphin_data:exf_fetcher"
|
|
|
|
# Set timestamp so it's `cooldown` seconds in the past (within window)
|
|
svc._recovery_timestamps[prog] = time.time() - cooldown
|
|
|
|
fired = [False]
|
|
def _noop(args, **kwargs):
|
|
fired[0] = True
|
|
return MagicMock(returncode=0)
|
|
|
|
with patch.object(_mod.subprocess, "run", side_effect=_noop):
|
|
svc._restart_via_supervisorctl(prog)
|
|
time.sleep(0.2)
|
|
|
|
self.assertFalse(fired[0],
|
|
f"Restart fired with {cooldown:.2f}s elapsed (cooldown={RECOVERY_COOLDOWN_CRITICAL_S}s)")
|
|
|
|
@given(
|
|
elapsed=st.floats(
|
|
min_value=RECOVERY_COOLDOWN_CRITICAL_S + 0.1, max_value=3600.0,
|
|
allow_nan=False, allow_infinity=False
|
|
)
|
|
)
|
|
@hyp_settings(max_examples=100, suppress_health_check=[HealthCheck.too_slow], deadline=None)
|
|
def test_restart_allowed_after_cooldown_expires(self, elapsed):
|
|
"""Any timestamp beyond the cooldown window must allow a restart."""
|
|
import meta_health_service_v3 as _mod
|
|
svc = _fresh_svc()
|
|
prog = "dolphin_data:exf_fetcher"
|
|
svc._recovery_timestamps[prog] = time.time() - elapsed
|
|
|
|
fired = [False]
|
|
def _noop(args, **kwargs):
|
|
fired[0] = True
|
|
return MagicMock(returncode=0)
|
|
|
|
with patch.object(_mod.subprocess, "run", side_effect=_noop):
|
|
svc._restart_via_supervisorctl(prog)
|
|
time.sleep(0.5)
|
|
|
|
self.assertTrue(fired[0],
|
|
f"Restart blocked with {elapsed:.1f}s elapsed (cooldown={RECOVERY_COOLDOWN_CRITICAL_S}s)")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main(verbosity=2)
|