#!/usr/bin/env python3 """ Dolphin Supervisord Full Restart ================================= Performs a clean, full supervisord restart — the only way to fix broken stdout log pipes after individual process restarts. Usage: python3 prod/ops/supervisord_restart.py [--stop-only] [--start-only] What it does: 1. Snapshot current HZ state (capital, posture) for safety 2. Gracefully stop all supervised programs (SIGTERM → wait) 3. Shutdown supervisord itself 4. Wait for PID file to disappear (confirms clean exit) 5. Relaunch supervisord as daemon 6. Wait for all expected programs to reach RUNNING 7. Verify HZ state is intact (capital preserved) 8. Print final status report Safety: - Never touches HZ data, CH data, or trade logs - Verifies capital checkpoint survives restart - Aborts if supervisord doesn't come up within timeout """ import sys import os import time import subprocess import json import argparse from pathlib import Path # ── Config ──────────────────────────────────────────────────────────────────── SUPERVISORD_BIN = "/usr/local/bin/supervisord" SUPERVISORCTL = "/usr/local/bin/supervisorctl" # or same as above via args CONFIG = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf" PIDFILE = "/mnt/dolphinng5_predict/prod/supervisor/run/supervisord.pid" LOGDIR = Path("/mnt/dolphinng5_predict/prod/supervisor/logs") # Programs that must reach RUNNING state before we declare success EXPECTED_RUNNING = [ "dolphin:nautilus_trader", "dolphin:scan_bridge", "dolphin_data:acb_processor", "dolphin_data:exf_fetcher", "dolphin_data:meta_health", "dolphin_data:obf_universe", "dolphin_data:system_stats", ] STOP_TIMEOUT_S = 30 # max seconds to wait for clean stop START_TIMEOUT_S = 60 # max seconds to wait for all programs RUNNING # ── Helpers ─────────────────────────────────────────────────────────────────── def log(msg: str): ts = time.strftime("%H:%M:%S") print(f"[{ts}] {msg}", flush=True) def ctl(*args) -> tuple[int, str, str]: """Run supervisorctl with our config. Returns (rc, stdout, stderr).""" cmd = [SUPERVISORCTL, "-c", CONFIG] + list(args) r = subprocess.run(cmd, capture_output=True, text=True) return r.returncode, r.stdout.strip(), r.stderr.strip() def supervisord_pid() -> int | None: try: pid = int(Path(PIDFILE).read_text().strip()) os.kill(pid, 0) # check alive return pid except (FileNotFoundError, ValueError, ProcessLookupError, PermissionError): return None def parse_status(output: str) -> dict[str, str]: """Parse supervisorctl status output → {name: state}.""" states = {} for line in output.splitlines(): parts = line.split() if len(parts) >= 2: states[parts[0]] = parts[1] return states def hz_capital() -> float | None: """Read capital_checkpoint from HZ. Returns None on any failure.""" try: sys.path.insert(0, "/mnt/dolphinng5_predict") import hazelcast hz = hazelcast.HazelcastClient( cluster_name="dolphin", cluster_members=["localhost:5701"], connection_timeout=3.0, ) raw = hz.get_map("DOLPHIN_STATE_BLUE").blocking().get("capital_checkpoint") hz.shutdown() return json.loads(raw)["capital"] if raw else None except Exception as e: return None # ── Main phases ─────────────────────────────────────────────────────────────── def phase_snapshot(): log("=== Phase 0: Pre-restart HZ snapshot ===") cap = hz_capital() if cap is not None: log(f" Capital checkpoint: ${cap:,.2f}") else: log(" WARNING: Could not read HZ capital (will verify post-start)") return cap def phase_stop(): log("=== Phase 1: Stopping all programs ===") pid = supervisord_pid() if pid is None: log(" Supervisord not running — nothing to stop") return # Stop all supervised programs gracefully log(f" supervisorctl stop all (supervisord PID={pid})") rc, out, err = ctl("stop", "all") log(f" {out or err or 'ok'}") # Wait for all to stop deadline = time.time() + STOP_TIMEOUT_S while time.time() < deadline: rc, out, _ = ctl("status") states = parse_status(out) running = [n for n, s in states.items() if s == "RUNNING"] if not running: break log(f" Still running: {running}") time.sleep(2) log("=== Phase 2: Shutting down supervisord ===") rc, out, err = ctl("shutdown") log(f" {out or err or 'ok'}") # Wait for PID to disappear deadline = time.time() + STOP_TIMEOUT_S while time.time() < deadline: if supervisord_pid() is None: break time.sleep(1) if supervisord_pid() is None: log(" Supervisord stopped cleanly.") else: log(" WARNING: Supervisord PID still alive — may need manual kill") def phase_start(): log("=== Phase 3: Launching supervisord ===") if supervisord_pid() is not None: log(f" Already running (PID={supervisord_pid()}) — skipping launch") return cmd = [SUPERVISORD_BIN, "-c", CONFIG] log(f" {' '.join(cmd)}") r = subprocess.run(cmd, capture_output=True, text=True) if r.returncode != 0: log(f" ERROR launching supervisord: {r.stderr}") sys.exit(1) # Wait for PID file deadline = time.time() + 10 while time.time() < deadline: if supervisord_pid() is not None: break time.sleep(0.5) pid = supervisord_pid() if pid: log(f" Supervisord started (PID={pid})") else: log(" ERROR: supervisord did not start") sys.exit(1) def phase_wait_running(): log(f"=== Phase 4: Waiting for programs to reach RUNNING (timeout={START_TIMEOUT_S}s) ===") deadline = time.time() + START_TIMEOUT_S last_states = {} while time.time() < deadline: rc, out, _ = ctl("status") states = parse_status(out) not_running = [n for n in EXPECTED_RUNNING if states.get(n) != "RUNNING"] if not not_running: log(" All expected programs RUNNING.") break if states != last_states: for name, state in sorted(states.items()): marker = "✓" if state == "RUNNING" else "⏳" if state in ("STARTING", "BACKOFF") else "✗" log(f" {marker} {name:<40} {state}") last_states = states time.sleep(3) else: log(" WARNING: Timeout waiting for programs. Final state:") rc, out, _ = ctl("status") print(out) def phase_verify(pre_capital: float | None): log("=== Phase 5: Post-restart verification ===") # Status rc, out, _ = ctl("status") states = parse_status(out) all_ok = True for name, state in sorted(states.items()): if name in ("dolphin:clean_arch_trader", "dolphin:paper_portfolio"): continue # expected STOPPED marker = "✓" if state == "RUNNING" else "✗" log(f" {marker} {name:<40} {state}") if state not in ("RUNNING", "STOPPED"): all_ok = False # HZ capital cap = hz_capital() if cap is not None: match = "✓" if (pre_capital is None or abs(cap - pre_capital) < 0.01) else "⚠ CHANGED" log(f" Capital: ${cap:,.2f} {match}") else: log(" WARNING: Could not verify HZ capital post-start") if all_ok: log("=== Restart COMPLETE — all services nominal ===") else: log("=== Restart done — some services need attention (see above) ===") # ── Entry point ─────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Dolphin supervisord full restart") parser.add_argument("--stop-only", action="store_true", help="Only stop, don't relaunch") parser.add_argument("--start-only", action="store_true", help="Only start, don't stop first") args = parser.parse_args() log("Dolphin Supervisord Restart") log(f" Config : {CONFIG}") log(f" PID now: {supervisord_pid()}") pre_cap = phase_snapshot() if not args.start_only: phase_stop() if not args.stop_only: time.sleep(1) # brief pause before relaunch phase_start() phase_wait_running() phase_verify(pre_cap) if __name__ == "__main__": main()