Files
DOLPHIN/prod/ops/supervisord_restart.py

258 lines
8.9 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Dolphin Supervisord Full Restart
=================================
Performs a clean, full supervisord restart the only way to fix broken
stdout log pipes after individual process restarts.
Usage:
python3 prod/ops/supervisord_restart.py [--stop-only] [--start-only]
What it does:
1. Snapshot current HZ state (capital, posture) for safety
2. Gracefully stop all supervised programs (SIGTERM wait)
3. Shutdown supervisord itself
4. Wait for PID file to disappear (confirms clean exit)
5. Relaunch supervisord as daemon
6. Wait for all expected programs to reach RUNNING
7. Verify HZ state is intact (capital preserved)
8. Print final status report
Safety:
- Never touches HZ data, CH data, or trade logs
- Verifies capital checkpoint survives restart
- Aborts if supervisord doesn't come up within timeout
"""
import sys
import os
import time
import subprocess
import json
import argparse
from pathlib import Path
# ── Config ────────────────────────────────────────────────────────────────────
SUPERVISORD_BIN = "/usr/local/bin/supervisord"
SUPERVISORCTL = "/usr/local/bin/supervisorctl" # or same as above via args
CONFIG = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
PIDFILE = "/mnt/dolphinng5_predict/prod/supervisor/run/supervisord.pid"
LOGDIR = Path("/mnt/dolphinng5_predict/prod/supervisor/logs")
# Programs that must reach RUNNING state before we declare success
EXPECTED_RUNNING = [
"dolphin:nautilus_trader",
"dolphin:scan_bridge",
"dolphin_data:acb_processor",
"dolphin_data:exf_fetcher",
"dolphin_data:meta_health",
"dolphin_data:obf_universe",
"dolphin_data:system_stats",
]
STOP_TIMEOUT_S = 30 # max seconds to wait for clean stop
START_TIMEOUT_S = 60 # max seconds to wait for all programs RUNNING
# ── Helpers ───────────────────────────────────────────────────────────────────
def log(msg: str):
ts = time.strftime("%H:%M:%S")
print(f"[{ts}] {msg}", flush=True)
def ctl(*args) -> tuple[int, str, str]:
"""Run supervisorctl with our config. Returns (rc, stdout, stderr)."""
cmd = [SUPERVISORCTL, "-c", CONFIG] + list(args)
r = subprocess.run(cmd, capture_output=True, text=True)
return r.returncode, r.stdout.strip(), r.stderr.strip()
def supervisord_pid() -> int | None:
try:
pid = int(Path(PIDFILE).read_text().strip())
os.kill(pid, 0) # check alive
return pid
except (FileNotFoundError, ValueError, ProcessLookupError, PermissionError):
return None
def parse_status(output: str) -> dict[str, str]:
"""Parse supervisorctl status output → {name: state}."""
states = {}
for line in output.splitlines():
parts = line.split()
if len(parts) >= 2:
states[parts[0]] = parts[1]
return states
def hz_capital() -> float | None:
"""Read capital_checkpoint from HZ. Returns None on any failure."""
try:
sys.path.insert(0, "/mnt/dolphinng5_predict")
import hazelcast
hz = hazelcast.HazelcastClient(
cluster_name="dolphin",
cluster_members=["localhost:5701"],
connection_timeout=3.0,
)
raw = hz.get_map("DOLPHIN_STATE_BLUE").blocking().get("capital_checkpoint")
hz.shutdown()
return json.loads(raw)["capital"] if raw else None
except Exception as e:
return None
# ── Main phases ───────────────────────────────────────────────────────────────
def phase_snapshot():
log("=== Phase 0: Pre-restart HZ snapshot ===")
cap = hz_capital()
if cap is not None:
log(f" Capital checkpoint: ${cap:,.2f}")
else:
log(" WARNING: Could not read HZ capital (will verify post-start)")
return cap
def phase_stop():
log("=== Phase 1: Stopping all programs ===")
pid = supervisord_pid()
if pid is None:
log(" Supervisord not running — nothing to stop")
return
# Stop all supervised programs gracefully
log(f" supervisorctl stop all (supervisord PID={pid})")
rc, out, err = ctl("stop", "all")
log(f" {out or err or 'ok'}")
# Wait for all to stop
deadline = time.time() + STOP_TIMEOUT_S
while time.time() < deadline:
rc, out, _ = ctl("status")
states = parse_status(out)
running = [n for n, s in states.items() if s == "RUNNING"]
if not running:
break
log(f" Still running: {running}")
time.sleep(2)
log("=== Phase 2: Shutting down supervisord ===")
rc, out, err = ctl("shutdown")
log(f" {out or err or 'ok'}")
# Wait for PID to disappear
deadline = time.time() + STOP_TIMEOUT_S
while time.time() < deadline:
if supervisord_pid() is None:
break
time.sleep(1)
if supervisord_pid() is None:
log(" Supervisord stopped cleanly.")
else:
log(" WARNING: Supervisord PID still alive — may need manual kill")
def phase_start():
log("=== Phase 3: Launching supervisord ===")
if supervisord_pid() is not None:
log(f" Already running (PID={supervisord_pid()}) — skipping launch")
return
cmd = [SUPERVISORD_BIN, "-c", CONFIG]
log(f" {' '.join(cmd)}")
r = subprocess.run(cmd, capture_output=True, text=True)
if r.returncode != 0:
log(f" ERROR launching supervisord: {r.stderr}")
sys.exit(1)
# Wait for PID file
deadline = time.time() + 10
while time.time() < deadline:
if supervisord_pid() is not None:
break
time.sleep(0.5)
pid = supervisord_pid()
if pid:
log(f" Supervisord started (PID={pid})")
else:
log(" ERROR: supervisord did not start")
sys.exit(1)
def phase_wait_running():
log(f"=== Phase 4: Waiting for programs to reach RUNNING (timeout={START_TIMEOUT_S}s) ===")
deadline = time.time() + START_TIMEOUT_S
last_states = {}
while time.time() < deadline:
rc, out, _ = ctl("status")
states = parse_status(out)
not_running = [n for n in EXPECTED_RUNNING if states.get(n) != "RUNNING"]
if not not_running:
log(" All expected programs RUNNING.")
break
if states != last_states:
for name, state in sorted(states.items()):
marker = "" if state == "RUNNING" else "" if state in ("STARTING", "BACKOFF") else ""
log(f" {marker} {name:<40} {state}")
last_states = states
time.sleep(3)
else:
log(" WARNING: Timeout waiting for programs. Final state:")
rc, out, _ = ctl("status")
print(out)
def phase_verify(pre_capital: float | None):
log("=== Phase 5: Post-restart verification ===")
# Status
rc, out, _ = ctl("status")
states = parse_status(out)
all_ok = True
for name, state in sorted(states.items()):
if name in ("dolphin:clean_arch_trader", "dolphin:paper_portfolio"):
continue # expected STOPPED
marker = "" if state == "RUNNING" else ""
log(f" {marker} {name:<40} {state}")
if state not in ("RUNNING", "STOPPED"):
all_ok = False
# HZ capital
cap = hz_capital()
if cap is not None:
match = "" if (pre_capital is None or abs(cap - pre_capital) < 0.01) else "⚠ CHANGED"
log(f" Capital: ${cap:,.2f} {match}")
else:
log(" WARNING: Could not verify HZ capital post-start")
if all_ok:
log("=== Restart COMPLETE — all services nominal ===")
else:
log("=== Restart done — some services need attention (see above) ===")
# ── Entry point ───────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Dolphin supervisord full restart")
parser.add_argument("--stop-only", action="store_true", help="Only stop, don't relaunch")
parser.add_argument("--start-only", action="store_true", help="Only start, don't stop first")
args = parser.parse_args()
log("Dolphin Supervisord Restart")
log(f" Config : {CONFIG}")
log(f" PID now: {supervisord_pid()}")
pre_cap = phase_snapshot()
if not args.start_only:
phase_stop()
if not args.stop_only:
time.sleep(1) # brief pause before relaunch
phase_start()
phase_wait_running()
phase_verify(pre_cap)
if __name__ == "__main__":
main()