Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
258 lines
8.9 KiB
Python
Executable File
258 lines
8.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Dolphin Supervisord Full Restart
|
|
=================================
|
|
Performs a clean, full supervisord restart — the only way to fix broken
|
|
stdout log pipes after individual process restarts.
|
|
|
|
Usage:
|
|
python3 prod/ops/supervisord_restart.py [--stop-only] [--start-only]
|
|
|
|
What it does:
|
|
1. Snapshot current HZ state (capital, posture) for safety
|
|
2. Gracefully stop all supervised programs (SIGTERM → wait)
|
|
3. Shutdown supervisord itself
|
|
4. Wait for PID file to disappear (confirms clean exit)
|
|
5. Relaunch supervisord as daemon
|
|
6. Wait for all expected programs to reach RUNNING
|
|
7. Verify HZ state is intact (capital preserved)
|
|
8. Print final status report
|
|
|
|
Safety:
|
|
- Never touches HZ data, CH data, or trade logs
|
|
- Verifies capital checkpoint survives restart
|
|
- Aborts if supervisord doesn't come up within timeout
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import time
|
|
import subprocess
|
|
import json
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
# ── Config ────────────────────────────────────────────────────────────────────
|
|
|
|
SUPERVISORD_BIN = "/usr/local/bin/supervisord"
|
|
SUPERVISORCTL = "/usr/local/bin/supervisorctl" # or same as above via args
|
|
CONFIG = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
|
|
PIDFILE = "/mnt/dolphinng5_predict/prod/supervisor/run/supervisord.pid"
|
|
LOGDIR = Path("/mnt/dolphinng5_predict/prod/supervisor/logs")
|
|
|
|
# Programs that must reach RUNNING state before we declare success
|
|
EXPECTED_RUNNING = [
|
|
"dolphin:nautilus_trader",
|
|
"dolphin:scan_bridge",
|
|
"dolphin_data:acb_processor",
|
|
"dolphin_data:exf_fetcher",
|
|
"dolphin_data:meta_health",
|
|
"dolphin_data:obf_universe",
|
|
"dolphin_data:system_stats",
|
|
]
|
|
|
|
STOP_TIMEOUT_S = 30 # max seconds to wait for clean stop
|
|
START_TIMEOUT_S = 60 # max seconds to wait for all programs RUNNING
|
|
|
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
|
|
|
def log(msg: str):
|
|
ts = time.strftime("%H:%M:%S")
|
|
print(f"[{ts}] {msg}", flush=True)
|
|
|
|
def ctl(*args) -> tuple[int, str, str]:
|
|
"""Run supervisorctl with our config. Returns (rc, stdout, stderr)."""
|
|
cmd = [SUPERVISORCTL, "-c", CONFIG] + list(args)
|
|
r = subprocess.run(cmd, capture_output=True, text=True)
|
|
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
|
|
|
def supervisord_pid() -> int | None:
|
|
try:
|
|
pid = int(Path(PIDFILE).read_text().strip())
|
|
os.kill(pid, 0) # check alive
|
|
return pid
|
|
except (FileNotFoundError, ValueError, ProcessLookupError, PermissionError):
|
|
return None
|
|
|
|
def parse_status(output: str) -> dict[str, str]:
|
|
"""Parse supervisorctl status output → {name: state}."""
|
|
states = {}
|
|
for line in output.splitlines():
|
|
parts = line.split()
|
|
if len(parts) >= 2:
|
|
states[parts[0]] = parts[1]
|
|
return states
|
|
|
|
def hz_capital() -> float | None:
|
|
"""Read capital_checkpoint from HZ. Returns None on any failure."""
|
|
try:
|
|
sys.path.insert(0, "/mnt/dolphinng5_predict")
|
|
import hazelcast
|
|
hz = hazelcast.HazelcastClient(
|
|
cluster_name="dolphin",
|
|
cluster_members=["localhost:5701"],
|
|
connection_timeout=3.0,
|
|
)
|
|
raw = hz.get_map("DOLPHIN_STATE_BLUE").blocking().get("capital_checkpoint")
|
|
hz.shutdown()
|
|
return json.loads(raw)["capital"] if raw else None
|
|
except Exception as e:
|
|
return None
|
|
|
|
# ── Main phases ───────────────────────────────────────────────────────────────
|
|
|
|
def phase_snapshot():
|
|
log("=== Phase 0: Pre-restart HZ snapshot ===")
|
|
cap = hz_capital()
|
|
if cap is not None:
|
|
log(f" Capital checkpoint: ${cap:,.2f}")
|
|
else:
|
|
log(" WARNING: Could not read HZ capital (will verify post-start)")
|
|
return cap
|
|
|
|
def phase_stop():
|
|
log("=== Phase 1: Stopping all programs ===")
|
|
pid = supervisord_pid()
|
|
if pid is None:
|
|
log(" Supervisord not running — nothing to stop")
|
|
return
|
|
|
|
# Stop all supervised programs gracefully
|
|
log(f" supervisorctl stop all (supervisord PID={pid})")
|
|
rc, out, err = ctl("stop", "all")
|
|
log(f" {out or err or 'ok'}")
|
|
|
|
# Wait for all to stop
|
|
deadline = time.time() + STOP_TIMEOUT_S
|
|
while time.time() < deadline:
|
|
rc, out, _ = ctl("status")
|
|
states = parse_status(out)
|
|
running = [n for n, s in states.items() if s == "RUNNING"]
|
|
if not running:
|
|
break
|
|
log(f" Still running: {running}")
|
|
time.sleep(2)
|
|
|
|
log("=== Phase 2: Shutting down supervisord ===")
|
|
rc, out, err = ctl("shutdown")
|
|
log(f" {out or err or 'ok'}")
|
|
|
|
# Wait for PID to disappear
|
|
deadline = time.time() + STOP_TIMEOUT_S
|
|
while time.time() < deadline:
|
|
if supervisord_pid() is None:
|
|
break
|
|
time.sleep(1)
|
|
|
|
if supervisord_pid() is None:
|
|
log(" Supervisord stopped cleanly.")
|
|
else:
|
|
log(" WARNING: Supervisord PID still alive — may need manual kill")
|
|
|
|
def phase_start():
|
|
log("=== Phase 3: Launching supervisord ===")
|
|
if supervisord_pid() is not None:
|
|
log(f" Already running (PID={supervisord_pid()}) — skipping launch")
|
|
return
|
|
|
|
cmd = [SUPERVISORD_BIN, "-c", CONFIG]
|
|
log(f" {' '.join(cmd)}")
|
|
r = subprocess.run(cmd, capture_output=True, text=True)
|
|
if r.returncode != 0:
|
|
log(f" ERROR launching supervisord: {r.stderr}")
|
|
sys.exit(1)
|
|
|
|
# Wait for PID file
|
|
deadline = time.time() + 10
|
|
while time.time() < deadline:
|
|
if supervisord_pid() is not None:
|
|
break
|
|
time.sleep(0.5)
|
|
|
|
pid = supervisord_pid()
|
|
if pid:
|
|
log(f" Supervisord started (PID={pid})")
|
|
else:
|
|
log(" ERROR: supervisord did not start")
|
|
sys.exit(1)
|
|
|
|
def phase_wait_running():
|
|
log(f"=== Phase 4: Waiting for programs to reach RUNNING (timeout={START_TIMEOUT_S}s) ===")
|
|
deadline = time.time() + START_TIMEOUT_S
|
|
last_states = {}
|
|
|
|
while time.time() < deadline:
|
|
rc, out, _ = ctl("status")
|
|
states = parse_status(out)
|
|
|
|
not_running = [n for n in EXPECTED_RUNNING if states.get(n) != "RUNNING"]
|
|
if not not_running:
|
|
log(" All expected programs RUNNING.")
|
|
break
|
|
|
|
if states != last_states:
|
|
for name, state in sorted(states.items()):
|
|
marker = "✓" if state == "RUNNING" else "⏳" if state in ("STARTING", "BACKOFF") else "✗"
|
|
log(f" {marker} {name:<40} {state}")
|
|
last_states = states
|
|
|
|
time.sleep(3)
|
|
else:
|
|
log(" WARNING: Timeout waiting for programs. Final state:")
|
|
rc, out, _ = ctl("status")
|
|
print(out)
|
|
|
|
def phase_verify(pre_capital: float | None):
|
|
log("=== Phase 5: Post-restart verification ===")
|
|
|
|
# Status
|
|
rc, out, _ = ctl("status")
|
|
states = parse_status(out)
|
|
all_ok = True
|
|
for name, state in sorted(states.items()):
|
|
if name in ("dolphin:clean_arch_trader", "dolphin:paper_portfolio"):
|
|
continue # expected STOPPED
|
|
marker = "✓" if state == "RUNNING" else "✗"
|
|
log(f" {marker} {name:<40} {state}")
|
|
if state not in ("RUNNING", "STOPPED"):
|
|
all_ok = False
|
|
|
|
# HZ capital
|
|
cap = hz_capital()
|
|
if cap is not None:
|
|
match = "✓" if (pre_capital is None or abs(cap - pre_capital) < 0.01) else "⚠ CHANGED"
|
|
log(f" Capital: ${cap:,.2f} {match}")
|
|
else:
|
|
log(" WARNING: Could not verify HZ capital post-start")
|
|
|
|
if all_ok:
|
|
log("=== Restart COMPLETE — all services nominal ===")
|
|
else:
|
|
log("=== Restart done — some services need attention (see above) ===")
|
|
|
|
# ── Entry point ───────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Dolphin supervisord full restart")
|
|
parser.add_argument("--stop-only", action="store_true", help="Only stop, don't relaunch")
|
|
parser.add_argument("--start-only", action="store_true", help="Only start, don't stop first")
|
|
args = parser.parse_args()
|
|
|
|
log("Dolphin Supervisord Restart")
|
|
log(f" Config : {CONFIG}")
|
|
log(f" PID now: {supervisord_pid()}")
|
|
|
|
pre_cap = phase_snapshot()
|
|
|
|
if not args.start_only:
|
|
phase_stop()
|
|
|
|
if not args.stop_only:
|
|
time.sleep(1) # brief pause before relaunch
|
|
phase_start()
|
|
phase_wait_running()
|
|
phase_verify(pre_cap)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|