initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
257
prod/ops/supervisord_restart.py
Executable file
257
prod/ops/supervisord_restart.py
Executable file
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dolphin Supervisord Full Restart
|
||||
=================================
|
||||
Performs a clean, full supervisord restart — the only way to fix broken
|
||||
stdout log pipes after individual process restarts.
|
||||
|
||||
Usage:
|
||||
python3 prod/ops/supervisord_restart.py [--stop-only] [--start-only]
|
||||
|
||||
What it does:
|
||||
1. Snapshot current HZ state (capital, posture) for safety
|
||||
2. Gracefully stop all supervised programs (SIGTERM → wait)
|
||||
3. Shutdown supervisord itself
|
||||
4. Wait for PID file to disappear (confirms clean exit)
|
||||
5. Relaunch supervisord as daemon
|
||||
6. Wait for all expected programs to reach RUNNING
|
||||
7. Verify HZ state is intact (capital preserved)
|
||||
8. Print final status report
|
||||
|
||||
Safety:
|
||||
- Never touches HZ data, CH data, or trade logs
|
||||
- Verifies capital checkpoint survives restart
|
||||
- Aborts if supervisord doesn't come up within timeout
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import subprocess
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
|
||||
SUPERVISORD_BIN = "/usr/local/bin/supervisord"
|
||||
SUPERVISORCTL = "/usr/local/bin/supervisorctl" # or same as above via args
|
||||
CONFIG = "/mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
|
||||
PIDFILE = "/mnt/dolphinng5_predict/prod/supervisor/run/supervisord.pid"
|
||||
LOGDIR = Path("/mnt/dolphinng5_predict/prod/supervisor/logs")
|
||||
|
||||
# Programs that must reach RUNNING state before we declare success
|
||||
EXPECTED_RUNNING = [
|
||||
"dolphin:nautilus_trader",
|
||||
"dolphin:scan_bridge",
|
||||
"dolphin_data:acb_processor",
|
||||
"dolphin_data:exf_fetcher",
|
||||
"dolphin_data:meta_health",
|
||||
"dolphin_data:obf_universe",
|
||||
"dolphin_data:system_stats",
|
||||
]
|
||||
|
||||
STOP_TIMEOUT_S = 30 # max seconds to wait for clean stop
|
||||
START_TIMEOUT_S = 60 # max seconds to wait for all programs RUNNING
|
||||
|
||||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def log(msg: str):
|
||||
ts = time.strftime("%H:%M:%S")
|
||||
print(f"[{ts}] {msg}", flush=True)
|
||||
|
||||
def ctl(*args) -> tuple[int, str, str]:
|
||||
"""Run supervisorctl with our config. Returns (rc, stdout, stderr)."""
|
||||
cmd = [SUPERVISORCTL, "-c", CONFIG] + list(args)
|
||||
r = subprocess.run(cmd, capture_output=True, text=True)
|
||||
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
||||
|
||||
def supervisord_pid() -> int | None:
|
||||
try:
|
||||
pid = int(Path(PIDFILE).read_text().strip())
|
||||
os.kill(pid, 0) # check alive
|
||||
return pid
|
||||
except (FileNotFoundError, ValueError, ProcessLookupError, PermissionError):
|
||||
return None
|
||||
|
||||
def parse_status(output: str) -> dict[str, str]:
|
||||
"""Parse supervisorctl status output → {name: state}."""
|
||||
states = {}
|
||||
for line in output.splitlines():
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
states[parts[0]] = parts[1]
|
||||
return states
|
||||
|
||||
def hz_capital() -> float | None:
|
||||
"""Read capital_checkpoint from HZ. Returns None on any failure."""
|
||||
try:
|
||||
sys.path.insert(0, "/mnt/dolphinng5_predict")
|
||||
import hazelcast
|
||||
hz = hazelcast.HazelcastClient(
|
||||
cluster_name="dolphin",
|
||||
cluster_members=["localhost:5701"],
|
||||
connection_timeout=3.0,
|
||||
)
|
||||
raw = hz.get_map("DOLPHIN_STATE_BLUE").blocking().get("capital_checkpoint")
|
||||
hz.shutdown()
|
||||
return json.loads(raw)["capital"] if raw else None
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
# ── Main phases ───────────────────────────────────────────────────────────────
|
||||
|
||||
def phase_snapshot():
|
||||
log("=== Phase 0: Pre-restart HZ snapshot ===")
|
||||
cap = hz_capital()
|
||||
if cap is not None:
|
||||
log(f" Capital checkpoint: ${cap:,.2f}")
|
||||
else:
|
||||
log(" WARNING: Could not read HZ capital (will verify post-start)")
|
||||
return cap
|
||||
|
||||
def phase_stop():
|
||||
log("=== Phase 1: Stopping all programs ===")
|
||||
pid = supervisord_pid()
|
||||
if pid is None:
|
||||
log(" Supervisord not running — nothing to stop")
|
||||
return
|
||||
|
||||
# Stop all supervised programs gracefully
|
||||
log(f" supervisorctl stop all (supervisord PID={pid})")
|
||||
rc, out, err = ctl("stop", "all")
|
||||
log(f" {out or err or 'ok'}")
|
||||
|
||||
# Wait for all to stop
|
||||
deadline = time.time() + STOP_TIMEOUT_S
|
||||
while time.time() < deadline:
|
||||
rc, out, _ = ctl("status")
|
||||
states = parse_status(out)
|
||||
running = [n for n, s in states.items() if s == "RUNNING"]
|
||||
if not running:
|
||||
break
|
||||
log(f" Still running: {running}")
|
||||
time.sleep(2)
|
||||
|
||||
log("=== Phase 2: Shutting down supervisord ===")
|
||||
rc, out, err = ctl("shutdown")
|
||||
log(f" {out or err or 'ok'}")
|
||||
|
||||
# Wait for PID to disappear
|
||||
deadline = time.time() + STOP_TIMEOUT_S
|
||||
while time.time() < deadline:
|
||||
if supervisord_pid() is None:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if supervisord_pid() is None:
|
||||
log(" Supervisord stopped cleanly.")
|
||||
else:
|
||||
log(" WARNING: Supervisord PID still alive — may need manual kill")
|
||||
|
||||
def phase_start():
|
||||
log("=== Phase 3: Launching supervisord ===")
|
||||
if supervisord_pid() is not None:
|
||||
log(f" Already running (PID={supervisord_pid()}) — skipping launch")
|
||||
return
|
||||
|
||||
cmd = [SUPERVISORD_BIN, "-c", CONFIG]
|
||||
log(f" {' '.join(cmd)}")
|
||||
r = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if r.returncode != 0:
|
||||
log(f" ERROR launching supervisord: {r.stderr}")
|
||||
sys.exit(1)
|
||||
|
||||
# Wait for PID file
|
||||
deadline = time.time() + 10
|
||||
while time.time() < deadline:
|
||||
if supervisord_pid() is not None:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
pid = supervisord_pid()
|
||||
if pid:
|
||||
log(f" Supervisord started (PID={pid})")
|
||||
else:
|
||||
log(" ERROR: supervisord did not start")
|
||||
sys.exit(1)
|
||||
|
||||
def phase_wait_running():
|
||||
log(f"=== Phase 4: Waiting for programs to reach RUNNING (timeout={START_TIMEOUT_S}s) ===")
|
||||
deadline = time.time() + START_TIMEOUT_S
|
||||
last_states = {}
|
||||
|
||||
while time.time() < deadline:
|
||||
rc, out, _ = ctl("status")
|
||||
states = parse_status(out)
|
||||
|
||||
not_running = [n for n in EXPECTED_RUNNING if states.get(n) != "RUNNING"]
|
||||
if not not_running:
|
||||
log(" All expected programs RUNNING.")
|
||||
break
|
||||
|
||||
if states != last_states:
|
||||
for name, state in sorted(states.items()):
|
||||
marker = "✓" if state == "RUNNING" else "⏳" if state in ("STARTING", "BACKOFF") else "✗"
|
||||
log(f" {marker} {name:<40} {state}")
|
||||
last_states = states
|
||||
|
||||
time.sleep(3)
|
||||
else:
|
||||
log(" WARNING: Timeout waiting for programs. Final state:")
|
||||
rc, out, _ = ctl("status")
|
||||
print(out)
|
||||
|
||||
def phase_verify(pre_capital: float | None):
|
||||
log("=== Phase 5: Post-restart verification ===")
|
||||
|
||||
# Status
|
||||
rc, out, _ = ctl("status")
|
||||
states = parse_status(out)
|
||||
all_ok = True
|
||||
for name, state in sorted(states.items()):
|
||||
if name in ("dolphin:clean_arch_trader", "dolphin:paper_portfolio"):
|
||||
continue # expected STOPPED
|
||||
marker = "✓" if state == "RUNNING" else "✗"
|
||||
log(f" {marker} {name:<40} {state}")
|
||||
if state not in ("RUNNING", "STOPPED"):
|
||||
all_ok = False
|
||||
|
||||
# HZ capital
|
||||
cap = hz_capital()
|
||||
if cap is not None:
|
||||
match = "✓" if (pre_capital is None or abs(cap - pre_capital) < 0.01) else "⚠ CHANGED"
|
||||
log(f" Capital: ${cap:,.2f} {match}")
|
||||
else:
|
||||
log(" WARNING: Could not verify HZ capital post-start")
|
||||
|
||||
if all_ok:
|
||||
log("=== Restart COMPLETE — all services nominal ===")
|
||||
else:
|
||||
log("=== Restart done — some services need attention (see above) ===")
|
||||
|
||||
# ── Entry point ───────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Dolphin supervisord full restart")
|
||||
parser.add_argument("--stop-only", action="store_true", help="Only stop, don't relaunch")
|
||||
parser.add_argument("--start-only", action="store_true", help="Only start, don't stop first")
|
||||
args = parser.parse_args()
|
||||
|
||||
log("Dolphin Supervisord Restart")
|
||||
log(f" Config : {CONFIG}")
|
||||
log(f" PID now: {supervisord_pid()}")
|
||||
|
||||
pre_cap = phase_snapshot()
|
||||
|
||||
if not args.start_only:
|
||||
phase_stop()
|
||||
|
||||
if not args.stop_only:
|
||||
time.sleep(1) # brief pause before relaunch
|
||||
phase_start()
|
||||
phase_wait_running()
|
||||
phase_verify(pre_cap)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user