From 6fc9eab927df2f8d9215adb6dc0b5050acbed997 Mon Sep 17 00:00:00 2001 From: Codex Date: Mon, 8 Jun 2026 14:22:22 +0200 Subject: [PATCH] OPS: supervisord systemd watchdog + controlled-bringup startup script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds dolphin-supervisord.service (installed + enabled) and dolphin_startup_check.sh: - ExecStartPre waits for HZ (CRITICAL/blocks), CH+Prefect (WARN/degraded-ok) - Logs to /tmp/dolphin_logs/startup.log + run_logs/dolphin_startup_.log - Writes machine-readable /tmp/dolphin_logs/startup_status.json on every start - nautilus_trader remains autostart=false — BLUE must be started manually SYSTEM BIBLE bumped to v7.1; §16.10 updated, §16.14 added. Co-Authored-By: Claude Sonnet 4.6 --- prod/docs/SYSTEM_BIBLE.md | 83 ++++++++- prod/supervisor/dolphin_startup_check.sh | 220 +++++++++++++++++++++++ 2 files changed, 297 insertions(+), 6 deletions(-) create mode 100644 prod/supervisor/dolphin_startup_check.sh diff --git a/prod/docs/SYSTEM_BIBLE.md b/prod/docs/SYSTEM_BIBLE.md index cf32919..721a53f 100644 --- a/prod/docs/SYSTEM_BIBLE.md +++ b/prod/docs/SYSTEM_BIBLE.md @@ -1,7 +1,7 @@ # DOLPHIN-NAUTILUS SYSTEM BIBLE ## Doctrinal Reference — As Running 2026-04-05 -**Version**: v7.0 — PINK DITAv2 Fee Accounting Fix + Orphan Prevention (2026-06-08) +**Version**: v7.1 — supervisord systemd watchdog + controlled-bringup startup (2026-06-08) **Previous version**: v6.0 — NG8 Linux Scanner + TUI v3 Live Observability + Test Footer CI (2026-04-05) **Previous version**: v4.1 — Multi-Speed Event-Driven Architecture (2026-03-25) **CI gate (Nautilus)**: 46/46 tests green @@ -1362,12 +1362,24 @@ supervisorctl -c /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.con ### 16.10 Daemon Start Sequence -**IMPORTANT**: supervisord has NO systemd unit — it is NOT auto-started on reboot. -After any reboot or OOM kill, supervisord must be started manually (step 2 below). +**`supervisord` is now auto-started on reboot via `dolphin-supervisord.service` (systemd).** +See §16.14 for watchdog details. Manual start is still the correct method for the initial +activation or when you need to bypass the watchdog. ```bash -# 1. Verify Hazelcast/Prefect are running (systemd-managed, survive reboots) -systemctl status dolphin-prefect-worker +# --- Automatic (post-reboot, systemd handles it) --- +# dolphin-supervisord.service runs dolphin_startup_check.sh as ExecStartPre: +# • Waits for dolphin-hazelcast container healthy (CRITICAL — blocks if not up) +# • Waits for dolphin-clickhouse (WARN — ch_writer spools on failure) +# • Waits for dolphin-prefect (WARN — exf_fetcher degrades) +# • Writes /tmp/dolphin_logs/startup_status.json + /tmp/dolphin_logs/startup.log +# Then starts supervisord. dolphin_data group (OBF, ACB, MHS, exf, maras, esof) +# auto-starts. nautilus_trader (BLUE) does NOT auto-start — must be manual. + +# --- Manual (if you need to start/restart supervisord yourself) --- + +# 1. Verify Hazelcast/ClickHouse containers are running +docker ps --filter name=dolphin-hazelcast --filter name=dolphin-clickhouse # 2. Start supervisord (MUST export DOLPHIN_LOG_ROOT — used by logfile= directives) mkdir -p /tmp/dolphin_logs/supervisor /tmp/dolphin_logs/trader @@ -1378,7 +1390,7 @@ DOLPHIN_LOG_ROOT=/tmp/dolphin_logs supervisord \ # 3. Verify data pipeline is up supervisorctl -c /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf status -# 4. Start BLUE (manual — autostart=false; only start after verifying BingX position state) +# 4. Start BLUE (manual — autostart=false; only start after verifying exchange position state) supervisorctl -c /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf \ start dolphin:nautilus_trader @@ -1441,6 +1453,65 @@ supervisorctl -c /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.con start dolphin:nautilus_trader ``` +### 16.14 supervisord Systemd Watchdog + +**Unit:** `/etc/systemd/system/dolphin-supervisord.service` +**Pre-start script:** `/mnt/dolphinng5_predict/prod/supervisor/dolphin_startup_check.sh` + +#### What it does + +| Phase | Action | +|---|---| +| **ExecStartPre 1** | `mkdir -p /tmp/dolphin_logs/supervisor /tmp/dolphin_logs/trader` | +| **ExecStartPre 2** | Runs `dolphin_startup_check.sh` — checks HZ (CRITICAL), CH (WARN), Prefect (WARN) | +| **ExecStart** | `supervisord -c dolphin-supervisord.conf` | +| **ExecStop** | `supervisorctl shutdown` | +| **Restart** | `on-failure`, 30 s delay — only fires on crash/OOM-kill, never on clean stop | +| **TimeoutStartSec** | 300 s — enough for bringup-check loops (HZ=90 s, CH=60 s, Prefect=45 s) | + +#### Startup check logic (dolphin_startup_check.sh) + +1. Checks Docker daemon. If unavailable, skips container checks, falls back to TCP. +2. **Hazelcast** — container must reach `running` + `healthy` within 90 s, then TCP:5701 + confirmed. If HZ is not up: **script exits 1 → systemd refuses to start supervisord**. +3. **ClickHouse** — container running + TCP:8123 within 60 s. Failure = WARN only. +4. **Prefect** — container running + TCP:4200 within 45 s. Failure = WARN only. +5. Writes `/tmp/dolphin_logs/startup.log` (same dir as all supervisor logs). +6. Writes `/mnt/dolphinng5_predict/run_logs/dolphin_startup_.log` (persistent). +7. Writes `/tmp/dolphin_logs/startup_status.json` (machine-readable, for BLUE to check). + +#### Key safety rule + +`nautilus_trader` is `autostart=false` in supervisord.conf. **The systemd watchdog will +NOT auto-start BLUE.** BLUE must always be started manually after verifying exchange +position state. The watchdog only manages supervisord and the data pipeline. + +#### Management commands + +```bash +# Status +systemctl status dolphin-supervisord.service + +# Start (e.g. first boot or after manual stop) +systemctl start dolphin-supervisord.service +# Startup log: /tmp/dolphin_logs/startup.log (also journalctl -u dolphin-supervisord) + +# Stop (does NOT restart — clean exit) +systemctl stop dolphin-supervisord.service + +# Disable watchdog (maintenance) +systemctl disable dolphin-supervisord.service + +# Re-enable +systemctl enable dolphin-supervisord.service + +# View startup check log +cat /tmp/dolphin_logs/startup.log +journalctl -u dolphin-supervisord.service -n 100 +``` + +--- + ### 16.11 Monitoring Endpoints | Service | URL / Command | diff --git a/prod/supervisor/dolphin_startup_check.sh b/prod/supervisor/dolphin_startup_check.sh new file mode 100644 index 0000000..0c8487e --- /dev/null +++ b/prod/supervisor/dolphin_startup_check.sh @@ -0,0 +1,220 @@ +#!/bin/bash +# ============================================================================= +# Dolphin controlled startup dependency check +# Runs as ExecStartPre for dolphin-supervisord.service +# +# Checks (in order, with per-step timeouts): +# 1. Docker daemon responsive +# 2. dolphin-hazelcast container healthy (CRITICAL — blocks on failure) +# 3. dolphin-clickhouse container up (WARN — ch_writer spools to disk) +# 4. dolphin-prefect container up (WARN — exf_fetcher degrades) +# 5. Port-level TCP smoke-test for each +# +# Logs every step with timestamps to: +# /tmp/dolphin_logs/startup.log (transient, same dir as BLUE logs) +# /mnt/dolphinng5_predict/run_logs/dolphin_startup_.log (persistent) +# +# Exit codes: +# 0 = all critical deps up → supervisord may start +# 1 = critical dep (Hazelcast) timed out → systemd will not start supervisord +# ============================================================================= + +set -euo pipefail + +LOGDIR="/tmp/dolphin_logs" +SUPDIR="${LOGDIR}/supervisor" +TRADEDIR="${LOGDIR}/trader" +RUNLOG="/mnt/dolphinng5_predict/run_logs" +STARTLOG="${LOGDIR}/startup.log" +DATE=$(date -u '+%Y-%m-%d') +PERSISTENT_LOG="${RUNLOG}/dolphin_startup_${DATE}.log" + +# ── helpers ───────────────────────────────────────────────────────────────── + +ts() { date -u '+%Y-%m-%dT%H:%M:%SZ'; } + +log() { + local msg="[$(ts)] $*" + echo "$msg" # → journald (captured by systemd) + echo "$msg" >> "$STARTLOG" 2>/dev/null + echo "$msg" >> "$PERSISTENT_LOG" 2>/dev/null || true +} + +log_section() { + log "────────────────────────────────────────" + log "$*" + log "────────────────────────────────────────" +} + +tcp_ok() { + # bash TCP pseudo-device — no nc required + timeout 2 bash -c "echo >/dev/tcp/${1}/${2}" 2>/dev/null +} + +wait_tcp() { + local label="$1" host="$2" port="$3" timeout_s="${4:-60}" + local elapsed=0 + log " Waiting for TCP ${host}:${port} (${label}, timeout=${timeout_s}s)..." + while ! tcp_ok "$host" "$port"; do + if (( elapsed >= timeout_s )); then + log " TIMEOUT: ${label} not reachable on ${host}:${port} after ${elapsed}s" + return 1 + fi + sleep 2; (( elapsed += 2 )) + done + log " OK: ${label} TCP:${port} ready (${elapsed}s)" + return 0 +} + +wait_docker_container() { + local label="$1" name="$2" timeout_s="${3:-90}" require_healthy="${4:-0}" + local elapsed=0 + log " Waiting for container '${name}' (timeout=${timeout_s}s)..." + while true; do + local status + status=$(docker inspect --format '{{.State.Status}}' "$name" 2>/dev/null || echo "missing") + local health + health=$(docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' "$name" 2>/dev/null || echo "none") + + if [[ "$status" == "running" ]]; then + if (( require_healthy )) && [[ "$health" != "none" ]] && [[ "$health" != "healthy" ]]; then + : # keep waiting + else + log " OK: ${label} status=${status} health=${health} (${elapsed}s)" + return 0 + fi + fi + + if (( elapsed >= timeout_s )); then + log " TIMEOUT: ${label} container '${name}' status=${status} health=${health} after ${elapsed}s" + return 1 + fi + sleep 3; (( elapsed += 3 )) + done +} + +# ── create dirs ────────────────────────────────────────────────────────────── + +mkdir -p "$SUPDIR" "$TRADEDIR" +# Persistent log dir might not be writable at early boot — ignore failure +mkdir -p "$RUNLOG" 2>/dev/null || true + +# ── begin ──────────────────────────────────────────────────────────────────── + +log_section "=== DOLPHIN STARTUP CHECK BEGIN ===" +log "Host: $(hostname) Kernel: $(uname -r)" +log "Startup log: ${STARTLOG}" +log "Persistent: ${PERSISTENT_LOG}" + +OVERALL_STATUS="OK" + +# ── 1. Docker daemon ───────────────────────────────────────────────────────── + +log_section "1/4 Docker daemon" +if ! docker info >/dev/null 2>&1; then + log " WARN: Docker daemon not responding — container checks skipped" + SKIP_CONTAINERS=1 +else + log " OK: Docker daemon responsive" + SKIP_CONTAINERS=0 +fi + +# ── 2. Hazelcast (CRITICAL) ────────────────────────────────────────────────── + +log_section "2/4 Hazelcast (CRITICAL)" +HZ_OK=0 +if (( SKIP_CONTAINERS )); then + log " Docker unavailable — falling back to TCP check only" +else + if wait_docker_container "dolphin-hazelcast" "dolphin-hazelcast" 90 1; then + HZ_OK=1 + else + log " Container check failed — trying TCP port 5701 directly..." + fi +fi + +if (( ! HZ_OK )); then + if wait_tcp "Hazelcast" "localhost" 5701 30; then + HZ_OK=1 + fi +fi + +if (( ! HZ_OK )); then + log " FATAL: Hazelcast not available after all checks — aborting" + log " ACTION: Check 'docker ps' and 'docker logs dolphin-hazelcast'" + log_section "=== STARTUP ABORTED (Hazelcast unavailable) ===" + exit 1 +fi + +# Extra 3s settle time for HZ to finish leader election +sleep 3 +log " HZ settle: 3s" + +# ── 3. ClickHouse (WARN) ───────────────────────────────────────────────────── + +log_section "3/4 ClickHouse (WARN — ch_writer spools on failure)" +CH_OK=0 +if (( ! SKIP_CONTAINERS )); then + if wait_docker_container "dolphin-clickhouse" "dolphin-clickhouse" 60 0; then + # Port-level confirm + if wait_tcp "ClickHouse HTTP" "localhost" 8123 20; then + CH_OK=1 + fi + fi +fi +if (( ! CH_OK )); then + if wait_tcp "ClickHouse HTTP" "localhost" 8123 20; then + CH_OK=1 + fi +fi +if (( ! CH_OK )); then + log " WARN: ClickHouse unreachable — ch_writer will spool; continuing" + OVERALL_STATUS="DEGRADED" +fi + +# ── 4. Prefect (WARN) ──────────────────────────────────────────────────────── + +log_section "4/4 Prefect (WARN — exf_fetcher degrades gracefully)" +PREFECT_OK=0 +if (( ! SKIP_CONTAINERS )); then + if wait_docker_container "dolphin-prefect" "dolphin-prefect" 45 0; then + if wait_tcp "Prefect API" "localhost" 4200 20; then + PREFECT_OK=1 + fi + fi +fi +if (( ! PREFECT_OK )); then + if wait_tcp "Prefect API" "localhost" 4200 15; then + PREFECT_OK=1 + fi +fi +if (( ! PREFECT_OK )); then + log " WARN: Prefect unreachable — exf_fetcher may degrade; continuing" + OVERALL_STATUS="DEGRADED" +fi + +# ── summary ────────────────────────────────────────────────────────────────── + +log_section "=== STARTUP CHECK COMPLETE ===" +log " Hazelcast : $(( HZ_OK )) (1=OK 0=FAIL)" +log " ClickHouse : $(( CH_OK )) (1=OK 0=WARN)" +log " Prefect : $(( PREFECT_OK )) (1=OK 0=WARN)" +log " Overall : ${OVERALL_STATUS}" +log " → Starting supervisord now" +log " Supervisord conf: /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf" +log " Log dir: ${SUPDIR}" +log "========================================" + +# Write a machine-readable status file BLUE can read on startup +cat > "${LOGDIR}/startup_status.json" <