From 6fc9eab927df2f8d9215adb6dc0b5050acbed997 Mon Sep 17 00:00:00 2001
From: Codex <codex@localhost>
Date: Mon, 8 Jun 2026 14:22:22 +0200
Subject: [PATCH] OPS: supervisord systemd watchdog + controlled-bringup
 startup script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds dolphin-supervisord.service (installed + enabled) and dolphin_startup_check.sh:
- ExecStartPre waits for HZ (CRITICAL/blocks), CH+Prefect (WARN/degraded-ok)
- Logs to /tmp/dolphin_logs/startup.log + run_logs/dolphin_startup_<date>.log
- Writes machine-readable /tmp/dolphin_logs/startup_status.json on every start
- nautilus_trader remains autostart=false — BLUE must be started manually
SYSTEM BIBLE bumped to v7.1; §16.10 updated, §16.14 added.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 prod/docs/SYSTEM_BIBLE.md                |  83 ++++++++-
 prod/supervisor/dolphin_startup_check.sh | 220 +++++++++++++++++++++++
 2 files changed, 297 insertions(+), 6 deletions(-)
 create mode 100644 prod/supervisor/dolphin_startup_check.sh
diff --git a/prod/docs/SYSTEM_BIBLE.md b/prod/docs/SYSTEM_BIBLE.md
index cf32919..721a53f 100644
--- a/prod/docs/SYSTEM_BIBLE.md
+++ b/prod/docs/SYSTEM_BIBLE.md
@@ -1,7 +1,7 @@
 # DOLPHIN-NAUTILUS SYSTEM BIBLE
 ## Doctrinal Reference — As Running 2026-04-05
 
-**Version**: v7.0 — PINK DITAv2 Fee Accounting Fix + Orphan Prevention (2026-06-08)
+**Version**: v7.1 — supervisord systemd watchdog + controlled-bringup startup (2026-06-08)
 **Previous version**: v6.0 — NG8 Linux Scanner + TUI v3 Live Observability + Test Footer CI (2026-04-05)
 **Previous version**: v4.1 — Multi-Speed Event-Driven Architecture (2026-03-25)
 **CI gate (Nautilus)**: 46/46 tests green
@@ -1362,12 +1362,24 @@ supervisorctl -c /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.con
 
 ### 16.10 Daemon Start Sequence
 
-**IMPORTANT**: supervisord has NO systemd unit — it is NOT auto-started on reboot.
-After any reboot or OOM kill, supervisord must be started manually (step 2 below).
+**`supervisord` is now auto-started on reboot via `dolphin-supervisord.service` (systemd).**
+See §16.14 for watchdog details. Manual start is still the correct method for the initial
+activation or when you need to bypass the watchdog.
 
 ```bash
-# 1. Verify Hazelcast/Prefect are running (systemd-managed, survive reboots)
-systemctl status dolphin-prefect-worker
+# --- Automatic (post-reboot, systemd handles it) ---
+# dolphin-supervisord.service runs dolphin_startup_check.sh as ExecStartPre:
+#   • Waits for dolphin-hazelcast container healthy (CRITICAL — blocks if not up)
+#   • Waits for dolphin-clickhouse (WARN — ch_writer spools on failure)
+#   • Waits for dolphin-prefect    (WARN — exf_fetcher degrades)
+#   • Writes /tmp/dolphin_logs/startup_status.json + /tmp/dolphin_logs/startup.log
+# Then starts supervisord. dolphin_data group (OBF, ACB, MHS, exf, maras, esof)
+# auto-starts. nautilus_trader (BLUE) does NOT auto-start — must be manual.
+
+# --- Manual (if you need to start/restart supervisord yourself) ---
+
+# 1. Verify Hazelcast/ClickHouse containers are running
+docker ps --filter name=dolphin-hazelcast --filter name=dolphin-clickhouse
 
 # 2. Start supervisord (MUST export DOLPHIN_LOG_ROOT — used by logfile= directives)
 mkdir -p /tmp/dolphin_logs/supervisor /tmp/dolphin_logs/trader
@@ -1378,7 +1390,7 @@ DOLPHIN_LOG_ROOT=/tmp/dolphin_logs supervisord \
 # 3. Verify data pipeline is up
 supervisorctl -c /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf status
 
-# 4. Start BLUE (manual — autostart=false; only start after verifying BingX position state)
+# 4. Start BLUE (manual — autostart=false; only start after verifying exchange position state)
 supervisorctl -c /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf \
     start dolphin:nautilus_trader
 
@@ -1441,6 +1453,65 @@ supervisorctl -c /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.con
     start dolphin:nautilus_trader
 ```
 
+### 16.14 supervisord Systemd Watchdog
+
+**Unit:** `/etc/systemd/system/dolphin-supervisord.service`
+**Pre-start script:** `/mnt/dolphinng5_predict/prod/supervisor/dolphin_startup_check.sh`
+
+#### What it does
+
+| Phase | Action |
+|---|---|
+| **ExecStartPre 1** | `mkdir -p /tmp/dolphin_logs/supervisor /tmp/dolphin_logs/trader` |
+| **ExecStartPre 2** | Runs `dolphin_startup_check.sh` — checks HZ (CRITICAL), CH (WARN), Prefect (WARN) |
+| **ExecStart** | `supervisord -c dolphin-supervisord.conf` |
+| **ExecStop** | `supervisorctl shutdown` |
+| **Restart** | `on-failure`, 30 s delay — only fires on crash/OOM-kill, never on clean stop |
+| **TimeoutStartSec** | 300 s — enough for bringup-check loops (HZ=90 s, CH=60 s, Prefect=45 s) |
+
+#### Startup check logic (dolphin_startup_check.sh)
+
+1. Checks Docker daemon. If unavailable, skips container checks, falls back to TCP.
+2. **Hazelcast** — container must reach `running` + `healthy` within 90 s, then TCP:5701
+   confirmed. If HZ is not up: **script exits 1 → systemd refuses to start supervisord**.
+3. **ClickHouse** — container running + TCP:8123 within 60 s. Failure = WARN only.
+4. **Prefect** — container running + TCP:4200 within 45 s. Failure = WARN only.
+5. Writes `/tmp/dolphin_logs/startup.log` (same dir as all supervisor logs).
+6. Writes `/mnt/dolphinng5_predict/run_logs/dolphin_startup_<date>.log` (persistent).
+7. Writes `/tmp/dolphin_logs/startup_status.json` (machine-readable, for BLUE to check).
+
+#### Key safety rule
+
+`nautilus_trader` is `autostart=false` in supervisord.conf. **The systemd watchdog will
+NOT auto-start BLUE.** BLUE must always be started manually after verifying exchange
+position state. The watchdog only manages supervisord and the data pipeline.
+
+#### Management commands
+
+```bash
+# Status
+systemctl status dolphin-supervisord.service
+
+# Start (e.g. first boot or after manual stop)
+systemctl start dolphin-supervisord.service
+# Startup log: /tmp/dolphin_logs/startup.log  (also journalctl -u dolphin-supervisord)
+
+# Stop (does NOT restart — clean exit)
+systemctl stop dolphin-supervisord.service
+
+# Disable watchdog (maintenance)
+systemctl disable dolphin-supervisord.service
+
+# Re-enable
+systemctl enable dolphin-supervisord.service
+
+# View startup check log
+cat /tmp/dolphin_logs/startup.log
+journalctl -u dolphin-supervisord.service -n 100
+```
+
+---
+
 ### 16.11 Monitoring Endpoints
 
 | Service | URL / Command |
diff --git a/prod/supervisor/dolphin_startup_check.sh b/prod/supervisor/dolphin_startup_check.sh
new file mode 100644
index 0000000..0c8487e
--- /dev/null
+++ b/prod/supervisor/dolphin_startup_check.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+# =============================================================================
+# Dolphin controlled startup dependency check
+# Runs as ExecStartPre for dolphin-supervisord.service
+#
+# Checks (in order, with per-step timeouts):
+#   1. Docker daemon responsive
+#   2. dolphin-hazelcast container healthy (CRITICAL — blocks on failure)
+#   3. dolphin-clickhouse container up   (WARN  — ch_writer spools to disk)
+#   4. dolphin-prefect container up      (WARN  — exf_fetcher degrades)
+#   5. Port-level TCP smoke-test for each
+#
+# Logs every step with timestamps to:
+#   /tmp/dolphin_logs/startup.log          (transient, same dir as BLUE logs)
+#   /mnt/dolphinng5_predict/run_logs/dolphin_startup_<date>.log  (persistent)
+#
+# Exit codes:
+#   0  = all critical deps up → supervisord may start
+#   1  = critical dep (Hazelcast) timed out → systemd will not start supervisord
+# =============================================================================
+
+set -euo pipefail
+
+LOGDIR="/tmp/dolphin_logs"
+SUPDIR="${LOGDIR}/supervisor"
+TRADEDIR="${LOGDIR}/trader"
+RUNLOG="/mnt/dolphinng5_predict/run_logs"
+STARTLOG="${LOGDIR}/startup.log"
+DATE=$(date -u '+%Y-%m-%d')
+PERSISTENT_LOG="${RUNLOG}/dolphin_startup_${DATE}.log"
+
+# ── helpers ─────────────────────────────────────────────────────────────────
+
+ts() { date -u '+%Y-%m-%dT%H:%M:%SZ'; }
+
+log() {
+    local msg="[$(ts)] $*"
+    echo "$msg"                          # → journald (captured by systemd)
+    echo "$msg" >> "$STARTLOG" 2>/dev/null
+    echo "$msg" >> "$PERSISTENT_LOG" 2>/dev/null || true
+}
+
+log_section() {
+    log "────────────────────────────────────────"
+    log "$*"
+    log "────────────────────────────────────────"
+}
+
+tcp_ok() {
+    # bash TCP pseudo-device — no nc required
+    timeout 2 bash -c "echo >/dev/tcp/${1}/${2}" 2>/dev/null
+}
+
+wait_tcp() {
+    local label="$1" host="$2" port="$3" timeout_s="${4:-60}"
+    local elapsed=0
+    log "  Waiting for TCP ${host}:${port} (${label}, timeout=${timeout_s}s)..."
+    while ! tcp_ok "$host" "$port"; do
+        if (( elapsed >= timeout_s )); then
+            log "  TIMEOUT: ${label} not reachable on ${host}:${port} after ${elapsed}s"
+            return 1
+        fi
+        sleep 2; (( elapsed += 2 ))
+    done
+    log "  OK: ${label} TCP:${port} ready (${elapsed}s)"
+    return 0
+}
+
+wait_docker_container() {
+    local label="$1" name="$2" timeout_s="${3:-90}" require_healthy="${4:-0}"
+    local elapsed=0
+    log "  Waiting for container '${name}' (timeout=${timeout_s}s)..."
+    while true; do
+        local status
+        status=$(docker inspect --format '{{.State.Status}}' "$name" 2>/dev/null || echo "missing")
+        local health
+        health=$(docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' "$name" 2>/dev/null || echo "none")
+
+        if [[ "$status" == "running" ]]; then
+            if (( require_healthy )) && [[ "$health" != "none" ]] && [[ "$health" != "healthy" ]]; then
+                : # keep waiting
+            else
+                log "  OK: ${label} status=${status} health=${health} (${elapsed}s)"
+                return 0
+            fi
+        fi
+
+        if (( elapsed >= timeout_s )); then
+            log "  TIMEOUT: ${label} container '${name}' status=${status} health=${health} after ${elapsed}s"
+            return 1
+        fi
+        sleep 3; (( elapsed += 3 ))
+    done
+}
+
+# ── create dirs ──────────────────────────────────────────────────────────────
+
+mkdir -p "$SUPDIR" "$TRADEDIR"
+# Persistent log dir might not be writable at early boot — ignore failure
+mkdir -p "$RUNLOG" 2>/dev/null || true
+
+# ── begin ────────────────────────────────────────────────────────────────────
+
+log_section "=== DOLPHIN STARTUP CHECK BEGIN ==="
+log "Host: $(hostname)  Kernel: $(uname -r)"
+log "Startup log: ${STARTLOG}"
+log "Persistent:  ${PERSISTENT_LOG}"
+
+OVERALL_STATUS="OK"
+
+# ── 1. Docker daemon ─────────────────────────────────────────────────────────
+
+log_section "1/4 Docker daemon"
+if ! docker info >/dev/null 2>&1; then
+    log "  WARN: Docker daemon not responding — container checks skipped"
+    SKIP_CONTAINERS=1
+else
+    log "  OK: Docker daemon responsive"
+    SKIP_CONTAINERS=0
+fi
+
+# ── 2. Hazelcast (CRITICAL) ──────────────────────────────────────────────────
+
+log_section "2/4 Hazelcast (CRITICAL)"
+HZ_OK=0
+if (( SKIP_CONTAINERS )); then
+    log "  Docker unavailable — falling back to TCP check only"
+else
+    if wait_docker_container "dolphin-hazelcast" "dolphin-hazelcast" 90 1; then
+        HZ_OK=1
+    else
+        log "  Container check failed — trying TCP port 5701 directly..."
+    fi
+fi
+
+if (( ! HZ_OK )); then
+    if wait_tcp "Hazelcast" "localhost" 5701 30; then
+        HZ_OK=1
+    fi
+fi
+
+if (( ! HZ_OK )); then
+    log "  FATAL: Hazelcast not available after all checks — aborting"
+    log "  ACTION: Check 'docker ps' and 'docker logs dolphin-hazelcast'"
+    log_section "=== STARTUP ABORTED (Hazelcast unavailable) ==="
+    exit 1
+fi
+
+# Extra 3s settle time for HZ to finish leader election
+sleep 3
+log "  HZ settle: 3s"
+
+# ── 3. ClickHouse (WARN) ─────────────────────────────────────────────────────
+
+log_section "3/4 ClickHouse (WARN — ch_writer spools on failure)"
+CH_OK=0
+if (( ! SKIP_CONTAINERS )); then
+    if wait_docker_container "dolphin-clickhouse" "dolphin-clickhouse" 60 0; then
+        # Port-level confirm
+        if wait_tcp "ClickHouse HTTP" "localhost" 8123 20; then
+            CH_OK=1
+        fi
+    fi
+fi
+if (( ! CH_OK )); then
+    if wait_tcp "ClickHouse HTTP" "localhost" 8123 20; then
+        CH_OK=1
+    fi
+fi
+if (( ! CH_OK )); then
+    log "  WARN: ClickHouse unreachable — ch_writer will spool; continuing"
+    OVERALL_STATUS="DEGRADED"
+fi
+
+# ── 4. Prefect (WARN) ────────────────────────────────────────────────────────
+
+log_section "4/4 Prefect (WARN — exf_fetcher degrades gracefully)"
+PREFECT_OK=0
+if (( ! SKIP_CONTAINERS )); then
+    if wait_docker_container "dolphin-prefect" "dolphin-prefect" 45 0; then
+        if wait_tcp "Prefect API" "localhost" 4200 20; then
+            PREFECT_OK=1
+        fi
+    fi
+fi
+if (( ! PREFECT_OK )); then
+    if wait_tcp "Prefect API" "localhost" 4200 15; then
+        PREFECT_OK=1
+    fi
+fi
+if (( ! PREFECT_OK )); then
+    log "  WARN: Prefect unreachable — exf_fetcher may degrade; continuing"
+    OVERALL_STATUS="DEGRADED"
+fi
+
+# ── summary ──────────────────────────────────────────────────────────────────
+
+log_section "=== STARTUP CHECK COMPLETE ==="
+log "  Hazelcast  : $(( HZ_OK ))     (1=OK 0=FAIL)"
+log "  ClickHouse : $(( CH_OK ))     (1=OK 0=WARN)"
+log "  Prefect    : $(( PREFECT_OK ))     (1=OK 0=WARN)"
+log "  Overall    : ${OVERALL_STATUS}"
+log "  → Starting supervisord now"
+log "  Supervisord conf: /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
+log "  Log dir: ${SUPDIR}"
+log "========================================"
+
+# Write a machine-readable status file BLUE can read on startup
+cat > "${LOGDIR}/startup_status.json" <<EOF
+{
+  "ts": "$(ts)",
+  "overall": "${OVERALL_STATUS}",
+  "hazelcast_ok": $(( HZ_OK )),
+  "clickhouse_ok": $(( CH_OK )),
+  "prefect_ok": $(( PREFECT_OK )),
+  "log": "${STARTLOG}"
+}
+EOF
+
+exit 0