#!/bin/bash # ============================================================================= # Dolphin controlled startup dependency check # Runs as ExecStartPre for dolphin-supervisord.service # # Checks (in order, with per-step timeouts): # 1. Docker daemon responsive # 2. dolphin-hazelcast container healthy (CRITICAL — blocks on failure) # 3. dolphin-clickhouse container up (WARN — ch_writer spools to disk) # 4. dolphin-prefect container up (WARN — exf_fetcher degrades) # 5. Port-level TCP smoke-test for each # # Logs every step with timestamps to: # /tmp/dolphin_logs/startup.log (transient, same dir as BLUE logs) # /mnt/dolphinng5_predict/run_logs/dolphin_startup_.log (persistent) # # Exit codes: # 0 = all critical deps up → supervisord may start # 1 = critical dep (Hazelcast) timed out → systemd will not start supervisord # ============================================================================= set -euo pipefail LOGDIR="/tmp/dolphin_logs" SUPDIR="${LOGDIR}/supervisor" TRADEDIR="${LOGDIR}/trader" RUNLOG="/mnt/dolphinng5_predict/run_logs" STARTLOG="${LOGDIR}/startup.log" DATE=$(date -u '+%Y-%m-%d') PERSISTENT_LOG="${RUNLOG}/dolphin_startup_${DATE}.log" # ── helpers ───────────────────────────────────────────────────────────────── ts() { date -u '+%Y-%m-%dT%H:%M:%SZ'; } log() { local msg="[$(ts)] $*" echo "$msg" # → journald (captured by systemd) echo "$msg" >> "$STARTLOG" 2>/dev/null echo "$msg" >> "$PERSISTENT_LOG" 2>/dev/null || true } log_section() { log "────────────────────────────────────────" log "$*" log "────────────────────────────────────────" } tcp_ok() { # bash TCP pseudo-device — no nc required timeout 2 bash -c "echo >/dev/tcp/${1}/${2}" 2>/dev/null } wait_tcp() { local label="$1" host="$2" port="$3" timeout_s="${4:-60}" local elapsed=0 log " Waiting for TCP ${host}:${port} (${label}, timeout=${timeout_s}s)..." while ! tcp_ok "$host" "$port"; do if (( elapsed >= timeout_s )); then log " TIMEOUT: ${label} not reachable on ${host}:${port} after ${elapsed}s" return 1 fi sleep 2; (( elapsed += 2 )) done log " OK: ${label} TCP:${port} ready (${elapsed}s)" return 0 } wait_docker_container() { local label="$1" name="$2" timeout_s="${3:-90}" require_healthy="${4:-0}" local elapsed=0 log " Waiting for container '${name}' (timeout=${timeout_s}s)..." while true; do local status status=$(docker inspect --format '{{.State.Status}}' "$name" 2>/dev/null || echo "missing") local health health=$(docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' "$name" 2>/dev/null || echo "none") if [[ "$status" == "running" ]]; then if (( require_healthy )) && [[ "$health" != "none" ]] && [[ "$health" != "healthy" ]]; then : # keep waiting else log " OK: ${label} status=${status} health=${health} (${elapsed}s)" return 0 fi fi if (( elapsed >= timeout_s )); then log " TIMEOUT: ${label} container '${name}' status=${status} health=${health} after ${elapsed}s" return 1 fi sleep 3; (( elapsed += 3 )) done } # ── create dirs ────────────────────────────────────────────────────────────── mkdir -p "$SUPDIR" "$TRADEDIR" # Persistent log dir might not be writable at early boot — ignore failure mkdir -p "$RUNLOG" 2>/dev/null || true # ── begin ──────────────────────────────────────────────────────────────────── log_section "=== DOLPHIN STARTUP CHECK BEGIN ===" log "Host: $(hostname) Kernel: $(uname -r)" log "Startup log: ${STARTLOG}" log "Persistent: ${PERSISTENT_LOG}" OVERALL_STATUS="OK" # ── 1. Docker daemon ───────────────────────────────────────────────────────── log_section "1/4 Docker daemon" if ! docker info >/dev/null 2>&1; then log " WARN: Docker daemon not responding — container checks skipped" SKIP_CONTAINERS=1 else log " OK: Docker daemon responsive" SKIP_CONTAINERS=0 fi # ── 2. Hazelcast (CRITICAL) ────────────────────────────────────────────────── log_section "2/4 Hazelcast (CRITICAL)" HZ_OK=0 if (( SKIP_CONTAINERS )); then log " Docker unavailable — falling back to TCP check only" else if wait_docker_container "dolphin-hazelcast" "dolphin-hazelcast" 90 1; then HZ_OK=1 else log " Container check failed — trying TCP port 5701 directly..." fi fi if (( ! HZ_OK )); then if wait_tcp "Hazelcast" "localhost" 5701 30; then HZ_OK=1 fi fi if (( ! HZ_OK )); then log " FATAL: Hazelcast not available after all checks — aborting" log " ACTION: Check 'docker ps' and 'docker logs dolphin-hazelcast'" log_section "=== STARTUP ABORTED (Hazelcast unavailable) ===" exit 1 fi # Extra 3s settle time for HZ to finish leader election sleep 3 log " HZ settle: 3s" # ── 3. ClickHouse (WARN) ───────────────────────────────────────────────────── log_section "3/4 ClickHouse (WARN — ch_writer spools on failure)" CH_OK=0 if (( ! SKIP_CONTAINERS )); then if wait_docker_container "dolphin-clickhouse" "dolphin-clickhouse" 60 0; then # Port-level confirm if wait_tcp "ClickHouse HTTP" "localhost" 8123 20; then CH_OK=1 fi fi fi if (( ! CH_OK )); then if wait_tcp "ClickHouse HTTP" "localhost" 8123 20; then CH_OK=1 fi fi if (( ! CH_OK )); then log " WARN: ClickHouse unreachable — ch_writer will spool; continuing" OVERALL_STATUS="DEGRADED" fi # ── 4. Prefect (WARN) ──────────────────────────────────────────────────────── log_section "4/4 Prefect (WARN — exf_fetcher degrades gracefully)" PREFECT_OK=0 if (( ! SKIP_CONTAINERS )); then if wait_docker_container "dolphin-prefect" "dolphin-prefect" 45 0; then if wait_tcp "Prefect API" "localhost" 4200 20; then PREFECT_OK=1 fi fi fi if (( ! PREFECT_OK )); then if wait_tcp "Prefect API" "localhost" 4200 15; then PREFECT_OK=1 fi fi if (( ! PREFECT_OK )); then log " WARN: Prefect unreachable — exf_fetcher may degrade; continuing" OVERALL_STATUS="DEGRADED" fi # ── summary ────────────────────────────────────────────────────────────────── log_section "=== STARTUP CHECK COMPLETE ===" log " Hazelcast : $(( HZ_OK )) (1=OK 0=FAIL)" log " ClickHouse : $(( CH_OK )) (1=OK 0=WARN)" log " Prefect : $(( PREFECT_OK )) (1=OK 0=WARN)" log " Overall : ${OVERALL_STATUS}" log " → Starting supervisord now" log " Supervisord conf: /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf" log " Log dir: ${SUPDIR}" log "========================================" # Write a machine-readable status file BLUE can read on startup cat > "${LOGDIR}/startup_status.json" <