Files
siloqy/prod/supervisor/dolphin_startup_check.sh

221 lines
8.1 KiB
Bash
Raw Normal View History

#!/bin/bash
# =============================================================================
# Dolphin controlled startup dependency check
# Runs as ExecStartPre for dolphin-supervisord.service
#
# Checks (in order, with per-step timeouts):
# 1. Docker daemon responsive
# 2. dolphin-hazelcast container healthy (CRITICAL — blocks on failure)
# 3. dolphin-clickhouse container up (WARN — ch_writer spools to disk)
# 4. dolphin-prefect container up (WARN — exf_fetcher degrades)
# 5. Port-level TCP smoke-test for each
#
# Logs every step with timestamps to:
# /tmp/dolphin_logs/startup.log (transient, same dir as BLUE logs)
# /mnt/dolphinng5_predict/run_logs/dolphin_startup_<date>.log (persistent)
#
# Exit codes:
# 0 = all critical deps up → supervisord may start
# 1 = critical dep (Hazelcast) timed out → systemd will not start supervisord
# =============================================================================
set -euo pipefail
LOGDIR="/tmp/dolphin_logs"
SUPDIR="${LOGDIR}/supervisor"
TRADEDIR="${LOGDIR}/trader"
RUNLOG="/mnt/dolphinng5_predict/run_logs"
STARTLOG="${LOGDIR}/startup.log"
DATE=$(date -u '+%Y-%m-%d')
PERSISTENT_LOG="${RUNLOG}/dolphin_startup_${DATE}.log"
# ── helpers ─────────────────────────────────────────────────────────────────
ts() { date -u '+%Y-%m-%dT%H:%M:%SZ'; }
log() {
local msg="[$(ts)] $*"
echo "$msg" # → journald (captured by systemd)
echo "$msg" >> "$STARTLOG" 2>/dev/null
echo "$msg" >> "$PERSISTENT_LOG" 2>/dev/null || true
}
log_section() {
log "────────────────────────────────────────"
log "$*"
log "────────────────────────────────────────"
}
tcp_ok() {
# bash TCP pseudo-device — no nc required
timeout 2 bash -c "echo >/dev/tcp/${1}/${2}" 2>/dev/null
}
wait_tcp() {
local label="$1" host="$2" port="$3" timeout_s="${4:-60}"
local elapsed=0
log " Waiting for TCP ${host}:${port} (${label}, timeout=${timeout_s}s)..."
while ! tcp_ok "$host" "$port"; do
if (( elapsed >= timeout_s )); then
log " TIMEOUT: ${label} not reachable on ${host}:${port} after ${elapsed}s"
return 1
fi
sleep 2; (( elapsed += 2 ))
done
log " OK: ${label} TCP:${port} ready (${elapsed}s)"
return 0
}
wait_docker_container() {
local label="$1" name="$2" timeout_s="${3:-90}" require_healthy="${4:-0}"
local elapsed=0
log " Waiting for container '${name}' (timeout=${timeout_s}s)..."
while true; do
local status
status=$(docker inspect --format '{{.State.Status}}' "$name" 2>/dev/null || echo "missing")
local health
health=$(docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' "$name" 2>/dev/null || echo "none")
if [[ "$status" == "running" ]]; then
if (( require_healthy )) && [[ "$health" != "none" ]] && [[ "$health" != "healthy" ]]; then
: # keep waiting
else
log " OK: ${label} status=${status} health=${health} (${elapsed}s)"
return 0
fi
fi
if (( elapsed >= timeout_s )); then
log " TIMEOUT: ${label} container '${name}' status=${status} health=${health} after ${elapsed}s"
return 1
fi
sleep 3; (( elapsed += 3 ))
done
}
# ── create dirs ──────────────────────────────────────────────────────────────
mkdir -p "$SUPDIR" "$TRADEDIR"
# Persistent log dir might not be writable at early boot — ignore failure
mkdir -p "$RUNLOG" 2>/dev/null || true
# ── begin ────────────────────────────────────────────────────────────────────
log_section "=== DOLPHIN STARTUP CHECK BEGIN ==="
log "Host: $(hostname) Kernel: $(uname -r)"
log "Startup log: ${STARTLOG}"
log "Persistent: ${PERSISTENT_LOG}"
OVERALL_STATUS="OK"
# ── 1. Docker daemon ─────────────────────────────────────────────────────────
log_section "1/4 Docker daemon"
if ! docker info >/dev/null 2>&1; then
log " WARN: Docker daemon not responding — container checks skipped"
SKIP_CONTAINERS=1
else
log " OK: Docker daemon responsive"
SKIP_CONTAINERS=0
fi
# ── 2. Hazelcast (CRITICAL) ──────────────────────────────────────────────────
log_section "2/4 Hazelcast (CRITICAL)"
HZ_OK=0
if (( SKIP_CONTAINERS )); then
log " Docker unavailable — falling back to TCP check only"
else
if wait_docker_container "dolphin-hazelcast" "dolphin-hazelcast" 90 1; then
HZ_OK=1
else
log " Container check failed — trying TCP port 5701 directly..."
fi
fi
if (( ! HZ_OK )); then
if wait_tcp "Hazelcast" "localhost" 5701 30; then
HZ_OK=1
fi
fi
if (( ! HZ_OK )); then
log " FATAL: Hazelcast not available after all checks — aborting"
log " ACTION: Check 'docker ps' and 'docker logs dolphin-hazelcast'"
log_section "=== STARTUP ABORTED (Hazelcast unavailable) ==="
exit 1
fi
# Extra 3s settle time for HZ to finish leader election
sleep 3
log " HZ settle: 3s"
# ── 3. ClickHouse (WARN) ─────────────────────────────────────────────────────
log_section "3/4 ClickHouse (WARN — ch_writer spools on failure)"
CH_OK=0
if (( ! SKIP_CONTAINERS )); then
if wait_docker_container "dolphin-clickhouse" "dolphin-clickhouse" 60 0; then
# Port-level confirm
if wait_tcp "ClickHouse HTTP" "localhost" 8123 20; then
CH_OK=1
fi
fi
fi
if (( ! CH_OK )); then
if wait_tcp "ClickHouse HTTP" "localhost" 8123 20; then
CH_OK=1
fi
fi
if (( ! CH_OK )); then
log " WARN: ClickHouse unreachable — ch_writer will spool; continuing"
OVERALL_STATUS="DEGRADED"
fi
# ── 4. Prefect (WARN) ────────────────────────────────────────────────────────
log_section "4/4 Prefect (WARN — exf_fetcher degrades gracefully)"
PREFECT_OK=0
if (( ! SKIP_CONTAINERS )); then
if wait_docker_container "dolphin-prefect" "dolphin-prefect" 45 0; then
if wait_tcp "Prefect API" "localhost" 4200 20; then
PREFECT_OK=1
fi
fi
fi
if (( ! PREFECT_OK )); then
if wait_tcp "Prefect API" "localhost" 4200 15; then
PREFECT_OK=1
fi
fi
if (( ! PREFECT_OK )); then
log " WARN: Prefect unreachable — exf_fetcher may degrade; continuing"
OVERALL_STATUS="DEGRADED"
fi
# ── summary ──────────────────────────────────────────────────────────────────
log_section "=== STARTUP CHECK COMPLETE ==="
log " Hazelcast : $(( HZ_OK )) (1=OK 0=FAIL)"
log " ClickHouse : $(( CH_OK )) (1=OK 0=WARN)"
log " Prefect : $(( PREFECT_OK )) (1=OK 0=WARN)"
log " Overall : ${OVERALL_STATUS}"
log " → Starting supervisord now"
log " Supervisord conf: /mnt/dolphinng5_predict/prod/supervisor/dolphin-supervisord.conf"
log " Log dir: ${SUPDIR}"
log "========================================"
# Write a machine-readable status file BLUE can read on startup
cat > "${LOGDIR}/startup_status.json" <<EOF
{
"ts": "$(ts)",
"overall": "${OVERALL_STATUS}",
"hazelcast_ok": $(( HZ_OK )),
"clickhouse_ok": $(( CH_OK )),
"prefect_ok": $(( PREFECT_OK )),
"log": "${STARTLOG}"
}
EOF
exit 0