Files
DOLPHIN/prod/docker-compose.yml

142 lines
3.9 KiB
YAML
Raw Normal View History

services:
autoheal:
image: willfarrell/autoheal:latest
container_name: dolphin-autoheal
restart: always
volumes:
- /var/run/docker.sock:/var/run/docker.sock
environment:
- AUTOHEAL_CONTAINER_LABEL=autoheal
- AUTOHEAL_INTERVAL=10 # poll every 10s (MHS fires first at ~1s)
- AUTOHEAL_START_PERIOD=30 # grace on container cold start
- AUTOHEAL_DEFAULT_STOP_TIMEOUT=10
hazelcast:
image: hazelcast/hazelcast:5.3
container_name: dolphin-hazelcast
# NOTE: autoheal REMOVED (2026-04-07). HZ is RAM-only volatile — restarting
# wipes ALL state and causes cascading failures. Better to leave it running
# even if temporarily unhealthy than to restart and lose everything.
ports:
- "5701:5701"
environment:
- JAVA_OPTS=-Xmx2g
- HZ_CLUSTERNAME=dolphin
volumes:
- hz_data:/opt/hazelcast/data
restart: unless-stopped
init: true
healthcheck:
test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/localhost/5701' || exit 1"]
interval: 15s
timeout: 5s
retries: 5
start_period: 90s
hazelcast-mc:
image: hazelcast/management-center:5.3
container_name: dolphin-hazelcast-mc
labels:
- autoheal=true
ports:
- "8080:8080"
environment:
- MC_DEFAULT_CLUSTER=dolphin
- MC_DEFAULT_CLUSTER_MEMBERS=hazelcast:5701
depends_on:
hazelcast:
condition: service_healthy
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:8080/ > /dev/null || exit 1"]
interval: 30s
timeout: 5s
retries: 3
start_period: 60s
prefect-server:
image: prefecthq/prefect:3-latest
container_name: dolphin-prefect
labels:
- autoheal=true
ports:
- "4200:4200"
command: prefect server start --host 0.0.0.0
environment:
# CRITICAL: These must match the Tailscale FQDN for external access
- PREFECT_UI_URL=http://dolphin.taile8ad92.ts.net:4200
- PREFECT_API_URL=http://dolphin.taile8ad92.ts.net:4200/api
- PREFECT_SERVER_API_HOST=0.0.0.0
- PREFECT_SERVER_CORS_ALLOWED_ORIGINS=*
- PYTHONUNBUFFERED=1
- PREFECT_LOGGING_TO_API_BATCH_INTERVAL=0.3
volumes:
- prefect_data:/root/.prefect
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:4200/api/health | grep -q true || exit 1"]
interval: 10s
timeout: 5s
retries: 3
start_period: 45s
clickhouse:
image: clickhouse/clickhouse-server:24.3-alpine
container_name: dolphin-clickhouse
restart: unless-stopped
labels:
- autoheal=true
ports:
- "8123:8123"
- "9000:9000"
volumes:
- prod_ch_data:/var/lib/clickhouse
- ./clickhouse/config.xml:/etc/clickhouse-server/config.d/dolphin.xml:ro
- ./clickhouse/users.xml:/etc/clickhouse-server/users.d/dolphin.xml:ro
networks:
- prod_default
ulimits:
nofile:
soft: 262144
hard: 262144
healthcheck:
test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1:8123/ping || exit 1"]
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
otelcol:
image: otel/opentelemetry-collector-contrib:0.121.0
container_name: dolphin-otelcol
restart: unless-stopped
labels:
- autoheal=true
ports:
- "4317:4317"
- "4318:4318"
volumes:
- ./otelcol/config.yaml:/etc/otelcol-contrib/config.yaml:ro
networks:
- prod_default
depends_on:
clickhouse:
condition: service_healthy
healthcheck:
test: ["CMD-SHELL", "wget -q -O - http://localhost:13133/ | grep -q 'Server available' && echo ok || exit 1"]
interval: 15s
timeout: 5s
retries: 3
start_period: 20s
networks:
prod_default:
external: true
volumes:
hz_data:
prefect_data:
prod_ch_data:
external: true