services: autoheal: image: willfarrell/autoheal:latest container_name: dolphin-autoheal restart: always volumes: - /var/run/docker.sock:/var/run/docker.sock environment: - AUTOHEAL_CONTAINER_LABEL=autoheal - AUTOHEAL_INTERVAL=10 # poll every 10s (MHS fires first at ~1s) - AUTOHEAL_START_PERIOD=30 # grace on container cold start - AUTOHEAL_DEFAULT_STOP_TIMEOUT=10 hazelcast: image: hazelcast/hazelcast:5.3 container_name: dolphin-hazelcast # NOTE: autoheal REMOVED (2026-04-07). HZ is RAM-only volatile — restarting # wipes ALL state and causes cascading failures. Better to leave it running # even if temporarily unhealthy than to restart and lose everything. ports: - "5701:5701" environment: - JAVA_OPTS=-Xmx2g - HZ_CLUSTERNAME=dolphin volumes: - hz_data:/opt/hazelcast/data restart: unless-stopped init: true healthcheck: test: ["CMD-SHELL", "timeout 5 bash -c ' /dev/null || exit 1"] interval: 30s timeout: 5s retries: 3 start_period: 60s prefect-server: image: prefecthq/prefect:3-latest container_name: dolphin-prefect labels: - autoheal=true ports: - "4200:4200" command: prefect server start --host 0.0.0.0 environment: # CRITICAL: These must match the Tailscale FQDN for external access - PREFECT_UI_URL=http://dolphin.taile8ad92.ts.net:4200 - PREFECT_API_URL=http://dolphin.taile8ad92.ts.net:4200/api - PREFECT_SERVER_API_HOST=0.0.0.0 - PREFECT_SERVER_CORS_ALLOWED_ORIGINS=* - PYTHONUNBUFFERED=1 - PREFECT_LOGGING_TO_API_BATCH_INTERVAL=0.3 volumes: - prefect_data:/root/.prefect restart: unless-stopped healthcheck: test: ["CMD-SHELL", "curl -sf http://localhost:4200/api/health | grep -q true || exit 1"] interval: 10s timeout: 5s retries: 3 start_period: 45s clickhouse: image: clickhouse/clickhouse-server:24.3-alpine container_name: dolphin-clickhouse restart: unless-stopped labels: - autoheal=true ports: - "8123:8123" - "9000:9000" volumes: - prod_ch_data:/var/lib/clickhouse - ./clickhouse/config.xml:/etc/clickhouse-server/config.d/dolphin.xml:ro - ./clickhouse/users.xml:/etc/clickhouse-server/users.d/dolphin.xml:ro networks: - prod_default ulimits: nofile: soft: 262144 hard: 262144 healthcheck: test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1:8123/ping || exit 1"] interval: 10s timeout: 5s retries: 5 start_period: 30s otelcol: image: otel/opentelemetry-collector-contrib:0.121.0 container_name: dolphin-otelcol restart: unless-stopped labels: - autoheal=true ports: - "4317:4317" - "4318:4318" volumes: - ./otelcol/config.yaml:/etc/otelcol-contrib/config.yaml:ro networks: - prod_default depends_on: clickhouse: condition: service_healthy healthcheck: test: ["CMD-SHELL", "wget -q -O - http://localhost:13133/ | grep -q 'Server available' && echo ok || exit 1"] interval: 15s timeout: 5s retries: 3 start_period: 20s networks: prod_default: external: true volumes: hz_data: prefect_data: prod_ch_data: external: true