services:

  autoheal:
    image: willfarrell/autoheal:latest
    container_name: dolphin-autoheal
    restart: always
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    environment:
      - AUTOHEAL_CONTAINER_LABEL=autoheal
      - AUTOHEAL_INTERVAL=10          # poll every 10s (MHS fires first at ~1s)
      - AUTOHEAL_START_PERIOD=30      # grace on container cold start
      - AUTOHEAL_DEFAULT_STOP_TIMEOUT=10

  hazelcast:
    image: hazelcast/hazelcast:5.3
    container_name: dolphin-hazelcast
    # NOTE: autoheal REMOVED (2026-04-07). HZ is RAM-only volatile — restarting
    # wipes ALL state and causes cascading failures. Better to leave it running
    # even if temporarily unhealthy than to restart and lose everything.
    ports:
      - "5701:5701"
    environment:
      - JAVA_OPTS=-Xmx2g
      - HZ_CLUSTERNAME=dolphin
    volumes:
      - hz_data:/opt/hazelcast/data
    restart: unless-stopped
    init: true
    healthcheck:
      test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/localhost/5701' || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 5
      start_period: 90s

  hazelcast-mc:
    image: hazelcast/management-center:5.3
    container_name: dolphin-hazelcast-mc
    labels:
      - autoheal=true
    ports:
      - "8080:8080"
    environment:
      - MC_DEFAULT_CLUSTER=dolphin
      - MC_DEFAULT_CLUSTER_MEMBERS=hazelcast:5701
    depends_on:
      hazelcast:
        condition: service_healthy
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "curl -sf http://localhost:8080/ > /dev/null || exit 1"]
      interval: 30s
      timeout: 5s
      retries: 3
      start_period: 60s

  prefect-server:
    image: prefecthq/prefect:3-latest
    container_name: dolphin-prefect
    labels:
      - autoheal=true
    ports:
      - "4200:4200"
    command: prefect server start --host 0.0.0.0
    environment:
      # CRITICAL: These must match the Tailscale FQDN for external access
      - PREFECT_UI_URL=http://dolphin.taile8ad92.ts.net:4200
      - PREFECT_API_URL=http://dolphin.taile8ad92.ts.net:4200/api
      - PREFECT_SERVER_API_HOST=0.0.0.0
      - PREFECT_SERVER_CORS_ALLOWED_ORIGINS=*
      - PYTHONUNBUFFERED=1
      - PREFECT_LOGGING_TO_API_BATCH_INTERVAL=0.3
    volumes:
      - prefect_data:/root/.prefect
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "curl -sf http://localhost:4200/api/health | grep -q true || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 45s

  clickhouse:
    image: clickhouse/clickhouse-server:24.3-alpine
    container_name: dolphin-clickhouse
    restart: unless-stopped
    labels:
      - autoheal=true
    ports:
      - "8123:8123"
      - "9000:9000"
    volumes:
      - prod_ch_data:/var/lib/clickhouse
      - ./clickhouse/config.xml:/etc/clickhouse-server/config.d/dolphin.xml:ro
      - ./clickhouse/users.xml:/etc/clickhouse-server/users.d/dolphin.xml:ro
    networks:
      - prod_default
    ulimits:
      nofile:
        soft: 262144
        hard: 262144
    healthcheck:
      test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1:8123/ping || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 5
      start_period: 30s

  otelcol:
    image: otel/opentelemetry-collector-contrib:0.121.0
    container_name: dolphin-otelcol
    restart: unless-stopped
    labels:
      - autoheal=true
    ports:
      - "4317:4317"
      - "4318:4318"
    volumes:
      - ./otelcol/config.yaml:/etc/otelcol-contrib/config.yaml:ro
    networks:
      - prod_default
    depends_on:
      clickhouse:
        condition: service_healthy
    healthcheck:
      test: ["CMD-SHELL", "wget -q -O - http://localhost:13133/ | grep -q 'Server available' && echo ok || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 3
      start_period: 20s

networks:
  prod_default:
    external: true

volumes:
  hz_data:
  prefect_data:
  prod_ch_data:
    external: true