initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
141
prod/docker-compose.yml
Executable file
141
prod/docker-compose.yml
Executable file
@@ -0,0 +1,141 @@
|
||||
services:
|
||||
|
||||
autoheal:
|
||||
image: willfarrell/autoheal:latest
|
||||
container_name: dolphin-autoheal
|
||||
restart: always
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
environment:
|
||||
- AUTOHEAL_CONTAINER_LABEL=autoheal
|
||||
- AUTOHEAL_INTERVAL=10 # poll every 10s (MHS fires first at ~1s)
|
||||
- AUTOHEAL_START_PERIOD=30 # grace on container cold start
|
||||
- AUTOHEAL_DEFAULT_STOP_TIMEOUT=10
|
||||
|
||||
hazelcast:
|
||||
image: hazelcast/hazelcast:5.3
|
||||
container_name: dolphin-hazelcast
|
||||
# NOTE: autoheal REMOVED (2026-04-07). HZ is RAM-only volatile — restarting
|
||||
# wipes ALL state and causes cascading failures. Better to leave it running
|
||||
# even if temporarily unhealthy than to restart and lose everything.
|
||||
ports:
|
||||
- "5701:5701"
|
||||
environment:
|
||||
- JAVA_OPTS=-Xmx2g
|
||||
- HZ_CLUSTERNAME=dolphin
|
||||
volumes:
|
||||
- hz_data:/opt/hazelcast/data
|
||||
restart: unless-stopped
|
||||
init: true
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "timeout 5 bash -c '</dev/tcp/localhost/5701' || exit 1"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 90s
|
||||
|
||||
hazelcast-mc:
|
||||
image: hazelcast/management-center:5.3
|
||||
container_name: dolphin-hazelcast-mc
|
||||
labels:
|
||||
- autoheal=true
|
||||
ports:
|
||||
- "8080:8080"
|
||||
environment:
|
||||
- MC_DEFAULT_CLUSTER=dolphin
|
||||
- MC_DEFAULT_CLUSTER_MEMBERS=hazelcast:5701
|
||||
depends_on:
|
||||
hazelcast:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -sf http://localhost:8080/ > /dev/null || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
|
||||
prefect-server:
|
||||
image: prefecthq/prefect:3-latest
|
||||
container_name: dolphin-prefect
|
||||
labels:
|
||||
- autoheal=true
|
||||
ports:
|
||||
- "4200:4200"
|
||||
command: prefect server start --host 0.0.0.0
|
||||
environment:
|
||||
# CRITICAL: These must match the Tailscale FQDN for external access
|
||||
- PREFECT_UI_URL=http://dolphin.taile8ad92.ts.net:4200
|
||||
- PREFECT_API_URL=http://dolphin.taile8ad92.ts.net:4200/api
|
||||
- PREFECT_SERVER_API_HOST=0.0.0.0
|
||||
- PREFECT_SERVER_CORS_ALLOWED_ORIGINS=*
|
||||
- PYTHONUNBUFFERED=1
|
||||
- PREFECT_LOGGING_TO_API_BATCH_INTERVAL=0.3
|
||||
volumes:
|
||||
- prefect_data:/root/.prefect
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -sf http://localhost:4200/api/health | grep -q true || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 45s
|
||||
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:24.3-alpine
|
||||
container_name: dolphin-clickhouse
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
- autoheal=true
|
||||
ports:
|
||||
- "8123:8123"
|
||||
- "9000:9000"
|
||||
volumes:
|
||||
- prod_ch_data:/var/lib/clickhouse
|
||||
- ./clickhouse/config.xml:/etc/clickhouse-server/config.d/dolphin.xml:ro
|
||||
- ./clickhouse/users.xml:/etc/clickhouse-server/users.d/dolphin.xml:ro
|
||||
networks:
|
||||
- prod_default
|
||||
ulimits:
|
||||
nofile:
|
||||
soft: 262144
|
||||
hard: 262144
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1:8123/ping || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
|
||||
otelcol:
|
||||
image: otel/opentelemetry-collector-contrib:0.121.0
|
||||
container_name: dolphin-otelcol
|
||||
restart: unless-stopped
|
||||
labels:
|
||||
- autoheal=true
|
||||
ports:
|
||||
- "4317:4317"
|
||||
- "4318:4318"
|
||||
volumes:
|
||||
- ./otelcol/config.yaml:/etc/otelcol-contrib/config.yaml:ro
|
||||
networks:
|
||||
- prod_default
|
||||
depends_on:
|
||||
clickhouse:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q -O - http://localhost:13133/ | grep -q 'Server available' && echo ok || exit 1"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 20s
|
||||
|
||||
networks:
|
||||
prod_default:
|
||||
external: true
|
||||
|
||||
volumes:
|
||||
hz_data:
|
||||
prefect_data:
|
||||
prod_ch_data:
|
||||
external: true
|
||||
Reference in New Issue
Block a user