initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems:
- prod/ (BLUE harness, configs, scripts, docs)
- nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved)
- adaptive_exit/ (AEM engine + models/bucket_assignments.pkl)
- Observability/ (EsoF advisor, TUI, dashboards)
- external_factors/ (EsoF producer)
- mc_forewarning_qlabs_fork/ (MC regime/envelope)

Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
hjnormey
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions

170
prod/dolphin_exit_handler.py Executable file
View File

@@ -0,0 +1,170 @@
"""Dolphin Graceful Exit Handler.
Registers signal handlers (SIGTERM, SIGINT, SIGHUP) and atexit callbacks
to log service lifecycle events to all channels: file log, ClickHouse,
and Hazelcast.
Usage in any service:
from dolphin_exit_handler import install_exit_handler
install_exit_handler("meta_health") # call once at startup
Uses stdlib only (atexit, signal). Industry-standard pattern —
no third-party dependency needed for this.
"""
import atexit
import logging
import os
import signal
import sys
import time
import traceback
from datetime import datetime, timezone
log = logging.getLogger("dolphin.exit")
# State for dedup — avoid double-logging on atexit + signal
_exit_logged = False
_service_name = "unknown"
_on_exit_callbacks = []
def _ts_iso():
return datetime.now(timezone.utc).isoformat()
def _log_exit(reason: str, exit_code: int = 0, signal_num: int = None):
"""Log exit event to all channels. Fire-and-forget — never raises."""
global _exit_logged
if _exit_logged:
return
_exit_logged = True
event = {
"service": _service_name,
"reason": reason,
"exit_code": exit_code,
"signal": signal_num,
"pid": os.getpid(),
"ts": _ts_iso(),
}
# 1. File log (always works)
log.warning("SERVICE_EXIT: %s reason=%s exit_code=%d signal=%s pid=%d",
_service_name, reason, exit_code, signal_num, os.getpid())
# 2. ClickHouse (best-effort)
try:
from ch_writer import ch_put
ch_put("service_lifecycle", {
"ts": int(time.time() * 1e6),
"service": _service_name,
"event": "EXIT",
"reason": reason,
"exit_code": exit_code,
"signal_num": signal_num or 0,
"pid": os.getpid(),
})
# Give the CH writer thread a moment to flush
time.sleep(0.3)
except Exception:
pass
# 3. Hazelcast (best-effort)
try:
import json
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name="dolphin",
cluster_members=["localhost:5701"],
connection_timeout=2.0,
)
m = client.get_map("DOLPHIN_SERVICE_LIFECYCLE").blocking()
m.put(_service_name, json.dumps(event), ttl=3600)
client.shutdown()
except Exception:
pass
# 4. Run registered callbacks
for cb in _on_exit_callbacks:
try:
cb(event)
except Exception:
pass
def _signal_handler(signum, frame):
"""Handle SIGTERM, SIGINT, SIGHUP."""
sig_name = {
signal.SIGTERM: "SIGTERM",
signal.SIGINT: "SIGINT",
signal.SIGHUP: "SIGHUP",
}.get(signum, f"SIG{signum}")
_log_exit(reason=sig_name, exit_code=128 + signum, signal_num=signum)
sys.exit(128 + signum)
def _atexit_handler():
"""Catch normal exit and unhandled exceptions."""
exc_type, exc_val, exc_tb = sys.exc_info()
if exc_type is not None and exc_type is not SystemExit:
reason = f"CRASH:{exc_type.__name__}:{exc_val}"
_log_exit(reason=reason, exit_code=1)
else:
code = 0
if exc_type is SystemExit and exc_val is not None:
code = getattr(exc_val, 'code', 0) or 0
_log_exit(reason="NORMAL_EXIT", exit_code=code)
def _excepthook(exc_type, exc_val, exc_tb):
"""Global exception hook — log crash before Python dies."""
reason = f"CRASH:{exc_type.__name__}:{exc_val}"
log.error("Unhandled exception in %s:\n%s", _service_name,
"".join(traceback.format_exception(exc_type, exc_val, exc_tb)))
_log_exit(reason=reason, exit_code=1)
sys.__excepthook__(exc_type, exc_val, exc_tb)
def install_exit_handler(service_name: str, on_exit=None):
"""Install exit handler for a Dolphin service.
Args:
service_name: e.g. "meta_health", "nautilus_trader", "exf_fetcher"
on_exit: optional callback(event_dict) for service-specific cleanup
"""
global _service_name
_service_name = service_name
if on_exit:
_on_exit_callbacks.append(on_exit)
# Register signal handlers
for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP):
try:
signal.signal(sig, _signal_handler)
except (OSError, ValueError):
pass # can't set handler in non-main thread
# Register atexit
atexit.register(_atexit_handler)
# Global excepthook for unhandled exceptions
sys.excepthook = _excepthook
log.info("SERVICE_START: %s pid=%d", service_name, os.getpid())
# Log startup to CH
try:
from ch_writer import ch_put
ch_put("service_lifecycle", {
"ts": int(time.time() * 1e6),
"service": service_name,
"event": "START",
"reason": "NORMAL_START",
"exit_code": 0,
"signal_num": 0,
"pid": os.getpid(),
})
except Exception:
pass