initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
170
prod/dolphin_exit_handler.py
Executable file
170
prod/dolphin_exit_handler.py
Executable file
@@ -0,0 +1,170 @@
|
||||
"""Dolphin Graceful Exit Handler.
|
||||
|
||||
Registers signal handlers (SIGTERM, SIGINT, SIGHUP) and atexit callbacks
|
||||
to log service lifecycle events to all channels: file log, ClickHouse,
|
||||
and Hazelcast.
|
||||
|
||||
Usage in any service:
|
||||
from dolphin_exit_handler import install_exit_handler
|
||||
install_exit_handler("meta_health") # call once at startup
|
||||
|
||||
Uses stdlib only (atexit, signal). Industry-standard pattern —
|
||||
no third-party dependency needed for this.
|
||||
"""
|
||||
import atexit
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
|
||||
log = logging.getLogger("dolphin.exit")
|
||||
|
||||
# State for dedup — avoid double-logging on atexit + signal
|
||||
_exit_logged = False
|
||||
_service_name = "unknown"
|
||||
_on_exit_callbacks = []
|
||||
|
||||
|
||||
def _ts_iso():
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _log_exit(reason: str, exit_code: int = 0, signal_num: int = None):
|
||||
"""Log exit event to all channels. Fire-and-forget — never raises."""
|
||||
global _exit_logged
|
||||
if _exit_logged:
|
||||
return
|
||||
_exit_logged = True
|
||||
|
||||
event = {
|
||||
"service": _service_name,
|
||||
"reason": reason,
|
||||
"exit_code": exit_code,
|
||||
"signal": signal_num,
|
||||
"pid": os.getpid(),
|
||||
"ts": _ts_iso(),
|
||||
}
|
||||
|
||||
# 1. File log (always works)
|
||||
log.warning("SERVICE_EXIT: %s reason=%s exit_code=%d signal=%s pid=%d",
|
||||
_service_name, reason, exit_code, signal_num, os.getpid())
|
||||
|
||||
# 2. ClickHouse (best-effort)
|
||||
try:
|
||||
from ch_writer import ch_put
|
||||
ch_put("service_lifecycle", {
|
||||
"ts": int(time.time() * 1e6),
|
||||
"service": _service_name,
|
||||
"event": "EXIT",
|
||||
"reason": reason,
|
||||
"exit_code": exit_code,
|
||||
"signal_num": signal_num or 0,
|
||||
"pid": os.getpid(),
|
||||
})
|
||||
# Give the CH writer thread a moment to flush
|
||||
time.sleep(0.3)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3. Hazelcast (best-effort)
|
||||
try:
|
||||
import json
|
||||
import hazelcast
|
||||
client = hazelcast.HazelcastClient(
|
||||
cluster_name="dolphin",
|
||||
cluster_members=["localhost:5701"],
|
||||
connection_timeout=2.0,
|
||||
)
|
||||
m = client.get_map("DOLPHIN_SERVICE_LIFECYCLE").blocking()
|
||||
m.put(_service_name, json.dumps(event), ttl=3600)
|
||||
client.shutdown()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 4. Run registered callbacks
|
||||
for cb in _on_exit_callbacks:
|
||||
try:
|
||||
cb(event)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _signal_handler(signum, frame):
|
||||
"""Handle SIGTERM, SIGINT, SIGHUP."""
|
||||
sig_name = {
|
||||
signal.SIGTERM: "SIGTERM",
|
||||
signal.SIGINT: "SIGINT",
|
||||
signal.SIGHUP: "SIGHUP",
|
||||
}.get(signum, f"SIG{signum}")
|
||||
|
||||
_log_exit(reason=sig_name, exit_code=128 + signum, signal_num=signum)
|
||||
sys.exit(128 + signum)
|
||||
|
||||
|
||||
def _atexit_handler():
|
||||
"""Catch normal exit and unhandled exceptions."""
|
||||
exc_type, exc_val, exc_tb = sys.exc_info()
|
||||
if exc_type is not None and exc_type is not SystemExit:
|
||||
reason = f"CRASH:{exc_type.__name__}:{exc_val}"
|
||||
_log_exit(reason=reason, exit_code=1)
|
||||
else:
|
||||
code = 0
|
||||
if exc_type is SystemExit and exc_val is not None:
|
||||
code = getattr(exc_val, 'code', 0) or 0
|
||||
_log_exit(reason="NORMAL_EXIT", exit_code=code)
|
||||
|
||||
|
||||
def _excepthook(exc_type, exc_val, exc_tb):
|
||||
"""Global exception hook — log crash before Python dies."""
|
||||
reason = f"CRASH:{exc_type.__name__}:{exc_val}"
|
||||
log.error("Unhandled exception in %s:\n%s", _service_name,
|
||||
"".join(traceback.format_exception(exc_type, exc_val, exc_tb)))
|
||||
_log_exit(reason=reason, exit_code=1)
|
||||
sys.__excepthook__(exc_type, exc_val, exc_tb)
|
||||
|
||||
|
||||
def install_exit_handler(service_name: str, on_exit=None):
|
||||
"""Install exit handler for a Dolphin service.
|
||||
|
||||
Args:
|
||||
service_name: e.g. "meta_health", "nautilus_trader", "exf_fetcher"
|
||||
on_exit: optional callback(event_dict) for service-specific cleanup
|
||||
"""
|
||||
global _service_name
|
||||
_service_name = service_name
|
||||
|
||||
if on_exit:
|
||||
_on_exit_callbacks.append(on_exit)
|
||||
|
||||
# Register signal handlers
|
||||
for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP):
|
||||
try:
|
||||
signal.signal(sig, _signal_handler)
|
||||
except (OSError, ValueError):
|
||||
pass # can't set handler in non-main thread
|
||||
|
||||
# Register atexit
|
||||
atexit.register(_atexit_handler)
|
||||
|
||||
# Global excepthook for unhandled exceptions
|
||||
sys.excepthook = _excepthook
|
||||
|
||||
log.info("SERVICE_START: %s pid=%d", service_name, os.getpid())
|
||||
|
||||
# Log startup to CH
|
||||
try:
|
||||
from ch_writer import ch_put
|
||||
ch_put("service_lifecycle", {
|
||||
"ts": int(time.time() * 1e6),
|
||||
"service": service_name,
|
||||
"event": "START",
|
||||
"reason": "NORMAL_START",
|
||||
"exit_code": 0,
|
||||
"signal_num": 0,
|
||||
"pid": os.getpid(),
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
Reference in New Issue
Block a user