"""Dolphin Graceful Exit Handler. Registers signal handlers (SIGTERM, SIGINT, SIGHUP) and atexit callbacks to log service lifecycle events to all channels: file log, ClickHouse, and Hazelcast. Usage in any service: from dolphin_exit_handler import install_exit_handler install_exit_handler("meta_health") # call once at startup Uses stdlib only (atexit, signal). Industry-standard pattern — no third-party dependency needed for this. """ import atexit import logging import os import signal import sys import time import traceback from datetime import datetime, timezone log = logging.getLogger("dolphin.exit") # State for dedup — avoid double-logging on atexit + signal _exit_logged = False _service_name = "unknown" _on_exit_callbacks = [] def _ts_iso(): return datetime.now(timezone.utc).isoformat() def _log_exit(reason: str, exit_code: int = 0, signal_num: int = None): """Log exit event to all channels. Fire-and-forget — never raises.""" global _exit_logged if _exit_logged: return _exit_logged = True event = { "service": _service_name, "reason": reason, "exit_code": exit_code, "signal": signal_num, "pid": os.getpid(), "ts": _ts_iso(), } # 1. File log (always works) log.warning("SERVICE_EXIT: %s reason=%s exit_code=%d signal=%s pid=%d", _service_name, reason, exit_code, signal_num, os.getpid()) # 2. ClickHouse (best-effort) try: from ch_writer import ch_put ch_put("service_lifecycle", { "ts": int(time.time() * 1e6), "service": _service_name, "event": "EXIT", "reason": reason, "exit_code": exit_code, "signal_num": signal_num or 0, "pid": os.getpid(), }) # Give the CH writer thread a moment to flush time.sleep(0.3) except Exception: pass # 3. Hazelcast (best-effort) try: import json import hazelcast client = hazelcast.HazelcastClient( cluster_name="dolphin", cluster_members=["localhost:5701"], connection_timeout=2.0, ) m = client.get_map("DOLPHIN_SERVICE_LIFECYCLE").blocking() m.put(_service_name, json.dumps(event), ttl=3600) client.shutdown() except Exception: pass # 4. Run registered callbacks for cb in _on_exit_callbacks: try: cb(event) except Exception: pass def _signal_handler(signum, frame): """Handle SIGTERM, SIGINT, SIGHUP.""" sig_name = { signal.SIGTERM: "SIGTERM", signal.SIGINT: "SIGINT", signal.SIGHUP: "SIGHUP", }.get(signum, f"SIG{signum}") _log_exit(reason=sig_name, exit_code=128 + signum, signal_num=signum) sys.exit(128 + signum) def _atexit_handler(): """Catch normal exit and unhandled exceptions.""" exc_type, exc_val, exc_tb = sys.exc_info() if exc_type is not None and exc_type is not SystemExit: reason = f"CRASH:{exc_type.__name__}:{exc_val}" _log_exit(reason=reason, exit_code=1) else: code = 0 if exc_type is SystemExit and exc_val is not None: code = getattr(exc_val, 'code', 0) or 0 _log_exit(reason="NORMAL_EXIT", exit_code=code) def _excepthook(exc_type, exc_val, exc_tb): """Global exception hook — log crash before Python dies.""" reason = f"CRASH:{exc_type.__name__}:{exc_val}" log.error("Unhandled exception in %s:\n%s", _service_name, "".join(traceback.format_exception(exc_type, exc_val, exc_tb))) _log_exit(reason=reason, exit_code=1) sys.__excepthook__(exc_type, exc_val, exc_tb) def install_exit_handler(service_name: str, on_exit=None): """Install exit handler for a Dolphin service. Args: service_name: e.g. "meta_health", "nautilus_trader", "exf_fetcher" on_exit: optional callback(event_dict) for service-specific cleanup """ global _service_name _service_name = service_name if on_exit: _on_exit_callbacks.append(on_exit) # Register signal handlers for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP): try: signal.signal(sig, _signal_handler) except (OSError, ValueError): pass # can't set handler in non-main thread # Register atexit atexit.register(_atexit_handler) # Global excepthook for unhandled exceptions sys.excepthook = _excepthook log.info("SERVICE_START: %s pid=%d", service_name, os.getpid()) # Log startup to CH try: from ch_writer import ch_put ch_put("service_lifecycle", { "ts": int(time.time() * 1e6), "service": service_name, "event": "START", "reason": "NORMAL_START", "exit_code": 0, "signal_num": 0, "pid": os.getpid(), }) except Exception: pass