Files
DOLPHIN/prod/dolphin_exit_handler.py

171 lines
4.9 KiB
Python
Raw Permalink Normal View History

"""Dolphin Graceful Exit Handler.
Registers signal handlers (SIGTERM, SIGINT, SIGHUP) and atexit callbacks
to log service lifecycle events to all channels: file log, ClickHouse,
and Hazelcast.
Usage in any service:
from dolphin_exit_handler import install_exit_handler
install_exit_handler("meta_health") # call once at startup
Uses stdlib only (atexit, signal). Industry-standard pattern
no third-party dependency needed for this.
"""
import atexit
import logging
import os
import signal
import sys
import time
import traceback
from datetime import datetime, timezone
log = logging.getLogger("dolphin.exit")
# State for dedup — avoid double-logging on atexit + signal
_exit_logged = False
_service_name = "unknown"
_on_exit_callbacks = []
def _ts_iso():
return datetime.now(timezone.utc).isoformat()
def _log_exit(reason: str, exit_code: int = 0, signal_num: int = None):
"""Log exit event to all channels. Fire-and-forget — never raises."""
global _exit_logged
if _exit_logged:
return
_exit_logged = True
event = {
"service": _service_name,
"reason": reason,
"exit_code": exit_code,
"signal": signal_num,
"pid": os.getpid(),
"ts": _ts_iso(),
}
# 1. File log (always works)
log.warning("SERVICE_EXIT: %s reason=%s exit_code=%d signal=%s pid=%d",
_service_name, reason, exit_code, signal_num, os.getpid())
# 2. ClickHouse (best-effort)
try:
from ch_writer import ch_put
ch_put("service_lifecycle", {
"ts": int(time.time() * 1e6),
"service": _service_name,
"event": "EXIT",
"reason": reason,
"exit_code": exit_code,
"signal_num": signal_num or 0,
"pid": os.getpid(),
})
# Give the CH writer thread a moment to flush
time.sleep(0.3)
except Exception:
pass
# 3. Hazelcast (best-effort)
try:
import json
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name="dolphin",
cluster_members=["localhost:5701"],
connection_timeout=2.0,
)
m = client.get_map("DOLPHIN_SERVICE_LIFECYCLE").blocking()
m.put(_service_name, json.dumps(event), ttl=3600)
client.shutdown()
except Exception:
pass
# 4. Run registered callbacks
for cb in _on_exit_callbacks:
try:
cb(event)
except Exception:
pass
def _signal_handler(signum, frame):
"""Handle SIGTERM, SIGINT, SIGHUP."""
sig_name = {
signal.SIGTERM: "SIGTERM",
signal.SIGINT: "SIGINT",
signal.SIGHUP: "SIGHUP",
}.get(signum, f"SIG{signum}")
_log_exit(reason=sig_name, exit_code=128 + signum, signal_num=signum)
sys.exit(128 + signum)
def _atexit_handler():
"""Catch normal exit and unhandled exceptions."""
exc_type, exc_val, exc_tb = sys.exc_info()
if exc_type is not None and exc_type is not SystemExit:
reason = f"CRASH:{exc_type.__name__}:{exc_val}"
_log_exit(reason=reason, exit_code=1)
else:
code = 0
if exc_type is SystemExit and exc_val is not None:
code = getattr(exc_val, 'code', 0) or 0
_log_exit(reason="NORMAL_EXIT", exit_code=code)
def _excepthook(exc_type, exc_val, exc_tb):
"""Global exception hook — log crash before Python dies."""
reason = f"CRASH:{exc_type.__name__}:{exc_val}"
log.error("Unhandled exception in %s:\n%s", _service_name,
"".join(traceback.format_exception(exc_type, exc_val, exc_tb)))
_log_exit(reason=reason, exit_code=1)
sys.__excepthook__(exc_type, exc_val, exc_tb)
def install_exit_handler(service_name: str, on_exit=None):
"""Install exit handler for a Dolphin service.
Args:
service_name: e.g. "meta_health", "nautilus_trader", "exf_fetcher"
on_exit: optional callback(event_dict) for service-specific cleanup
"""
global _service_name
_service_name = service_name
if on_exit:
_on_exit_callbacks.append(on_exit)
# Register signal handlers
for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP):
try:
signal.signal(sig, _signal_handler)
except (OSError, ValueError):
pass # can't set handler in non-main thread
# Register atexit
atexit.register(_atexit_handler)
# Global excepthook for unhandled exceptions
sys.excepthook = _excepthook
log.info("SERVICE_START: %s pid=%d", service_name, os.getpid())
# Log startup to CH
try:
from ch_writer import ch_put
ch_put("service_lifecycle", {
"ts": int(time.time() * 1e6),
"service": service_name,
"event": "START",
"reason": "NORMAL_START",
"exit_code": 0,
"signal_num": 0,
"pid": os.getpid(),
})
except Exception:
pass