171 lines
4.9 KiB
Python
171 lines
4.9 KiB
Python
|
|
"""Dolphin Graceful Exit Handler.
|
||
|
|
|
||
|
|
Registers signal handlers (SIGTERM, SIGINT, SIGHUP) and atexit callbacks
|
||
|
|
to log service lifecycle events to all channels: file log, ClickHouse,
|
||
|
|
and Hazelcast.
|
||
|
|
|
||
|
|
Usage in any service:
|
||
|
|
from dolphin_exit_handler import install_exit_handler
|
||
|
|
install_exit_handler("meta_health") # call once at startup
|
||
|
|
|
||
|
|
Uses stdlib only (atexit, signal). Industry-standard pattern —
|
||
|
|
no third-party dependency needed for this.
|
||
|
|
"""
|
||
|
|
import atexit
|
||
|
|
import logging
|
||
|
|
import os
|
||
|
|
import signal
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
import traceback
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
|
||
|
|
log = logging.getLogger("dolphin.exit")
|
||
|
|
|
||
|
|
# State for dedup — avoid double-logging on atexit + signal
|
||
|
|
_exit_logged = False
|
||
|
|
_service_name = "unknown"
|
||
|
|
_on_exit_callbacks = []
|
||
|
|
|
||
|
|
|
||
|
|
def _ts_iso():
|
||
|
|
return datetime.now(timezone.utc).isoformat()
|
||
|
|
|
||
|
|
|
||
|
|
def _log_exit(reason: str, exit_code: int = 0, signal_num: int = None):
|
||
|
|
"""Log exit event to all channels. Fire-and-forget — never raises."""
|
||
|
|
global _exit_logged
|
||
|
|
if _exit_logged:
|
||
|
|
return
|
||
|
|
_exit_logged = True
|
||
|
|
|
||
|
|
event = {
|
||
|
|
"service": _service_name,
|
||
|
|
"reason": reason,
|
||
|
|
"exit_code": exit_code,
|
||
|
|
"signal": signal_num,
|
||
|
|
"pid": os.getpid(),
|
||
|
|
"ts": _ts_iso(),
|
||
|
|
}
|
||
|
|
|
||
|
|
# 1. File log (always works)
|
||
|
|
log.warning("SERVICE_EXIT: %s reason=%s exit_code=%d signal=%s pid=%d",
|
||
|
|
_service_name, reason, exit_code, signal_num, os.getpid())
|
||
|
|
|
||
|
|
# 2. ClickHouse (best-effort)
|
||
|
|
try:
|
||
|
|
from ch_writer import ch_put
|
||
|
|
ch_put("service_lifecycle", {
|
||
|
|
"ts": int(time.time() * 1e6),
|
||
|
|
"service": _service_name,
|
||
|
|
"event": "EXIT",
|
||
|
|
"reason": reason,
|
||
|
|
"exit_code": exit_code,
|
||
|
|
"signal_num": signal_num or 0,
|
||
|
|
"pid": os.getpid(),
|
||
|
|
})
|
||
|
|
# Give the CH writer thread a moment to flush
|
||
|
|
time.sleep(0.3)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# 3. Hazelcast (best-effort)
|
||
|
|
try:
|
||
|
|
import json
|
||
|
|
import hazelcast
|
||
|
|
client = hazelcast.HazelcastClient(
|
||
|
|
cluster_name="dolphin",
|
||
|
|
cluster_members=["localhost:5701"],
|
||
|
|
connection_timeout=2.0,
|
||
|
|
)
|
||
|
|
m = client.get_map("DOLPHIN_SERVICE_LIFECYCLE").blocking()
|
||
|
|
m.put(_service_name, json.dumps(event), ttl=3600)
|
||
|
|
client.shutdown()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# 4. Run registered callbacks
|
||
|
|
for cb in _on_exit_callbacks:
|
||
|
|
try:
|
||
|
|
cb(event)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
def _signal_handler(signum, frame):
|
||
|
|
"""Handle SIGTERM, SIGINT, SIGHUP."""
|
||
|
|
sig_name = {
|
||
|
|
signal.SIGTERM: "SIGTERM",
|
||
|
|
signal.SIGINT: "SIGINT",
|
||
|
|
signal.SIGHUP: "SIGHUP",
|
||
|
|
}.get(signum, f"SIG{signum}")
|
||
|
|
|
||
|
|
_log_exit(reason=sig_name, exit_code=128 + signum, signal_num=signum)
|
||
|
|
sys.exit(128 + signum)
|
||
|
|
|
||
|
|
|
||
|
|
def _atexit_handler():
|
||
|
|
"""Catch normal exit and unhandled exceptions."""
|
||
|
|
exc_type, exc_val, exc_tb = sys.exc_info()
|
||
|
|
if exc_type is not None and exc_type is not SystemExit:
|
||
|
|
reason = f"CRASH:{exc_type.__name__}:{exc_val}"
|
||
|
|
_log_exit(reason=reason, exit_code=1)
|
||
|
|
else:
|
||
|
|
code = 0
|
||
|
|
if exc_type is SystemExit and exc_val is not None:
|
||
|
|
code = getattr(exc_val, 'code', 0) or 0
|
||
|
|
_log_exit(reason="NORMAL_EXIT", exit_code=code)
|
||
|
|
|
||
|
|
|
||
|
|
def _excepthook(exc_type, exc_val, exc_tb):
|
||
|
|
"""Global exception hook — log crash before Python dies."""
|
||
|
|
reason = f"CRASH:{exc_type.__name__}:{exc_val}"
|
||
|
|
log.error("Unhandled exception in %s:\n%s", _service_name,
|
||
|
|
"".join(traceback.format_exception(exc_type, exc_val, exc_tb)))
|
||
|
|
_log_exit(reason=reason, exit_code=1)
|
||
|
|
sys.__excepthook__(exc_type, exc_val, exc_tb)
|
||
|
|
|
||
|
|
|
||
|
|
def install_exit_handler(service_name: str, on_exit=None):
|
||
|
|
"""Install exit handler for a Dolphin service.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
service_name: e.g. "meta_health", "nautilus_trader", "exf_fetcher"
|
||
|
|
on_exit: optional callback(event_dict) for service-specific cleanup
|
||
|
|
"""
|
||
|
|
global _service_name
|
||
|
|
_service_name = service_name
|
||
|
|
|
||
|
|
if on_exit:
|
||
|
|
_on_exit_callbacks.append(on_exit)
|
||
|
|
|
||
|
|
# Register signal handlers
|
||
|
|
for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP):
|
||
|
|
try:
|
||
|
|
signal.signal(sig, _signal_handler)
|
||
|
|
except (OSError, ValueError):
|
||
|
|
pass # can't set handler in non-main thread
|
||
|
|
|
||
|
|
# Register atexit
|
||
|
|
atexit.register(_atexit_handler)
|
||
|
|
|
||
|
|
# Global excepthook for unhandled exceptions
|
||
|
|
sys.excepthook = _excepthook
|
||
|
|
|
||
|
|
log.info("SERVICE_START: %s pid=%d", service_name, os.getpid())
|
||
|
|
|
||
|
|
# Log startup to CH
|
||
|
|
try:
|
||
|
|
from ch_writer import ch_put
|
||
|
|
ch_put("service_lifecycle", {
|
||
|
|
"ts": int(time.time() * 1e6),
|
||
|
|
"service": service_name,
|
||
|
|
"event": "START",
|
||
|
|
"reason": "NORMAL_START",
|
||
|
|
"exit_code": 0,
|
||
|
|
"signal_num": 0,
|
||
|
|
"pid": os.getpid(),
|
||
|
|
})
|
||
|
|
except Exception:
|
||
|
|
pass
|