Files
DOLPHIN/prod/services/service_base.py

332 lines
10 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Dolphin Service Base Class - Boilerplate for reliable userland services
Features:
- Automatic retries with exponential backoff
- Structured logging to journal
- Health check endpoints
- Graceful shutdown on signals
- Systemd notify support (Type=notify)
- Memory/CPU monitoring
"""
import abc
import asyncio
import logging
import signal
import sys
import os
import time
import json
from typing import Optional, Callable, Any
from dataclasses import dataclass, asdict
from datetime import datetime
from functools import wraps
# Optional imports - graceful degradation if not available
try:
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
TENACITY_AVAILABLE = True
except ImportError:
TENACITY_AVAILABLE = False
try:
from pystemd.daemon import notify, Notification
SYSTEMD_AVAILABLE = True
except ImportError:
SYSTEMD_AVAILABLE = False
def notify(*args, **kwargs):
pass
# Configure logging for systemd journal
class JournalHandler(logging.Handler):
"""Log handler that outputs JSON for systemd journal"""
def emit(self, record):
try:
msg = {
'timestamp': datetime.utcnow().isoformat(),
'level': record.levelname,
'logger': record.name,
'message': self.format(record),
'source': getattr(record, 'source', 'unknown'),
'service': getattr(record, 'service', 'unknown'),
}
print(json.dumps(msg), flush=True)
except Exception:
self.handleError(record)
def get_logger(name: str) -> logging.Logger:
"""Get configured logger for services"""
logger = logging.getLogger(name)
if not logger.handlers:
handler = JournalHandler()
handler.setFormatter(logging.Formatter('%(message)s'))
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
@dataclass
class ServiceHealth:
"""Health check status"""
status: str # 'healthy', 'degraded', 'unhealthy'
last_check: float
uptime: float
memory_mb: float
cpu_percent: float
error_count: int
message: str
def to_json(self) -> str:
return json.dumps(asdict(self))
class ServiceBase(abc.ABC):
"""
Base class for reliable Dolphin services
Usage:
class MyService(ServiceBase):
def __init__(self):
super().__init__("my-service", check_interval=30)
async def run_cycle(self):
# Your service logic here
pass
if __name__ == '__main__':
service = MyService()
service.run()
"""
def __init__(
self,
name: str,
check_interval: float = 30.0,
max_retries: int = 3,
notify_systemd: bool = True
):
self.name = name
self.check_interval = check_interval
self.max_retries = max_retries
self.notify_systemd = notify_systemd and SYSTEMD_AVAILABLE
self.logger = get_logger(f'dolphin.{name}')
self.logger.service = name
self._shutdown_event = asyncio.Event()
self._start_time = time.time()
self._health = ServiceHealth(
status='starting',
last_check=time.time(),
uptime=0.0,
memory_mb=0.0,
cpu_percent=0.0,
error_count=0,
message='Initializing'
)
self._tasks = []
# Setup signal handlers
self._setup_signals()
def _setup_signals(self):
"""Setup graceful shutdown handlers"""
for sig in (signal.SIGTERM, signal.SIGINT):
asyncio.get_event_loop().add_signal_handler(
sig, lambda: asyncio.create_task(self._shutdown())
)
async def _shutdown(self):
"""Graceful shutdown"""
self.logger.warning(f"{self.name}: Shutdown signal received")
self._shutdown_event.set()
# Cancel all tasks
for task in self._tasks:
if not task.done():
task.cancel()
# Give tasks time to cleanup
await asyncio.sleep(0.5)
def _update_health(self, status: str, message: str = ''):
"""Update health status"""
import psutil
process = psutil.Process()
self._health = ServiceHealth(
status=status,
last_check=time.time(),
uptime=time.time() - self._start_time,
memory_mb=process.memory_info().rss / 1024 / 1024,
cpu_percent=process.cpu_percent(),
error_count=self._health.error_count,
message=message
)
def _log_extra(self, **kwargs):
"""Add extra context to logs"""
for key, value in kwargs.items():
setattr(self.logger, key, value)
def retry_with_backoff(self, func: Callable, **kwargs):
"""Decorator/wrapper for retry logic"""
if not TENACITY_AVAILABLE:
return func
retry_kwargs = {
'stop': stop_after_attempt(kwargs.get('max_retries', self.max_retries)),
'wait': wait_exponential(multiplier=1, min=4, max=60),
'retry': retry_if_exception_type((Exception,)),
'before_sleep': lambda retry_state: self.logger.warning(
f"Retry {retry_state.attempt_number}: {retry_state.outcome.exception()}"
)
}
return retry(**retry_kwargs)(func)
@abc.abstractmethod
async def run_cycle(self):
"""
Main service logic - implement this!
Called repeatedly in the main loop.
Should be non-blocking or use asyncio.
"""
pass
async def health_check(self) -> bool:
"""
Optional: Implement custom health check
Return True if healthy, False otherwise
"""
return True
async def _health_loop(self):
"""Background health check loop"""
while not self._shutdown_event.is_set():
try:
healthy = await self.health_check()
if healthy:
self._update_health('healthy', 'Service operating normally')
else:
self._update_health('degraded', 'Health check failed')
# Notify systemd we're still alive
if self.notify_systemd:
notify(Notification.WATCHDOG)
except Exception as e:
self._health.error_count += 1
self._update_health('unhealthy', str(e))
self.logger.error(f"Health check error: {e}")
try:
await asyncio.wait_for(
self._shutdown_event.wait(),
timeout=self.check_interval
)
except asyncio.TimeoutError:
pass # Normal - continue loop
async def _main_loop(self):
"""Main service loop"""
self.logger.info(f"{self.name}: Starting main loop")
while not self._shutdown_event.is_set():
try:
await self.run_cycle()
except asyncio.CancelledError:
break
except Exception as e:
self._health.error_count += 1
self.logger.error(f"Cycle error: {e}", exc_info=True)
# Brief pause before retry
await asyncio.sleep(1)
def run(self):
"""Run the service (blocking)"""
self.logger.info(f"{self.name}: Service starting")
# Notify systemd we're ready
if self.notify_systemd:
notify(Notification.READY)
self.logger.info("Notified systemd: READY")
# Start health check loop
health_task = asyncio.create_task(self._health_loop())
self._tasks.append(health_task)
# Start main loop
main_task = asyncio.create_task(self._main_loop())
self._tasks.append(main_task)
try:
# Run until shutdown
asyncio.get_event_loop().run_until_complete(self._shutdown_event.wait())
except KeyboardInterrupt:
pass
finally:
self.logger.info(f"{self.name}: Service stopping")
# Cleanup
for task in self._tasks:
if not task.done():
task.cancel()
# Wait for cleanup
if self._tasks:
asyncio.get_event_loop().run_until_complete(
asyncio.gather(*self._tasks, return_exceptions=True)
)
self.logger.info(f"{self.name}: Service stopped")
def run_scheduled(
func: Callable,
interval_seconds: float,
name: str = 'scheduled-task'
):
"""
Run a function on a schedule (simple alternative to full service)
Usage:
def my_task():
print("Running...")
run_scheduled(my_task, interval_seconds=60, name='my-task')
"""
logger = get_logger(f'dolphin.scheduled.{name}')
logger.info(f"Starting scheduled task: {name} (interval: {interval_seconds}s)")
async def loop():
while True:
try:
start = time.time()
if asyncio.iscoroutinefunction(func):
await func()
else:
func()
elapsed = time.time() - start
logger.info(f"Task completed in {elapsed:.2f}s")
# Sleep remaining time
sleep_time = max(0, interval_seconds - elapsed)
await asyncio.sleep(sleep_time)
except Exception as e:
logger.error(f"Task error: {e}", exc_info=True)
await asyncio.sleep(interval_seconds)
try:
asyncio.run(loop())
except KeyboardInterrupt:
logger.info("Stopped by user")
__all__ = [
'ServiceBase',
'ServiceHealth',
'get_logger',
'JournalHandler',
'run_scheduled',
'notify',
'SYSTEMD_AVAILABLE',
'TENACITY_AVAILABLE',
]