332 lines
10 KiB
Python
332 lines
10 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Dolphin Service Base Class - Boilerplate for reliable userland services
|
||
|
|
Features:
|
||
|
|
- Automatic retries with exponential backoff
|
||
|
|
- Structured logging to journal
|
||
|
|
- Health check endpoints
|
||
|
|
- Graceful shutdown on signals
|
||
|
|
- Systemd notify support (Type=notify)
|
||
|
|
- Memory/CPU monitoring
|
||
|
|
"""
|
||
|
|
import abc
|
||
|
|
import asyncio
|
||
|
|
import logging
|
||
|
|
import signal
|
||
|
|
import sys
|
||
|
|
import os
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
from typing import Optional, Callable, Any
|
||
|
|
from dataclasses import dataclass, asdict
|
||
|
|
from datetime import datetime
|
||
|
|
from functools import wraps
|
||
|
|
|
||
|
|
# Optional imports - graceful degradation if not available
|
||
|
|
try:
|
||
|
|
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
||
|
|
TENACITY_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
TENACITY_AVAILABLE = False
|
||
|
|
|
||
|
|
try:
|
||
|
|
from pystemd.daemon import notify, Notification
|
||
|
|
SYSTEMD_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
SYSTEMD_AVAILABLE = False
|
||
|
|
def notify(*args, **kwargs):
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Configure logging for systemd journal
|
||
|
|
class JournalHandler(logging.Handler):
|
||
|
|
"""Log handler that outputs JSON for systemd journal"""
|
||
|
|
def emit(self, record):
|
||
|
|
try:
|
||
|
|
msg = {
|
||
|
|
'timestamp': datetime.utcnow().isoformat(),
|
||
|
|
'level': record.levelname,
|
||
|
|
'logger': record.name,
|
||
|
|
'message': self.format(record),
|
||
|
|
'source': getattr(record, 'source', 'unknown'),
|
||
|
|
'service': getattr(record, 'service', 'unknown'),
|
||
|
|
}
|
||
|
|
print(json.dumps(msg), flush=True)
|
||
|
|
except Exception:
|
||
|
|
self.handleError(record)
|
||
|
|
|
||
|
|
def get_logger(name: str) -> logging.Logger:
|
||
|
|
"""Get configured logger for services"""
|
||
|
|
logger = logging.getLogger(name)
|
||
|
|
if not logger.handlers:
|
||
|
|
handler = JournalHandler()
|
||
|
|
handler.setFormatter(logging.Formatter('%(message)s'))
|
||
|
|
logger.addHandler(handler)
|
||
|
|
logger.setLevel(logging.INFO)
|
||
|
|
return logger
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ServiceHealth:
|
||
|
|
"""Health check status"""
|
||
|
|
status: str # 'healthy', 'degraded', 'unhealthy'
|
||
|
|
last_check: float
|
||
|
|
uptime: float
|
||
|
|
memory_mb: float
|
||
|
|
cpu_percent: float
|
||
|
|
error_count: int
|
||
|
|
message: str
|
||
|
|
|
||
|
|
def to_json(self) -> str:
|
||
|
|
return json.dumps(asdict(self))
|
||
|
|
|
||
|
|
class ServiceBase(abc.ABC):
|
||
|
|
"""
|
||
|
|
Base class for reliable Dolphin services
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
class MyService(ServiceBase):
|
||
|
|
def __init__(self):
|
||
|
|
super().__init__("my-service", check_interval=30)
|
||
|
|
|
||
|
|
async def run_cycle(self):
|
||
|
|
# Your service logic here
|
||
|
|
pass
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
service = MyService()
|
||
|
|
service.run()
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
name: str,
|
||
|
|
check_interval: float = 30.0,
|
||
|
|
max_retries: int = 3,
|
||
|
|
notify_systemd: bool = True
|
||
|
|
):
|
||
|
|
self.name = name
|
||
|
|
self.check_interval = check_interval
|
||
|
|
self.max_retries = max_retries
|
||
|
|
self.notify_systemd = notify_systemd and SYSTEMD_AVAILABLE
|
||
|
|
|
||
|
|
self.logger = get_logger(f'dolphin.{name}')
|
||
|
|
self.logger.service = name
|
||
|
|
|
||
|
|
self._shutdown_event = asyncio.Event()
|
||
|
|
self._start_time = time.time()
|
||
|
|
self._health = ServiceHealth(
|
||
|
|
status='starting',
|
||
|
|
last_check=time.time(),
|
||
|
|
uptime=0.0,
|
||
|
|
memory_mb=0.0,
|
||
|
|
cpu_percent=0.0,
|
||
|
|
error_count=0,
|
||
|
|
message='Initializing'
|
||
|
|
)
|
||
|
|
self._tasks = []
|
||
|
|
|
||
|
|
# Setup signal handlers
|
||
|
|
self._setup_signals()
|
||
|
|
|
||
|
|
def _setup_signals(self):
|
||
|
|
"""Setup graceful shutdown handlers"""
|
||
|
|
for sig in (signal.SIGTERM, signal.SIGINT):
|
||
|
|
asyncio.get_event_loop().add_signal_handler(
|
||
|
|
sig, lambda: asyncio.create_task(self._shutdown())
|
||
|
|
)
|
||
|
|
|
||
|
|
async def _shutdown(self):
|
||
|
|
"""Graceful shutdown"""
|
||
|
|
self.logger.warning(f"{self.name}: Shutdown signal received")
|
||
|
|
self._shutdown_event.set()
|
||
|
|
|
||
|
|
# Cancel all tasks
|
||
|
|
for task in self._tasks:
|
||
|
|
if not task.done():
|
||
|
|
task.cancel()
|
||
|
|
|
||
|
|
# Give tasks time to cleanup
|
||
|
|
await asyncio.sleep(0.5)
|
||
|
|
|
||
|
|
def _update_health(self, status: str, message: str = ''):
|
||
|
|
"""Update health status"""
|
||
|
|
import psutil
|
||
|
|
process = psutil.Process()
|
||
|
|
|
||
|
|
self._health = ServiceHealth(
|
||
|
|
status=status,
|
||
|
|
last_check=time.time(),
|
||
|
|
uptime=time.time() - self._start_time,
|
||
|
|
memory_mb=process.memory_info().rss / 1024 / 1024,
|
||
|
|
cpu_percent=process.cpu_percent(),
|
||
|
|
error_count=self._health.error_count,
|
||
|
|
message=message
|
||
|
|
)
|
||
|
|
|
||
|
|
def _log_extra(self, **kwargs):
|
||
|
|
"""Add extra context to logs"""
|
||
|
|
for key, value in kwargs.items():
|
||
|
|
setattr(self.logger, key, value)
|
||
|
|
|
||
|
|
def retry_with_backoff(self, func: Callable, **kwargs):
|
||
|
|
"""Decorator/wrapper for retry logic"""
|
||
|
|
if not TENACITY_AVAILABLE:
|
||
|
|
return func
|
||
|
|
|
||
|
|
retry_kwargs = {
|
||
|
|
'stop': stop_after_attempt(kwargs.get('max_retries', self.max_retries)),
|
||
|
|
'wait': wait_exponential(multiplier=1, min=4, max=60),
|
||
|
|
'retry': retry_if_exception_type((Exception,)),
|
||
|
|
'before_sleep': lambda retry_state: self.logger.warning(
|
||
|
|
f"Retry {retry_state.attempt_number}: {retry_state.outcome.exception()}"
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
return retry(**retry_kwargs)(func)
|
||
|
|
|
||
|
|
@abc.abstractmethod
|
||
|
|
async def run_cycle(self):
|
||
|
|
"""
|
||
|
|
Main service logic - implement this!
|
||
|
|
Called repeatedly in the main loop.
|
||
|
|
Should be non-blocking or use asyncio.
|
||
|
|
"""
|
||
|
|
pass
|
||
|
|
|
||
|
|
async def health_check(self) -> bool:
|
||
|
|
"""
|
||
|
|
Optional: Implement custom health check
|
||
|
|
Return True if healthy, False otherwise
|
||
|
|
"""
|
||
|
|
return True
|
||
|
|
|
||
|
|
async def _health_loop(self):
|
||
|
|
"""Background health check loop"""
|
||
|
|
while not self._shutdown_event.is_set():
|
||
|
|
try:
|
||
|
|
healthy = await self.health_check()
|
||
|
|
if healthy:
|
||
|
|
self._update_health('healthy', 'Service operating normally')
|
||
|
|
else:
|
||
|
|
self._update_health('degraded', 'Health check failed')
|
||
|
|
|
||
|
|
# Notify systemd we're still alive
|
||
|
|
if self.notify_systemd:
|
||
|
|
notify(Notification.WATCHDOG)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
self._health.error_count += 1
|
||
|
|
self._update_health('unhealthy', str(e))
|
||
|
|
self.logger.error(f"Health check error: {e}")
|
||
|
|
|
||
|
|
try:
|
||
|
|
await asyncio.wait_for(
|
||
|
|
self._shutdown_event.wait(),
|
||
|
|
timeout=self.check_interval
|
||
|
|
)
|
||
|
|
except asyncio.TimeoutError:
|
||
|
|
pass # Normal - continue loop
|
||
|
|
|
||
|
|
async def _main_loop(self):
|
||
|
|
"""Main service loop"""
|
||
|
|
self.logger.info(f"{self.name}: Starting main loop")
|
||
|
|
|
||
|
|
while not self._shutdown_event.is_set():
|
||
|
|
try:
|
||
|
|
await self.run_cycle()
|
||
|
|
except asyncio.CancelledError:
|
||
|
|
break
|
||
|
|
except Exception as e:
|
||
|
|
self._health.error_count += 1
|
||
|
|
self.logger.error(f"Cycle error: {e}", exc_info=True)
|
||
|
|
# Brief pause before retry
|
||
|
|
await asyncio.sleep(1)
|
||
|
|
|
||
|
|
def run(self):
|
||
|
|
"""Run the service (blocking)"""
|
||
|
|
self.logger.info(f"{self.name}: Service starting")
|
||
|
|
|
||
|
|
# Notify systemd we're ready
|
||
|
|
if self.notify_systemd:
|
||
|
|
notify(Notification.READY)
|
||
|
|
self.logger.info("Notified systemd: READY")
|
||
|
|
|
||
|
|
# Start health check loop
|
||
|
|
health_task = asyncio.create_task(self._health_loop())
|
||
|
|
self._tasks.append(health_task)
|
||
|
|
|
||
|
|
# Start main loop
|
||
|
|
main_task = asyncio.create_task(self._main_loop())
|
||
|
|
self._tasks.append(main_task)
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Run until shutdown
|
||
|
|
asyncio.get_event_loop().run_until_complete(self._shutdown_event.wait())
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
pass
|
||
|
|
finally:
|
||
|
|
self.logger.info(f"{self.name}: Service stopping")
|
||
|
|
# Cleanup
|
||
|
|
for task in self._tasks:
|
||
|
|
if not task.done():
|
||
|
|
task.cancel()
|
||
|
|
|
||
|
|
# Wait for cleanup
|
||
|
|
if self._tasks:
|
||
|
|
asyncio.get_event_loop().run_until_complete(
|
||
|
|
asyncio.gather(*self._tasks, return_exceptions=True)
|
||
|
|
)
|
||
|
|
|
||
|
|
self.logger.info(f"{self.name}: Service stopped")
|
||
|
|
|
||
|
|
def run_scheduled(
|
||
|
|
func: Callable,
|
||
|
|
interval_seconds: float,
|
||
|
|
name: str = 'scheduled-task'
|
||
|
|
):
|
||
|
|
"""
|
||
|
|
Run a function on a schedule (simple alternative to full service)
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
def my_task():
|
||
|
|
print("Running...")
|
||
|
|
|
||
|
|
run_scheduled(my_task, interval_seconds=60, name='my-task')
|
||
|
|
"""
|
||
|
|
logger = get_logger(f'dolphin.scheduled.{name}')
|
||
|
|
logger.info(f"Starting scheduled task: {name} (interval: {interval_seconds}s)")
|
||
|
|
|
||
|
|
async def loop():
|
||
|
|
while True:
|
||
|
|
try:
|
||
|
|
start = time.time()
|
||
|
|
if asyncio.iscoroutinefunction(func):
|
||
|
|
await func()
|
||
|
|
else:
|
||
|
|
func()
|
||
|
|
elapsed = time.time() - start
|
||
|
|
logger.info(f"Task completed in {elapsed:.2f}s")
|
||
|
|
|
||
|
|
# Sleep remaining time
|
||
|
|
sleep_time = max(0, interval_seconds - elapsed)
|
||
|
|
await asyncio.sleep(sleep_time)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Task error: {e}", exc_info=True)
|
||
|
|
await asyncio.sleep(interval_seconds)
|
||
|
|
|
||
|
|
try:
|
||
|
|
asyncio.run(loop())
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
logger.info("Stopped by user")
|
||
|
|
|
||
|
|
__all__ = [
|
||
|
|
'ServiceBase',
|
||
|
|
'ServiceHealth',
|
||
|
|
'get_logger',
|
||
|
|
'JournalHandler',
|
||
|
|
'run_scheduled',
|
||
|
|
'notify',
|
||
|
|
'SYSTEMD_AVAILABLE',
|
||
|
|
'TENACITY_AVAILABLE',
|
||
|
|
]
|