initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
331
prod/services/service_base.py
Executable file
331
prod/services/service_base.py
Executable file
@@ -0,0 +1,331 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dolphin Service Base Class - Boilerplate for reliable userland services
|
||||
Features:
|
||||
- Automatic retries with exponential backoff
|
||||
- Structured logging to journal
|
||||
- Health check endpoints
|
||||
- Graceful shutdown on signals
|
||||
- Systemd notify support (Type=notify)
|
||||
- Memory/CPU monitoring
|
||||
"""
|
||||
import abc
|
||||
import asyncio
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
from typing import Optional, Callable, Any
|
||||
from dataclasses import dataclass, asdict
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
|
||||
# Optional imports - graceful degradation if not available
|
||||
try:
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
||||
TENACITY_AVAILABLE = True
|
||||
except ImportError:
|
||||
TENACITY_AVAILABLE = False
|
||||
|
||||
try:
|
||||
from pystemd.daemon import notify, Notification
|
||||
SYSTEMD_AVAILABLE = True
|
||||
except ImportError:
|
||||
SYSTEMD_AVAILABLE = False
|
||||
def notify(*args, **kwargs):
|
||||
pass
|
||||
|
||||
# Configure logging for systemd journal
|
||||
class JournalHandler(logging.Handler):
|
||||
"""Log handler that outputs JSON for systemd journal"""
|
||||
def emit(self, record):
|
||||
try:
|
||||
msg = {
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'level': record.levelname,
|
||||
'logger': record.name,
|
||||
'message': self.format(record),
|
||||
'source': getattr(record, 'source', 'unknown'),
|
||||
'service': getattr(record, 'service', 'unknown'),
|
||||
}
|
||||
print(json.dumps(msg), flush=True)
|
||||
except Exception:
|
||||
self.handleError(record)
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
"""Get configured logger for services"""
|
||||
logger = logging.getLogger(name)
|
||||
if not logger.handlers:
|
||||
handler = JournalHandler()
|
||||
handler.setFormatter(logging.Formatter('%(message)s'))
|
||||
logger.addHandler(handler)
|
||||
logger.setLevel(logging.INFO)
|
||||
return logger
|
||||
|
||||
@dataclass
|
||||
class ServiceHealth:
|
||||
"""Health check status"""
|
||||
status: str # 'healthy', 'degraded', 'unhealthy'
|
||||
last_check: float
|
||||
uptime: float
|
||||
memory_mb: float
|
||||
cpu_percent: float
|
||||
error_count: int
|
||||
message: str
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(asdict(self))
|
||||
|
||||
class ServiceBase(abc.ABC):
|
||||
"""
|
||||
Base class for reliable Dolphin services
|
||||
|
||||
Usage:
|
||||
class MyService(ServiceBase):
|
||||
def __init__(self):
|
||||
super().__init__("my-service", check_interval=30)
|
||||
|
||||
async def run_cycle(self):
|
||||
# Your service logic here
|
||||
pass
|
||||
|
||||
if __name__ == '__main__':
|
||||
service = MyService()
|
||||
service.run()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
check_interval: float = 30.0,
|
||||
max_retries: int = 3,
|
||||
notify_systemd: bool = True
|
||||
):
|
||||
self.name = name
|
||||
self.check_interval = check_interval
|
||||
self.max_retries = max_retries
|
||||
self.notify_systemd = notify_systemd and SYSTEMD_AVAILABLE
|
||||
|
||||
self.logger = get_logger(f'dolphin.{name}')
|
||||
self.logger.service = name
|
||||
|
||||
self._shutdown_event = asyncio.Event()
|
||||
self._start_time = time.time()
|
||||
self._health = ServiceHealth(
|
||||
status='starting',
|
||||
last_check=time.time(),
|
||||
uptime=0.0,
|
||||
memory_mb=0.0,
|
||||
cpu_percent=0.0,
|
||||
error_count=0,
|
||||
message='Initializing'
|
||||
)
|
||||
self._tasks = []
|
||||
|
||||
# Setup signal handlers
|
||||
self._setup_signals()
|
||||
|
||||
def _setup_signals(self):
|
||||
"""Setup graceful shutdown handlers"""
|
||||
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||
asyncio.get_event_loop().add_signal_handler(
|
||||
sig, lambda: asyncio.create_task(self._shutdown())
|
||||
)
|
||||
|
||||
async def _shutdown(self):
|
||||
"""Graceful shutdown"""
|
||||
self.logger.warning(f"{self.name}: Shutdown signal received")
|
||||
self._shutdown_event.set()
|
||||
|
||||
# Cancel all tasks
|
||||
for task in self._tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
|
||||
# Give tasks time to cleanup
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
def _update_health(self, status: str, message: str = ''):
|
||||
"""Update health status"""
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
|
||||
self._health = ServiceHealth(
|
||||
status=status,
|
||||
last_check=time.time(),
|
||||
uptime=time.time() - self._start_time,
|
||||
memory_mb=process.memory_info().rss / 1024 / 1024,
|
||||
cpu_percent=process.cpu_percent(),
|
||||
error_count=self._health.error_count,
|
||||
message=message
|
||||
)
|
||||
|
||||
def _log_extra(self, **kwargs):
|
||||
"""Add extra context to logs"""
|
||||
for key, value in kwargs.items():
|
||||
setattr(self.logger, key, value)
|
||||
|
||||
def retry_with_backoff(self, func: Callable, **kwargs):
|
||||
"""Decorator/wrapper for retry logic"""
|
||||
if not TENACITY_AVAILABLE:
|
||||
return func
|
||||
|
||||
retry_kwargs = {
|
||||
'stop': stop_after_attempt(kwargs.get('max_retries', self.max_retries)),
|
||||
'wait': wait_exponential(multiplier=1, min=4, max=60),
|
||||
'retry': retry_if_exception_type((Exception,)),
|
||||
'before_sleep': lambda retry_state: self.logger.warning(
|
||||
f"Retry {retry_state.attempt_number}: {retry_state.outcome.exception()}"
|
||||
)
|
||||
}
|
||||
|
||||
return retry(**retry_kwargs)(func)
|
||||
|
||||
@abc.abstractmethod
|
||||
async def run_cycle(self):
|
||||
"""
|
||||
Main service logic - implement this!
|
||||
Called repeatedly in the main loop.
|
||||
Should be non-blocking or use asyncio.
|
||||
"""
|
||||
pass
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""
|
||||
Optional: Implement custom health check
|
||||
Return True if healthy, False otherwise
|
||||
"""
|
||||
return True
|
||||
|
||||
async def _health_loop(self):
|
||||
"""Background health check loop"""
|
||||
while not self._shutdown_event.is_set():
|
||||
try:
|
||||
healthy = await self.health_check()
|
||||
if healthy:
|
||||
self._update_health('healthy', 'Service operating normally')
|
||||
else:
|
||||
self._update_health('degraded', 'Health check failed')
|
||||
|
||||
# Notify systemd we're still alive
|
||||
if self.notify_systemd:
|
||||
notify(Notification.WATCHDOG)
|
||||
|
||||
except Exception as e:
|
||||
self._health.error_count += 1
|
||||
self._update_health('unhealthy', str(e))
|
||||
self.logger.error(f"Health check error: {e}")
|
||||
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self._shutdown_event.wait(),
|
||||
timeout=self.check_interval
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
pass # Normal - continue loop
|
||||
|
||||
async def _main_loop(self):
|
||||
"""Main service loop"""
|
||||
self.logger.info(f"{self.name}: Starting main loop")
|
||||
|
||||
while not self._shutdown_event.is_set():
|
||||
try:
|
||||
await self.run_cycle()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
self._health.error_count += 1
|
||||
self.logger.error(f"Cycle error: {e}", exc_info=True)
|
||||
# Brief pause before retry
|
||||
await asyncio.sleep(1)
|
||||
|
||||
def run(self):
|
||||
"""Run the service (blocking)"""
|
||||
self.logger.info(f"{self.name}: Service starting")
|
||||
|
||||
# Notify systemd we're ready
|
||||
if self.notify_systemd:
|
||||
notify(Notification.READY)
|
||||
self.logger.info("Notified systemd: READY")
|
||||
|
||||
# Start health check loop
|
||||
health_task = asyncio.create_task(self._health_loop())
|
||||
self._tasks.append(health_task)
|
||||
|
||||
# Start main loop
|
||||
main_task = asyncio.create_task(self._main_loop())
|
||||
self._tasks.append(main_task)
|
||||
|
||||
try:
|
||||
# Run until shutdown
|
||||
asyncio.get_event_loop().run_until_complete(self._shutdown_event.wait())
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
self.logger.info(f"{self.name}: Service stopping")
|
||||
# Cleanup
|
||||
for task in self._tasks:
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
|
||||
# Wait for cleanup
|
||||
if self._tasks:
|
||||
asyncio.get_event_loop().run_until_complete(
|
||||
asyncio.gather(*self._tasks, return_exceptions=True)
|
||||
)
|
||||
|
||||
self.logger.info(f"{self.name}: Service stopped")
|
||||
|
||||
def run_scheduled(
|
||||
func: Callable,
|
||||
interval_seconds: float,
|
||||
name: str = 'scheduled-task'
|
||||
):
|
||||
"""
|
||||
Run a function on a schedule (simple alternative to full service)
|
||||
|
||||
Usage:
|
||||
def my_task():
|
||||
print("Running...")
|
||||
|
||||
run_scheduled(my_task, interval_seconds=60, name='my-task')
|
||||
"""
|
||||
logger = get_logger(f'dolphin.scheduled.{name}')
|
||||
logger.info(f"Starting scheduled task: {name} (interval: {interval_seconds}s)")
|
||||
|
||||
async def loop():
|
||||
while True:
|
||||
try:
|
||||
start = time.time()
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
await func()
|
||||
else:
|
||||
func()
|
||||
elapsed = time.time() - start
|
||||
logger.info(f"Task completed in {elapsed:.2f}s")
|
||||
|
||||
# Sleep remaining time
|
||||
sleep_time = max(0, interval_seconds - elapsed)
|
||||
await asyncio.sleep(sleep_time)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Task error: {e}", exc_info=True)
|
||||
await asyncio.sleep(interval_seconds)
|
||||
|
||||
try:
|
||||
asyncio.run(loop())
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Stopped by user")
|
||||
|
||||
__all__ = [
|
||||
'ServiceBase',
|
||||
'ServiceHealth',
|
||||
'get_logger',
|
||||
'JournalHandler',
|
||||
'run_scheduled',
|
||||
'notify',
|
||||
'SYSTEMD_AVAILABLE',
|
||||
'TENACITY_AVAILABLE',
|
||||
]
|
||||
Reference in New Issue
Block a user