DOLPHIN/prod/services/service_base.py

#!/usr/bin/env python3
"""
Dolphin Service Base Class - Boilerplate for reliable userland services
Features:
- Automatic retries with exponential backoff
- Structured logging to journal
- Health check endpoints
- Graceful shutdown on signals
- Systemd notify support (Type=notify)
- Memory/CPU monitoring
"""
import abc
import asyncio
import logging
import signal
import sys
import os
import time
import json
from typing import Optional, Callable, Any
from dataclasses import dataclass, asdict
from datetime import datetime
from functools import wraps

# Optional imports - graceful degradation if not available
try:
    from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
    TENACITY_AVAILABLE = True
except ImportError:
    TENACITY_AVAILABLE = False
    
try:
    from pystemd.daemon import notify, Notification
    SYSTEMD_AVAILABLE = True
except ImportError:
    SYSTEMD_AVAILABLE = False
    def notify(*args, **kwargs):
        pass

# Configure logging for systemd journal
class JournalHandler(logging.Handler):
    """Log handler that outputs JSON for systemd journal"""
    def emit(self, record):
        try:
            msg = {
                'timestamp': datetime.utcnow().isoformat(),
                'level': record.levelname,
                'logger': record.name,
                'message': self.format(record),
                'source': getattr(record, 'source', 'unknown'),
                'service': getattr(record, 'service', 'unknown'),
            }
            print(json.dumps(msg), flush=True)
        except Exception:
            self.handleError(record)

def get_logger(name: str) -> logging.Logger:
    """Get configured logger for services"""
    logger = logging.getLogger(name)
    if not logger.handlers:
        handler = JournalHandler()
        handler.setFormatter(logging.Formatter('%(message)s'))
        logger.addHandler(handler)
        logger.setLevel(logging.INFO)
    return logger

@dataclass
class ServiceHealth:
    """Health check status"""
    status: str  # 'healthy', 'degraded', 'unhealthy'
    last_check: float
    uptime: float
    memory_mb: float
    cpu_percent: float
    error_count: int
    message: str
    
    def to_json(self) -> str:
        return json.dumps(asdict(self))

class ServiceBase(abc.ABC):
    """
    Base class for reliable Dolphin services
    
    Usage:
        class MyService(ServiceBase):
            def __init__(self):
                super().__init__("my-service", check_interval=30)
            
            async def run_cycle(self):
                # Your service logic here
                pass
        
        if __name__ == '__main__':
            service = MyService()
            service.run()
    """
    
    def __init__(
        self,
        name: str,
        check_interval: float = 30.0,
        max_retries: int = 3,
        notify_systemd: bool = True
    ):
        self.name = name
        self.check_interval = check_interval
        self.max_retries = max_retries
        self.notify_systemd = notify_systemd and SYSTEMD_AVAILABLE
        
        self.logger = get_logger(f'dolphin.{name}')
        self.logger.service = name
        
        self._shutdown_event = asyncio.Event()
        self._start_time = time.time()
        self._health = ServiceHealth(
            status='starting',
            last_check=time.time(),
            uptime=0.0,
            memory_mb=0.0,
            cpu_percent=0.0,
            error_count=0,
            message='Initializing'
        )
        self._tasks = []
        
        # Setup signal handlers
        self._setup_signals()
        
    def _setup_signals(self):
        """Setup graceful shutdown handlers"""
        for sig in (signal.SIGTERM, signal.SIGINT):
            asyncio.get_event_loop().add_signal_handler(
                sig, lambda: asyncio.create_task(self._shutdown())
            )
            
    async def _shutdown(self):
        """Graceful shutdown"""
        self.logger.warning(f"{self.name}: Shutdown signal received")
        self._shutdown_event.set()
        
        # Cancel all tasks
        for task in self._tasks:
            if not task.done():
                task.cancel()
                
        # Give tasks time to cleanup
        await asyncio.sleep(0.5)
        
    def _update_health(self, status: str, message: str = ''):
        """Update health status"""
        import psutil
        process = psutil.Process()
        
        self._health = ServiceHealth(
            status=status,
            last_check=time.time(),
            uptime=time.time() - self._start_time,
            memory_mb=process.memory_info().rss / 1024 / 1024,
            cpu_percent=process.cpu_percent(),
            error_count=self._health.error_count,
            message=message
        )
        
    def _log_extra(self, **kwargs):
        """Add extra context to logs"""
        for key, value in kwargs.items():
            setattr(self.logger, key, value)
    
    def retry_with_backoff(self, func: Callable, **kwargs):
        """Decorator/wrapper for retry logic"""
        if not TENACITY_AVAILABLE:
            return func
            
        retry_kwargs = {
            'stop': stop_after_attempt(kwargs.get('max_retries', self.max_retries)),
            'wait': wait_exponential(multiplier=1, min=4, max=60),
            'retry': retry_if_exception_type((Exception,)),
            'before_sleep': lambda retry_state: self.logger.warning(
                f"Retry {retry_state.attempt_number}: {retry_state.outcome.exception()}"
            )
        }
        
        return retry(**retry_kwargs)(func)
    
    @abc.abstractmethod
    async def run_cycle(self):
        """
        Main service logic - implement this!
        Called repeatedly in the main loop.
        Should be non-blocking or use asyncio.
        """
        pass
    
    async def health_check(self) -> bool:
        """
        Optional: Implement custom health check
        Return True if healthy, False otherwise
        """
        return True
    
    async def _health_loop(self):
        """Background health check loop"""
        while not self._shutdown_event.is_set():
            try:
                healthy = await self.health_check()
                if healthy:
                    self._update_health('healthy', 'Service operating normally')
                else:
                    self._update_health('degraded', 'Health check failed')
                    
                # Notify systemd we're still alive
                if self.notify_systemd:
                    notify(Notification.WATCHDOG)
                    
            except Exception as e:
                self._health.error_count += 1
                self._update_health('unhealthy', str(e))
                self.logger.error(f"Health check error: {e}")
                
            try:
                await asyncio.wait_for(
                    self._shutdown_event.wait(),
                    timeout=self.check_interval
                )
            except asyncio.TimeoutError:
                pass  # Normal - continue loop
                
    async def _main_loop(self):
        """Main service loop"""
        self.logger.info(f"{self.name}: Starting main loop")
        
        while not self._shutdown_event.is_set():
            try:
                await self.run_cycle()
            except asyncio.CancelledError:
                break
            except Exception as e:
                self._health.error_count += 1
                self.logger.error(f"Cycle error: {e}", exc_info=True)
                # Brief pause before retry
                await asyncio.sleep(1)
                
    def run(self):
        """Run the service (blocking)"""
        self.logger.info(f"{self.name}: Service starting")
        
        # Notify systemd we're ready
        if self.notify_systemd:
            notify(Notification.READY)
            self.logger.info("Notified systemd: READY")
        
        # Start health check loop
        health_task = asyncio.create_task(self._health_loop())
        self._tasks.append(health_task)
        
        # Start main loop
        main_task = asyncio.create_task(self._main_loop())
        self._tasks.append(main_task)
        
        try:
            # Run until shutdown
            asyncio.get_event_loop().run_until_complete(self._shutdown_event.wait())
        except KeyboardInterrupt:
            pass
        finally:
            self.logger.info(f"{self.name}: Service stopping")
            # Cleanup
            for task in self._tasks:
                if not task.done():
                    task.cancel()
                    
            # Wait for cleanup
            if self._tasks:
                asyncio.get_event_loop().run_until_complete(
                    asyncio.gather(*self._tasks, return_exceptions=True)
                )
                
        self.logger.info(f"{self.name}: Service stopped")

def run_scheduled(
    func: Callable,
    interval_seconds: float,
    name: str = 'scheduled-task'
):
    """
    Run a function on a schedule (simple alternative to full service)
    
    Usage:
        def my_task():
            print("Running...")
        
        run_scheduled(my_task, interval_seconds=60, name='my-task')
    """
    logger = get_logger(f'dolphin.scheduled.{name}')
    logger.info(f"Starting scheduled task: {name} (interval: {interval_seconds}s)")
    
    async def loop():
        while True:
            try:
                start = time.time()
                if asyncio.iscoroutinefunction(func):
                    await func()
                else:
                    func()
                elapsed = time.time() - start
                logger.info(f"Task completed in {elapsed:.2f}s")
                
                # Sleep remaining time
                sleep_time = max(0, interval_seconds - elapsed)
                await asyncio.sleep(sleep_time)
                
            except Exception as e:
                logger.error(f"Task error: {e}", exc_info=True)
                await asyncio.sleep(interval_seconds)
    
    try:
        asyncio.run(loop())
    except KeyboardInterrupt:
        logger.info("Stopped by user")

__all__ = [
    'ServiceBase',
    'ServiceHealth',
    'get_logger',
    'JournalHandler',
    'run_scheduled',
    'notify',
    'SYSTEMD_AVAILABLE',
    'TENACITY_AVAILABLE',
]
initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore. 2026-04-21 16:58:38 +02:00			`#!/usr/bin/env python3`
			`"""`
			`Dolphin Service Base Class - Boilerplate for reliable userland services`
			`Features:`
			`- Automatic retries with exponential backoff`
			`- Structured logging to journal`
			`- Health check endpoints`
			`- Graceful shutdown on signals`
			`- Systemd notify support (Type=notify)`
			`- Memory/CPU monitoring`
			`"""`
			`import abc`
			`import asyncio`
			`import logging`
			`import signal`
			`import sys`
			`import os`
			`import time`
			`import json`
			`from typing import Optional, Callable, Any`
			`from dataclasses import dataclass, asdict`
			`from datetime import datetime`
			`from functools import wraps`

			`# Optional imports - graceful degradation if not available`
			`try:`
			`from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type`
			`TENACITY_AVAILABLE = True`
			`except ImportError:`
			`TENACITY_AVAILABLE = False`

			`try:`
			`from pystemd.daemon import notify, Notification`
			`SYSTEMD_AVAILABLE = True`
			`except ImportError:`
			`SYSTEMD_AVAILABLE = False`
			`def notify(args, *kwargs):`
			`pass`

			`# Configure logging for systemd journal`
			`class JournalHandler(logging.Handler):`
			`"""Log handler that outputs JSON for systemd journal"""`
			`def emit(self, record):`
			`try:`
			`msg = {`
			`'timestamp': datetime.utcnow().isoformat(),`
			`'level': record.levelname,`
			`'logger': record.name,`
			`'message': self.format(record),`
			`'source': getattr(record, 'source', 'unknown'),`
			`'service': getattr(record, 'service', 'unknown'),`
			`}`
			`print(json.dumps(msg), flush=True)`
			`except Exception:`
			`self.handleError(record)`

			`def get_logger(name: str) -> logging.Logger:`
			`"""Get configured logger for services"""`
			`logger = logging.getLogger(name)`
			`if not logger.handlers:`
			`handler = JournalHandler()`
			`handler.setFormatter(logging.Formatter('%(message)s'))`
			`logger.addHandler(handler)`
			`logger.setLevel(logging.INFO)`
			`return logger`

			`@dataclass`
			`class ServiceHealth:`
			`"""Health check status"""`
			`status: str # 'healthy', 'degraded', 'unhealthy'`
			`last_check: float`
			`uptime: float`
			`memory_mb: float`
			`cpu_percent: float`
			`error_count: int`
			`message: str`

			`def to_json(self) -> str:`
			`return json.dumps(asdict(self))`

			`class ServiceBase(abc.ABC):`
			`"""`
			`Base class for reliable Dolphin services`

			`Usage:`
			`class MyService(ServiceBase):`
			`def __init__(self):`
			`super().__init__("my-service", check_interval=30)`

			`async def run_cycle(self):`
			`# Your service logic here`
			`pass`

			`if __name__ == '__main__':`
			`service = MyService()`
			`service.run()`
			`"""`

			`def __init__(`
			`self,`
			`name: str,`
			`check_interval: float = 30.0,`
			`max_retries: int = 3,`
			`notify_systemd: bool = True`
			`):`
			`self.name = name`
			`self.check_interval = check_interval`
			`self.max_retries = max_retries`
			`self.notify_systemd = notify_systemd and SYSTEMD_AVAILABLE`

			`self.logger = get_logger(f'dolphin.{name}')`
			`self.logger.service = name`

			`self._shutdown_event = asyncio.Event()`
			`self._start_time = time.time()`
			`self._health = ServiceHealth(`
			`status='starting',`
			`last_check=time.time(),`
			`uptime=0.0,`
			`memory_mb=0.0,`
			`cpu_percent=0.0,`
			`error_count=0,`
			`message='Initializing'`
			`)`
			`self._tasks = []`

			`# Setup signal handlers`
			`self._setup_signals()`

			`def _setup_signals(self):`
			`"""Setup graceful shutdown handlers"""`
			`for sig in (signal.SIGTERM, signal.SIGINT):`
			`asyncio.get_event_loop().add_signal_handler(`
			`sig, lambda: asyncio.create_task(self._shutdown())`
			`)`

			`async def _shutdown(self):`
			`"""Graceful shutdown"""`
			`self.logger.warning(f"{self.name}: Shutdown signal received")`
			`self._shutdown_event.set()`

			`# Cancel all tasks`
			`for task in self._tasks:`
			`if not task.done():`
			`task.cancel()`

			`# Give tasks time to cleanup`
			`await asyncio.sleep(0.5)`

			`def _update_health(self, status: str, message: str = ''):`
			`"""Update health status"""`
			`import psutil`
			`process = psutil.Process()`

			`self._health = ServiceHealth(`
			`status=status,`
			`last_check=time.time(),`
			`uptime=time.time() - self._start_time,`
			`memory_mb=process.memory_info().rss / 1024 / 1024,`
			`cpu_percent=process.cpu_percent(),`
			`error_count=self._health.error_count,`
			`message=message`
			`)`

			`def _log_extra(self, **kwargs):`
			`"""Add extra context to logs"""`
			`for key, value in kwargs.items():`
			`setattr(self.logger, key, value)`

			`def retry_with_backoff(self, func: Callable, **kwargs):`
			`"""Decorator/wrapper for retry logic"""`
			`if not TENACITY_AVAILABLE:`
			`return func`

			`retry_kwargs = {`
			`'stop': stop_after_attempt(kwargs.get('max_retries', self.max_retries)),`
			`'wait': wait_exponential(multiplier=1, min=4, max=60),`
			`'retry': retry_if_exception_type((Exception,)),`
			`'before_sleep': lambda retry_state: self.logger.warning(`
			`f"Retry {retry_state.attempt_number}: {retry_state.outcome.exception()}"`
			`)`
			`}`

			`return retry(**retry_kwargs)(func)`

			`@abc.abstractmethod`
			`async def run_cycle(self):`
			`"""`
			`Main service logic - implement this!`
			`Called repeatedly in the main loop.`
			`Should be non-blocking or use asyncio.`
			`"""`
			`pass`

			`async def health_check(self) -> bool:`
			`"""`
			`Optional: Implement custom health check`
			`Return True if healthy, False otherwise`
			`"""`
			`return True`

			`async def _health_loop(self):`
			`"""Background health check loop"""`
			`while not self._shutdown_event.is_set():`
			`try:`
			`healthy = await self.health_check()`
			`if healthy:`
			`self._update_health('healthy', 'Service operating normally')`
			`else:`
			`self._update_health('degraded', 'Health check failed')`

			`# Notify systemd we're still alive`
			`if self.notify_systemd:`
			`notify(Notification.WATCHDOG)`

			`except Exception as e:`
			`self._health.error_count += 1`
			`self._update_health('unhealthy', str(e))`
			`self.logger.error(f"Health check error: {e}")`

			`try:`
			`await asyncio.wait_for(`
			`self._shutdown_event.wait(),`
			`timeout=self.check_interval`
			`)`
			`except asyncio.TimeoutError:`
			`pass # Normal - continue loop`

			`async def _main_loop(self):`
			`"""Main service loop"""`
			`self.logger.info(f"{self.name}: Starting main loop")`

			`while not self._shutdown_event.is_set():`
			`try:`
			`await self.run_cycle()`
			`except asyncio.CancelledError:`
			`break`
			`except Exception as e:`
			`self._health.error_count += 1`
			`self.logger.error(f"Cycle error: {e}", exc_info=True)`
			`# Brief pause before retry`
			`await asyncio.sleep(1)`

			`def run(self):`
			`"""Run the service (blocking)"""`
			`self.logger.info(f"{self.name}: Service starting")`

			`# Notify systemd we're ready`
			`if self.notify_systemd:`
			`notify(Notification.READY)`
			`self.logger.info("Notified systemd: READY")`

			`# Start health check loop`
			`health_task = asyncio.create_task(self._health_loop())`
			`self._tasks.append(health_task)`

			`# Start main loop`
			`main_task = asyncio.create_task(self._main_loop())`
			`self._tasks.append(main_task)`

			`try:`
			`# Run until shutdown`
			`asyncio.get_event_loop().run_until_complete(self._shutdown_event.wait())`
			`except KeyboardInterrupt:`
			`pass`
			`finally:`
			`self.logger.info(f"{self.name}: Service stopping")`
			`# Cleanup`
			`for task in self._tasks:`
			`if not task.done():`
			`task.cancel()`

			`# Wait for cleanup`
			`if self._tasks:`
			`asyncio.get_event_loop().run_until_complete(`
			`asyncio.gather(*self._tasks, return_exceptions=True)`
			`)`

			`self.logger.info(f"{self.name}: Service stopped")`

			`def run_scheduled(`
			`func: Callable,`
			`interval_seconds: float,`
			`name: str = 'scheduled-task'`
			`):`
			`"""`
			`Run a function on a schedule (simple alternative to full service)`

			`Usage:`
			`def my_task():`
			`print("Running...")`

			`run_scheduled(my_task, interval_seconds=60, name='my-task')`
			`"""`
			`logger = get_logger(f'dolphin.scheduled.{name}')`
			`logger.info(f"Starting scheduled task: {name} (interval: {interval_seconds}s)")`

			`async def loop():`
			`while True:`
			`try:`
			`start = time.time()`
			`if asyncio.iscoroutinefunction(func):`
			`await func()`
			`else:`
			`func()`
			`elapsed = time.time() - start`
			`logger.info(f"Task completed in {elapsed:.2f}s")`

			`# Sleep remaining time`
			`sleep_time = max(0, interval_seconds - elapsed)`
			`await asyncio.sleep(sleep_time)`

			`except Exception as e:`
			`logger.error(f"Task error: {e}", exc_info=True)`
			`await asyncio.sleep(interval_seconds)`

			`try:`
			`asyncio.run(loop())`
			`except KeyboardInterrupt:`
			`logger.info("Stopped by user")`

			`__all__ = [`
			`'ServiceBase',`
			`'ServiceHealth',`
			`'get_logger',`
			`'JournalHandler',`
			`'run_scheduled',`
			`'notify',`
			`'SYSTEMD_AVAILABLE',`
			`'TENACITY_AVAILABLE',`
			`]`