initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/prod/services/service_base.py
+++ b/prod/services/service_base.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+"""
+Dolphin Service Base Class - Boilerplate for reliable userland services
+Features:
+- Automatic retries with exponential backoff
+- Structured logging to journal
+- Health check endpoints
+- Graceful shutdown on signals
+- Systemd notify support (Type=notify)
+- Memory/CPU monitoring
+"""
+import abc
+import asyncio
+import logging
+import signal
+import sys
+import os
+import time
+import json
+from typing import Optional, Callable, Any
+from dataclasses import dataclass, asdict
+from datetime import datetime
+from functools import wraps
+
+# Optional imports - graceful degradation if not available
+try:
+    from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
+    TENACITY_AVAILABLE = True
+except ImportError:
+    TENACITY_AVAILABLE = False
+    
+try:
+    from pystemd.daemon import notify, Notification
+    SYSTEMD_AVAILABLE = True
+except ImportError:
+    SYSTEMD_AVAILABLE = False
+    def notify(*args, **kwargs):
+        pass
+
+# Configure logging for systemd journal
+class JournalHandler(logging.Handler):
+    """Log handler that outputs JSON for systemd journal"""
+    def emit(self, record):
+        try:
+            msg = {
+                'timestamp': datetime.utcnow().isoformat(),
+                'level': record.levelname,
+                'logger': record.name,
+                'message': self.format(record),
+                'source': getattr(record, 'source', 'unknown'),
+                'service': getattr(record, 'service', 'unknown'),
+            }
+            print(json.dumps(msg), flush=True)
+        except Exception:
+            self.handleError(record)
+
+def get_logger(name: str) -> logging.Logger:
+    """Get configured logger for services"""
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        handler = JournalHandler()
+        handler.setFormatter(logging.Formatter('%(message)s'))
+        logger.addHandler(handler)
+        logger.setLevel(logging.INFO)
+    return logger
+
+@dataclass
+class ServiceHealth:
+    """Health check status"""
+    status: str  # 'healthy', 'degraded', 'unhealthy'
+    last_check: float
+    uptime: float
+    memory_mb: float
+    cpu_percent: float
+    error_count: int
+    message: str
+    
+    def to_json(self) -> str:
+        return json.dumps(asdict(self))
+
+class ServiceBase(abc.ABC):
+    """
+    Base class for reliable Dolphin services
+    
+    Usage:
+        class MyService(ServiceBase):
+            def __init__(self):
+                super().__init__("my-service", check_interval=30)
+            
+            async def run_cycle(self):
+                # Your service logic here
+                pass
+        
+        if __name__ == '__main__':
+            service = MyService()
+            service.run()
+    """
+    
+    def __init__(
+        self,
+        name: str,
+        check_interval: float = 30.0,
+        max_retries: int = 3,
+        notify_systemd: bool = True
+    ):
+        self.name = name
+        self.check_interval = check_interval
+        self.max_retries = max_retries
+        self.notify_systemd = notify_systemd and SYSTEMD_AVAILABLE
+        
+        self.logger = get_logger(f'dolphin.{name}')
+        self.logger.service = name
+        
+        self._shutdown_event = asyncio.Event()
+        self._start_time = time.time()
+        self._health = ServiceHealth(
+            status='starting',
+            last_check=time.time(),
+            uptime=0.0,
+            memory_mb=0.0,
+            cpu_percent=0.0,
+            error_count=0,
+            message='Initializing'
+        )
+        self._tasks = []
+        
+        # Setup signal handlers
+        self._setup_signals()
+        
+    def _setup_signals(self):
+        """Setup graceful shutdown handlers"""
+        for sig in (signal.SIGTERM, signal.SIGINT):
+            asyncio.get_event_loop().add_signal_handler(
+                sig, lambda: asyncio.create_task(self._shutdown())
+            )
+            
+    async def _shutdown(self):
+        """Graceful shutdown"""
+        self.logger.warning(f"{self.name}: Shutdown signal received")
+        self._shutdown_event.set()
+        
+        # Cancel all tasks
+        for task in self._tasks:
+            if not task.done():
+                task.cancel()
+                
+        # Give tasks time to cleanup
+        await asyncio.sleep(0.5)
+        
+    def _update_health(self, status: str, message: str = ''):
+        """Update health status"""
+        import psutil
+        process = psutil.Process()
+        
+        self._health = ServiceHealth(
+            status=status,
+            last_check=time.time(),
+            uptime=time.time() - self._start_time,
+            memory_mb=process.memory_info().rss / 1024 / 1024,
+            cpu_percent=process.cpu_percent(),
+            error_count=self._health.error_count,
+            message=message
+        )
+        
+    def _log_extra(self, **kwargs):
+        """Add extra context to logs"""
+        for key, value in kwargs.items():
+            setattr(self.logger, key, value)
+    
+    def retry_with_backoff(self, func: Callable, **kwargs):
+        """Decorator/wrapper for retry logic"""
+        if not TENACITY_AVAILABLE:
+            return func
+            
+        retry_kwargs = {
+            'stop': stop_after_attempt(kwargs.get('max_retries', self.max_retries)),
+            'wait': wait_exponential(multiplier=1, min=4, max=60),
+            'retry': retry_if_exception_type((Exception,)),
+            'before_sleep': lambda retry_state: self.logger.warning(
+                f"Retry {retry_state.attempt_number}: {retry_state.outcome.exception()}"
+            )
+        }
+        
+        return retry(**retry_kwargs)(func)
+    
+    @abc.abstractmethod
+    async def run_cycle(self):
+        """
+        Main service logic - implement this!
+        Called repeatedly in the main loop.
+        Should be non-blocking or use asyncio.
+        """
+        pass
+    
+    async def health_check(self) -> bool:
+        """
+        Optional: Implement custom health check
+        Return True if healthy, False otherwise
+        """
+        return True
+    
+    async def _health_loop(self):
+        """Background health check loop"""
+        while not self._shutdown_event.is_set():
+            try:
+                healthy = await self.health_check()
+                if healthy:
+                    self._update_health('healthy', 'Service operating normally')
+                else:
+                    self._update_health('degraded', 'Health check failed')
+                    
+                # Notify systemd we're still alive
+                if self.notify_systemd:
+                    notify(Notification.WATCHDOG)
+                    
+            except Exception as e:
+                self._health.error_count += 1
+                self._update_health('unhealthy', str(e))
+                self.logger.error(f"Health check error: {e}")
+                
+            try:
+                await asyncio.wait_for(
+                    self._shutdown_event.wait(),
+                    timeout=self.check_interval
+                )
+            except asyncio.TimeoutError:
+                pass  # Normal - continue loop
+                
+    async def _main_loop(self):
+        """Main service loop"""
+        self.logger.info(f"{self.name}: Starting main loop")
+        
+        while not self._shutdown_event.is_set():
+            try:
+                await self.run_cycle()
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                self._health.error_count += 1
+                self.logger.error(f"Cycle error: {e}", exc_info=True)
+                # Brief pause before retry
+                await asyncio.sleep(1)
+                
+    def run(self):
+        """Run the service (blocking)"""
+        self.logger.info(f"{self.name}: Service starting")
+        
+        # Notify systemd we're ready
+        if self.notify_systemd:
+            notify(Notification.READY)
+            self.logger.info("Notified systemd: READY")
+        
+        # Start health check loop
+        health_task = asyncio.create_task(self._health_loop())
+        self._tasks.append(health_task)
+        
+        # Start main loop
+        main_task = asyncio.create_task(self._main_loop())
+        self._tasks.append(main_task)
+        
+        try:
+            # Run until shutdown
+            asyncio.get_event_loop().run_until_complete(self._shutdown_event.wait())
+        except KeyboardInterrupt:
+            pass
+        finally:
+            self.logger.info(f"{self.name}: Service stopping")
+            # Cleanup
+            for task in self._tasks:
+                if not task.done():
+                    task.cancel()
+                    
+            # Wait for cleanup
+            if self._tasks:
+                asyncio.get_event_loop().run_until_complete(
+                    asyncio.gather(*self._tasks, return_exceptions=True)
+                )
+                
+        self.logger.info(f"{self.name}: Service stopped")
+
+def run_scheduled(
+    func: Callable,
+    interval_seconds: float,
+    name: str = 'scheduled-task'
+):
+    """
+    Run a function on a schedule (simple alternative to full service)
+    
+    Usage:
+        def my_task():
+            print("Running...")
+        
+        run_scheduled(my_task, interval_seconds=60, name='my-task')
+    """
+    logger = get_logger(f'dolphin.scheduled.{name}')
+    logger.info(f"Starting scheduled task: {name} (interval: {interval_seconds}s)")
+    
+    async def loop():
+        while True:
+            try:
+                start = time.time()
+                if asyncio.iscoroutinefunction(func):
+                    await func()
+                else:
+                    func()
+                elapsed = time.time() - start
+                logger.info(f"Task completed in {elapsed:.2f}s")
+                
+                # Sleep remaining time
+                sleep_time = max(0, interval_seconds - elapsed)
+                await asyncio.sleep(sleep_time)
+                
+            except Exception as e:
+                logger.error(f"Task error: {e}", exc_info=True)
+                await asyncio.sleep(interval_seconds)
+    
+    try:
+        asyncio.run(loop())
+    except KeyboardInterrupt:
+        logger.info("Stopped by user")
+
+__all__ = [
+    'ServiceBase',
+    'ServiceHealth',
+    'get_logger',
+    'JournalHandler',
+    'run_scheduled',
+    'notify',
+    'SYSTEMD_AVAILABLE',
+    'TENACITY_AVAILABLE',
+]