#!/usr/bin/env python3 """ Dolphin Service Supervisor ========================== A SINGLE userland service that manages MULTIPLE service-like components. Architecture: - One systemd service: dolphin-supervisor.service - Internally manages: ExF, OB, Watchdog, MC, etc. - Each component is a Python thread/async task - Centralized health, logging, restart """ import asyncio import threading import signal import sys import time import json import traceback from abc import ABC, abstractmethod from dataclasses import dataclass, field from typing import Dict, List, Optional, Callable from datetime import datetime from concurrent.futures import ThreadPoolExecutor import logging # Optional systemd notify try: from pystemd.daemon import notify, Notification SYSTEMD_AVAILABLE = True except ImportError: SYSTEMD_AVAILABLE = False def notify(*args, **kwargs): pass # Optional tenacity for retries try: from tenacity import retry, stop_after_attempt, wait_exponential TENACITY_AVAILABLE = True except ImportError: TENACITY_AVAILABLE = False # ============================================================================= # STRUCTURED LOGGING # ============================================================================= class JSONFormatter(logging.Formatter): def format(self, record): log_data = { 'timestamp': datetime.utcnow().isoformat(), 'level': record.levelname, 'component': getattr(record, 'component', 'supervisor'), 'message': record.getMessage(), 'source': record.name, } if hasattr(record, 'extra_data'): log_data.update(record.extra_data) return json.dumps(log_data) def get_logger(name: str) -> logging.Logger: logger = logging.getLogger(name) if not logger.handlers: handler = logging.StreamHandler() handler.setFormatter(JSONFormatter()) logger.addHandler(handler) logger.setLevel(logging.INFO) return logger # ============================================================================= # COMPONENT BASE CLASS # ============================================================================= @dataclass class ComponentHealth: name: str status: str # 'healthy', 'degraded', 'failed', 'stopped' last_run: float error_count: int message: str uptime: float = 0.0 class ServiceComponent(ABC): """ Base class for a service-like component. Runs in its own thread, managed by the supervisor. """ def __init__(self, name: str, interval: float = 1.0, max_retries: int = 3): self.name = name self.interval = interval self.max_retries = max_retries self.logger = get_logger(f'component.{name}') self.logger.component = name self._running = False self._thread: Optional[threading.Thread] = None self._error_count = 0 self._last_run = 0 self._start_time = 0 self._health = ComponentHealth( name=name, status='stopped', last_run=0, error_count=0, message='Not started' ) @abstractmethod def run_cycle(self): """Override this with your component's work""" pass def health_check(self) -> bool: """Override for custom health check""" return True def _execute_with_retry(self): """Execute run_cycle with retry logic""" for attempt in range(self.max_retries): try: self.run_cycle() self._error_count = 0 self._last_run = time.time() return except Exception as e: self._error_count += 1 self.logger.error( f"Cycle failed (attempt {attempt + 1}): {e}", extra={'extra_data': {'attempt': attempt + 1, 'error': str(e)}} ) if attempt < self.max_retries - 1: time.sleep(min(2 ** attempt, 30)) # Exponential backoff else: raise def _loop(self): """Main component loop (runs in thread)""" self._running = True self._start_time = time.time() self.logger.info(f"{self.name}: Component started") while self._running: try: self._execute_with_retry() self._health.status = 'healthy' self._health.message = 'Running normally' except Exception as e: self._health.status = 'failed' self._health.message = f'Failed: {str(e)[:100]}' self.logger.error(f"{self.name}: Component failed: {e}") # Continue running (supervisor will restart if needed) # Sleep until next cycle time.sleep(self.interval) self._health.status = 'stopped' self.logger.info(f"{self.name}: Component stopped") def start(self): """Start the component in a new thread""" if self._thread and self._thread.is_alive(): self.logger.warning(f"{self.name}: Already running") return self._thread = threading.Thread(target=self._loop, name=f"component-{self.name}") self._thread.daemon = True self._thread.start() self.logger.info(f"{self.name}: Thread started") def stop(self, timeout: float = 5.0): """Stop the component gracefully""" self._running = False if self._thread and self._thread.is_alive(): self._thread.join(timeout=timeout) if self._thread.is_alive(): self.logger.warning(f"{self.name}: Thread did not stop gracefully") def get_health(self) -> ComponentHealth: """Get current health status""" self._health.last_run = self._last_run self._health.error_count = self._error_count if self._start_time: self._health.uptime = time.time() - self._start_time return self._health # ============================================================================= # SUPERVISOR (SINGLE SERVICE) # ============================================================================= class DolphinSupervisor: """ SINGLE service that manages MULTIPLE userland components. Usage: supervisor = DolphinSupervisor() supervisor.register(ExFComponent()) supervisor.register(OBComponent()) supervisor.register(WatchdogComponent()) supervisor.run() """ def __init__(self, health_check_interval: float = 10.0): self.logger = get_logger('supervisor') self.logger.component = 'supervisor' self.components: Dict[str, ServiceComponent] = {} self._running = False self._shutdown_event = threading.Event() self._health_check_interval = health_check_interval self._supervisor_thread: Optional[threading.Thread] = None # Signal handling self._setup_signals() def _setup_signals(self): """Setup graceful shutdown""" def handler(signum, frame): self.logger.info(f"Received signal {signum}, shutting down...") self._shutdown_event.set() signal.signal(signal.SIGTERM, handler) signal.signal(signal.SIGINT, handler) def register(self, component: ServiceComponent): """Register a component to be managed""" self.components[component.name] = component self.logger.info(f"Registered component: {component.name}") def start_all(self): """Start all registered components""" self.logger.info(f"Starting {len(self.components)} components...") for name, component in self.components.items(): try: component.start() except Exception as e: self.logger.error(f"Failed to start {name}: {e}") # Notify systemd we're ready if SYSTEMD_AVAILABLE: notify(Notification.READY) self.logger.info("Notified systemd: READY") def stop_all(self, timeout: float = 5.0): """Stop all components gracefully""" self.logger.info("Stopping all components...") for name, component in self.components.items(): try: component.stop(timeout=timeout) except Exception as e: self.logger.error(f"Error stopping {name}: {e}") def _supervisor_loop(self): """Main supervisor loop - monitors components""" self.logger.info("Supervisor monitoring started") while not self._shutdown_event.is_set(): # Check health of all components health_report = {} for name, component in self.components.items(): health = component.get_health() health_report[name] = { 'status': health.status, 'uptime': health.uptime, 'errors': health.error_count, 'message': health.message } # Restart failed components if health.status == 'failed' and component._running: self.logger.warning(f"{name}: Restarting failed component...") component.stop(timeout=2.0) time.sleep(1) component.start() # Log health summary failed = sum(1 for h in health_report.values() if h['status'] == 'failed') if failed > 0: self.logger.error(f"Health check: {failed} components failed", extra={'extra_data': health_report}) else: self.logger.debug("Health check: all components healthy", extra={'extra_data': health_report}) # Notify systemd watchdog if SYSTEMD_AVAILABLE: notify(Notification.WATCHDOG) # Wait for next check self._shutdown_event.wait(self._health_check_interval) self.logger.info("Supervisor monitoring stopped") def get_status(self) -> Dict: """Get full status of supervisor and components""" return { 'supervisor': { 'running': self._running, 'components_count': len(self.components) }, 'components': { name: { 'status': comp.get_health().status, 'uptime': comp.get_health().uptime, 'errors': comp.get_health().error_count, 'message': comp.get_health().message } for name, comp in self.components.items() } } def run(self): """Run the supervisor (blocking)""" self.logger.info("=" * 60) self.logger.info("Dolphin Service Supervisor Starting") self.logger.info("=" * 60) self._running = True # Start all components self.start_all() # Start supervisor monitoring thread self._supervisor_thread = threading.Thread( target=self._supervisor_loop, name="supervisor-monitor" ) self._supervisor_thread.start() # Wait for shutdown signal try: while not self._shutdown_event.is_set(): self._shutdown_event.wait(1) except KeyboardInterrupt: pass finally: self._running = False self.stop_all() if self._supervisor_thread: self._supervisor_thread.join(timeout=5.0) self.logger.info("Supervisor shutdown complete") # ============================================================================= # EXAMPLE COMPONENTS # ============================================================================= class ExFComponent(ServiceComponent): """External Factors - 0.5s aggressive oversampling""" def __init__(self): super().__init__(name='exf', interval=0.5, max_retries=3) self.indicators = {} def run_cycle(self): # Simulate fetching indicators self.indicators['basis'] = {'value': 0.01, 'timestamp': time.time()} self.indicators['spread'] = {'value': 0.02, 'timestamp': time.time()} # In real implementation: fetch from APIs, push to Hazelcast class OBComponent(ServiceComponent): """Order Book Streamer - 500ms""" def __init__(self): super().__init__(name='ob', interval=0.5, max_retries=3) def run_cycle(self): # Simulate OB snapshot pass class WatchdogComponent(ServiceComponent): """Survival Stack Watchdog - 10s""" def __init__(self): super().__init__(name='watchdog', interval=10.0, max_retries=5) self.posture = 'APEX' def run_cycle(self): # Check categories, compute posture pass class MCComponent(ServiceComponent): """MC-Forewarner - 4h (but we check every 5s if it's time)""" def __init__(self): super().__init__(name='mc', interval=300, max_retries=3) # 5 min check self.last_run = 0 def run_cycle(self): # Only actually run every 4 hours if time.time() - self.last_run > 14400: # 4 hours self.logger.info("Running MC-Forewarner assessment") self.last_run = time.time() # ============================================================================= # MAIN ENTRY POINT # ============================================================================= if __name__ == '__main__': # Create supervisor supervisor = DolphinSupervisor(health_check_interval=10.0) # Register components supervisor.register(ExFComponent()) supervisor.register(OBComponent()) supervisor.register(WatchdogComponent()) supervisor.register(MCComponent()) # Run supervisor.run()