Files
DOLPHIN/prod/services/supervisor.py

412 lines
14 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Dolphin Service Supervisor
==========================
A SINGLE userland service that manages MULTIPLE service-like components.
Architecture:
- One systemd service: dolphin-supervisor.service
- Internally manages: ExF, OB, Watchdog, MC, etc.
- Each component is a Python thread/async task
- Centralized health, logging, restart
"""
import asyncio
import threading
import signal
import sys
import time
import json
import traceback
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Callable
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import logging
# Optional systemd notify
try:
from pystemd.daemon import notify, Notification
SYSTEMD_AVAILABLE = True
except ImportError:
SYSTEMD_AVAILABLE = False
def notify(*args, **kwargs):
pass
# Optional tenacity for retries
try:
from tenacity import retry, stop_after_attempt, wait_exponential
TENACITY_AVAILABLE = True
except ImportError:
TENACITY_AVAILABLE = False
# =============================================================================
# STRUCTURED LOGGING
# =============================================================================
class JSONFormatter(logging.Formatter):
def format(self, record):
log_data = {
'timestamp': datetime.utcnow().isoformat(),
'level': record.levelname,
'component': getattr(record, 'component', 'supervisor'),
'message': record.getMessage(),
'source': record.name,
}
if hasattr(record, 'extra_data'):
log_data.update(record.extra_data)
return json.dumps(log_data)
def get_logger(name: str) -> logging.Logger:
logger = logging.getLogger(name)
if not logger.handlers:
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
# =============================================================================
# COMPONENT BASE CLASS
# =============================================================================
@dataclass
class ComponentHealth:
name: str
status: str # 'healthy', 'degraded', 'failed', 'stopped'
last_run: float
error_count: int
message: str
uptime: float = 0.0
class ServiceComponent(ABC):
"""
Base class for a service-like component.
Runs in its own thread, managed by the supervisor.
"""
def __init__(self, name: str, interval: float = 1.0, max_retries: int = 3):
self.name = name
self.interval = interval
self.max_retries = max_retries
self.logger = get_logger(f'component.{name}')
self.logger.component = name
self._running = False
self._thread: Optional[threading.Thread] = None
self._error_count = 0
self._last_run = 0
self._start_time = 0
self._health = ComponentHealth(
name=name, status='stopped',
last_run=0, error_count=0, message='Not started'
)
@abstractmethod
def run_cycle(self):
"""Override this with your component's work"""
pass
def health_check(self) -> bool:
"""Override for custom health check"""
return True
def _execute_with_retry(self):
"""Execute run_cycle with retry logic"""
for attempt in range(self.max_retries):
try:
self.run_cycle()
self._error_count = 0
self._last_run = time.time()
return
except Exception as e:
self._error_count += 1
self.logger.error(
f"Cycle failed (attempt {attempt + 1}): {e}",
extra={'extra_data': {'attempt': attempt + 1, 'error': str(e)}}
)
if attempt < self.max_retries - 1:
time.sleep(min(2 ** attempt, 30)) # Exponential backoff
else:
raise
def _loop(self):
"""Main component loop (runs in thread)"""
self._running = True
self._start_time = time.time()
self.logger.info(f"{self.name}: Component started")
while self._running:
try:
self._execute_with_retry()
self._health.status = 'healthy'
self._health.message = 'Running normally'
except Exception as e:
self._health.status = 'failed'
self._health.message = f'Failed: {str(e)[:100]}'
self.logger.error(f"{self.name}: Component failed: {e}")
# Continue running (supervisor will restart if needed)
# Sleep until next cycle
time.sleep(self.interval)
self._health.status = 'stopped'
self.logger.info(f"{self.name}: Component stopped")
def start(self):
"""Start the component in a new thread"""
if self._thread and self._thread.is_alive():
self.logger.warning(f"{self.name}: Already running")
return
self._thread = threading.Thread(target=self._loop, name=f"component-{self.name}")
self._thread.daemon = True
self._thread.start()
self.logger.info(f"{self.name}: Thread started")
def stop(self, timeout: float = 5.0):
"""Stop the component gracefully"""
self._running = False
if self._thread and self._thread.is_alive():
self._thread.join(timeout=timeout)
if self._thread.is_alive():
self.logger.warning(f"{self.name}: Thread did not stop gracefully")
def get_health(self) -> ComponentHealth:
"""Get current health status"""
self._health.last_run = self._last_run
self._health.error_count = self._error_count
if self._start_time:
self._health.uptime = time.time() - self._start_time
return self._health
# =============================================================================
# SUPERVISOR (SINGLE SERVICE)
# =============================================================================
class DolphinSupervisor:
"""
SINGLE service that manages MULTIPLE userland components.
Usage:
supervisor = DolphinSupervisor()
supervisor.register(ExFComponent())
supervisor.register(OBComponent())
supervisor.register(WatchdogComponent())
supervisor.run()
"""
def __init__(self, health_check_interval: float = 10.0):
self.logger = get_logger('supervisor')
self.logger.component = 'supervisor'
self.components: Dict[str, ServiceComponent] = {}
self._running = False
self._shutdown_event = threading.Event()
self._health_check_interval = health_check_interval
self._supervisor_thread: Optional[threading.Thread] = None
# Signal handling
self._setup_signals()
def _setup_signals(self):
"""Setup graceful shutdown"""
def handler(signum, frame):
self.logger.info(f"Received signal {signum}, shutting down...")
self._shutdown_event.set()
signal.signal(signal.SIGTERM, handler)
signal.signal(signal.SIGINT, handler)
def register(self, component: ServiceComponent):
"""Register a component to be managed"""
self.components[component.name] = component
self.logger.info(f"Registered component: {component.name}")
def start_all(self):
"""Start all registered components"""
self.logger.info(f"Starting {len(self.components)} components...")
for name, component in self.components.items():
try:
component.start()
except Exception as e:
self.logger.error(f"Failed to start {name}: {e}")
# Notify systemd we're ready
if SYSTEMD_AVAILABLE:
notify(Notification.READY)
self.logger.info("Notified systemd: READY")
def stop_all(self, timeout: float = 5.0):
"""Stop all components gracefully"""
self.logger.info("Stopping all components...")
for name, component in self.components.items():
try:
component.stop(timeout=timeout)
except Exception as e:
self.logger.error(f"Error stopping {name}: {e}")
def _supervisor_loop(self):
"""Main supervisor loop - monitors components"""
self.logger.info("Supervisor monitoring started")
while not self._shutdown_event.is_set():
# Check health of all components
health_report = {}
for name, component in self.components.items():
health = component.get_health()
health_report[name] = {
'status': health.status,
'uptime': health.uptime,
'errors': health.error_count,
'message': health.message
}
# Restart failed components
if health.status == 'failed' and component._running:
self.logger.warning(f"{name}: Restarting failed component...")
component.stop(timeout=2.0)
time.sleep(1)
component.start()
# Log health summary
failed = sum(1 for h in health_report.values() if h['status'] == 'failed')
if failed > 0:
self.logger.error(f"Health check: {failed} components failed",
extra={'extra_data': health_report})
else:
self.logger.debug("Health check: all components healthy",
extra={'extra_data': health_report})
# Notify systemd watchdog
if SYSTEMD_AVAILABLE:
notify(Notification.WATCHDOG)
# Wait for next check
self._shutdown_event.wait(self._health_check_interval)
self.logger.info("Supervisor monitoring stopped")
def get_status(self) -> Dict:
"""Get full status of supervisor and components"""
return {
'supervisor': {
'running': self._running,
'components_count': len(self.components)
},
'components': {
name: {
'status': comp.get_health().status,
'uptime': comp.get_health().uptime,
'errors': comp.get_health().error_count,
'message': comp.get_health().message
}
for name, comp in self.components.items()
}
}
def run(self):
"""Run the supervisor (blocking)"""
self.logger.info("=" * 60)
self.logger.info("Dolphin Service Supervisor Starting")
self.logger.info("=" * 60)
self._running = True
# Start all components
self.start_all()
# Start supervisor monitoring thread
self._supervisor_thread = threading.Thread(
target=self._supervisor_loop,
name="supervisor-monitor"
)
self._supervisor_thread.start()
# Wait for shutdown signal
try:
while not self._shutdown_event.is_set():
self._shutdown_event.wait(1)
except KeyboardInterrupt:
pass
finally:
self._running = False
self.stop_all()
if self._supervisor_thread:
self._supervisor_thread.join(timeout=5.0)
self.logger.info("Supervisor shutdown complete")
# =============================================================================
# EXAMPLE COMPONENTS
# =============================================================================
class ExFComponent(ServiceComponent):
"""External Factors - 0.5s aggressive oversampling"""
def __init__(self):
super().__init__(name='exf', interval=0.5, max_retries=3)
self.indicators = {}
def run_cycle(self):
# Simulate fetching indicators
self.indicators['basis'] = {'value': 0.01, 'timestamp': time.time()}
self.indicators['spread'] = {'value': 0.02, 'timestamp': time.time()}
# In real implementation: fetch from APIs, push to Hazelcast
class OBComponent(ServiceComponent):
"""Order Book Streamer - 500ms"""
def __init__(self):
super().__init__(name='ob', interval=0.5, max_retries=3)
def run_cycle(self):
# Simulate OB snapshot
pass
class WatchdogComponent(ServiceComponent):
"""Survival Stack Watchdog - 10s"""
def __init__(self):
super().__init__(name='watchdog', interval=10.0, max_retries=5)
self.posture = 'APEX'
def run_cycle(self):
# Check categories, compute posture
pass
class MCComponent(ServiceComponent):
"""MC-Forewarner - 4h (but we check every 5s if it's time)"""
def __init__(self):
super().__init__(name='mc', interval=300, max_retries=3) # 5 min check
self.last_run = 0
def run_cycle(self):
# Only actually run every 4 hours
if time.time() - self.last_run > 14400: # 4 hours
self.logger.info("Running MC-Forewarner assessment")
self.last_run = time.time()
# =============================================================================
# MAIN ENTRY POINT
# =============================================================================
if __name__ == '__main__':
# Create supervisor
supervisor = DolphinSupervisor(health_check_interval=10.0)
# Register components
supervisor.register(ExFComponent())
supervisor.register(OBComponent())
supervisor.register(WatchdogComponent())
supervisor.register(MCComponent())
# Run
supervisor.run()