412 lines
14 KiB
Python
412 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Dolphin Service Supervisor
|
||
|
|
==========================
|
||
|
|
A SINGLE userland service that manages MULTIPLE service-like components.
|
||
|
|
|
||
|
|
Architecture:
|
||
|
|
- One systemd service: dolphin-supervisor.service
|
||
|
|
- Internally manages: ExF, OB, Watchdog, MC, etc.
|
||
|
|
- Each component is a Python thread/async task
|
||
|
|
- Centralized health, logging, restart
|
||
|
|
"""
|
||
|
|
import asyncio
|
||
|
|
import threading
|
||
|
|
import signal
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import traceback
|
||
|
|
from abc import ABC, abstractmethod
|
||
|
|
from dataclasses import dataclass, field
|
||
|
|
from typing import Dict, List, Optional, Callable
|
||
|
|
from datetime import datetime
|
||
|
|
from concurrent.futures import ThreadPoolExecutor
|
||
|
|
import logging
|
||
|
|
|
||
|
|
# Optional systemd notify
|
||
|
|
try:
|
||
|
|
from pystemd.daemon import notify, Notification
|
||
|
|
SYSTEMD_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
SYSTEMD_AVAILABLE = False
|
||
|
|
def notify(*args, **kwargs):
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Optional tenacity for retries
|
||
|
|
try:
|
||
|
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
||
|
|
TENACITY_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
TENACITY_AVAILABLE = False
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# STRUCTURED LOGGING
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
class JSONFormatter(logging.Formatter):
|
||
|
|
def format(self, record):
|
||
|
|
log_data = {
|
||
|
|
'timestamp': datetime.utcnow().isoformat(),
|
||
|
|
'level': record.levelname,
|
||
|
|
'component': getattr(record, 'component', 'supervisor'),
|
||
|
|
'message': record.getMessage(),
|
||
|
|
'source': record.name,
|
||
|
|
}
|
||
|
|
if hasattr(record, 'extra_data'):
|
||
|
|
log_data.update(record.extra_data)
|
||
|
|
return json.dumps(log_data)
|
||
|
|
|
||
|
|
def get_logger(name: str) -> logging.Logger:
|
||
|
|
logger = logging.getLogger(name)
|
||
|
|
if not logger.handlers:
|
||
|
|
handler = logging.StreamHandler()
|
||
|
|
handler.setFormatter(JSONFormatter())
|
||
|
|
logger.addHandler(handler)
|
||
|
|
logger.setLevel(logging.INFO)
|
||
|
|
return logger
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# COMPONENT BASE CLASS
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ComponentHealth:
|
||
|
|
name: str
|
||
|
|
status: str # 'healthy', 'degraded', 'failed', 'stopped'
|
||
|
|
last_run: float
|
||
|
|
error_count: int
|
||
|
|
message: str
|
||
|
|
uptime: float = 0.0
|
||
|
|
|
||
|
|
|
||
|
|
class ServiceComponent(ABC):
|
||
|
|
"""
|
||
|
|
Base class for a service-like component.
|
||
|
|
Runs in its own thread, managed by the supervisor.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self, name: str, interval: float = 1.0, max_retries: int = 3):
|
||
|
|
self.name = name
|
||
|
|
self.interval = interval
|
||
|
|
self.max_retries = max_retries
|
||
|
|
self.logger = get_logger(f'component.{name}')
|
||
|
|
self.logger.component = name
|
||
|
|
|
||
|
|
self._running = False
|
||
|
|
self._thread: Optional[threading.Thread] = None
|
||
|
|
self._error_count = 0
|
||
|
|
self._last_run = 0
|
||
|
|
self._start_time = 0
|
||
|
|
self._health = ComponentHealth(
|
||
|
|
name=name, status='stopped',
|
||
|
|
last_run=0, error_count=0, message='Not started'
|
||
|
|
)
|
||
|
|
|
||
|
|
@abstractmethod
|
||
|
|
def run_cycle(self):
|
||
|
|
"""Override this with your component's work"""
|
||
|
|
pass
|
||
|
|
|
||
|
|
def health_check(self) -> bool:
|
||
|
|
"""Override for custom health check"""
|
||
|
|
return True
|
||
|
|
|
||
|
|
def _execute_with_retry(self):
|
||
|
|
"""Execute run_cycle with retry logic"""
|
||
|
|
for attempt in range(self.max_retries):
|
||
|
|
try:
|
||
|
|
self.run_cycle()
|
||
|
|
self._error_count = 0
|
||
|
|
self._last_run = time.time()
|
||
|
|
return
|
||
|
|
except Exception as e:
|
||
|
|
self._error_count += 1
|
||
|
|
self.logger.error(
|
||
|
|
f"Cycle failed (attempt {attempt + 1}): {e}",
|
||
|
|
extra={'extra_data': {'attempt': attempt + 1, 'error': str(e)}}
|
||
|
|
)
|
||
|
|
if attempt < self.max_retries - 1:
|
||
|
|
time.sleep(min(2 ** attempt, 30)) # Exponential backoff
|
||
|
|
else:
|
||
|
|
raise
|
||
|
|
|
||
|
|
def _loop(self):
|
||
|
|
"""Main component loop (runs in thread)"""
|
||
|
|
self._running = True
|
||
|
|
self._start_time = time.time()
|
||
|
|
self.logger.info(f"{self.name}: Component started")
|
||
|
|
|
||
|
|
while self._running:
|
||
|
|
try:
|
||
|
|
self._execute_with_retry()
|
||
|
|
self._health.status = 'healthy'
|
||
|
|
self._health.message = 'Running normally'
|
||
|
|
except Exception as e:
|
||
|
|
self._health.status = 'failed'
|
||
|
|
self._health.message = f'Failed: {str(e)[:100]}'
|
||
|
|
self.logger.error(f"{self.name}: Component failed: {e}")
|
||
|
|
# Continue running (supervisor will restart if needed)
|
||
|
|
|
||
|
|
# Sleep until next cycle
|
||
|
|
time.sleep(self.interval)
|
||
|
|
|
||
|
|
self._health.status = 'stopped'
|
||
|
|
self.logger.info(f"{self.name}: Component stopped")
|
||
|
|
|
||
|
|
def start(self):
|
||
|
|
"""Start the component in a new thread"""
|
||
|
|
if self._thread and self._thread.is_alive():
|
||
|
|
self.logger.warning(f"{self.name}: Already running")
|
||
|
|
return
|
||
|
|
|
||
|
|
self._thread = threading.Thread(target=self._loop, name=f"component-{self.name}")
|
||
|
|
self._thread.daemon = True
|
||
|
|
self._thread.start()
|
||
|
|
self.logger.info(f"{self.name}: Thread started")
|
||
|
|
|
||
|
|
def stop(self, timeout: float = 5.0):
|
||
|
|
"""Stop the component gracefully"""
|
||
|
|
self._running = False
|
||
|
|
if self._thread and self._thread.is_alive():
|
||
|
|
self._thread.join(timeout=timeout)
|
||
|
|
if self._thread.is_alive():
|
||
|
|
self.logger.warning(f"{self.name}: Thread did not stop gracefully")
|
||
|
|
|
||
|
|
def get_health(self) -> ComponentHealth:
|
||
|
|
"""Get current health status"""
|
||
|
|
self._health.last_run = self._last_run
|
||
|
|
self._health.error_count = self._error_count
|
||
|
|
if self._start_time:
|
||
|
|
self._health.uptime = time.time() - self._start_time
|
||
|
|
return self._health
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# SUPERVISOR (SINGLE SERVICE)
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
class DolphinSupervisor:
|
||
|
|
"""
|
||
|
|
SINGLE service that manages MULTIPLE userland components.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
supervisor = DolphinSupervisor()
|
||
|
|
supervisor.register(ExFComponent())
|
||
|
|
supervisor.register(OBComponent())
|
||
|
|
supervisor.register(WatchdogComponent())
|
||
|
|
supervisor.run()
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self, health_check_interval: float = 10.0):
|
||
|
|
self.logger = get_logger('supervisor')
|
||
|
|
self.logger.component = 'supervisor'
|
||
|
|
|
||
|
|
self.components: Dict[str, ServiceComponent] = {}
|
||
|
|
self._running = False
|
||
|
|
self._shutdown_event = threading.Event()
|
||
|
|
self._health_check_interval = health_check_interval
|
||
|
|
self._supervisor_thread: Optional[threading.Thread] = None
|
||
|
|
|
||
|
|
# Signal handling
|
||
|
|
self._setup_signals()
|
||
|
|
|
||
|
|
def _setup_signals(self):
|
||
|
|
"""Setup graceful shutdown"""
|
||
|
|
def handler(signum, frame):
|
||
|
|
self.logger.info(f"Received signal {signum}, shutting down...")
|
||
|
|
self._shutdown_event.set()
|
||
|
|
|
||
|
|
signal.signal(signal.SIGTERM, handler)
|
||
|
|
signal.signal(signal.SIGINT, handler)
|
||
|
|
|
||
|
|
def register(self, component: ServiceComponent):
|
||
|
|
"""Register a component to be managed"""
|
||
|
|
self.components[component.name] = component
|
||
|
|
self.logger.info(f"Registered component: {component.name}")
|
||
|
|
|
||
|
|
def start_all(self):
|
||
|
|
"""Start all registered components"""
|
||
|
|
self.logger.info(f"Starting {len(self.components)} components...")
|
||
|
|
for name, component in self.components.items():
|
||
|
|
try:
|
||
|
|
component.start()
|
||
|
|
except Exception as e:
|
||
|
|
self.logger.error(f"Failed to start {name}: {e}")
|
||
|
|
|
||
|
|
# Notify systemd we're ready
|
||
|
|
if SYSTEMD_AVAILABLE:
|
||
|
|
notify(Notification.READY)
|
||
|
|
self.logger.info("Notified systemd: READY")
|
||
|
|
|
||
|
|
def stop_all(self, timeout: float = 5.0):
|
||
|
|
"""Stop all components gracefully"""
|
||
|
|
self.logger.info("Stopping all components...")
|
||
|
|
for name, component in self.components.items():
|
||
|
|
try:
|
||
|
|
component.stop(timeout=timeout)
|
||
|
|
except Exception as e:
|
||
|
|
self.logger.error(f"Error stopping {name}: {e}")
|
||
|
|
|
||
|
|
def _supervisor_loop(self):
|
||
|
|
"""Main supervisor loop - monitors components"""
|
||
|
|
self.logger.info("Supervisor monitoring started")
|
||
|
|
|
||
|
|
while not self._shutdown_event.is_set():
|
||
|
|
# Check health of all components
|
||
|
|
health_report = {}
|
||
|
|
for name, component in self.components.items():
|
||
|
|
health = component.get_health()
|
||
|
|
health_report[name] = {
|
||
|
|
'status': health.status,
|
||
|
|
'uptime': health.uptime,
|
||
|
|
'errors': health.error_count,
|
||
|
|
'message': health.message
|
||
|
|
}
|
||
|
|
|
||
|
|
# Restart failed components
|
||
|
|
if health.status == 'failed' and component._running:
|
||
|
|
self.logger.warning(f"{name}: Restarting failed component...")
|
||
|
|
component.stop(timeout=2.0)
|
||
|
|
time.sleep(1)
|
||
|
|
component.start()
|
||
|
|
|
||
|
|
# Log health summary
|
||
|
|
failed = sum(1 for h in health_report.values() if h['status'] == 'failed')
|
||
|
|
if failed > 0:
|
||
|
|
self.logger.error(f"Health check: {failed} components failed",
|
||
|
|
extra={'extra_data': health_report})
|
||
|
|
else:
|
||
|
|
self.logger.debug("Health check: all components healthy",
|
||
|
|
extra={'extra_data': health_report})
|
||
|
|
|
||
|
|
# Notify systemd watchdog
|
||
|
|
if SYSTEMD_AVAILABLE:
|
||
|
|
notify(Notification.WATCHDOG)
|
||
|
|
|
||
|
|
# Wait for next check
|
||
|
|
self._shutdown_event.wait(self._health_check_interval)
|
||
|
|
|
||
|
|
self.logger.info("Supervisor monitoring stopped")
|
||
|
|
|
||
|
|
def get_status(self) -> Dict:
|
||
|
|
"""Get full status of supervisor and components"""
|
||
|
|
return {
|
||
|
|
'supervisor': {
|
||
|
|
'running': self._running,
|
||
|
|
'components_count': len(self.components)
|
||
|
|
},
|
||
|
|
'components': {
|
||
|
|
name: {
|
||
|
|
'status': comp.get_health().status,
|
||
|
|
'uptime': comp.get_health().uptime,
|
||
|
|
'errors': comp.get_health().error_count,
|
||
|
|
'message': comp.get_health().message
|
||
|
|
}
|
||
|
|
for name, comp in self.components.items()
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
def run(self):
|
||
|
|
"""Run the supervisor (blocking)"""
|
||
|
|
self.logger.info("=" * 60)
|
||
|
|
self.logger.info("Dolphin Service Supervisor Starting")
|
||
|
|
self.logger.info("=" * 60)
|
||
|
|
|
||
|
|
self._running = True
|
||
|
|
|
||
|
|
# Start all components
|
||
|
|
self.start_all()
|
||
|
|
|
||
|
|
# Start supervisor monitoring thread
|
||
|
|
self._supervisor_thread = threading.Thread(
|
||
|
|
target=self._supervisor_loop,
|
||
|
|
name="supervisor-monitor"
|
||
|
|
)
|
||
|
|
self._supervisor_thread.start()
|
||
|
|
|
||
|
|
# Wait for shutdown signal
|
||
|
|
try:
|
||
|
|
while not self._shutdown_event.is_set():
|
||
|
|
self._shutdown_event.wait(1)
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
pass
|
||
|
|
finally:
|
||
|
|
self._running = False
|
||
|
|
self.stop_all()
|
||
|
|
if self._supervisor_thread:
|
||
|
|
self._supervisor_thread.join(timeout=5.0)
|
||
|
|
|
||
|
|
self.logger.info("Supervisor shutdown complete")
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# EXAMPLE COMPONENTS
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
class ExFComponent(ServiceComponent):
|
||
|
|
"""External Factors - 0.5s aggressive oversampling"""
|
||
|
|
def __init__(self):
|
||
|
|
super().__init__(name='exf', interval=0.5, max_retries=3)
|
||
|
|
self.indicators = {}
|
||
|
|
|
||
|
|
def run_cycle(self):
|
||
|
|
# Simulate fetching indicators
|
||
|
|
self.indicators['basis'] = {'value': 0.01, 'timestamp': time.time()}
|
||
|
|
self.indicators['spread'] = {'value': 0.02, 'timestamp': time.time()}
|
||
|
|
# In real implementation: fetch from APIs, push to Hazelcast
|
||
|
|
|
||
|
|
|
||
|
|
class OBComponent(ServiceComponent):
|
||
|
|
"""Order Book Streamer - 500ms"""
|
||
|
|
def __init__(self):
|
||
|
|
super().__init__(name='ob', interval=0.5, max_retries=3)
|
||
|
|
|
||
|
|
def run_cycle(self):
|
||
|
|
# Simulate OB snapshot
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
class WatchdogComponent(ServiceComponent):
|
||
|
|
"""Survival Stack Watchdog - 10s"""
|
||
|
|
def __init__(self):
|
||
|
|
super().__init__(name='watchdog', interval=10.0, max_retries=5)
|
||
|
|
self.posture = 'APEX'
|
||
|
|
|
||
|
|
def run_cycle(self):
|
||
|
|
# Check categories, compute posture
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
class MCComponent(ServiceComponent):
|
||
|
|
"""MC-Forewarner - 4h (but we check every 5s if it's time)"""
|
||
|
|
def __init__(self):
|
||
|
|
super().__init__(name='mc', interval=300, max_retries=3) # 5 min check
|
||
|
|
self.last_run = 0
|
||
|
|
|
||
|
|
def run_cycle(self):
|
||
|
|
# Only actually run every 4 hours
|
||
|
|
if time.time() - self.last_run > 14400: # 4 hours
|
||
|
|
self.logger.info("Running MC-Forewarner assessment")
|
||
|
|
self.last_run = time.time()
|
||
|
|
|
||
|
|
|
||
|
|
# =============================================================================
|
||
|
|
# MAIN ENTRY POINT
|
||
|
|
# =============================================================================
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
# Create supervisor
|
||
|
|
supervisor = DolphinSupervisor(health_check_interval=10.0)
|
||
|
|
|
||
|
|
# Register components
|
||
|
|
supervisor.register(ExFComponent())
|
||
|
|
supervisor.register(OBComponent())
|
||
|
|
supervisor.register(WatchdogComponent())
|
||
|
|
supervisor.register(MCComponent())
|
||
|
|
|
||
|
|
# Run
|
||
|
|
supervisor.run()
|