initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
411
prod/services/supervisor.py
Executable file
411
prod/services/supervisor.py
Executable file
@@ -0,0 +1,411 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dolphin Service Supervisor
|
||||
==========================
|
||||
A SINGLE userland service that manages MULTIPLE service-like components.
|
||||
|
||||
Architecture:
|
||||
- One systemd service: dolphin-supervisor.service
|
||||
- Internally manages: ExF, OB, Watchdog, MC, etc.
|
||||
- Each component is a Python thread/async task
|
||||
- Centralized health, logging, restart
|
||||
"""
|
||||
import asyncio
|
||||
import threading
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import traceback
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Callable
|
||||
from datetime import datetime
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import logging
|
||||
|
||||
# Optional systemd notify
|
||||
try:
|
||||
from pystemd.daemon import notify, Notification
|
||||
SYSTEMD_AVAILABLE = True
|
||||
except ImportError:
|
||||
SYSTEMD_AVAILABLE = False
|
||||
def notify(*args, **kwargs):
|
||||
pass
|
||||
|
||||
# Optional tenacity for retries
|
||||
try:
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
TENACITY_AVAILABLE = True
|
||||
except ImportError:
|
||||
TENACITY_AVAILABLE = False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# STRUCTURED LOGGING
|
||||
# =============================================================================
|
||||
|
||||
class JSONFormatter(logging.Formatter):
|
||||
def format(self, record):
|
||||
log_data = {
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'level': record.levelname,
|
||||
'component': getattr(record, 'component', 'supervisor'),
|
||||
'message': record.getMessage(),
|
||||
'source': record.name,
|
||||
}
|
||||
if hasattr(record, 'extra_data'):
|
||||
log_data.update(record.extra_data)
|
||||
return json.dumps(log_data)
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
logger = logging.getLogger(name)
|
||||
if not logger.handlers:
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(JSONFormatter())
|
||||
logger.addHandler(handler)
|
||||
logger.setLevel(logging.INFO)
|
||||
return logger
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# COMPONENT BASE CLASS
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class ComponentHealth:
|
||||
name: str
|
||||
status: str # 'healthy', 'degraded', 'failed', 'stopped'
|
||||
last_run: float
|
||||
error_count: int
|
||||
message: str
|
||||
uptime: float = 0.0
|
||||
|
||||
|
||||
class ServiceComponent(ABC):
|
||||
"""
|
||||
Base class for a service-like component.
|
||||
Runs in its own thread, managed by the supervisor.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, interval: float = 1.0, max_retries: int = 3):
|
||||
self.name = name
|
||||
self.interval = interval
|
||||
self.max_retries = max_retries
|
||||
self.logger = get_logger(f'component.{name}')
|
||||
self.logger.component = name
|
||||
|
||||
self._running = False
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._error_count = 0
|
||||
self._last_run = 0
|
||||
self._start_time = 0
|
||||
self._health = ComponentHealth(
|
||||
name=name, status='stopped',
|
||||
last_run=0, error_count=0, message='Not started'
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def run_cycle(self):
|
||||
"""Override this with your component's work"""
|
||||
pass
|
||||
|
||||
def health_check(self) -> bool:
|
||||
"""Override for custom health check"""
|
||||
return True
|
||||
|
||||
def _execute_with_retry(self):
|
||||
"""Execute run_cycle with retry logic"""
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
self.run_cycle()
|
||||
self._error_count = 0
|
||||
self._last_run = time.time()
|
||||
return
|
||||
except Exception as e:
|
||||
self._error_count += 1
|
||||
self.logger.error(
|
||||
f"Cycle failed (attempt {attempt + 1}): {e}",
|
||||
extra={'extra_data': {'attempt': attempt + 1, 'error': str(e)}}
|
||||
)
|
||||
if attempt < self.max_retries - 1:
|
||||
time.sleep(min(2 ** attempt, 30)) # Exponential backoff
|
||||
else:
|
||||
raise
|
||||
|
||||
def _loop(self):
|
||||
"""Main component loop (runs in thread)"""
|
||||
self._running = True
|
||||
self._start_time = time.time()
|
||||
self.logger.info(f"{self.name}: Component started")
|
||||
|
||||
while self._running:
|
||||
try:
|
||||
self._execute_with_retry()
|
||||
self._health.status = 'healthy'
|
||||
self._health.message = 'Running normally'
|
||||
except Exception as e:
|
||||
self._health.status = 'failed'
|
||||
self._health.message = f'Failed: {str(e)[:100]}'
|
||||
self.logger.error(f"{self.name}: Component failed: {e}")
|
||||
# Continue running (supervisor will restart if needed)
|
||||
|
||||
# Sleep until next cycle
|
||||
time.sleep(self.interval)
|
||||
|
||||
self._health.status = 'stopped'
|
||||
self.logger.info(f"{self.name}: Component stopped")
|
||||
|
||||
def start(self):
|
||||
"""Start the component in a new thread"""
|
||||
if self._thread and self._thread.is_alive():
|
||||
self.logger.warning(f"{self.name}: Already running")
|
||||
return
|
||||
|
||||
self._thread = threading.Thread(target=self._loop, name=f"component-{self.name}")
|
||||
self._thread.daemon = True
|
||||
self._thread.start()
|
||||
self.logger.info(f"{self.name}: Thread started")
|
||||
|
||||
def stop(self, timeout: float = 5.0):
|
||||
"""Stop the component gracefully"""
|
||||
self._running = False
|
||||
if self._thread and self._thread.is_alive():
|
||||
self._thread.join(timeout=timeout)
|
||||
if self._thread.is_alive():
|
||||
self.logger.warning(f"{self.name}: Thread did not stop gracefully")
|
||||
|
||||
def get_health(self) -> ComponentHealth:
|
||||
"""Get current health status"""
|
||||
self._health.last_run = self._last_run
|
||||
self._health.error_count = self._error_count
|
||||
if self._start_time:
|
||||
self._health.uptime = time.time() - self._start_time
|
||||
return self._health
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SUPERVISOR (SINGLE SERVICE)
|
||||
# =============================================================================
|
||||
|
||||
class DolphinSupervisor:
|
||||
"""
|
||||
SINGLE service that manages MULTIPLE userland components.
|
||||
|
||||
Usage:
|
||||
supervisor = DolphinSupervisor()
|
||||
supervisor.register(ExFComponent())
|
||||
supervisor.register(OBComponent())
|
||||
supervisor.register(WatchdogComponent())
|
||||
supervisor.run()
|
||||
"""
|
||||
|
||||
def __init__(self, health_check_interval: float = 10.0):
|
||||
self.logger = get_logger('supervisor')
|
||||
self.logger.component = 'supervisor'
|
||||
|
||||
self.components: Dict[str, ServiceComponent] = {}
|
||||
self._running = False
|
||||
self._shutdown_event = threading.Event()
|
||||
self._health_check_interval = health_check_interval
|
||||
self._supervisor_thread: Optional[threading.Thread] = None
|
||||
|
||||
# Signal handling
|
||||
self._setup_signals()
|
||||
|
||||
def _setup_signals(self):
|
||||
"""Setup graceful shutdown"""
|
||||
def handler(signum, frame):
|
||||
self.logger.info(f"Received signal {signum}, shutting down...")
|
||||
self._shutdown_event.set()
|
||||
|
||||
signal.signal(signal.SIGTERM, handler)
|
||||
signal.signal(signal.SIGINT, handler)
|
||||
|
||||
def register(self, component: ServiceComponent):
|
||||
"""Register a component to be managed"""
|
||||
self.components[component.name] = component
|
||||
self.logger.info(f"Registered component: {component.name}")
|
||||
|
||||
def start_all(self):
|
||||
"""Start all registered components"""
|
||||
self.logger.info(f"Starting {len(self.components)} components...")
|
||||
for name, component in self.components.items():
|
||||
try:
|
||||
component.start()
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to start {name}: {e}")
|
||||
|
||||
# Notify systemd we're ready
|
||||
if SYSTEMD_AVAILABLE:
|
||||
notify(Notification.READY)
|
||||
self.logger.info("Notified systemd: READY")
|
||||
|
||||
def stop_all(self, timeout: float = 5.0):
|
||||
"""Stop all components gracefully"""
|
||||
self.logger.info("Stopping all components...")
|
||||
for name, component in self.components.items():
|
||||
try:
|
||||
component.stop(timeout=timeout)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error stopping {name}: {e}")
|
||||
|
||||
def _supervisor_loop(self):
|
||||
"""Main supervisor loop - monitors components"""
|
||||
self.logger.info("Supervisor monitoring started")
|
||||
|
||||
while not self._shutdown_event.is_set():
|
||||
# Check health of all components
|
||||
health_report = {}
|
||||
for name, component in self.components.items():
|
||||
health = component.get_health()
|
||||
health_report[name] = {
|
||||
'status': health.status,
|
||||
'uptime': health.uptime,
|
||||
'errors': health.error_count,
|
||||
'message': health.message
|
||||
}
|
||||
|
||||
# Restart failed components
|
||||
if health.status == 'failed' and component._running:
|
||||
self.logger.warning(f"{name}: Restarting failed component...")
|
||||
component.stop(timeout=2.0)
|
||||
time.sleep(1)
|
||||
component.start()
|
||||
|
||||
# Log health summary
|
||||
failed = sum(1 for h in health_report.values() if h['status'] == 'failed')
|
||||
if failed > 0:
|
||||
self.logger.error(f"Health check: {failed} components failed",
|
||||
extra={'extra_data': health_report})
|
||||
else:
|
||||
self.logger.debug("Health check: all components healthy",
|
||||
extra={'extra_data': health_report})
|
||||
|
||||
# Notify systemd watchdog
|
||||
if SYSTEMD_AVAILABLE:
|
||||
notify(Notification.WATCHDOG)
|
||||
|
||||
# Wait for next check
|
||||
self._shutdown_event.wait(self._health_check_interval)
|
||||
|
||||
self.logger.info("Supervisor monitoring stopped")
|
||||
|
||||
def get_status(self) -> Dict:
|
||||
"""Get full status of supervisor and components"""
|
||||
return {
|
||||
'supervisor': {
|
||||
'running': self._running,
|
||||
'components_count': len(self.components)
|
||||
},
|
||||
'components': {
|
||||
name: {
|
||||
'status': comp.get_health().status,
|
||||
'uptime': comp.get_health().uptime,
|
||||
'errors': comp.get_health().error_count,
|
||||
'message': comp.get_health().message
|
||||
}
|
||||
for name, comp in self.components.items()
|
||||
}
|
||||
}
|
||||
|
||||
def run(self):
|
||||
"""Run the supervisor (blocking)"""
|
||||
self.logger.info("=" * 60)
|
||||
self.logger.info("Dolphin Service Supervisor Starting")
|
||||
self.logger.info("=" * 60)
|
||||
|
||||
self._running = True
|
||||
|
||||
# Start all components
|
||||
self.start_all()
|
||||
|
||||
# Start supervisor monitoring thread
|
||||
self._supervisor_thread = threading.Thread(
|
||||
target=self._supervisor_loop,
|
||||
name="supervisor-monitor"
|
||||
)
|
||||
self._supervisor_thread.start()
|
||||
|
||||
# Wait for shutdown signal
|
||||
try:
|
||||
while not self._shutdown_event.is_set():
|
||||
self._shutdown_event.wait(1)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
self._running = False
|
||||
self.stop_all()
|
||||
if self._supervisor_thread:
|
||||
self._supervisor_thread.join(timeout=5.0)
|
||||
|
||||
self.logger.info("Supervisor shutdown complete")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# EXAMPLE COMPONENTS
|
||||
# =============================================================================
|
||||
|
||||
class ExFComponent(ServiceComponent):
|
||||
"""External Factors - 0.5s aggressive oversampling"""
|
||||
def __init__(self):
|
||||
super().__init__(name='exf', interval=0.5, max_retries=3)
|
||||
self.indicators = {}
|
||||
|
||||
def run_cycle(self):
|
||||
# Simulate fetching indicators
|
||||
self.indicators['basis'] = {'value': 0.01, 'timestamp': time.time()}
|
||||
self.indicators['spread'] = {'value': 0.02, 'timestamp': time.time()}
|
||||
# In real implementation: fetch from APIs, push to Hazelcast
|
||||
|
||||
|
||||
class OBComponent(ServiceComponent):
|
||||
"""Order Book Streamer - 500ms"""
|
||||
def __init__(self):
|
||||
super().__init__(name='ob', interval=0.5, max_retries=3)
|
||||
|
||||
def run_cycle(self):
|
||||
# Simulate OB snapshot
|
||||
pass
|
||||
|
||||
|
||||
class WatchdogComponent(ServiceComponent):
|
||||
"""Survival Stack Watchdog - 10s"""
|
||||
def __init__(self):
|
||||
super().__init__(name='watchdog', interval=10.0, max_retries=5)
|
||||
self.posture = 'APEX'
|
||||
|
||||
def run_cycle(self):
|
||||
# Check categories, compute posture
|
||||
pass
|
||||
|
||||
|
||||
class MCComponent(ServiceComponent):
|
||||
"""MC-Forewarner - 4h (but we check every 5s if it's time)"""
|
||||
def __init__(self):
|
||||
super().__init__(name='mc', interval=300, max_retries=3) # 5 min check
|
||||
self.last_run = 0
|
||||
|
||||
def run_cycle(self):
|
||||
# Only actually run every 4 hours
|
||||
if time.time() - self.last_run > 14400: # 4 hours
|
||||
self.logger.info("Running MC-Forewarner assessment")
|
||||
self.last_run = time.time()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAIN ENTRY POINT
|
||||
# =============================================================================
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Create supervisor
|
||||
supervisor = DolphinSupervisor(health_check_interval=10.0)
|
||||
|
||||
# Register components
|
||||
supervisor.register(ExFComponent())
|
||||
supervisor.register(OBComponent())
|
||||
supervisor.register(WatchdogComponent())
|
||||
supervisor.register(MCComponent())
|
||||
|
||||
# Run
|
||||
supervisor.run()
|
||||
Reference in New Issue
Block a user