#!/usr/bin/env python3 """ DOLPHIN All Services Prefect Daemon ==================================== Unified Prefect-managed daemon for all DOLPHIN algorithm services. Manages: - Scan Bridge Service (Arrow โ†’ Hz) - ACB Processor Service (ACB boost calculation) - System Watchdog Service (Survival Stack) - EXTF Service (External factors) - OBF Service (Order book features) Usage: prefect deployment build dolphin_services_prefect_daemon.py:dolphin_services_daemon \ --name "dolphin-all-services" --pool dolphin-daemon-pool """ import sys import time import json import signal import subprocess from datetime import datetime, timezone from dataclasses import dataclass from typing import Dict, List, Optional, Tuple sys.path.insert(0, '/mnt/dolphinng5_predict') sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin') from prefect import flow, task, get_run_logger # Service definitions SERVICES = { 'scan_bridge': { 'script': '/mnt/dolphinng5_predict/prod/scan_bridge_service.py', 'log': '/tmp/scan_bridge_service.log', 'hz_check': ('DOLPHIN_FEATURES', 'latest_eigen_scan'), }, 'acb_processor': { 'script': '/mnt/dolphinng5_predict/prod/acb_processor_service.py', 'log': '/tmp/acb_processor_service.log', 'hz_check': ('DOLPHIN_FEATURES', 'acb_boost'), }, 'system_watchdog': { 'script': '/mnt/dolphinng5_predict/prod/system_watchdog_service.py', 'log': '/tmp/system_watchdog_service.log', 'hz_check': ('DOLPHIN_SAFETY', 'latest'), }, 'extf_service': { 'script': '/mnt/dolphinng5_predict/prod/exf_prefect_final.py', 'log': '/tmp/extf_service.log', 'hz_check': ('DOLPHIN_FEATURES', 'exf_latest'), }, 'obf_service': { 'script': '/mnt/dolphinng5_predict/prod/obf_prefect_flow.py', 'log': '/tmp/obf_service.log', 'hz_check': ('DOLPHIN_FEATURES_SHARD_00', None), # Just check existence }, } @dataclass class ServiceStatus: name: str pid: Optional[int] running: bool hz_healthy: bool last_restart: Optional[str] restart_count: int class ServiceManager: """Manages a single service process.""" def __init__(self, name: str, config: Dict): self.name = name self.script = config['script'] self.log_file = config['log'] self.hz_check = config['hz_check'] self.process: Optional[subprocess.Popen] = None self.restart_count = 0 self.last_restart: Optional[datetime] = None def start(self) -> bool: """Start the service.""" logger = get_run_logger() if self.is_running(): logger.debug(f"{self.name}: Already running") return True logger.info(f"๐Ÿš€ Starting {self.name}...") try: self.process = subprocess.Popen( [sys.executable, self.script], stdout=open(self.log_file, 'a'), stderr=subprocess.STDOUT, cwd='/mnt/dolphinng5_predict/prod' ) time.sleep(2) # Wait for startup if self.is_running(): self.restart_count += 1 self.last_restart = datetime.now(timezone.utc) logger.info(f"โœ… {self.name} started (PID: {self.process.pid})") return True else: logger.error(f"โŒ {self.name} failed to start") return False except Exception as e: logger.error(f"โŒ {self.name} error: {e}") return False def stop(self) -> None: """Stop the service.""" logger = get_run_logger() if not self.is_running(): return logger.info(f"๐Ÿ›‘ Stopping {self.name}...") try: self.process.send_signal(signal.SIGTERM) self.process.wait(timeout=5) logger.info(f"โœ… {self.name} stopped") except subprocess.TimeoutExpired: logger.warning(f"โš ๏ธ {self.name} force killing...") self.process.kill() self.process.wait() except Exception as e: logger.error(f"Error stopping {self.name}: {e}") finally: self.process = None def is_running(self) -> bool: """Check if service is running.""" if self.process is None: # Check if another instance is running try: result = subprocess.run( ['pgrep', '-f', self.script], capture_output=True, text=True ) return result.returncode == 0 and result.stdout.strip() except: return False return self.process.poll() is None def get_pid(self) -> Optional[int]: """Get process PID.""" if self.process: return self.process.pid # Try to find from pgrep try: result = subprocess.run( ['pgrep', '-f', self.script], capture_output=True, text=True ) if result.returncode == 0: return int(result.stdout.strip().split('\n')[0]) except: pass return None def check_hz_data(map_name: str, key: Optional[str]) -> Tuple[bool, str]: """Check if data exists in Hazelcast.""" try: import hazelcast client = hazelcast.HazelcastClient( cluster_name="dolphin", cluster_members=["127.0.0.1:5701"], ) hz_map = client.get_map(map_name).blocking() if key: data = hz_map.get(key) client.shutdown() if data: return True, f"{key} exists" else: return False, f"{key} missing" else: size = hz_map.size() client.shutdown() if size > 0: return True, f"{size} entries" else: return False, "empty" except Exception as e: return False, str(e) @task(name="check-all-services") def check_all_services_task() -> Dict[str, ServiceStatus]: """Check status of all services.""" logger = get_run_logger() statuses = {} for name, config in SERVICES.items(): manager = ServiceManager(name, config) running = manager.is_running() pid = manager.get_pid() # Check Hz data map_name, key = config['hz_check'] hz_healthy, hz_msg = check_hz_data(map_name, key) status = ServiceStatus( name=name, pid=pid, running=running, hz_healthy=hz_healthy, last_restart=None, restart_count=0 ) statuses[name] = status status_icon = "โœ…" if running else "โŒ" hz_icon = "โœ…" if hz_healthy else "โŒ" logger.info(f"{status_icon} {name:20} (PID: {pid or 'N/A':>6}) | Hz: {hz_icon} {hz_msg}") return statuses @task(name="restart-service") def restart_service_task(name: str) -> bool: """Restart a specific service.""" logger = get_run_logger() if name not in SERVICES: logger.error(f"Unknown service: {name}") return False manager = ServiceManager(name, SERVICES[name]) # Stop if running if manager.is_running(): manager.stop() time.sleep(2) # Start if manager.start(): logger.info(f"โœ… {name} restarted successfully") return True else: logger.error(f"โŒ {name} restart failed") return False # Global managers managers: Dict[str, ServiceManager] = {} @flow(name="dolphin-services-daemon") def dolphin_services_daemon(): """ Main daemon flow that manages all DOLPHIN services. Runs indefinitely, monitoring and restarting services as needed. """ global managers logger = get_run_logger() logger.info("=" * 70) logger.info("๐Ÿฌ DOLPHIN ALL SERVICES DAEMON (Prefect)") logger.info("=" * 70) logger.info("Managing services:") for name in SERVICES: logger.info(f" - {name}") logger.info("=" * 70) # Initialize managers for name, config in SERVICES.items(): managers[name] = ServiceManager(name, config) # Initial start of all services logger.info("\n๐Ÿš€ Initial service startup...") for name, manager in managers.items(): if not manager.is_running(): manager.start() else: logger.info(f"โœ… {name} already running") # Health check loop check_interval = 30 # seconds try: while True: time.sleep(check_interval) logger.info("\n๐Ÿ“Š Health Check") logger.info("-" * 70) statuses = check_all_services_task() # Check for issues and restart if needed for name, status in statuses.items(): if not status.running: logger.warning(f"๐Ÿ”„ {name} not running, restarting...") restart_service_task(name) elif not status.hz_healthy: logger.warning(f"โš ๏ธ {name} Hz data stale (process running)") # Don't restart immediately, wait for next cycle # Summary running_count = sum(1 for s in statuses.values() if s.running) healthy_count = sum(1 for s in statuses.values() if s.hz_healthy) logger.info(f"\nSummary: {running_count}/{len(SERVICES)} running, " f"{healthy_count}/{len(SERVICES)} Hz healthy") except KeyboardInterrupt: logger.info("\n๐Ÿ›‘ Shutting down...") except Exception as e: logger.error(f"โŒ Daemon error: {e}") raise finally: logger.info("๐Ÿงน Stopping all services...") for manager in managers.values(): manager.stop() logger.info("โœ… All services stopped") @flow(name="dolphin-services-status") def quick_status_check() -> Dict: """Quick status check flow.""" logger = get_run_logger() logger.info("๐Ÿฌ DOLPHIN Services Status") logger.info("=" * 50) statuses = check_all_services_task() return { 'services': {name: { 'running': s.running, 'pid': s.pid, 'hz_healthy': s.hz_healthy, } for name, s in statuses.items()}, 'timestamp': datetime.now(timezone.utc).isoformat() } if __name__ == "__main__": dolphin_services_daemon()