Files
DOLPHIN/prod/dolphin_services_prefect_daemon.py

359 lines
11 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN All Services Prefect Daemon
====================================
Unified Prefect-managed daemon for all DOLPHIN algorithm services.
Manages:
- Scan Bridge Service (Arrow Hz)
- ACB Processor Service (ACB boost calculation)
- System Watchdog Service (Survival Stack)
- EXTF Service (External factors)
- OBF Service (Order book features)
Usage:
prefect deployment build dolphin_services_prefect_daemon.py:dolphin_services_daemon \
--name "dolphin-all-services" --pool dolphin-daemon-pool
"""
import sys
import time
import json
import signal
import subprocess
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
sys.path.insert(0, '/mnt/dolphinng5_predict')
sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin')
from prefect import flow, task, get_run_logger
# Service definitions
SERVICES = {
'scan_bridge': {
'script': '/mnt/dolphinng5_predict/prod/scan_bridge_service.py',
'log': '/tmp/scan_bridge_service.log',
'hz_check': ('DOLPHIN_FEATURES', 'latest_eigen_scan'),
},
'acb_processor': {
'script': '/mnt/dolphinng5_predict/prod/acb_processor_service.py',
'log': '/tmp/acb_processor_service.log',
'hz_check': ('DOLPHIN_FEATURES', 'acb_boost'),
},
'system_watchdog': {
'script': '/mnt/dolphinng5_predict/prod/system_watchdog_service.py',
'log': '/tmp/system_watchdog_service.log',
'hz_check': ('DOLPHIN_SAFETY', 'latest'),
},
'extf_service': {
'script': '/mnt/dolphinng5_predict/prod/exf_prefect_final.py',
'log': '/tmp/extf_service.log',
'hz_check': ('DOLPHIN_FEATURES', 'exf_latest'),
},
'obf_service': {
'script': '/mnt/dolphinng5_predict/prod/obf_prefect_flow.py',
'log': '/tmp/obf_service.log',
'hz_check': ('DOLPHIN_FEATURES_SHARD_00', None), # Just check existence
},
}
@dataclass
class ServiceStatus:
name: str
pid: Optional[int]
running: bool
hz_healthy: bool
last_restart: Optional[str]
restart_count: int
class ServiceManager:
"""Manages a single service process."""
def __init__(self, name: str, config: Dict):
self.name = name
self.script = config['script']
self.log_file = config['log']
self.hz_check = config['hz_check']
self.process: Optional[subprocess.Popen] = None
self.restart_count = 0
self.last_restart: Optional[datetime] = None
def start(self) -> bool:
"""Start the service."""
logger = get_run_logger()
if self.is_running():
logger.debug(f"{self.name}: Already running")
return True
logger.info(f"🚀 Starting {self.name}...")
try:
self.process = subprocess.Popen(
[sys.executable, self.script],
stdout=open(self.log_file, 'a'),
stderr=subprocess.STDOUT,
cwd='/mnt/dolphinng5_predict/prod'
)
time.sleep(2) # Wait for startup
if self.is_running():
self.restart_count += 1
self.last_restart = datetime.now(timezone.utc)
logger.info(f"{self.name} started (PID: {self.process.pid})")
return True
else:
logger.error(f"{self.name} failed to start")
return False
except Exception as e:
logger.error(f"{self.name} error: {e}")
return False
def stop(self) -> None:
"""Stop the service."""
logger = get_run_logger()
if not self.is_running():
return
logger.info(f"🛑 Stopping {self.name}...")
try:
self.process.send_signal(signal.SIGTERM)
self.process.wait(timeout=5)
logger.info(f"{self.name} stopped")
except subprocess.TimeoutExpired:
logger.warning(f"⚠️ {self.name} force killing...")
self.process.kill()
self.process.wait()
except Exception as e:
logger.error(f"Error stopping {self.name}: {e}")
finally:
self.process = None
def is_running(self) -> bool:
"""Check if service is running."""
if self.process is None:
# Check if another instance is running
try:
result = subprocess.run(
['pgrep', '-f', self.script],
capture_output=True,
text=True
)
return result.returncode == 0 and result.stdout.strip()
except:
return False
return self.process.poll() is None
def get_pid(self) -> Optional[int]:
"""Get process PID."""
if self.process:
return self.process.pid
# Try to find from pgrep
try:
result = subprocess.run(
['pgrep', '-f', self.script],
capture_output=True,
text=True
)
if result.returncode == 0:
return int(result.stdout.strip().split('\n')[0])
except:
pass
return None
def check_hz_data(map_name: str, key: Optional[str]) -> Tuple[bool, str]:
"""Check if data exists in Hazelcast."""
try:
import hazelcast
client = hazelcast.HazelcastClient(
cluster_name="dolphin",
cluster_members=["127.0.0.1:5701"],
)
hz_map = client.get_map(map_name).blocking()
if key:
data = hz_map.get(key)
client.shutdown()
if data:
return True, f"{key} exists"
else:
return False, f"{key} missing"
else:
size = hz_map.size()
client.shutdown()
if size > 0:
return True, f"{size} entries"
else:
return False, "empty"
except Exception as e:
return False, str(e)
@task(name="check-all-services")
def check_all_services_task() -> Dict[str, ServiceStatus]:
"""Check status of all services."""
logger = get_run_logger()
statuses = {}
for name, config in SERVICES.items():
manager = ServiceManager(name, config)
running = manager.is_running()
pid = manager.get_pid()
# Check Hz data
map_name, key = config['hz_check']
hz_healthy, hz_msg = check_hz_data(map_name, key)
status = ServiceStatus(
name=name,
pid=pid,
running=running,
hz_healthy=hz_healthy,
last_restart=None,
restart_count=0
)
statuses[name] = status
status_icon = "" if running else ""
hz_icon = "" if hz_healthy else ""
logger.info(f"{status_icon} {name:20} (PID: {pid or 'N/A':>6}) | Hz: {hz_icon} {hz_msg}")
return statuses
@task(name="restart-service")
def restart_service_task(name: str) -> bool:
"""Restart a specific service."""
logger = get_run_logger()
if name not in SERVICES:
logger.error(f"Unknown service: {name}")
return False
manager = ServiceManager(name, SERVICES[name])
# Stop if running
if manager.is_running():
manager.stop()
time.sleep(2)
# Start
if manager.start():
logger.info(f"{name} restarted successfully")
return True
else:
logger.error(f"{name} restart failed")
return False
# Global managers
managers: Dict[str, ServiceManager] = {}
@flow(name="dolphin-services-daemon")
def dolphin_services_daemon():
"""
Main daemon flow that manages all DOLPHIN services.
Runs indefinitely, monitoring and restarting services as needed.
"""
global managers
logger = get_run_logger()
logger.info("=" * 70)
logger.info("🐬 DOLPHIN ALL SERVICES DAEMON (Prefect)")
logger.info("=" * 70)
logger.info("Managing services:")
for name in SERVICES:
logger.info(f" - {name}")
logger.info("=" * 70)
# Initialize managers
for name, config in SERVICES.items():
managers[name] = ServiceManager(name, config)
# Initial start of all services
logger.info("\n🚀 Initial service startup...")
for name, manager in managers.items():
if not manager.is_running():
manager.start()
else:
logger.info(f"{name} already running")
# Health check loop
check_interval = 30 # seconds
try:
while True:
time.sleep(check_interval)
logger.info("\n📊 Health Check")
logger.info("-" * 70)
statuses = check_all_services_task()
# Check for issues and restart if needed
for name, status in statuses.items():
if not status.running:
logger.warning(f"🔄 {name} not running, restarting...")
restart_service_task(name)
elif not status.hz_healthy:
logger.warning(f"⚠️ {name} Hz data stale (process running)")
# Don't restart immediately, wait for next cycle
# Summary
running_count = sum(1 for s in statuses.values() if s.running)
healthy_count = sum(1 for s in statuses.values() if s.hz_healthy)
logger.info(f"\nSummary: {running_count}/{len(SERVICES)} running, "
f"{healthy_count}/{len(SERVICES)} Hz healthy")
except KeyboardInterrupt:
logger.info("\n🛑 Shutting down...")
except Exception as e:
logger.error(f"❌ Daemon error: {e}")
raise
finally:
logger.info("🧹 Stopping all services...")
for manager in managers.values():
manager.stop()
logger.info("✅ All services stopped")
@flow(name="dolphin-services-status")
def quick_status_check() -> Dict:
"""Quick status check flow."""
logger = get_run_logger()
logger.info("🐬 DOLPHIN Services Status")
logger.info("=" * 50)
statuses = check_all_services_task()
return {
'services': {name: {
'running': s.running,
'pid': s.pid,
'hz_healthy': s.hz_healthy,
} for name, s in statuses.items()},
'timestamp': datetime.now(timezone.utc).isoformat()
}
if __name__ == "__main__":
dolphin_services_daemon()