359 lines
11 KiB
Python
359 lines
11 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
DOLPHIN All Services Prefect Daemon
|
||
|
|
====================================
|
||
|
|
Unified Prefect-managed daemon for all DOLPHIN algorithm services.
|
||
|
|
|
||
|
|
Manages:
|
||
|
|
- Scan Bridge Service (Arrow → Hz)
|
||
|
|
- ACB Processor Service (ACB boost calculation)
|
||
|
|
- System Watchdog Service (Survival Stack)
|
||
|
|
- EXTF Service (External factors)
|
||
|
|
- OBF Service (Order book features)
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
prefect deployment build dolphin_services_prefect_daemon.py:dolphin_services_daemon \
|
||
|
|
--name "dolphin-all-services" --pool dolphin-daemon-pool
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import signal
|
||
|
|
import subprocess
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from typing import Dict, List, Optional, Tuple
|
||
|
|
|
||
|
|
sys.path.insert(0, '/mnt/dolphinng5_predict')
|
||
|
|
sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin')
|
||
|
|
|
||
|
|
from prefect import flow, task, get_run_logger
|
||
|
|
|
||
|
|
# Service definitions
|
||
|
|
SERVICES = {
|
||
|
|
'scan_bridge': {
|
||
|
|
'script': '/mnt/dolphinng5_predict/prod/scan_bridge_service.py',
|
||
|
|
'log': '/tmp/scan_bridge_service.log',
|
||
|
|
'hz_check': ('DOLPHIN_FEATURES', 'latest_eigen_scan'),
|
||
|
|
},
|
||
|
|
'acb_processor': {
|
||
|
|
'script': '/mnt/dolphinng5_predict/prod/acb_processor_service.py',
|
||
|
|
'log': '/tmp/acb_processor_service.log',
|
||
|
|
'hz_check': ('DOLPHIN_FEATURES', 'acb_boost'),
|
||
|
|
},
|
||
|
|
'system_watchdog': {
|
||
|
|
'script': '/mnt/dolphinng5_predict/prod/system_watchdog_service.py',
|
||
|
|
'log': '/tmp/system_watchdog_service.log',
|
||
|
|
'hz_check': ('DOLPHIN_SAFETY', 'latest'),
|
||
|
|
},
|
||
|
|
'extf_service': {
|
||
|
|
'script': '/mnt/dolphinng5_predict/prod/exf_prefect_final.py',
|
||
|
|
'log': '/tmp/extf_service.log',
|
||
|
|
'hz_check': ('DOLPHIN_FEATURES', 'exf_latest'),
|
||
|
|
},
|
||
|
|
'obf_service': {
|
||
|
|
'script': '/mnt/dolphinng5_predict/prod/obf_prefect_flow.py',
|
||
|
|
'log': '/tmp/obf_service.log',
|
||
|
|
'hz_check': ('DOLPHIN_FEATURES_SHARD_00', None), # Just check existence
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class ServiceStatus:
|
||
|
|
name: str
|
||
|
|
pid: Optional[int]
|
||
|
|
running: bool
|
||
|
|
hz_healthy: bool
|
||
|
|
last_restart: Optional[str]
|
||
|
|
restart_count: int
|
||
|
|
|
||
|
|
|
||
|
|
class ServiceManager:
|
||
|
|
"""Manages a single service process."""
|
||
|
|
|
||
|
|
def __init__(self, name: str, config: Dict):
|
||
|
|
self.name = name
|
||
|
|
self.script = config['script']
|
||
|
|
self.log_file = config['log']
|
||
|
|
self.hz_check = config['hz_check']
|
||
|
|
self.process: Optional[subprocess.Popen] = None
|
||
|
|
self.restart_count = 0
|
||
|
|
self.last_restart: Optional[datetime] = None
|
||
|
|
|
||
|
|
def start(self) -> bool:
|
||
|
|
"""Start the service."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
if self.is_running():
|
||
|
|
logger.debug(f"{self.name}: Already running")
|
||
|
|
return True
|
||
|
|
|
||
|
|
logger.info(f"🚀 Starting {self.name}...")
|
||
|
|
|
||
|
|
try:
|
||
|
|
self.process = subprocess.Popen(
|
||
|
|
[sys.executable, self.script],
|
||
|
|
stdout=open(self.log_file, 'a'),
|
||
|
|
stderr=subprocess.STDOUT,
|
||
|
|
cwd='/mnt/dolphinng5_predict/prod'
|
||
|
|
)
|
||
|
|
|
||
|
|
time.sleep(2) # Wait for startup
|
||
|
|
|
||
|
|
if self.is_running():
|
||
|
|
self.restart_count += 1
|
||
|
|
self.last_restart = datetime.now(timezone.utc)
|
||
|
|
logger.info(f"✅ {self.name} started (PID: {self.process.pid})")
|
||
|
|
return True
|
||
|
|
else:
|
||
|
|
logger.error(f"❌ {self.name} failed to start")
|
||
|
|
return False
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"❌ {self.name} error: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def stop(self) -> None:
|
||
|
|
"""Stop the service."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
if not self.is_running():
|
||
|
|
return
|
||
|
|
|
||
|
|
logger.info(f"🛑 Stopping {self.name}...")
|
||
|
|
|
||
|
|
try:
|
||
|
|
self.process.send_signal(signal.SIGTERM)
|
||
|
|
self.process.wait(timeout=5)
|
||
|
|
logger.info(f"✅ {self.name} stopped")
|
||
|
|
except subprocess.TimeoutExpired:
|
||
|
|
logger.warning(f"⚠️ {self.name} force killing...")
|
||
|
|
self.process.kill()
|
||
|
|
self.process.wait()
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Error stopping {self.name}: {e}")
|
||
|
|
finally:
|
||
|
|
self.process = None
|
||
|
|
|
||
|
|
def is_running(self) -> bool:
|
||
|
|
"""Check if service is running."""
|
||
|
|
if self.process is None:
|
||
|
|
# Check if another instance is running
|
||
|
|
try:
|
||
|
|
result = subprocess.run(
|
||
|
|
['pgrep', '-f', self.script],
|
||
|
|
capture_output=True,
|
||
|
|
text=True
|
||
|
|
)
|
||
|
|
return result.returncode == 0 and result.stdout.strip()
|
||
|
|
except:
|
||
|
|
return False
|
||
|
|
|
||
|
|
return self.process.poll() is None
|
||
|
|
|
||
|
|
def get_pid(self) -> Optional[int]:
|
||
|
|
"""Get process PID."""
|
||
|
|
if self.process:
|
||
|
|
return self.process.pid
|
||
|
|
# Try to find from pgrep
|
||
|
|
try:
|
||
|
|
result = subprocess.run(
|
||
|
|
['pgrep', '-f', self.script],
|
||
|
|
capture_output=True,
|
||
|
|
text=True
|
||
|
|
)
|
||
|
|
if result.returncode == 0:
|
||
|
|
return int(result.stdout.strip().split('\n')[0])
|
||
|
|
except:
|
||
|
|
pass
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def check_hz_data(map_name: str, key: Optional[str]) -> Tuple[bool, str]:
|
||
|
|
"""Check if data exists in Hazelcast."""
|
||
|
|
try:
|
||
|
|
import hazelcast
|
||
|
|
client = hazelcast.HazelcastClient(
|
||
|
|
cluster_name="dolphin",
|
||
|
|
cluster_members=["127.0.0.1:5701"],
|
||
|
|
)
|
||
|
|
|
||
|
|
hz_map = client.get_map(map_name).blocking()
|
||
|
|
|
||
|
|
if key:
|
||
|
|
data = hz_map.get(key)
|
||
|
|
client.shutdown()
|
||
|
|
if data:
|
||
|
|
return True, f"{key} exists"
|
||
|
|
else:
|
||
|
|
return False, f"{key} missing"
|
||
|
|
else:
|
||
|
|
size = hz_map.size()
|
||
|
|
client.shutdown()
|
||
|
|
if size > 0:
|
||
|
|
return True, f"{size} entries"
|
||
|
|
else:
|
||
|
|
return False, "empty"
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
return False, str(e)
|
||
|
|
|
||
|
|
|
||
|
|
@task(name="check-all-services")
|
||
|
|
def check_all_services_task() -> Dict[str, ServiceStatus]:
|
||
|
|
"""Check status of all services."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
statuses = {}
|
||
|
|
|
||
|
|
for name, config in SERVICES.items():
|
||
|
|
manager = ServiceManager(name, config)
|
||
|
|
|
||
|
|
running = manager.is_running()
|
||
|
|
pid = manager.get_pid()
|
||
|
|
|
||
|
|
# Check Hz data
|
||
|
|
map_name, key = config['hz_check']
|
||
|
|
hz_healthy, hz_msg = check_hz_data(map_name, key)
|
||
|
|
|
||
|
|
status = ServiceStatus(
|
||
|
|
name=name,
|
||
|
|
pid=pid,
|
||
|
|
running=running,
|
||
|
|
hz_healthy=hz_healthy,
|
||
|
|
last_restart=None,
|
||
|
|
restart_count=0
|
||
|
|
)
|
||
|
|
|
||
|
|
statuses[name] = status
|
||
|
|
|
||
|
|
status_icon = "✅" if running else "❌"
|
||
|
|
hz_icon = "✅" if hz_healthy else "❌"
|
||
|
|
logger.info(f"{status_icon} {name:20} (PID: {pid or 'N/A':>6}) | Hz: {hz_icon} {hz_msg}")
|
||
|
|
|
||
|
|
return statuses
|
||
|
|
|
||
|
|
|
||
|
|
@task(name="restart-service")
|
||
|
|
def restart_service_task(name: str) -> bool:
|
||
|
|
"""Restart a specific service."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
if name not in SERVICES:
|
||
|
|
logger.error(f"Unknown service: {name}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
manager = ServiceManager(name, SERVICES[name])
|
||
|
|
|
||
|
|
# Stop if running
|
||
|
|
if manager.is_running():
|
||
|
|
manager.stop()
|
||
|
|
time.sleep(2)
|
||
|
|
|
||
|
|
# Start
|
||
|
|
if manager.start():
|
||
|
|
logger.info(f"✅ {name} restarted successfully")
|
||
|
|
return True
|
||
|
|
else:
|
||
|
|
logger.error(f"❌ {name} restart failed")
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
# Global managers
|
||
|
|
managers: Dict[str, ServiceManager] = {}
|
||
|
|
|
||
|
|
|
||
|
|
@flow(name="dolphin-services-daemon")
|
||
|
|
def dolphin_services_daemon():
|
||
|
|
"""
|
||
|
|
Main daemon flow that manages all DOLPHIN services.
|
||
|
|
Runs indefinitely, monitoring and restarting services as needed.
|
||
|
|
"""
|
||
|
|
global managers
|
||
|
|
|
||
|
|
logger = get_run_logger()
|
||
|
|
logger.info("=" * 70)
|
||
|
|
logger.info("🐬 DOLPHIN ALL SERVICES DAEMON (Prefect)")
|
||
|
|
logger.info("=" * 70)
|
||
|
|
logger.info("Managing services:")
|
||
|
|
for name in SERVICES:
|
||
|
|
logger.info(f" - {name}")
|
||
|
|
logger.info("=" * 70)
|
||
|
|
|
||
|
|
# Initialize managers
|
||
|
|
for name, config in SERVICES.items():
|
||
|
|
managers[name] = ServiceManager(name, config)
|
||
|
|
|
||
|
|
# Initial start of all services
|
||
|
|
logger.info("\n🚀 Initial service startup...")
|
||
|
|
for name, manager in managers.items():
|
||
|
|
if not manager.is_running():
|
||
|
|
manager.start()
|
||
|
|
else:
|
||
|
|
logger.info(f"✅ {name} already running")
|
||
|
|
|
||
|
|
# Health check loop
|
||
|
|
check_interval = 30 # seconds
|
||
|
|
|
||
|
|
try:
|
||
|
|
while True:
|
||
|
|
time.sleep(check_interval)
|
||
|
|
|
||
|
|
logger.info("\n📊 Health Check")
|
||
|
|
logger.info("-" * 70)
|
||
|
|
|
||
|
|
statuses = check_all_services_task()
|
||
|
|
|
||
|
|
# Check for issues and restart if needed
|
||
|
|
for name, status in statuses.items():
|
||
|
|
if not status.running:
|
||
|
|
logger.warning(f"🔄 {name} not running, restarting...")
|
||
|
|
restart_service_task(name)
|
||
|
|
elif not status.hz_healthy:
|
||
|
|
logger.warning(f"⚠️ {name} Hz data stale (process running)")
|
||
|
|
# Don't restart immediately, wait for next cycle
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
running_count = sum(1 for s in statuses.values() if s.running)
|
||
|
|
healthy_count = sum(1 for s in statuses.values() if s.hz_healthy)
|
||
|
|
|
||
|
|
logger.info(f"\nSummary: {running_count}/{len(SERVICES)} running, "
|
||
|
|
f"{healthy_count}/{len(SERVICES)} Hz healthy")
|
||
|
|
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
logger.info("\n🛑 Shutting down...")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"❌ Daemon error: {e}")
|
||
|
|
raise
|
||
|
|
finally:
|
||
|
|
logger.info("🧹 Stopping all services...")
|
||
|
|
for manager in managers.values():
|
||
|
|
manager.stop()
|
||
|
|
logger.info("✅ All services stopped")
|
||
|
|
|
||
|
|
|
||
|
|
@flow(name="dolphin-services-status")
|
||
|
|
def quick_status_check() -> Dict:
|
||
|
|
"""Quick status check flow."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
logger.info("🐬 DOLPHIN Services Status")
|
||
|
|
logger.info("=" * 50)
|
||
|
|
|
||
|
|
statuses = check_all_services_task()
|
||
|
|
|
||
|
|
return {
|
||
|
|
'services': {name: {
|
||
|
|
'running': s.running,
|
||
|
|
'pid': s.pid,
|
||
|
|
'hz_healthy': s.hz_healthy,
|
||
|
|
} for name, s in statuses.items()},
|
||
|
|
'timestamp': datetime.now(timezone.utc).isoformat()
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
dolphin_services_daemon()
|