424 lines
14 KiB
Python
424 lines
14 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
DOLPHIN Scan Bridge Prefect Daemon
|
||
|
|
===================================
|
||
|
|
Phase 2 Implementation: Prefect-managed long-running daemon for scan bridge.
|
||
|
|
|
||
|
|
This daemon supervises the scan bridge service, providing:
|
||
|
|
- Automatic restart on crash
|
||
|
|
- Health monitoring (data freshness)
|
||
|
|
- Centralized logging via Prefect
|
||
|
|
- Integration with DOLPHIN orchestration
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
# Deploy to Prefect
|
||
|
|
prefect deployment build scan_bridge_prefect_daemon.py:scan_bridge_daemon_flow \
|
||
|
|
--name "scan-bridge-daemon" --pool dolphin-daemon-pool
|
||
|
|
|
||
|
|
# Start worker
|
||
|
|
prefect worker start --pool dolphin-daemon-pool
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import signal
|
||
|
|
import subprocess
|
||
|
|
import threading
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional, Dict, Any
|
||
|
|
|
||
|
|
# Add paths
|
||
|
|
sys.path.insert(0, '/mnt/dolphinng5_predict')
|
||
|
|
sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin')
|
||
|
|
|
||
|
|
# Prefect imports
|
||
|
|
from prefect import flow, task, get_run_logger
|
||
|
|
from prefect.runtime import flow_run
|
||
|
|
from prefect.states import Completed, Failed
|
||
|
|
|
||
|
|
# Hazelcast for health checks
|
||
|
|
try:
|
||
|
|
import hazelcast
|
||
|
|
HAZELCAST_AVAILABLE = True
|
||
|
|
except ImportError:
|
||
|
|
HAZELCAST_AVAILABLE = False
|
||
|
|
|
||
|
|
|
||
|
|
# Constants
|
||
|
|
DAEMON_SCRIPT = "/mnt/dolphinng5_predict/prod/scan_bridge_service.py"
|
||
|
|
HEALTH_CHECK_INTERVAL = 30 # seconds
|
||
|
|
DATA_STALE_THRESHOLD = 60 # seconds (critical)
|
||
|
|
DATA_WARNING_THRESHOLD = 30 # seconds (warning)
|
||
|
|
RESTART_DELAY = 5 # seconds between restart attempts
|
||
|
|
MAX_RESTART_ATTEMPTS = 3
|
||
|
|
|
||
|
|
|
||
|
|
class ScanBridgeProcess:
|
||
|
|
"""Manages the scan bridge subprocess."""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
self.process: Optional[subprocess.Popen] = None
|
||
|
|
self.start_time: Optional[datetime] = None
|
||
|
|
self.restart_count = 0
|
||
|
|
self._stop_event = threading.Event()
|
||
|
|
self._monitor_thread: Optional[threading.Thread] = None
|
||
|
|
|
||
|
|
def start(self) -> bool:
|
||
|
|
"""Start the scan bridge subprocess."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
if self.process and self.process.poll() is None:
|
||
|
|
logger.warning("Process already running")
|
||
|
|
return True
|
||
|
|
|
||
|
|
logger.info(f"🚀 Starting scan bridge (attempt {self.restart_count + 1})...")
|
||
|
|
|
||
|
|
try:
|
||
|
|
self.process = subprocess.Popen(
|
||
|
|
[sys.executable, DAEMON_SCRIPT],
|
||
|
|
stdout=subprocess.PIPE,
|
||
|
|
stderr=subprocess.STDOUT,
|
||
|
|
universal_newlines=True,
|
||
|
|
bufsize=1, # Line buffered
|
||
|
|
cwd="/mnt/dolphinng5_predict/prod"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Wait for startup
|
||
|
|
time.sleep(2)
|
||
|
|
|
||
|
|
if self.process.poll() is None:
|
||
|
|
self.start_time = datetime.now(timezone.utc)
|
||
|
|
self.restart_count = 0
|
||
|
|
logger.info(f"✅ Scan bridge started (PID: {self.process.pid})")
|
||
|
|
|
||
|
|
# Start log monitor thread
|
||
|
|
self._start_log_monitor()
|
||
|
|
return True
|
||
|
|
else:
|
||
|
|
logger.error(f"❌ Process exited immediately with code {self.process.poll()}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"❌ Failed to start: {e}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def stop(self, timeout: int = 10) -> None:
|
||
|
|
"""Stop the scan bridge subprocess gracefully."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
self._stop_event.set()
|
||
|
|
|
||
|
|
if not self.process:
|
||
|
|
return
|
||
|
|
|
||
|
|
if self.process.poll() is not None:
|
||
|
|
logger.debug("Process already stopped")
|
||
|
|
return
|
||
|
|
|
||
|
|
logger.info(f"🛑 Stopping scan bridge (PID: {self.process.pid})...")
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Try graceful shutdown
|
||
|
|
self.process.send_signal(signal.SIGTERM)
|
||
|
|
|
||
|
|
# Wait for process to terminate
|
||
|
|
try:
|
||
|
|
self.process.wait(timeout=timeout)
|
||
|
|
logger.info("✅ Process stopped gracefully")
|
||
|
|
except subprocess.TimeoutExpired:
|
||
|
|
logger.warning("⚠️ Process didn't stop in time, forcing...")
|
||
|
|
self.process.kill()
|
||
|
|
self.process.wait()
|
||
|
|
logger.info("✅ Process killed")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Error stopping process: {e}")
|
||
|
|
finally:
|
||
|
|
self.process = None
|
||
|
|
if self._monitor_thread and self._monitor_thread.is_alive():
|
||
|
|
self._monitor_thread.join(timeout=2)
|
||
|
|
|
||
|
|
def is_running(self) -> bool:
|
||
|
|
"""Check if process is running."""
|
||
|
|
return self.process is not None and self.process.poll() is None
|
||
|
|
|
||
|
|
def get_exit_code(self) -> Optional[int]:
|
||
|
|
"""Get process exit code if terminated."""
|
||
|
|
if self.process is None:
|
||
|
|
return None
|
||
|
|
return self.process.poll()
|
||
|
|
|
||
|
|
def _start_log_monitor(self):
|
||
|
|
"""Start thread to monitor and forward logs."""
|
||
|
|
if not self.process:
|
||
|
|
return
|
||
|
|
|
||
|
|
def monitor():
|
||
|
|
logger = get_run_logger()
|
||
|
|
try:
|
||
|
|
for line in iter(self.process.stdout.readline, ''):
|
||
|
|
if self._stop_event.is_set():
|
||
|
|
break
|
||
|
|
line = line.strip()
|
||
|
|
if line:
|
||
|
|
# Forward to Prefect logs with prefix
|
||
|
|
logger.info(f"[Bridge] {line}")
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"Log monitor ended: {e}")
|
||
|
|
|
||
|
|
self._monitor_thread = threading.Thread(target=monitor, daemon=True)
|
||
|
|
self._monitor_thread.start()
|
||
|
|
|
||
|
|
|
||
|
|
def check_hazelcast_data_freshness() -> Dict[str, Any]:
|
||
|
|
"""Check freshness of data in Hazelcast."""
|
||
|
|
if not HAZELCAST_AVAILABLE:
|
||
|
|
return {"available": False, "error": "Hazelcast not installed"}
|
||
|
|
|
||
|
|
try:
|
||
|
|
client = hazelcast.HazelcastClient(
|
||
|
|
cluster_name="dolphin",
|
||
|
|
cluster_members=["127.0.0.1:5701"],
|
||
|
|
)
|
||
|
|
|
||
|
|
features_map = client.get_map('DOLPHIN_FEATURES').blocking()
|
||
|
|
val = features_map.get('latest_eigen_scan')
|
||
|
|
|
||
|
|
if not val:
|
||
|
|
client.shutdown()
|
||
|
|
return {
|
||
|
|
"available": True,
|
||
|
|
"has_data": False,
|
||
|
|
"error": "No latest_eigen_scan in Hazelcast"
|
||
|
|
}
|
||
|
|
|
||
|
|
data = json.loads(val)
|
||
|
|
mtime = data.get('file_mtime', 0)
|
||
|
|
scan_number = data.get('scan_number', 0)
|
||
|
|
asset_count = len(data.get('assets', []))
|
||
|
|
|
||
|
|
age_sec = time.time() - mtime if mtime else float('inf')
|
||
|
|
|
||
|
|
client.shutdown()
|
||
|
|
|
||
|
|
return {
|
||
|
|
"available": True,
|
||
|
|
"has_data": True,
|
||
|
|
"scan_number": scan_number,
|
||
|
|
"asset_count": asset_count,
|
||
|
|
"data_age_sec": age_sec,
|
||
|
|
"is_fresh": age_sec < DATA_STALE_THRESHOLD,
|
||
|
|
"is_warning": age_sec >= DATA_WARNING_THRESHOLD,
|
||
|
|
}
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
return {
|
||
|
|
"available": True,
|
||
|
|
"has_data": False,
|
||
|
|
"error": str(e)
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
@task(name="health-check", retries=2, retry_delay_seconds=5)
|
||
|
|
def perform_health_check() -> Dict[str, Any]:
|
||
|
|
"""Perform comprehensive health check."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
result = {
|
||
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||
|
|
"process_running": False,
|
||
|
|
"hazelcast": {},
|
||
|
|
"healthy": False,
|
||
|
|
"action_required": None
|
||
|
|
}
|
||
|
|
|
||
|
|
# Check 1: Process status
|
||
|
|
global bridge_process
|
||
|
|
if bridge_process and bridge_process.is_running():
|
||
|
|
result["process_running"] = True
|
||
|
|
result["process_pid"] = bridge_process.process.pid
|
||
|
|
result["uptime_sec"] = (datetime.now(timezone.utc) - bridge_process.start_time).total_seconds() if bridge_process.start_time else 0
|
||
|
|
else:
|
||
|
|
logger.error("❌ Health check: Process not running")
|
||
|
|
result["action_required"] = "restart"
|
||
|
|
return result
|
||
|
|
|
||
|
|
# Check 2: Hazelcast data freshness
|
||
|
|
hz_status = check_hazelcast_data_freshness()
|
||
|
|
result["hazelcast"] = hz_status
|
||
|
|
|
||
|
|
if not hz_status.get("available"):
|
||
|
|
logger.warning("⚠️ Hazelcast unavailable for health check")
|
||
|
|
result["action_required"] = "investigate"
|
||
|
|
return result
|
||
|
|
|
||
|
|
if not hz_status.get("has_data"):
|
||
|
|
logger.warning("⚠️ No data in Hazelcast yet")
|
||
|
|
result["action_required"] = "wait"
|
||
|
|
return result
|
||
|
|
|
||
|
|
age = hz_status.get("data_age_sec", float('inf'))
|
||
|
|
|
||
|
|
if age > DATA_STALE_THRESHOLD:
|
||
|
|
logger.error(f"❌ Data stale: {age:.0f}s old (threshold: {DATA_STALE_THRESHOLD}s)")
|
||
|
|
result["action_required"] = "restart"
|
||
|
|
return result
|
||
|
|
elif age > DATA_WARNING_THRESHOLD:
|
||
|
|
logger.warning(f"⚠️ Data warning: {age:.0f}s old")
|
||
|
|
else:
|
||
|
|
logger.info(f"✅ Healthy: data age {age:.0f}s, scan #{hz_status.get('scan_number')}")
|
||
|
|
|
||
|
|
result["healthy"] = True
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
@task(name="restart-bridge")
|
||
|
|
def restart_bridge() -> bool:
|
||
|
|
"""Restart the scan bridge service."""
|
||
|
|
logger = get_run_logger()
|
||
|
|
global bridge_process
|
||
|
|
|
||
|
|
bridge_process.restart_count += 1
|
||
|
|
|
||
|
|
if bridge_process.restart_count > MAX_RESTART_ATTEMPTS:
|
||
|
|
logger.error(f"❌ Max restart attempts ({MAX_RESTART_ATTEMPTS}) exceeded")
|
||
|
|
return False
|
||
|
|
|
||
|
|
logger.warning(f"🔄 Restarting bridge (attempt {bridge_process.restart_count}/{MAX_RESTART_ATTEMPTS})...")
|
||
|
|
|
||
|
|
# Stop existing
|
||
|
|
bridge_process.stop()
|
||
|
|
time.sleep(RESTART_DELAY)
|
||
|
|
|
||
|
|
# Start new
|
||
|
|
if bridge_process.start():
|
||
|
|
logger.info("✅ Bridge restarted successfully")
|
||
|
|
return True
|
||
|
|
else:
|
||
|
|
logger.error("❌ Bridge restart failed")
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
# Global process manager
|
||
|
|
bridge_process: Optional[ScanBridgeProcess] = None
|
||
|
|
|
||
|
|
|
||
|
|
@flow(
|
||
|
|
name="scan-bridge-daemon",
|
||
|
|
description="Long-running daemon that supervises the scan bridge service",
|
||
|
|
log_prints=True,
|
||
|
|
)
|
||
|
|
def scan_bridge_daemon_flow():
|
||
|
|
"""
|
||
|
|
Main daemon flow that runs indefinitely, managing the scan bridge.
|
||
|
|
|
||
|
|
This flow:
|
||
|
|
1. Starts the scan bridge subprocess
|
||
|
|
2. Monitors health every 30 seconds
|
||
|
|
3. Restarts on failure or stale data
|
||
|
|
4. Logs all output to Prefect
|
||
|
|
"""
|
||
|
|
global bridge_process
|
||
|
|
|
||
|
|
logger = get_run_logger()
|
||
|
|
logger.info("=" * 70)
|
||
|
|
logger.info("🐬 DOLPHIN Scan Bridge Daemon (Prefect)")
|
||
|
|
logger.info("=" * 70)
|
||
|
|
logger.info(f"Health check interval: {HEALTH_CHECK_INTERVAL}s")
|
||
|
|
logger.info(f"Data stale threshold: {DATA_STALE_THRESHOLD}s")
|
||
|
|
logger.info(f"Script: {DAEMON_SCRIPT}")
|
||
|
|
logger.info("=" * 70)
|
||
|
|
|
||
|
|
# Initialize process manager
|
||
|
|
bridge_process = ScanBridgeProcess()
|
||
|
|
|
||
|
|
# Start initial instance
|
||
|
|
if not bridge_process.start():
|
||
|
|
logger.error("❌ Failed to start scan bridge")
|
||
|
|
raise RuntimeError("Initial start failed")
|
||
|
|
|
||
|
|
consecutive_failures = 0
|
||
|
|
max_consecutive_failures = 5
|
||
|
|
|
||
|
|
try:
|
||
|
|
while True:
|
||
|
|
# Wait between health checks
|
||
|
|
time.sleep(HEALTH_CHECK_INTERVAL)
|
||
|
|
|
||
|
|
# Perform health check
|
||
|
|
health = perform_health_check()
|
||
|
|
|
||
|
|
if health["healthy"]:
|
||
|
|
consecutive_failures = 0
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Not healthy - determine action
|
||
|
|
consecutive_failures += 1
|
||
|
|
action = health.get("action_required")
|
||
|
|
|
||
|
|
if consecutive_failures >= max_consecutive_failures:
|
||
|
|
logger.error(f"❌ Too many consecutive failures ({consecutive_failures})")
|
||
|
|
raise RuntimeError("Max failures exceeded")
|
||
|
|
|
||
|
|
if action == "restart":
|
||
|
|
if not restart_bridge():
|
||
|
|
logger.error("❌ Restart failed")
|
||
|
|
raise RuntimeError("Restart failed")
|
||
|
|
|
||
|
|
elif action == "investigate":
|
||
|
|
logger.warning("⚠️ Manual investigation required")
|
||
|
|
# Don't restart, just wait and check again
|
||
|
|
|
||
|
|
elif action == "wait":
|
||
|
|
logger.info("⏳ Waiting for data...")
|
||
|
|
# Normal for startup
|
||
|
|
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
logger.info("\n🛑 Interrupted by user")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"❌ Daemon error: {e}")
|
||
|
|
raise
|
||
|
|
finally:
|
||
|
|
logger.info("🧹 Cleaning up...")
|
||
|
|
bridge_process.stop()
|
||
|
|
logger.info("✅ Daemon stopped")
|
||
|
|
|
||
|
|
|
||
|
|
@flow(name="scan-bridge-health-check")
|
||
|
|
def quick_health_check() -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Standalone health check flow for external monitoring.
|
||
|
|
Can be scheduled independently for alerting.
|
||
|
|
"""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
# Check if bridge is running
|
||
|
|
result = check_hazelcast_data_freshness()
|
||
|
|
|
||
|
|
if not result.get("available"):
|
||
|
|
logger.error("❌ Hazelcast unavailable")
|
||
|
|
return {"status": "error", "hazelcast": result}
|
||
|
|
|
||
|
|
if not result.get("has_data"):
|
||
|
|
logger.error("❌ No scan data in Hazelcast")
|
||
|
|
return {"status": "no_data", "hazelcast": result}
|
||
|
|
|
||
|
|
age = result.get("data_age_sec", 0)
|
||
|
|
|
||
|
|
if age > DATA_STALE_THRESHOLD:
|
||
|
|
logger.error(f"❌ STALE DATA: {age:.0f}s old")
|
||
|
|
return {"status": "stale", "age_sec": age, "hazelcast": result}
|
||
|
|
elif age > DATA_WARNING_THRESHOLD:
|
||
|
|
logger.warning(f"⚠️ Data warning: {age:.0f}s old")
|
||
|
|
return {"status": "warning", "age_sec": age, "hazelcast": result}
|
||
|
|
else:
|
||
|
|
logger.info(f"✅ Healthy: {age:.0f}s old, scan #{result.get('scan_number')}")
|
||
|
|
return {"status": "healthy", "age_sec": age, "hazelcast": result}
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
# Run the daemon
|
||
|
|
scan_bridge_daemon_flow()
|