Files
DOLPHIN/prod/scan_bridge_prefect_daemon.py

424 lines
14 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN Scan Bridge Prefect Daemon
===================================
Phase 2 Implementation: Prefect-managed long-running daemon for scan bridge.
This daemon supervises the scan bridge service, providing:
- Automatic restart on crash
- Health monitoring (data freshness)
- Centralized logging via Prefect
- Integration with DOLPHIN orchestration
Usage:
# Deploy to Prefect
prefect deployment build scan_bridge_prefect_daemon.py:scan_bridge_daemon_flow \
--name "scan-bridge-daemon" --pool dolphin-daemon-pool
# Start worker
prefect worker start --pool dolphin-daemon-pool
"""
import sys
import time
import json
import signal
import subprocess
import threading
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any
# Add paths
sys.path.insert(0, '/mnt/dolphinng5_predict')
sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin')
# Prefect imports
from prefect import flow, task, get_run_logger
from prefect.runtime import flow_run
from prefect.states import Completed, Failed
# Hazelcast for health checks
try:
import hazelcast
HAZELCAST_AVAILABLE = True
except ImportError:
HAZELCAST_AVAILABLE = False
# Constants
DAEMON_SCRIPT = "/mnt/dolphinng5_predict/prod/scan_bridge_service.py"
HEALTH_CHECK_INTERVAL = 30 # seconds
DATA_STALE_THRESHOLD = 60 # seconds (critical)
DATA_WARNING_THRESHOLD = 30 # seconds (warning)
RESTART_DELAY = 5 # seconds between restart attempts
MAX_RESTART_ATTEMPTS = 3
class ScanBridgeProcess:
"""Manages the scan bridge subprocess."""
def __init__(self):
self.process: Optional[subprocess.Popen] = None
self.start_time: Optional[datetime] = None
self.restart_count = 0
self._stop_event = threading.Event()
self._monitor_thread: Optional[threading.Thread] = None
def start(self) -> bool:
"""Start the scan bridge subprocess."""
logger = get_run_logger()
if self.process and self.process.poll() is None:
logger.warning("Process already running")
return True
logger.info(f"🚀 Starting scan bridge (attempt {self.restart_count + 1})...")
try:
self.process = subprocess.Popen(
[sys.executable, DAEMON_SCRIPT],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
bufsize=1, # Line buffered
cwd="/mnt/dolphinng5_predict/prod"
)
# Wait for startup
time.sleep(2)
if self.process.poll() is None:
self.start_time = datetime.now(timezone.utc)
self.restart_count = 0
logger.info(f"✅ Scan bridge started (PID: {self.process.pid})")
# Start log monitor thread
self._start_log_monitor()
return True
else:
logger.error(f"❌ Process exited immediately with code {self.process.poll()}")
return False
except Exception as e:
logger.error(f"❌ Failed to start: {e}")
return False
def stop(self, timeout: int = 10) -> None:
"""Stop the scan bridge subprocess gracefully."""
logger = get_run_logger()
self._stop_event.set()
if not self.process:
return
if self.process.poll() is not None:
logger.debug("Process already stopped")
return
logger.info(f"🛑 Stopping scan bridge (PID: {self.process.pid})...")
try:
# Try graceful shutdown
self.process.send_signal(signal.SIGTERM)
# Wait for process to terminate
try:
self.process.wait(timeout=timeout)
logger.info("✅ Process stopped gracefully")
except subprocess.TimeoutExpired:
logger.warning("⚠️ Process didn't stop in time, forcing...")
self.process.kill()
self.process.wait()
logger.info("✅ Process killed")
except Exception as e:
logger.error(f"Error stopping process: {e}")
finally:
self.process = None
if self._monitor_thread and self._monitor_thread.is_alive():
self._monitor_thread.join(timeout=2)
def is_running(self) -> bool:
"""Check if process is running."""
return self.process is not None and self.process.poll() is None
def get_exit_code(self) -> Optional[int]:
"""Get process exit code if terminated."""
if self.process is None:
return None
return self.process.poll()
def _start_log_monitor(self):
"""Start thread to monitor and forward logs."""
if not self.process:
return
def monitor():
logger = get_run_logger()
try:
for line in iter(self.process.stdout.readline, ''):
if self._stop_event.is_set():
break
line = line.strip()
if line:
# Forward to Prefect logs with prefix
logger.info(f"[Bridge] {line}")
except Exception as e:
logger.debug(f"Log monitor ended: {e}")
self._monitor_thread = threading.Thread(target=monitor, daemon=True)
self._monitor_thread.start()
def check_hazelcast_data_freshness() -> Dict[str, Any]:
"""Check freshness of data in Hazelcast."""
if not HAZELCAST_AVAILABLE:
return {"available": False, "error": "Hazelcast not installed"}
try:
client = hazelcast.HazelcastClient(
cluster_name="dolphin",
cluster_members=["127.0.0.1:5701"],
)
features_map = client.get_map('DOLPHIN_FEATURES').blocking()
val = features_map.get('latest_eigen_scan')
if not val:
client.shutdown()
return {
"available": True,
"has_data": False,
"error": "No latest_eigen_scan in Hazelcast"
}
data = json.loads(val)
mtime = data.get('file_mtime', 0)
scan_number = data.get('scan_number', 0)
asset_count = len(data.get('assets', []))
age_sec = time.time() - mtime if mtime else float('inf')
client.shutdown()
return {
"available": True,
"has_data": True,
"scan_number": scan_number,
"asset_count": asset_count,
"data_age_sec": age_sec,
"is_fresh": age_sec < DATA_STALE_THRESHOLD,
"is_warning": age_sec >= DATA_WARNING_THRESHOLD,
}
except Exception as e:
return {
"available": True,
"has_data": False,
"error": str(e)
}
@task(name="health-check", retries=2, retry_delay_seconds=5)
def perform_health_check() -> Dict[str, Any]:
"""Perform comprehensive health check."""
logger = get_run_logger()
result = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"process_running": False,
"hazelcast": {},
"healthy": False,
"action_required": None
}
# Check 1: Process status
global bridge_process
if bridge_process and bridge_process.is_running():
result["process_running"] = True
result["process_pid"] = bridge_process.process.pid
result["uptime_sec"] = (datetime.now(timezone.utc) - bridge_process.start_time).total_seconds() if bridge_process.start_time else 0
else:
logger.error("❌ Health check: Process not running")
result["action_required"] = "restart"
return result
# Check 2: Hazelcast data freshness
hz_status = check_hazelcast_data_freshness()
result["hazelcast"] = hz_status
if not hz_status.get("available"):
logger.warning("⚠️ Hazelcast unavailable for health check")
result["action_required"] = "investigate"
return result
if not hz_status.get("has_data"):
logger.warning("⚠️ No data in Hazelcast yet")
result["action_required"] = "wait"
return result
age = hz_status.get("data_age_sec", float('inf'))
if age > DATA_STALE_THRESHOLD:
logger.error(f"❌ Data stale: {age:.0f}s old (threshold: {DATA_STALE_THRESHOLD}s)")
result["action_required"] = "restart"
return result
elif age > DATA_WARNING_THRESHOLD:
logger.warning(f"⚠️ Data warning: {age:.0f}s old")
else:
logger.info(f"✅ Healthy: data age {age:.0f}s, scan #{hz_status.get('scan_number')}")
result["healthy"] = True
return result
@task(name="restart-bridge")
def restart_bridge() -> bool:
"""Restart the scan bridge service."""
logger = get_run_logger()
global bridge_process
bridge_process.restart_count += 1
if bridge_process.restart_count > MAX_RESTART_ATTEMPTS:
logger.error(f"❌ Max restart attempts ({MAX_RESTART_ATTEMPTS}) exceeded")
return False
logger.warning(f"🔄 Restarting bridge (attempt {bridge_process.restart_count}/{MAX_RESTART_ATTEMPTS})...")
# Stop existing
bridge_process.stop()
time.sleep(RESTART_DELAY)
# Start new
if bridge_process.start():
logger.info("✅ Bridge restarted successfully")
return True
else:
logger.error("❌ Bridge restart failed")
return False
# Global process manager
bridge_process: Optional[ScanBridgeProcess] = None
@flow(
name="scan-bridge-daemon",
description="Long-running daemon that supervises the scan bridge service",
log_prints=True,
)
def scan_bridge_daemon_flow():
"""
Main daemon flow that runs indefinitely, managing the scan bridge.
This flow:
1. Starts the scan bridge subprocess
2. Monitors health every 30 seconds
3. Restarts on failure or stale data
4. Logs all output to Prefect
"""
global bridge_process
logger = get_run_logger()
logger.info("=" * 70)
logger.info("🐬 DOLPHIN Scan Bridge Daemon (Prefect)")
logger.info("=" * 70)
logger.info(f"Health check interval: {HEALTH_CHECK_INTERVAL}s")
logger.info(f"Data stale threshold: {DATA_STALE_THRESHOLD}s")
logger.info(f"Script: {DAEMON_SCRIPT}")
logger.info("=" * 70)
# Initialize process manager
bridge_process = ScanBridgeProcess()
# Start initial instance
if not bridge_process.start():
logger.error("❌ Failed to start scan bridge")
raise RuntimeError("Initial start failed")
consecutive_failures = 0
max_consecutive_failures = 5
try:
while True:
# Wait between health checks
time.sleep(HEALTH_CHECK_INTERVAL)
# Perform health check
health = perform_health_check()
if health["healthy"]:
consecutive_failures = 0
continue
# Not healthy - determine action
consecutive_failures += 1
action = health.get("action_required")
if consecutive_failures >= max_consecutive_failures:
logger.error(f"❌ Too many consecutive failures ({consecutive_failures})")
raise RuntimeError("Max failures exceeded")
if action == "restart":
if not restart_bridge():
logger.error("❌ Restart failed")
raise RuntimeError("Restart failed")
elif action == "investigate":
logger.warning("⚠️ Manual investigation required")
# Don't restart, just wait and check again
elif action == "wait":
logger.info("⏳ Waiting for data...")
# Normal for startup
except KeyboardInterrupt:
logger.info("\n🛑 Interrupted by user")
except Exception as e:
logger.error(f"❌ Daemon error: {e}")
raise
finally:
logger.info("🧹 Cleaning up...")
bridge_process.stop()
logger.info("✅ Daemon stopped")
@flow(name="scan-bridge-health-check")
def quick_health_check() -> Dict[str, Any]:
"""
Standalone health check flow for external monitoring.
Can be scheduled independently for alerting.
"""
logger = get_run_logger()
# Check if bridge is running
result = check_hazelcast_data_freshness()
if not result.get("available"):
logger.error("❌ Hazelcast unavailable")
return {"status": "error", "hazelcast": result}
if not result.get("has_data"):
logger.error("❌ No scan data in Hazelcast")
return {"status": "no_data", "hazelcast": result}
age = result.get("data_age_sec", 0)
if age > DATA_STALE_THRESHOLD:
logger.error(f"❌ STALE DATA: {age:.0f}s old")
return {"status": "stale", "age_sec": age, "hazelcast": result}
elif age > DATA_WARNING_THRESHOLD:
logger.warning(f"⚠️ Data warning: {age:.0f}s old")
return {"status": "warning", "age_sec": age, "hazelcast": result}
else:
logger.info(f"✅ Healthy: {age:.0f}s old, scan #{result.get('scan_number')}")
return {"status": "healthy", "age_sec": age, "hazelcast": result}
if __name__ == "__main__":
# Run the daemon
scan_bridge_daemon_flow()