#!/usr/bin/env python3 """ DOLPHIN Scan Bridge Prefect Daemon =================================== Phase 2 Implementation: Prefect-managed long-running daemon for scan bridge. This daemon supervises the scan bridge service, providing: - Automatic restart on crash - Health monitoring (data freshness) - Centralized logging via Prefect - Integration with DOLPHIN orchestration Usage: # Deploy to Prefect prefect deployment build scan_bridge_prefect_daemon.py:scan_bridge_daemon_flow \ --name "scan-bridge-daemon" --pool dolphin-daemon-pool # Start worker prefect worker start --pool dolphin-daemon-pool """ import sys import time import json import signal import subprocess import threading from datetime import datetime, timezone from pathlib import Path from typing import Optional, Dict, Any # Add paths sys.path.insert(0, '/mnt/dolphinng5_predict') sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin') # Prefect imports from prefect import flow, task, get_run_logger from prefect.runtime import flow_run from prefect.states import Completed, Failed # Hazelcast for health checks try: import hazelcast HAZELCAST_AVAILABLE = True except ImportError: HAZELCAST_AVAILABLE = False # Constants DAEMON_SCRIPT = "/mnt/dolphinng5_predict/prod/scan_bridge_service.py" HEALTH_CHECK_INTERVAL = 30 # seconds DATA_STALE_THRESHOLD = 60 # seconds (critical) DATA_WARNING_THRESHOLD = 30 # seconds (warning) RESTART_DELAY = 5 # seconds between restart attempts MAX_RESTART_ATTEMPTS = 3 class ScanBridgeProcess: """Manages the scan bridge subprocess.""" def __init__(self): self.process: Optional[subprocess.Popen] = None self.start_time: Optional[datetime] = None self.restart_count = 0 self._stop_event = threading.Event() self._monitor_thread: Optional[threading.Thread] = None def start(self) -> bool: """Start the scan bridge subprocess.""" logger = get_run_logger() if self.process and self.process.poll() is None: logger.warning("Process already running") return True logger.info(f"๐Ÿš€ Starting scan bridge (attempt {self.restart_count + 1})...") try: self.process = subprocess.Popen( [sys.executable, DAEMON_SCRIPT], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, bufsize=1, # Line buffered cwd="/mnt/dolphinng5_predict/prod" ) # Wait for startup time.sleep(2) if self.process.poll() is None: self.start_time = datetime.now(timezone.utc) self.restart_count = 0 logger.info(f"โœ… Scan bridge started (PID: {self.process.pid})") # Start log monitor thread self._start_log_monitor() return True else: logger.error(f"โŒ Process exited immediately with code {self.process.poll()}") return False except Exception as e: logger.error(f"โŒ Failed to start: {e}") return False def stop(self, timeout: int = 10) -> None: """Stop the scan bridge subprocess gracefully.""" logger = get_run_logger() self._stop_event.set() if not self.process: return if self.process.poll() is not None: logger.debug("Process already stopped") return logger.info(f"๐Ÿ›‘ Stopping scan bridge (PID: {self.process.pid})...") try: # Try graceful shutdown self.process.send_signal(signal.SIGTERM) # Wait for process to terminate try: self.process.wait(timeout=timeout) logger.info("โœ… Process stopped gracefully") except subprocess.TimeoutExpired: logger.warning("โš ๏ธ Process didn't stop in time, forcing...") self.process.kill() self.process.wait() logger.info("โœ… Process killed") except Exception as e: logger.error(f"Error stopping process: {e}") finally: self.process = None if self._monitor_thread and self._monitor_thread.is_alive(): self._monitor_thread.join(timeout=2) def is_running(self) -> bool: """Check if process is running.""" return self.process is not None and self.process.poll() is None def get_exit_code(self) -> Optional[int]: """Get process exit code if terminated.""" if self.process is None: return None return self.process.poll() def _start_log_monitor(self): """Start thread to monitor and forward logs.""" if not self.process: return def monitor(): logger = get_run_logger() try: for line in iter(self.process.stdout.readline, ''): if self._stop_event.is_set(): break line = line.strip() if line: # Forward to Prefect logs with prefix logger.info(f"[Bridge] {line}") except Exception as e: logger.debug(f"Log monitor ended: {e}") self._monitor_thread = threading.Thread(target=monitor, daemon=True) self._monitor_thread.start() def check_hazelcast_data_freshness() -> Dict[str, Any]: """Check freshness of data in Hazelcast.""" if not HAZELCAST_AVAILABLE: return {"available": False, "error": "Hazelcast not installed"} try: client = hazelcast.HazelcastClient( cluster_name="dolphin", cluster_members=["127.0.0.1:5701"], ) features_map = client.get_map('DOLPHIN_FEATURES').blocking() val = features_map.get('latest_eigen_scan') if not val: client.shutdown() return { "available": True, "has_data": False, "error": "No latest_eigen_scan in Hazelcast" } data = json.loads(val) mtime = data.get('file_mtime', 0) scan_number = data.get('scan_number', 0) asset_count = len(data.get('assets', [])) age_sec = time.time() - mtime if mtime else float('inf') client.shutdown() return { "available": True, "has_data": True, "scan_number": scan_number, "asset_count": asset_count, "data_age_sec": age_sec, "is_fresh": age_sec < DATA_STALE_THRESHOLD, "is_warning": age_sec >= DATA_WARNING_THRESHOLD, } except Exception as e: return { "available": True, "has_data": False, "error": str(e) } @task(name="health-check", retries=2, retry_delay_seconds=5) def perform_health_check() -> Dict[str, Any]: """Perform comprehensive health check.""" logger = get_run_logger() result = { "timestamp": datetime.now(timezone.utc).isoformat(), "process_running": False, "hazelcast": {}, "healthy": False, "action_required": None } # Check 1: Process status global bridge_process if bridge_process and bridge_process.is_running(): result["process_running"] = True result["process_pid"] = bridge_process.process.pid result["uptime_sec"] = (datetime.now(timezone.utc) - bridge_process.start_time).total_seconds() if bridge_process.start_time else 0 else: logger.error("โŒ Health check: Process not running") result["action_required"] = "restart" return result # Check 2: Hazelcast data freshness hz_status = check_hazelcast_data_freshness() result["hazelcast"] = hz_status if not hz_status.get("available"): logger.warning("โš ๏ธ Hazelcast unavailable for health check") result["action_required"] = "investigate" return result if not hz_status.get("has_data"): logger.warning("โš ๏ธ No data in Hazelcast yet") result["action_required"] = "wait" return result age = hz_status.get("data_age_sec", float('inf')) if age > DATA_STALE_THRESHOLD: logger.error(f"โŒ Data stale: {age:.0f}s old (threshold: {DATA_STALE_THRESHOLD}s)") result["action_required"] = "restart" return result elif age > DATA_WARNING_THRESHOLD: logger.warning(f"โš ๏ธ Data warning: {age:.0f}s old") else: logger.info(f"โœ… Healthy: data age {age:.0f}s, scan #{hz_status.get('scan_number')}") result["healthy"] = True return result @task(name="restart-bridge") def restart_bridge() -> bool: """Restart the scan bridge service.""" logger = get_run_logger() global bridge_process bridge_process.restart_count += 1 if bridge_process.restart_count > MAX_RESTART_ATTEMPTS: logger.error(f"โŒ Max restart attempts ({MAX_RESTART_ATTEMPTS}) exceeded") return False logger.warning(f"๐Ÿ”„ Restarting bridge (attempt {bridge_process.restart_count}/{MAX_RESTART_ATTEMPTS})...") # Stop existing bridge_process.stop() time.sleep(RESTART_DELAY) # Start new if bridge_process.start(): logger.info("โœ… Bridge restarted successfully") return True else: logger.error("โŒ Bridge restart failed") return False # Global process manager bridge_process: Optional[ScanBridgeProcess] = None @flow( name="scan-bridge-daemon", description="Long-running daemon that supervises the scan bridge service", log_prints=True, ) def scan_bridge_daemon_flow(): """ Main daemon flow that runs indefinitely, managing the scan bridge. This flow: 1. Starts the scan bridge subprocess 2. Monitors health every 30 seconds 3. Restarts on failure or stale data 4. Logs all output to Prefect """ global bridge_process logger = get_run_logger() logger.info("=" * 70) logger.info("๐Ÿฌ DOLPHIN Scan Bridge Daemon (Prefect)") logger.info("=" * 70) logger.info(f"Health check interval: {HEALTH_CHECK_INTERVAL}s") logger.info(f"Data stale threshold: {DATA_STALE_THRESHOLD}s") logger.info(f"Script: {DAEMON_SCRIPT}") logger.info("=" * 70) # Initialize process manager bridge_process = ScanBridgeProcess() # Start initial instance if not bridge_process.start(): logger.error("โŒ Failed to start scan bridge") raise RuntimeError("Initial start failed") consecutive_failures = 0 max_consecutive_failures = 5 try: while True: # Wait between health checks time.sleep(HEALTH_CHECK_INTERVAL) # Perform health check health = perform_health_check() if health["healthy"]: consecutive_failures = 0 continue # Not healthy - determine action consecutive_failures += 1 action = health.get("action_required") if consecutive_failures >= max_consecutive_failures: logger.error(f"โŒ Too many consecutive failures ({consecutive_failures})") raise RuntimeError("Max failures exceeded") if action == "restart": if not restart_bridge(): logger.error("โŒ Restart failed") raise RuntimeError("Restart failed") elif action == "investigate": logger.warning("โš ๏ธ Manual investigation required") # Don't restart, just wait and check again elif action == "wait": logger.info("โณ Waiting for data...") # Normal for startup except KeyboardInterrupt: logger.info("\n๐Ÿ›‘ Interrupted by user") except Exception as e: logger.error(f"โŒ Daemon error: {e}") raise finally: logger.info("๐Ÿงน Cleaning up...") bridge_process.stop() logger.info("โœ… Daemon stopped") @flow(name="scan-bridge-health-check") def quick_health_check() -> Dict[str, Any]: """ Standalone health check flow for external monitoring. Can be scheduled independently for alerting. """ logger = get_run_logger() # Check if bridge is running result = check_hazelcast_data_freshness() if not result.get("available"): logger.error("โŒ Hazelcast unavailable") return {"status": "error", "hazelcast": result} if not result.get("has_data"): logger.error("โŒ No scan data in Hazelcast") return {"status": "no_data", "hazelcast": result} age = result.get("data_age_sec", 0) if age > DATA_STALE_THRESHOLD: logger.error(f"โŒ STALE DATA: {age:.0f}s old") return {"status": "stale", "age_sec": age, "hazelcast": result} elif age > DATA_WARNING_THRESHOLD: logger.warning(f"โš ๏ธ Data warning: {age:.0f}s old") return {"status": "warning", "age_sec": age, "hazelcast": result} else: logger.info(f"โœ… Healthy: {age:.0f}s old, scan #{result.get('scan_number')}") return {"status": "healthy", "age_sec": age, "hazelcast": result} if __name__ == "__main__": # Run the daemon scan_bridge_daemon_flow()