#!/usr/bin/env python3 """ DOLPHIN Scan Bridge - Prefect Managed Service ============================================== Long-running flow that continuously watches for Arrow scan files and pushes them to Hazelcast. Self-healing via Prefect. Usage: prefect deploy scan_bridge_prefect_flow.py:scan_bridge_flow \ --name scan-bridge --pool dolphin-services prefect worker start --pool dolphin-services """ import os import sys import json import time from pathlib import Path from datetime import datetime, timezone from typing import Optional sys.path.insert(0, '/mnt/dolphinng5_predict') sys.path.insert(0, '/mnt/dolphinng5_predict/prod') from prefect import flow, task, get_run_logger from prefect.runtime import flow_run import pyarrow as pa import pyarrow.ipc as ipc import hazelcast # Configuration ARROW_DIR = Path('/mnt/ng6_data/arrow_scans') / datetime.now().strftime('%Y-%m-%d') HZ_CLUSTER = "dolphin" HZ_HOST = "127.0.0.1:5701" POLL_INTERVAL = 5.0 # seconds when idle HEALTH_LOG_INTERVAL = 60 # log status every 60 iterations (~5 min) class NumpyEncoder(json.JSONEncoder): def default(self, obj): if hasattr(obj, 'item'): # numpy scalar return obj.item() if isinstance(obj, (list, tuple)): return [self.default(x) for x in obj] return super().default(obj) def load_scan_file(filepath: Path) -> dict: """Load and parse an Arrow scan file.""" with pa.memory_map(str(filepath), 'r') as source: table = ipc.open_file(source).read_all() result = {} for col in table.column_names: val = table.column(col)[0].as_py() # Parse JSON columns if col.endswith('_json') and val: result[col.replace('_json', '')] = json.loads(val) else: result[col] = val return result def get_latest_arrow_file() -> Optional[Path]: """Find the most recently modified .arrow file.""" arrow_dir = Path('/mnt/ng6_data/arrow_scans') / datetime.now().strftime('%Y-%m-%d') if not arrow_dir.exists(): return None latest_file = None latest_mtime = 0 try: with os.scandir(arrow_dir) as it: for entry in it: if entry.name.endswith('.arrow') and entry.is_file(): mtime = entry.stat().st_mtime if mtime > latest_mtime: latest_mtime = mtime latest_file = Path(entry.path) except FileNotFoundError: return None return latest_file @task(name="push-to-hazelcast", retries=3, retry_delay_seconds=5) def push_scan_to_hz(scan_data: dict, filepath: Path) -> bool: """Push scan data to Hazelcast DOLPHIN_FEATURES map.""" client = hazelcast.HazelcastClient( cluster_name=HZ_CLUSTER, cluster_members=[HZ_HOST], ) try: features_map = client.get_map('DOLPHIN_FEATURES').blocking() # Add metadata scan_data['bridge_ts'] = datetime.now(timezone.utc).isoformat() scan_data['bridge_source'] = 'scan_bridge_prefect' scan_data['file_mtime'] = filepath.stat().st_mtime # Push to Hz features_map.put("latest_eigen_scan", json.dumps(scan_data, cls=NumpyEncoder)) return True finally: client.shutdown() @task(name="health-check") def check_hz_connection() -> bool: """Verify Hazelcast connectivity.""" try: client = hazelcast.HazelcastClient( cluster_name=HZ_CLUSTER, cluster_members=[HZ_HOST], ) client.shutdown() return True except Exception: return False @flow( name="scan-bridge-flow", description="Continuously watch Arrow files and push to Hazelcast", log_prints=True, task_runner=None, # Use default sequential runner ) def scan_bridge_flow(): """ Main scan bridge flow - runs indefinitely. - Watches /mnt/ng6_data/arrow_scans/ for new .arrow files - Pushes parsed data to Hazelcast DOLPHIN_FEATURES - Handles idle periods (no new scans) - Self-healing via Prefect retries/restarts """ logger = get_run_logger() logger.info("=" * 70) logger.info("🐬 DOLPHIN Scan Bridge - Prefect Managed") logger.info("=" * 70) logger.info(f"Arrow directory: {ARROW_DIR}") logger.info(f"Hazelcast: {HZ_HOST} (cluster: {HZ_CLUSTER})") logger.info(f"Poll interval: {POLL_INTERVAL}s") logger.info("=" * 70) # Health check if not check_hz_connection(): logger.error("❌ Cannot connect to Hazelcast - exiting") raise RuntimeError("Hazelcast connection failed") logger.info("✅ Connected to Hazelcast") last_scan_number = -1 last_file_mtime = 0 iterations = 0 scans_pushed = 0 try: while True: iterations += 1 # Find latest file latest_file = get_latest_arrow_file() if not latest_file: if iterations % HEALTH_LOG_INTERVAL == 0: logger.info(f"⏳ No arrow files yet (iteration {iterations})") time.sleep(POLL_INTERVAL) continue # Check if file is new mtime = latest_file.stat().st_mtime if mtime <= last_file_mtime: if iterations % HEALTH_LOG_INTERVAL == 0: logger.info(f"⏳ Idle - waiting for new scans (pushed: {scans_pushed})") time.sleep(POLL_INTERVAL) continue # New file found - process it try: scan_data = load_scan_file(latest_file) scan_number = scan_data.get('scan_number', 0) # Push to Hz push_scan_to_hz(scan_data, latest_file) last_file_mtime = mtime scans_pushed += 1 # Log every 10 scans if scans_pushed % 10 == 0: logger.info(f"📊 Pushed {scans_pushed} scans (latest: #{scan_number})") else: logger.debug(f"Pushed scan #{scan_number}") except Exception as e: logger.error(f"Error processing {latest_file.name}: {e}") time.sleep(1.0) # Brief delay on error continue time.sleep(POLL_INTERVAL) except KeyboardInterrupt: logger.info("🛑 Interrupted by user") except Exception as e: logger.error(f"❌ Fatal error: {e}") raise finally: logger.info(f"✅ Scan bridge stopped. Total scans pushed: {scans_pushed}") if __name__ == "__main__": scan_bridge_flow()