Files
DOLPHIN/prod/scan_bridge_prefect_flow.py

224 lines
6.7 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN Scan Bridge - Prefect Managed Service
==============================================
Long-running flow that continuously watches for Arrow scan files
and pushes them to Hazelcast. Self-healing via Prefect.
Usage:
prefect deploy scan_bridge_prefect_flow.py:scan_bridge_flow \
--name scan-bridge --pool dolphin-services
prefect worker start --pool dolphin-services
"""
import os
import sys
import json
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
sys.path.insert(0, '/mnt/dolphinng5_predict')
sys.path.insert(0, '/mnt/dolphinng5_predict/prod')
from prefect import flow, task, get_run_logger
from prefect.runtime import flow_run
import pyarrow as pa
import pyarrow.ipc as ipc
import hazelcast
# Configuration
ARROW_DIR = Path('/mnt/ng6_data/arrow_scans') / datetime.now().strftime('%Y-%m-%d')
HZ_CLUSTER = "dolphin"
HZ_HOST = "127.0.0.1:5701"
POLL_INTERVAL = 5.0 # seconds when idle
HEALTH_LOG_INTERVAL = 60 # log status every 60 iterations (~5 min)
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if hasattr(obj, 'item'): # numpy scalar
return obj.item()
if isinstance(obj, (list, tuple)):
return [self.default(x) for x in obj]
return super().default(obj)
def load_scan_file(filepath: Path) -> dict:
"""Load and parse an Arrow scan file."""
with pa.memory_map(str(filepath), 'r') as source:
table = ipc.open_file(source).read_all()
result = {}
for col in table.column_names:
val = table.column(col)[0].as_py()
# Parse JSON columns
if col.endswith('_json') and val:
result[col.replace('_json', '')] = json.loads(val)
else:
result[col] = val
return result
def get_latest_arrow_file() -> Optional[Path]:
"""Find the most recently modified .arrow file."""
arrow_dir = Path('/mnt/ng6_data/arrow_scans') / datetime.now().strftime('%Y-%m-%d')
if not arrow_dir.exists():
return None
latest_file = None
latest_mtime = 0
try:
with os.scandir(arrow_dir) as it:
for entry in it:
if entry.name.endswith('.arrow') and entry.is_file():
mtime = entry.stat().st_mtime
if mtime > latest_mtime:
latest_mtime = mtime
latest_file = Path(entry.path)
except FileNotFoundError:
return None
return latest_file
@task(name="push-to-hazelcast", retries=3, retry_delay_seconds=5)
def push_scan_to_hz(scan_data: dict, filepath: Path) -> bool:
"""Push scan data to Hazelcast DOLPHIN_FEATURES map."""
client = hazelcast.HazelcastClient(
cluster_name=HZ_CLUSTER,
cluster_members=[HZ_HOST],
)
try:
features_map = client.get_map('DOLPHIN_FEATURES').blocking()
# Add metadata
scan_data['bridge_ts'] = datetime.now(timezone.utc).isoformat()
scan_data['bridge_source'] = 'scan_bridge_prefect'
scan_data['file_mtime'] = filepath.stat().st_mtime
# Push to Hz
features_map.put("latest_eigen_scan", json.dumps(scan_data, cls=NumpyEncoder))
return True
finally:
client.shutdown()
@task(name="health-check")
def check_hz_connection() -> bool:
"""Verify Hazelcast connectivity."""
try:
client = hazelcast.HazelcastClient(
cluster_name=HZ_CLUSTER,
cluster_members=[HZ_HOST],
)
client.shutdown()
return True
except Exception:
return False
@flow(
name="scan-bridge-flow",
description="Continuously watch Arrow files and push to Hazelcast",
log_prints=True,
task_runner=None, # Use default sequential runner
)
def scan_bridge_flow():
"""
Main scan bridge flow - runs indefinitely.
- Watches /mnt/ng6_data/arrow_scans/ for new .arrow files
- Pushes parsed data to Hazelcast DOLPHIN_FEATURES
- Handles idle periods (no new scans)
- Self-healing via Prefect retries/restarts
"""
logger = get_run_logger()
logger.info("=" * 70)
logger.info("🐬 DOLPHIN Scan Bridge - Prefect Managed")
logger.info("=" * 70)
logger.info(f"Arrow directory: {ARROW_DIR}")
logger.info(f"Hazelcast: {HZ_HOST} (cluster: {HZ_CLUSTER})")
logger.info(f"Poll interval: {POLL_INTERVAL}s")
logger.info("=" * 70)
# Health check
if not check_hz_connection():
logger.error("❌ Cannot connect to Hazelcast - exiting")
raise RuntimeError("Hazelcast connection failed")
logger.info("✅ Connected to Hazelcast")
last_scan_number = -1
last_file_mtime = 0
iterations = 0
scans_pushed = 0
try:
while True:
iterations += 1
# Find latest file
latest_file = get_latest_arrow_file()
if not latest_file:
if iterations % HEALTH_LOG_INTERVAL == 0:
logger.info(f"⏳ No arrow files yet (iteration {iterations})")
time.sleep(POLL_INTERVAL)
continue
# Check if file is new
mtime = latest_file.stat().st_mtime
if mtime <= last_file_mtime:
if iterations % HEALTH_LOG_INTERVAL == 0:
logger.info(f"⏳ Idle - waiting for new scans (pushed: {scans_pushed})")
time.sleep(POLL_INTERVAL)
continue
# New file found - process it
try:
scan_data = load_scan_file(latest_file)
scan_number = scan_data.get('scan_number', 0)
# Push to Hz
push_scan_to_hz(scan_data, latest_file)
last_file_mtime = mtime
scans_pushed += 1
# Log every 10 scans
if scans_pushed % 10 == 0:
logger.info(f"📊 Pushed {scans_pushed} scans (latest: #{scan_number})")
else:
logger.debug(f"Pushed scan #{scan_number}")
except Exception as e:
logger.error(f"Error processing {latest_file.name}: {e}")
time.sleep(1.0) # Brief delay on error
continue
time.sleep(POLL_INTERVAL)
except KeyboardInterrupt:
logger.info("🛑 Interrupted by user")
except Exception as e:
logger.error(f"❌ Fatal error: {e}")
raise
finally:
logger.info(f"✅ Scan bridge stopped. Total scans pushed: {scans_pushed}")
if __name__ == "__main__":
scan_bridge_flow()