224 lines
6.7 KiB
Python
224 lines
6.7 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
DOLPHIN Scan Bridge - Prefect Managed Service
|
||
|
|
==============================================
|
||
|
|
Long-running flow that continuously watches for Arrow scan files
|
||
|
|
and pushes them to Hazelcast. Self-healing via Prefect.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
prefect deploy scan_bridge_prefect_flow.py:scan_bridge_flow \
|
||
|
|
--name scan-bridge --pool dolphin-services
|
||
|
|
|
||
|
|
prefect worker start --pool dolphin-services
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
sys.path.insert(0, '/mnt/dolphinng5_predict')
|
||
|
|
sys.path.insert(0, '/mnt/dolphinng5_predict/prod')
|
||
|
|
|
||
|
|
from prefect import flow, task, get_run_logger
|
||
|
|
from prefect.runtime import flow_run
|
||
|
|
|
||
|
|
import pyarrow as pa
|
||
|
|
import pyarrow.ipc as ipc
|
||
|
|
import hazelcast
|
||
|
|
|
||
|
|
|
||
|
|
# Configuration
|
||
|
|
ARROW_DIR = Path('/mnt/ng6_data/arrow_scans') / datetime.now().strftime('%Y-%m-%d')
|
||
|
|
HZ_CLUSTER = "dolphin"
|
||
|
|
HZ_HOST = "127.0.0.1:5701"
|
||
|
|
POLL_INTERVAL = 5.0 # seconds when idle
|
||
|
|
HEALTH_LOG_INTERVAL = 60 # log status every 60 iterations (~5 min)
|
||
|
|
|
||
|
|
|
||
|
|
class NumpyEncoder(json.JSONEncoder):
|
||
|
|
def default(self, obj):
|
||
|
|
if hasattr(obj, 'item'): # numpy scalar
|
||
|
|
return obj.item()
|
||
|
|
if isinstance(obj, (list, tuple)):
|
||
|
|
return [self.default(x) for x in obj]
|
||
|
|
return super().default(obj)
|
||
|
|
|
||
|
|
|
||
|
|
def load_scan_file(filepath: Path) -> dict:
|
||
|
|
"""Load and parse an Arrow scan file."""
|
||
|
|
with pa.memory_map(str(filepath), 'r') as source:
|
||
|
|
table = ipc.open_file(source).read_all()
|
||
|
|
|
||
|
|
result = {}
|
||
|
|
for col in table.column_names:
|
||
|
|
val = table.column(col)[0].as_py()
|
||
|
|
# Parse JSON columns
|
||
|
|
if col.endswith('_json') and val:
|
||
|
|
result[col.replace('_json', '')] = json.loads(val)
|
||
|
|
else:
|
||
|
|
result[col] = val
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
def get_latest_arrow_file() -> Optional[Path]:
|
||
|
|
"""Find the most recently modified .arrow file."""
|
||
|
|
arrow_dir = Path('/mnt/ng6_data/arrow_scans') / datetime.now().strftime('%Y-%m-%d')
|
||
|
|
|
||
|
|
if not arrow_dir.exists():
|
||
|
|
return None
|
||
|
|
|
||
|
|
latest_file = None
|
||
|
|
latest_mtime = 0
|
||
|
|
|
||
|
|
try:
|
||
|
|
with os.scandir(arrow_dir) as it:
|
||
|
|
for entry in it:
|
||
|
|
if entry.name.endswith('.arrow') and entry.is_file():
|
||
|
|
mtime = entry.stat().st_mtime
|
||
|
|
if mtime > latest_mtime:
|
||
|
|
latest_mtime = mtime
|
||
|
|
latest_file = Path(entry.path)
|
||
|
|
except FileNotFoundError:
|
||
|
|
return None
|
||
|
|
|
||
|
|
return latest_file
|
||
|
|
|
||
|
|
|
||
|
|
@task(name="push-to-hazelcast", retries=3, retry_delay_seconds=5)
|
||
|
|
def push_scan_to_hz(scan_data: dict, filepath: Path) -> bool:
|
||
|
|
"""Push scan data to Hazelcast DOLPHIN_FEATURES map."""
|
||
|
|
client = hazelcast.HazelcastClient(
|
||
|
|
cluster_name=HZ_CLUSTER,
|
||
|
|
cluster_members=[HZ_HOST],
|
||
|
|
)
|
||
|
|
|
||
|
|
try:
|
||
|
|
features_map = client.get_map('DOLPHIN_FEATURES').blocking()
|
||
|
|
|
||
|
|
# Add metadata
|
||
|
|
scan_data['bridge_ts'] = datetime.now(timezone.utc).isoformat()
|
||
|
|
scan_data['bridge_source'] = 'scan_bridge_prefect'
|
||
|
|
scan_data['file_mtime'] = filepath.stat().st_mtime
|
||
|
|
|
||
|
|
# Push to Hz
|
||
|
|
features_map.put("latest_eigen_scan", json.dumps(scan_data, cls=NumpyEncoder))
|
||
|
|
|
||
|
|
return True
|
||
|
|
|
||
|
|
finally:
|
||
|
|
client.shutdown()
|
||
|
|
|
||
|
|
|
||
|
|
@task(name="health-check")
|
||
|
|
def check_hz_connection() -> bool:
|
||
|
|
"""Verify Hazelcast connectivity."""
|
||
|
|
try:
|
||
|
|
client = hazelcast.HazelcastClient(
|
||
|
|
cluster_name=HZ_CLUSTER,
|
||
|
|
cluster_members=[HZ_HOST],
|
||
|
|
)
|
||
|
|
client.shutdown()
|
||
|
|
return True
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
@flow(
|
||
|
|
name="scan-bridge-flow",
|
||
|
|
description="Continuously watch Arrow files and push to Hazelcast",
|
||
|
|
log_prints=True,
|
||
|
|
task_runner=None, # Use default sequential runner
|
||
|
|
)
|
||
|
|
def scan_bridge_flow():
|
||
|
|
"""
|
||
|
|
Main scan bridge flow - runs indefinitely.
|
||
|
|
|
||
|
|
- Watches /mnt/ng6_data/arrow_scans/ for new .arrow files
|
||
|
|
- Pushes parsed data to Hazelcast DOLPHIN_FEATURES
|
||
|
|
- Handles idle periods (no new scans)
|
||
|
|
- Self-healing via Prefect retries/restarts
|
||
|
|
"""
|
||
|
|
logger = get_run_logger()
|
||
|
|
|
||
|
|
logger.info("=" * 70)
|
||
|
|
logger.info("🐬 DOLPHIN Scan Bridge - Prefect Managed")
|
||
|
|
logger.info("=" * 70)
|
||
|
|
logger.info(f"Arrow directory: {ARROW_DIR}")
|
||
|
|
logger.info(f"Hazelcast: {HZ_HOST} (cluster: {HZ_CLUSTER})")
|
||
|
|
logger.info(f"Poll interval: {POLL_INTERVAL}s")
|
||
|
|
logger.info("=" * 70)
|
||
|
|
|
||
|
|
# Health check
|
||
|
|
if not check_hz_connection():
|
||
|
|
logger.error("❌ Cannot connect to Hazelcast - exiting")
|
||
|
|
raise RuntimeError("Hazelcast connection failed")
|
||
|
|
|
||
|
|
logger.info("✅ Connected to Hazelcast")
|
||
|
|
|
||
|
|
last_scan_number = -1
|
||
|
|
last_file_mtime = 0
|
||
|
|
iterations = 0
|
||
|
|
scans_pushed = 0
|
||
|
|
|
||
|
|
try:
|
||
|
|
while True:
|
||
|
|
iterations += 1
|
||
|
|
|
||
|
|
# Find latest file
|
||
|
|
latest_file = get_latest_arrow_file()
|
||
|
|
|
||
|
|
if not latest_file:
|
||
|
|
if iterations % HEALTH_LOG_INTERVAL == 0:
|
||
|
|
logger.info(f"⏳ No arrow files yet (iteration {iterations})")
|
||
|
|
time.sleep(POLL_INTERVAL)
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Check if file is new
|
||
|
|
mtime = latest_file.stat().st_mtime
|
||
|
|
if mtime <= last_file_mtime:
|
||
|
|
if iterations % HEALTH_LOG_INTERVAL == 0:
|
||
|
|
logger.info(f"⏳ Idle - waiting for new scans (pushed: {scans_pushed})")
|
||
|
|
time.sleep(POLL_INTERVAL)
|
||
|
|
continue
|
||
|
|
|
||
|
|
# New file found - process it
|
||
|
|
try:
|
||
|
|
scan_data = load_scan_file(latest_file)
|
||
|
|
scan_number = scan_data.get('scan_number', 0)
|
||
|
|
|
||
|
|
# Push to Hz
|
||
|
|
push_scan_to_hz(scan_data, latest_file)
|
||
|
|
|
||
|
|
last_file_mtime = mtime
|
||
|
|
scans_pushed += 1
|
||
|
|
|
||
|
|
# Log every 10 scans
|
||
|
|
if scans_pushed % 10 == 0:
|
||
|
|
logger.info(f"📊 Pushed {scans_pushed} scans (latest: #{scan_number})")
|
||
|
|
else:
|
||
|
|
logger.debug(f"Pushed scan #{scan_number}")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Error processing {latest_file.name}: {e}")
|
||
|
|
time.sleep(1.0) # Brief delay on error
|
||
|
|
continue
|
||
|
|
|
||
|
|
time.sleep(POLL_INTERVAL)
|
||
|
|
|
||
|
|
except KeyboardInterrupt:
|
||
|
|
logger.info("🛑 Interrupted by user")
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"❌ Fatal error: {e}")
|
||
|
|
raise
|
||
|
|
finally:
|
||
|
|
logger.info(f"✅ Scan bridge stopped. Total scans pushed: {scans_pushed}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
scan_bridge_flow()
|