#!/usr/bin/env python3 """ DOLPHIN Scan Bridge Service (Linux) ==================================== Watches Arrow scan files and pushes to Hazelcast. Handles DolphinNG6 restart/scan_number resets by using file timestamps. """ import os import sys import time import json import logging from pathlib import Path from datetime import datetime, timezone import pyarrow as pa import pyarrow.ipc as ipc import numpy as np logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(name)s: %(message)s' ) logger = logging.getLogger("ScanBridge") sys.path.insert(0, '/mnt/dolphinng5_predict') sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin') import hazelcast from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler sys.path.insert(0, '/mnt/dolphinng5_predict/prod') from dolphin_exit_handler import install_exit_handler install_exit_handler("scan_bridge") class NumpyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.integer): return int(obj) if isinstance(obj, np.floating): return float(obj) if isinstance(obj, np.ndarray): return obj.tolist() return super().default(obj) class ArrowScanReader: def load_scan(self, filepath): with pa.memory_map(str(filepath), 'r') as source: table = ipc.open_file(source).read_all() result = {} for col in table.column_names: val = table.column(col)[0].as_py() # Parse JSON columns (assets_json, asset_prices_json, etc.) if col.endswith('_json') and val: result[col.replace('_json', '')] = json.loads(val) else: result[col] = val return result class ScanHandler(FileSystemEventHandler): def __init__(self, reader, hz_map): self.reader = reader self.hz_map = hz_map self.last_mtime = 0 self.scans_pushed = 0 def on_created(self, event): if event.is_directory or not event.src_path.endswith('.arrow'): return self._process(event.src_path) def _process(self, filepath): try: time.sleep(0.02) mtime = Path(filepath).stat().st_mtime if mtime <= self.last_mtime: return scan = self.reader.load_scan(filepath) scan['bridge_ts'] = datetime.now(timezone.utc).isoformat() scan['file_mtime'] = mtime self.hz_map.put("latest_eigen_scan", json.dumps(scan, cls=NumpyEncoder)) self.last_mtime = mtime self.scans_pushed += 1 if self.scans_pushed % 100 == 0: logger.info(f"Pushed {self.scans_pushed} | #{scan.get('scan_number')} | " f"{len(scan.get('assets', []))} assets | {len(scan.get('asset_prices', []))} prices") except Exception as e: logger.error(f"Error: {e}") def get_latest_file(arrow_dir): latest_file = None latest_mtime = 0 try: with os.scandir(arrow_dir) as it: for entry in it: if entry.name.endswith('.arrow') and entry.is_file(): mtime = entry.stat().st_mtime if mtime > latest_mtime: latest_mtime = mtime latest_file = Path(entry.path) except FileNotFoundError: return None, 0 return latest_file, latest_mtime ARROW_BASE = Path('/mnt/dolphinng6_data/arrow_scans') def main(): hz = hazelcast.HazelcastClient(cluster_name="dolphin", cluster_members=["127.0.0.1:5701"]) hz_map = hz.get_map("DOLPHIN_FEATURES").blocking() logger.info("Connected to Hazelcast") reader = ArrowScanReader() handler = ScanHandler(reader, hz_map) observer = None current_date = None try: while True: today = datetime.now().strftime('%Y-%m-%d') arrow_dir = ARROW_BASE / today # Day rolled over or first start — rewire observer if today != current_date: if observer is not None: observer.stop() observer.join() arrow_dir.mkdir(parents=True, exist_ok=True) observer = Observer() observer.schedule(handler, str(arrow_dir), recursive=False) observer.start() current_date = today logger.info(f"Watching: {arrow_dir}") # Catch up on any files already present latest_file, _ = get_latest_file(arrow_dir) if latest_file: handler._process(str(latest_file)) logger.info(f"Caught up to scan #{handler.scans_pushed} pushed so far") time.sleep(1) except KeyboardInterrupt: pass finally: if observer is not None: observer.stop() observer.join() hz.shutdown() if __name__ == "__main__": main()