Files
DOLPHIN/prod/scan_bridge_service.py

167 lines
5.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN Scan Bridge Service (Linux)
====================================
Watches Arrow scan files and pushes to Hazelcast.
Handles DolphinNG6 restart/scan_number resets by using file timestamps.
"""
import os
import sys
import time
import json
import logging
from pathlib import Path
from datetime import datetime, timezone
import pyarrow as pa
import pyarrow.ipc as ipc
import numpy as np
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s'
)
logger = logging.getLogger("ScanBridge")
sys.path.insert(0, '/mnt/dolphinng5_predict')
sys.path.insert(0, '/mnt/dolphinng5_predict/nautilus_dolphin')
import hazelcast
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
sys.path.insert(0, '/mnt/dolphinng5_predict/prod')
from dolphin_exit_handler import install_exit_handler
install_exit_handler("scan_bridge")
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super().default(obj)
class ArrowScanReader:
def load_scan(self, filepath):
with pa.memory_map(str(filepath), 'r') as source:
table = ipc.open_file(source).read_all()
result = {}
for col in table.column_names:
val = table.column(col)[0].as_py()
# Parse JSON columns (assets_json, asset_prices_json, etc.)
if col.endswith('_json') and val:
result[col.replace('_json', '')] = json.loads(val)
else:
result[col] = val
return result
class ScanHandler(FileSystemEventHandler):
def __init__(self, reader, hz_map):
self.reader = reader
self.hz_map = hz_map
self.last_mtime = 0
self.scans_pushed = 0
def on_created(self, event):
if event.is_directory or not event.src_path.endswith('.arrow'):
return
self._process(event.src_path)
def _process(self, filepath):
try:
time.sleep(0.02)
mtime = Path(filepath).stat().st_mtime
if mtime <= self.last_mtime:
return
scan = self.reader.load_scan(filepath)
scan['bridge_ts'] = datetime.now(timezone.utc).isoformat()
scan['file_mtime'] = mtime
self.hz_map.put("latest_eigen_scan", json.dumps(scan, cls=NumpyEncoder))
self.last_mtime = mtime
self.scans_pushed += 1
if self.scans_pushed % 100 == 0:
logger.info(f"Pushed {self.scans_pushed} | #{scan.get('scan_number')} | "
f"{len(scan.get('assets', []))} assets | {len(scan.get('asset_prices', []))} prices")
except Exception as e:
logger.error(f"Error: {e}")
def get_latest_file(arrow_dir):
latest_file = None
latest_mtime = 0
try:
with os.scandir(arrow_dir) as it:
for entry in it:
if entry.name.endswith('.arrow') and entry.is_file():
mtime = entry.stat().st_mtime
if mtime > latest_mtime:
latest_mtime = mtime
latest_file = Path(entry.path)
except FileNotFoundError:
return None, 0
return latest_file, latest_mtime
ARROW_BASE = Path('/mnt/dolphinng6_data/arrow_scans')
def main():
hz = hazelcast.HazelcastClient(cluster_name="dolphin", cluster_members=["127.0.0.1:5701"])
hz_map = hz.get_map("DOLPHIN_FEATURES").blocking()
logger.info("Connected to Hazelcast")
reader = ArrowScanReader()
handler = ScanHandler(reader, hz_map)
observer = None
current_date = None
try:
while True:
today = datetime.now().strftime('%Y-%m-%d')
arrow_dir = ARROW_BASE / today
# Day rolled over or first start — rewire observer
if today != current_date:
if observer is not None:
observer.stop()
observer.join()
arrow_dir.mkdir(parents=True, exist_ok=True)
observer = Observer()
observer.schedule(handler, str(arrow_dir), recursive=False)
observer.start()
current_date = today
logger.info(f"Watching: {arrow_dir}")
# Catch up on any files already present
latest_file, _ = get_latest_file(arrow_dir)
if latest_file:
handler._process(str(latest_file))
logger.info(f"Caught up to scan #{handler.scans_pushed} pushed so far")
time.sleep(1)
except KeyboardInterrupt:
pass
finally:
if observer is not None:
observer.stop()
observer.join()
hz.shutdown()
if __name__ == "__main__":
main()