initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
56
prod/prefect_services/watchdog_service_prefect.py
Executable file
56
prod/prefect_services/watchdog_service_prefect.py
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
"""System Watchdog Service - Prefect Managed"""
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from prefect import flow, task, get_run_logger
|
||||
|
||||
SERVICE_SCRIPT = "/mnt/dolphinng5_predict/prod/system_watchdog_service.py"
|
||||
|
||||
@task
|
||||
def check_safety() -> bool:
|
||||
"""Check if Safety data is in Hz."""
|
||||
try:
|
||||
import hazelcast
|
||||
client = hazelcast.HazelcastClient(
|
||||
cluster_name="dolphin",
|
||||
cluster_members=["127.0.0.1:5701"],
|
||||
)
|
||||
safety = client.get_map('DOLPHIN_SAFETY').blocking()
|
||||
data = safety.get('latest')
|
||||
client.shutdown()
|
||||
return data is not None
|
||||
except:
|
||||
return False
|
||||
|
||||
@flow(name="watchdog-service")
|
||||
def watchdog_service_flow():
|
||||
"""Manage System Watchdog Service."""
|
||||
logger = get_run_logger()
|
||||
logger.info("Starting Watchdog Service...")
|
||||
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, SERVICE_SCRIPT],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True
|
||||
)
|
||||
|
||||
logger.info(f"Watchdog started (PID: {proc.pid})")
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(60)
|
||||
if proc.poll() is not None:
|
||||
logger.error("Watchdog died, restarting...")
|
||||
proc = subprocess.Popen([sys.executable, SERVICE_SCRIPT])
|
||||
else:
|
||||
healthy = check_safety()
|
||||
logger.info(f"Safety data: {'✅' if healthy else '⏳'}")
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Stopping Watchdog...")
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
|
||||
if __name__ == "__main__":
|
||||
watchdog_service_flow()
|
||||
Reference in New Issue
Block a user