#!/usr/bin/env python3
"""
DOLPHIN OBF-Universe — Lightweight L2 health monitor for all USDT perps
========================================================================
Subscribes to @depth5@500ms for all active USDT perpetual pairs (~400 assets),
computes OB health metrics every 500 ms (matching WS push cadence), and serves them via:
  - Hazelcast key  : obf_universe_latest  (JSON dict keyed by symbol)
  - Parquet archive: /mnt/ng6_data/ob_universe/  (Hive partitioned, no pruning)

Used by Asset Picker to score tradable universe on real-time liquidity/health.

Architecture:
  - Two asyncio WS connections (Binance cap: 300 streams each)
  - Subscribe to {symbol}@depth5@500ms — absolute snapshots, no book maintenance
  - In-memory state: Dict[symbol → {bids, asks, ts}]   (lock-protected)
  - Snapshot thread: every SNAPSHOT_INTERVAL_S compute health + push HZ
  - Flush thread   : every FLUSH_INTERVAL_S write Parquet

Binance rate limits:
  - WebSocket push streams carry ZERO REST weight
  - depth5@500ms: ~2 msg/s per stream, 800 msg/s for 400 assets — trivial

Usage:
  python prod/obf_universe_service.py
  python prod/obf_universe_service.py --snapshot-interval 30 --dry-run
"""

import argparse
import asyncio
import hashlib
import json
import logging
import sys
import threading
import time
from collections import deque
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import pyarrow as pa
import pyarrow.parquet as pq
import requests

HCM_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(HCM_DIR))
sys.path.insert(0, str(HCM_DIR / "prod"))

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s — %(message)s",
)
logger = logging.getLogger("obf_universe")

# ── Constants ──────────────────────────────────────────────────────────────────
SNAPSHOT_INTERVAL_S  = 0.5         # HZ publish cadence — matches @depth5@500ms WS push
PARQUET_BUFFER_EVERY_N = 20        # buffer to Parquet every Nth snapshot (= every 10s)
FLUSH_INTERVAL_S     = 300         # parquet flush cadence (5 min)
FIRST_FLUSH_S        = 60          # first flush fires early
STALE_THRESHOLD_S    = 10.0        # skip asset if book not updated within N s
MAX_STREAMS_PER_CONN = 200         # conservative (<300 Binance limit); two connections
PARQUET_BASE_DIR     = Path("/mnt/ng6_data/ob_universe")
HZ_KEY               = "obf_universe_latest"
WS_URL               = "wss://fstream.binance.com/ws"
EXCHANGE_INFO_URL    = "https://fapi.binance.com/fapi/v1/exchangeInfo"

# ── Parquet Schema (lightweight — health metrics only) ─────────────────────────
OB_UNIVERSE_SCHEMA = pa.schema([
    pa.field("local_ts",         pa.float64()),
    pa.field("asset",            pa.string()),
    pa.field("best_bid",         pa.float64()),
    pa.field("best_ask",         pa.float64()),
    pa.field("spread_bps",       pa.float32()),
    pa.field("depth_1pct_usd",   pa.float64()),
    pa.field("depth_quality",    pa.float32()),
    pa.field("fill_probability", pa.float32()),
    pa.field("imbalance",        pa.float32()),
    pa.field("n_bid_levels",     pa.int32()),
    pa.field("n_ask_levels",     pa.int32()),
])


def _compute_health(bids: list, asks: list) -> Optional[dict]:
    """Compute OB health metrics from raw depth5 bid/ask levels.

    bids/asks: [[price_str, qty_str], ...]  (up to 5 levels, best first)
    Returns None if book is empty.
    """
    if not bids or not asks:
        return None
    try:
        bid0 = float(bids[0][0])
        ask0 = float(asks[0][0])
        if bid0 <= 0 or ask0 <= 0:
            return None

        mid = (bid0 + ask0) / 2.0
        spread_bps = (ask0 - bid0) / mid * 10_000.0

        threshold_1pct = mid * 0.01
        bid_notional = sum(
            float(p) * float(q)
            for p, q in bids
            if float(p) >= mid - threshold_1pct
        )
        ask_notional = sum(
            float(p) * float(q)
            for p, q in asks
            if float(p) <= mid + threshold_1pct
        )
        depth_1pct_usd  = bid_notional + ask_notional
        depth_quality   = float(min(1.0, depth_1pct_usd / 1_000_000.0))
        fill_probability = float(max(0.0, min(1.0, 1.0 - spread_bps / 20.0)))
        total = bid_notional + ask_notional
        imbalance = float((bid_notional - ask_notional) / max(total, 1e-9))

        return {
            "best_bid":         bid0,
            "best_ask":         ask0,
            "spread_bps":       round(spread_bps, 4),
            "depth_1pct_usd":   round(depth_1pct_usd, 2),
            "depth_quality":    round(depth_quality, 4),
            "fill_probability": round(fill_probability, 4),
            "imbalance":        round(imbalance, 4),
            "n_bid_levels":     len(bids),
            "n_ask_levels":     len(asks),
        }
    except (ValueError, IndexError, ZeroDivisionError):
        return None


class OBUniverseService:
    """
    Two-connection WS health monitor for the full USDT perp universe.

    Thread safety:
      _state is protected by _state_lock (updated by WS threads, read by snapshot thread)
    """

    def __init__(self, hz_host: str = "localhost:5701", hz_cluster: str = "dolphin",
                 snapshot_interval_s: float = SNAPSHOT_INTERVAL_S,
                 dry_run: bool = False):
        self.hz_host            = hz_host
        self.hz_cluster         = hz_cluster
        self.snapshot_interval  = snapshot_interval_s
        self.dry_run            = dry_run

        self._state: Dict[str, dict] = {}     # symbol → {bids, asks, ts}
        self._state_lock = threading.Lock()

        # Parquet row buffer per asset: deque of health dicts
        self._buf: Dict[str, deque] = {}
        self._buf_lock = threading.Lock()

        self._running   = False
        self._symbols: List[str] = []
        self._hz_client = None
        self._imap      = None

        # Stats
        self.stats = {"snapshots": 0, "hz_pushes": 0, "hz_failures": 0,
                      "parquet_files": 0, "parquet_rows": 0, "ws_msgs": 0}

    # ── Universe fetch ─────────────────────────────────────────────────────────

    def _fetch_universe(self) -> List[str]:
        """Fetch all active USDT perpetual symbols from Binance."""
        try:
            resp = requests.get(EXCHANGE_INFO_URL, timeout=10)
            resp.raise_for_status()
            data = resp.json()
            symbols = [
                s["symbol"] for s in data["symbols"]
                if s.get("quoteAsset") == "USDT"
                and s.get("contractType") == "PERPETUAL"
                and s.get("status") == "TRADING"
            ]
            logger.info("Universe: %d active USDT perps fetched", len(symbols))
            return sorted(symbols)
        except Exception as e:
            logger.error("Failed to fetch universe: %s", e)
            return ["BTCUSDT", "ETHUSDT", "SOLUSDT"]  # minimal fallback

    # ── Hazelcast ─────────────────────────────────────────────────────────────

    def _connect_hz(self) -> bool:
        try:
            import hazelcast
            if self._hz_client:
                try:
                    self._hz_client.shutdown()
                except Exception:
                    pass
            self._hz_client = hazelcast.HazelcastClient(
                cluster_name=self.hz_cluster,
                cluster_members=[self.hz_host],
            )
            self._imap = self._hz_client.get_map("DOLPHIN_FEATURES").blocking()
            logger.info("HZ connected")
            return True
        except Exception as e:
            logger.warning("HZ connect failed: %s", e)
            self._hz_client = None
            self._imap = None
            return False

    # ── WebSocket connections ─────────────────────────────────────────────────

    def _update_state(self, symbol: str, bids: list, asks: list, ts: float):
        """Thread-safe state update from WS thread."""
        with self._state_lock:
            self._state[symbol] = {"bids": bids, "asks": asks, "ts": ts}
        self.stats["ws_msgs"] += 1

    async def _ws_connection(self, symbols: List[str], conn_id: int):
        """Async WS loop for one connection. Runs in a background thread's event loop."""
        import websockets

        streams = [f"{s.lower()}@depth5@500ms" for s in symbols]
        logger.info("WS conn %d: subscribing to %d streams", conn_id, len(streams))

        while self._running:
            try:
                async with websockets.connect(
                    WS_URL,
                    ping_interval=20,
                    ping_timeout=30,
                    max_size=2**23,       # 8 MB — handle bursts
                ) as ws:
                    await ws.send(json.dumps({
                        "method": "SUBSCRIBE",
                        "params": streams,
                        "id": conn_id,
                    }))
                    logger.info("WS conn %d: subscribed", conn_id)

                    async for raw in ws:
                        if not self._running:
                            break
                        try:
                            msg = json.loads(raw)
                        except json.JSONDecodeError:
                            continue

                        # Skip subscription ack / errors
                        if "result" in msg or "error" in msg:
                            continue

                        symbol = msg.get("s")
                        if not symbol:
                            continue

                        bids = msg.get("b", [])
                        asks = msg.get("a", [])
                        ts   = msg.get("T", time.time() * 1000) / 1000.0
                        self._update_state(symbol, bids, asks, ts)

            except Exception as e:
                if self._running:
                    logger.warning("WS conn %d error: %s — reconnecting in 5s", conn_id, e)
                    await asyncio.sleep(5)

    def _run_ws_thread(self, symbols: List[str], conn_id: int):
        """Entrypoint for a WS background thread (owns its own event loop)."""
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        try:
            loop.run_until_complete(self._ws_connection(symbols, conn_id))
        finally:
            loop.close()

    # ── Snapshot loop ─────────────────────────────────────────────────────────

    def _snapshot_loop(self):
        """Every SNAPSHOT_INTERVAL_S: compute health for all assets, push to HZ."""
        next_tick = time.monotonic() + self.snapshot_interval
        while self._running:
            now_mono = time.monotonic()
            if now_mono < next_tick:
                time.sleep(min(1.0, next_tick - now_mono))
                continue
            next_tick = now_mono + self.snapshot_interval

            now_ts   = time.time()
            cutoff   = now_ts - STALE_THRESHOLD_S

            with self._state_lock:
                state_copy = dict(self._state)

            health_map: Dict[str, dict] = {}
            n_stale = 0
            rows_by_asset: Dict[str, dict] = {}

            for symbol, book in state_copy.items():
                if book["ts"] < cutoff:
                    n_stale += 1
                    continue
                h = _compute_health(book["bids"], book["asks"])
                if h is None:
                    continue
                health_map[symbol] = h
                rows_by_asset[symbol] = {"local_ts": now_ts, "asset": symbol, **h}

            self.stats["snapshots"] += 1

            # Buffer rows for Parquet flush every Nth snapshot (decoupled from HZ rate)
            if self.stats["snapshots"] % PARQUET_BUFFER_EVERY_N == 0:
                with self._buf_lock:
                    for symbol, row in rows_by_asset.items():
                        if symbol not in self._buf:
                            self._buf[symbol] = deque()
                        self._buf[symbol].append(row)

            # Push to HZ
            payload = {
                "_snapshot_utc": datetime.now(timezone.utc).isoformat(),
                "_n_assets":     len(health_map),
                "_n_stale":      n_stale,
                "_n_total":      len(state_copy),
                **health_map,
            }

            if not self.dry_run and self._imap:
                try:
                    self._imap.put(HZ_KEY, json.dumps(payload))
                    self.stats["hz_pushes"] += 1
                    try:
                        from ch_writer import ch_put
                        snap_ts_ms = int(now_ts * 1_000)
                        for sym, h in health_map.items():
                            ch_put("obf_universe", {
                                "ts":               snap_ts_ms,
                                "symbol":           sym,
                                "spread_bps":       float(h.get("spread_bps", 0)),
                                "depth_1pct_usd":   float(h.get("depth_1pct_usd", 0)),
                                "depth_quality":    float(h.get("depth_quality", 0)),
                                "fill_probability": float(h.get("fill_probability", 0)),
                                "imbalance":        float(h.get("imbalance", 0)),
                                "best_bid":         float(h.get("best_bid", 0)),
                                "best_ask":         float(h.get("best_ask", 0)),
                                "n_bid_levels":     int(h.get("n_bid_levels", 0)),
                                "n_ask_levels":     int(h.get("n_ask_levels", 0)),
                            })
                    except Exception:
                        pass

                    # Write per-asset OB snapshots for HZOBProvider cache.
                    # Format: asset_{symbol}_ob → {bid_notional[5], ask_notional[5],
                    #                               bid_depth[5], ask_depth[5], timestamp}
                    def _levels(side, n=5):
                        out_notional = []
                        out_depth    = []
                        for i in range(n):
                            if i < len(side):
                                p, q = float(side[i][0]), float(side[i][1])
                                out_notional.append(round(p * q, 2))
                                out_depth.append(round(q, 6))
                            else:
                                out_notional.append(0.0)
                                out_depth.append(0.0)
                        return out_notional, out_depth

                    per_asset_kv: Dict[str, str] = {}
                    for symbol, book in state_copy.items():
                        if book["ts"] < cutoff:
                            continue
                        bn, bd = _levels(book["bids"])
                        an, ad = _levels(book["asks"])
                        per_asset_kv[f"asset_{symbol}_ob"] = json.dumps({
                            "timestamp":    book["ts"],
                            "bid_notional": bn,
                            "ask_notional": an,
                            "bid_depth":    bd,
                            "ask_depth":    ad,
                        })
                    if per_asset_kv:
                        self._imap.put_all(per_asset_kv)

                except Exception as e:
                    self.stats["hz_failures"] += 1
                    logger.warning("HZ push failed: %s", e)
                    self._connect_hz()  # silent reconnect
            elif self.dry_run:
                self.stats["hz_pushes"] += 1

            if self.stats["snapshots"] % 120 == 1:
                logger.info(
                    "Snapshot #%d: health=%d/%d stale=%d hz_ok=%d",
                    self.stats["snapshots"], len(health_map),
                    len(state_copy), n_stale, self.stats["hz_pushes"],
                )

    # ── Parquet flush ─────────────────────────────────────────────────────────

    def _flush_loop(self):
        """Every FLUSH_INTERVAL_S: write buffered rows to Parquet."""
        time.sleep(FIRST_FLUSH_S)
        while self._running:
            try:
                self._flush()
            except Exception as e:
                logger.error("Parquet flush error: %s", e)
            time.sleep(FLUSH_INTERVAL_S)

    def _flush(self):
        with self._buf_lock:
            snapshot = {s: list(q) for s, q in self._buf.items() if q}
            for q in self._buf.values():
                q.clear()

        if not snapshot:
            return

        now      = datetime.now(timezone.utc)
        date_str = now.strftime("%Y-%m-%d")
        ts_str   = now.strftime("%H-%M-%S-%f")
        total    = 0

        for symbol, rows in snapshot.items():
            if not rows:
                continue
            if self.dry_run:
                total += len(rows)
                continue
            try:
                out_dir = (
                    PARQUET_BASE_DIR
                    / "exchange=binance"
                    / f"symbol={symbol}"
                    / f"date={date_str}"
                )
                out_dir.mkdir(parents=True, exist_ok=True)
                filename = f"part-{ts_str}.parquet"
                out_path = out_dir / filename
                tmp_path = out_path.with_suffix(".tmp")

                sentinel = {"local_ts": 0.0, "asset": "", "best_bid": 0.0, "best_ask": 0.0,
                            "spread_bps": float("nan"), "depth_1pct_usd": 0.0,
                            "depth_quality": float("nan"), "fill_probability": float("nan"),
                            "imbalance": float("nan"), "n_bid_levels": 0, "n_ask_levels": 0}
                arrays = {
                    f.name: pa.array(
                        [row.get(f.name, sentinel[f.name]) for row in rows],
                        type=f.type,
                    )
                    for f in OB_UNIVERSE_SCHEMA
                }
                table = pa.table(arrays, schema=OB_UNIVERSE_SCHEMA)
                pq.write_table(table, tmp_path, compression="snappy")
                tmp_path.rename(out_path)

                digest = hashlib.md5(out_path.read_bytes()).hexdigest()
                (out_dir / f"{filename}.md5").write_text(digest)

                self.stats["parquet_files"] += 1
                total += len(rows)
            except Exception as e:
                logger.error("Parquet write failed for %s: %s", symbol, e)

        self.stats["parquet_rows"] += total
        if total:
            logger.info("Parquet flush: %d rows, %d assets", total, len(snapshot))

    # ── Lifecycle ─────────────────────────────────────────────────────────────

    def start(self, dry_run: bool = False):
        if dry_run:
            self.dry_run = True

        self._running  = True
        self._symbols  = self._fetch_universe()

        if not self.dry_run:
            self._connect_hz()

        # Initialise state dict for all symbols
        for s in self._symbols:
            self._state[s] = {"bids": [], "asks": [], "ts": 0.0}

        # Partition symbols across WS connections
        batches: List[List[str]] = []
        for i in range(0, len(self._symbols), MAX_STREAMS_PER_CONN):
            batches.append(self._symbols[i:i + MAX_STREAMS_PER_CONN])

        self._ws_threads = []
        for conn_id, batch in enumerate(batches):
            t = threading.Thread(
                target=self._run_ws_thread,
                args=(batch, conn_id),
                daemon=True,
                name=f"obf-universe-ws-{conn_id}",
            )
            t.start()
            self._ws_threads.append(t)
        logger.info(
            "Started %d WS connections for %d symbols", len(batches), len(self._symbols)
        )

        self._snapshot_thread = threading.Thread(
            target=self._snapshot_loop, daemon=True, name="obf-universe-snapshot"
        )
        self._snapshot_thread.start()

        self._flush_thread = threading.Thread(
            target=self._flush_loop, daemon=True, name="obf-universe-flush"
        )
        self._flush_thread.start()

    def stop(self):
        logger.info(
            "Stopping — snapshots=%d hz=%d parquet_rows=%d ws_msgs=%d",
            self.stats["snapshots"], self.stats["hz_pushes"],
            self.stats["parquet_rows"], self.stats["ws_msgs"],
        )
        self._running = False
        if self._hz_client:
            try:
                self._hz_client.shutdown()
            except Exception:
                pass

    def run_forever(self):
        """Block main thread until KeyboardInterrupt / SIGTERM."""
        import signal
        def _sig(signum, frame):
            raise KeyboardInterrupt
        signal.signal(signal.SIGTERM, _sig)

        try:
            while self._running:
                time.sleep(5)
        except KeyboardInterrupt:
            pass
        finally:
            self.stop()


# ── Entry point ────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="DOLPHIN OBF-Universe service")
    parser.add_argument("--snapshot-interval", type=float, default=SNAPSHOT_INTERVAL_S,
                        help="Health snapshot interval in seconds (default 60)")
    parser.add_argument("--hz-host", default="localhost:5701")
    parser.add_argument("--hz-cluster", default="dolphin")
    parser.add_argument("--dry-run", action="store_true",
                        help="Skip HZ writes and Parquet writes (testing)")
    args = parser.parse_args()

    svc = OBUniverseService(
        hz_host=args.hz_host,
        hz_cluster=args.hz_cluster,
        snapshot_interval_s=args.snapshot_interval,
    )
    svc.start(dry_run=args.dry_run)
    svc.run_forever()


if __name__ == "__main__":
    main()