186 lines
6.8 KiB
Python
186 lines
6.8 KiB
Python
|
|
"""
|
||
|
|
ch_writer.py — Dolphin ClickHouse fire-and-forget writer.
|
||
|
|
|
||
|
|
All inserts are async (CH async_insert=1, wait_for_async_insert=0).
|
||
|
|
Uses HTTP INSERT with JSONEachRow — zero external dependencies.
|
||
|
|
|
||
|
|
OTel transport note:
|
||
|
|
This file is the single integration point. To switch to OTel transport
|
||
|
|
(e.g., when Uptrace is the primary sink), replace _flush() internals only.
|
||
|
|
All caller code (ch_put calls across services) stays unchanged.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
from ch_writer import ch_put
|
||
|
|
ch_put("eigen_scans", {"ts": int(time.time() * 1e6), "scan_number": n, ...})
|
||
|
|
|
||
|
|
Environment overrides (optional):
|
||
|
|
CH_URL — default: http://localhost:8123
|
||
|
|
CH_USER — default: dolphin
|
||
|
|
CH_PASS — default: dolphin_ch_2026
|
||
|
|
CH_DB — default: dolphin
|
||
|
|
"""
|
||
|
|
import json
|
||
|
|
import logging
|
||
|
|
import os
|
||
|
|
import random
|
||
|
|
import struct
|
||
|
|
import threading
|
||
|
|
import time
|
||
|
|
import urllib.request
|
||
|
|
from collections import defaultdict
|
||
|
|
from queue import Full, Queue
|
||
|
|
|
||
|
|
log = logging.getLogger("ch_writer")
|
||
|
|
|
||
|
|
CH_URL = os.environ.get("CH_URL", "http://localhost:8123")
|
||
|
|
CH_USER = os.environ.get("CH_USER", "dolphin")
|
||
|
|
CH_PASS = os.environ.get("CH_PASS", "dolphin_ch_2026")
|
||
|
|
CH_DB = os.environ.get("CH_DB", "dolphin")
|
||
|
|
|
||
|
|
# ─── Timestamp helpers ────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def ts_us() -> int:
|
||
|
|
"""Current UTC time as microseconds — for DateTime64(6) fields."""
|
||
|
|
return int(time.time() * 1_000_000)
|
||
|
|
|
||
|
|
def ts_ms() -> int:
|
||
|
|
"""Current UTC time as milliseconds — for DateTime64(3) fields."""
|
||
|
|
return int(time.time() * 1_000)
|
||
|
|
|
||
|
|
|
||
|
|
# ─── UUIDv7 — time-ordered distributed trace ID ───────────────────────────────
|
||
|
|
|
||
|
|
def uuid7() -> str:
|
||
|
|
"""
|
||
|
|
Generate a UUIDv7 — RFC 9562 time-ordered UUID.
|
||
|
|
|
||
|
|
Layout (128 bits):
|
||
|
|
[0:48] Unix timestamp milliseconds — sortable, embeds timing
|
||
|
|
[48:52] Version = 0b0111 (7)
|
||
|
|
[52:64] rand_a (12 bits) — sub-ms uniqueness
|
||
|
|
[64:66] Variant = 0b10
|
||
|
|
[66:128] rand_b (62 bits) — entropy
|
||
|
|
|
||
|
|
Properties:
|
||
|
|
- Lexicographically sortable by time (no JOIN to recover timestamp)
|
||
|
|
- CH can use as ORDER BY component alongside ts columns
|
||
|
|
- Drop-in for UUIDv4 (same string format, same String column type)
|
||
|
|
- Pure stdlib — no dependencies
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
scan_uuid = uuid7() # NG7: one per scan
|
||
|
|
# Pass downstream to trade_events, obf_fast_intrade, posture_events
|
||
|
|
# This IS the distributed trace ID across the causal chain.
|
||
|
|
"""
|
||
|
|
ts_ms_val = int(time.time() * 1_000)
|
||
|
|
rand_a = random.getrandbits(12)
|
||
|
|
rand_b = random.getrandbits(62)
|
||
|
|
|
||
|
|
hi = (ts_ms_val << 16) | 0x7000 | rand_a
|
||
|
|
lo = (0b10 << 62) | rand_b
|
||
|
|
|
||
|
|
b = struct.pack(">QQ", hi, lo)
|
||
|
|
return (
|
||
|
|
f"{b[0:4].hex()}-{b[4:6].hex()}-"
|
||
|
|
f"{b[6:8].hex()}-{b[8:10].hex()}-{b[10:16].hex()}"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# ─── Internal writer ──────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
class _CHWriter:
|
||
|
|
"""
|
||
|
|
Thread-safe, non-blocking ClickHouse writer.
|
||
|
|
Batches rows per table and flushes every flush_interval_s.
|
||
|
|
The caller's thread is NEVER blocked — queue.put_nowait() drops
|
||
|
|
silently if the queue is full (observability is best-effort).
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self, flush_interval_s: float = 1.0, maxqueue: int = 50_000, db: str = CH_DB):
|
||
|
|
self._q: Queue = Queue(maxsize=maxqueue)
|
||
|
|
self._interval = flush_interval_s
|
||
|
|
self._db = db
|
||
|
|
self._dropped = 0
|
||
|
|
self._t = threading.Thread(
|
||
|
|
target=self._run, daemon=True, name=f"ch-writer-{db}"
|
||
|
|
)
|
||
|
|
self._t.start()
|
||
|
|
|
||
|
|
def put(self, table: str, row: dict) -> None:
|
||
|
|
"""Non-blocking enqueue. Silently drops on full queue."""
|
||
|
|
try:
|
||
|
|
self._q.put_nowait((table, row))
|
||
|
|
except Full:
|
||
|
|
self._dropped += 1
|
||
|
|
if self._dropped % 1000 == 1:
|
||
|
|
log.warning("ch_writer: %d rows dropped (queue full)", self._dropped)
|
||
|
|
|
||
|
|
def _run(self):
|
||
|
|
batch: dict[str, list] = defaultdict(list)
|
||
|
|
deadline = time.monotonic() + self._interval
|
||
|
|
while True:
|
||
|
|
remaining = max(0.005, deadline - time.monotonic())
|
||
|
|
try:
|
||
|
|
table, row = self._q.get(timeout=remaining)
|
||
|
|
batch[table].append(row)
|
||
|
|
except Exception:
|
||
|
|
pass # timeout — fall through to flush check
|
||
|
|
if time.monotonic() >= deadline:
|
||
|
|
if batch:
|
||
|
|
self._flush(batch)
|
||
|
|
batch = defaultdict(list)
|
||
|
|
deadline = time.monotonic() + self._interval
|
||
|
|
|
||
|
|
def _flush(self, batch: dict[str, list]):
|
||
|
|
for table, rows in batch.items():
|
||
|
|
if not rows:
|
||
|
|
continue
|
||
|
|
body = "\n".join(json.dumps(r) for r in rows).encode()
|
||
|
|
url = (
|
||
|
|
f"{CH_URL}/?database={self._db}"
|
||
|
|
f"&query=INSERT+INTO+{table}+FORMAT+JSONEachRow"
|
||
|
|
f"&async_insert=1&wait_for_async_insert=0"
|
||
|
|
)
|
||
|
|
req = urllib.request.Request(url, data=body, method="POST")
|
||
|
|
req.add_header("X-ClickHouse-User", CH_USER)
|
||
|
|
req.add_header("X-ClickHouse-Key", CH_PASS)
|
||
|
|
req.add_header("Content-Type", "application/octet-stream")
|
||
|
|
try:
|
||
|
|
with urllib.request.urlopen(req, timeout=5) as resp:
|
||
|
|
if resp.status not in (200, 201):
|
||
|
|
log.debug(
|
||
|
|
"CH flush [%s]: HTTP %s", table, resp.status
|
||
|
|
)
|
||
|
|
except Exception as e:
|
||
|
|
# Observability writes must never surface to callers
|
||
|
|
log.debug("CH flush error [%s]: %s", table, e)
|
||
|
|
|
||
|
|
|
||
|
|
# ─── Module-level singletons ─────────────────────────────────────────────────
|
||
|
|
|
||
|
|
_writer = _CHWriter(db="dolphin")
|
||
|
|
_writer_green = _CHWriter(db="dolphin_green")
|
||
|
|
|
||
|
|
|
||
|
|
def ch_put(table: str, row: dict) -> None:
|
||
|
|
"""
|
||
|
|
Fire-and-forget insert into dolphin.<table> (BLUE environment).
|
||
|
|
|
||
|
|
Args:
|
||
|
|
table: ClickHouse table name (without database prefix), e.g. "eigen_scans"
|
||
|
|
row: Dict of column_name → value. Timestamps should be:
|
||
|
|
- DateTime64(6) fields: int microseconds (use ts_us())
|
||
|
|
- DateTime64(3) fields: int milliseconds (use ts_ms())
|
||
|
|
- Date fields: "YYYY-MM-DD" string
|
||
|
|
"""
|
||
|
|
_writer.put(table, row)
|
||
|
|
|
||
|
|
|
||
|
|
def ch_put_green(table: str, row: dict) -> None:
|
||
|
|
"""
|
||
|
|
Fire-and-forget insert into dolphin_green.<table> (GREEN / NT TradingNode environment).
|
||
|
|
|
||
|
|
Same signature as ch_put — drop-in for GREEN services.
|
||
|
|
"""
|
||
|
|
_writer_green.put(table, row)
|