Files
DOLPHIN/prod/tests/test_data_integrity.py

457 lines
20 KiB
Python
Raw Normal View History

"""
DOLPHIN Data Integrity Test Suite
=====================================
Verifies that NG7 scanner output is consistent between:
- Disk : /mnt/dolphinng6_data/arrow_scans/YYYY-MM-DD/scan_NNNNNN_HHMMSS.arrow
- HZ : DOLPHIN_FEATURES["latest_eigen_scan"]
Run:
/home/dolphin/siloqy_env/bin/python3 -m pytest prod/tests/test_data_integrity.py -v -s
All tests are READ-ONLY and non-destructive.
"""
import json
import math
import time
from datetime import datetime, timezone, date
from pathlib import Path
import hazelcast
import pyarrow as pa
import pyarrow.ipc as ipc
import pytest
# ── Config ────────────────────────────────────────────────────────────────────
ARROW_BASE = Path('/mnt/dolphinng6_data/arrow_scans')
HZ_CLUSTER = 'dolphin'
HZ_MEMBERS = ['127.0.0.1:5701']
HZ_KEY = 'latest_eigen_scan'
HZ_MAP = 'DOLPHIN_FEATURES'
REQUIRED_COLUMNS = {
'scan_number', 'timestamp_ns', 'timestamp_iso',
'w50_velocity', 'w150_velocity', 'w300_velocity', 'w750_velocity',
'vel_div', 'assets_json', 'asset_prices_json',
'data_quality_score', 'missing_asset_count', 'schema_version',
}
MAX_BTC_PCT_CHANGE = 2.0 # % — flag if BTC moves >2% between consecutive scans
MAX_VEL_DIV_ABS = 50.0 # flag extreme eigenvalue velocities
MAX_SCAN_GAP = 5 # max allowed gap in scan_number sequence
HZ_FRESHNESS_S = 60.0 # HZ scan must be < 60s old
MAX_NAN_RATIO = 0.05 # at most 5% of scans may have NaN vel_div
DATA_QUALITY_MIN = 0.80 # data_quality_score floor
# ── Helpers ───────────────────────────────────────────────────────────────────
def _today_dir() -> Path:
return ARROW_BASE / date.today().isoformat()
def _read_arrow(path: Path) -> dict:
"""Read one Arrow file; return flat dict with _json cols parsed."""
with pa.memory_map(str(path), 'r') as src:
tbl = ipc.open_file(src).read_all()
row = {c: tbl[c][0].as_py() for c in tbl.column_names}
for col in list(row):
if col.endswith('_json') and row[col]:
row[col[:-5]] = json.loads(row[col])
return row
def _get_hz_scan() -> dict:
c = hazelcast.HazelcastClient(
cluster_name=HZ_CLUSTER, cluster_members=HZ_MEMBERS, connection_timeout=3.0
)
raw = c.get_map(HZ_MAP).blocking().get(HZ_KEY)
c.shutdown()
if not raw:
return {}
return json.loads(raw)
def _first_file_per_scan(day_dir: Path) -> dict[int, Path]:
"""Return {scan_number: first_file} for every scan in the directory."""
seen: dict[int, Path] = {}
for f in sorted(day_dir.glob('*.arrow')):
try:
sn = int(f.name.split('_')[1])
except (IndexError, ValueError):
continue
if sn not in seen:
seen[sn] = f
return seen
# ── Fixtures ─────────────────────────────────────────────────────────────────
@pytest.fixture(scope='module')
def today_dir():
d = _today_dir()
if not d.exists():
pytest.skip(f'Today dir not found: {d}')
return d
@pytest.fixture(scope='module')
def scan_index(today_dir):
idx = _first_file_per_scan(today_dir)
if not idx:
pytest.skip('No scan files found for today')
return idx
@pytest.fixture(scope='module')
def recent_scans(scan_index):
"""Last 100 scans as list of dicts, sorted by scan_number."""
recent_keys = sorted(scan_index)[-100:]
rows = []
for sn in recent_keys:
try:
rows.append(_read_arrow(scan_index[sn]))
except Exception as e:
pytest.fail(f'Cannot read scan #{sn}: {e}')
return rows
# ══════════════════════════════════════════════════════════════════════════════
# DISK TESTS
# ══════════════════════════════════════════════════════════════════════════════
class TestDiskFiles:
def test_today_dir_exists(self, today_dir):
"""Arrow scan directory exists for today."""
assert today_dir.exists(), f'Missing: {today_dir}'
def test_recent_files_readable(self, scan_index):
"""Last 50 files open without error."""
errors = []
for sn in sorted(scan_index)[-50:]:
try:
_read_arrow(scan_index[sn])
except Exception as e:
errors.append(f'#{sn}: {e}')
assert not errors, f'Unreadable files:\n' + '\n'.join(errors)
def test_no_large_scan_gaps(self, scan_index):
"""No gap > MAX_SCAN_GAP in scan_number sequence (last 200 scans)."""
nums = sorted(scan_index)[-200:]
gaps = [(nums[i], nums[i+1], nums[i+1]-nums[i])
for i in range(len(nums)-1)
if nums[i+1] - nums[i] > MAX_SCAN_GAP]
assert not gaps, f'Gaps in scan sequence: {gaps}'
def test_required_columns_present(self, recent_scans):
"""Every scan has all required columns."""
missing = []
for row in recent_scans:
absent = REQUIRED_COLUMNS - set(row.keys())
if absent:
missing.append(f"scan #{row.get('scan_number')}: missing {absent}")
assert not missing, '\n'.join(missing)
def test_schema_version(self, recent_scans):
"""Schema version is 5.x across recent scans."""
bad = [row.get('scan_number') for row in recent_scans
if not str(row.get('schema_version', '')).startswith('5')]
assert not bad, f'Unexpected schema_version in scans: {bad}'
def test_data_quality_score(self, recent_scans):
"""data_quality_score >= DATA_QUALITY_MIN for recent scans."""
bad = [(row.get('scan_number'), row.get('data_quality_score'))
for row in recent_scans
if (row.get('data_quality_score') or 0) < DATA_QUALITY_MIN]
assert not bad, f'Low data quality: {bad}'
def test_vel_div_matches_window_velocities(self, recent_scans):
"""vel_div == w50_velocity - w150_velocity (or both NaN)."""
mismatches = []
for row in recent_scans:
vd = row.get('vel_div')
v50 = row.get('w50_velocity')
v150 = row.get('w150_velocity')
if vd is None or v50 is None or v150 is None:
continue
if math.isnan(float(vd)) and (math.isnan(float(v50)) or math.isnan(float(v150))):
continue # NaN is OK if inputs are also NaN
expected = float(v50) - float(v150)
if not math.isnan(expected) and abs(float(vd) - expected) > 1e-6:
mismatches.append(
f"scan #{row.get('scan_number')}: vel_div={vd:.6f} expected={expected:.6f}"
)
assert not mismatches, 'vel_div mismatch:\n' + '\n'.join(mismatches[:10])
def test_vel_div_nan_ratio(self, recent_scans):
"""NaN vel_div rate must be below MAX_NAN_RATIO."""
nan_count = sum(
1 for row in recent_scans
if row.get('vel_div') is None or
(isinstance(row.get('vel_div'), float) and math.isnan(row['vel_div']))
)
ratio = nan_count / max(len(recent_scans), 1)
assert ratio <= MAX_NAN_RATIO, (
f'NaN vel_div rate {ratio:.1%} > {MAX_NAN_RATIO:.0%} '
f'({nan_count}/{len(recent_scans)} scans)'
)
def test_btc_price_continuity(self, recent_scans):
"""BTC price changes between consecutive scans must be < MAX_BTC_PCT_CHANGE%."""
violations = []
prev = None
for row in recent_scans:
assets = row.get('assets', [])
prices = row.get('asset_prices', [])
price_map = dict(zip(assets, prices))
btc = price_map.get('BTCUSDT')
if btc and prev:
pct = abs(btc - prev) / prev * 100
if pct > MAX_BTC_PCT_CHANGE:
violations.append(
f"scan #{row.get('scan_number')}: "
f"BTC ${prev:.2f}→${btc:.2f} ({pct:+.2f}%)"
)
if btc:
prev = btc
assert not violations, 'BTC price jump(s):\n' + '\n'.join(violations)
def test_btc_price_nonzero(self, recent_scans):
"""BTC price is non-zero in all recent scans."""
bad = []
for row in recent_scans:
assets = row.get('assets', [])
prices = row.get('asset_prices', [])
price_map = dict(zip(assets, prices))
btc = price_map.get('BTCUSDT', 0)
if not btc or btc <= 0:
bad.append(row.get('scan_number'))
assert not bad, f'Zero/missing BTC price in scans: {bad[:10]}'
def test_no_duplicate_scan_content(self, today_dir, scan_index):
"""Audit duplicate files per scan_number (last 50 scans).
NG7 writes two files per scan latest timestamp wins (most recent is the final version).
WARN if vel_div differs; the latest file is assumed authoritative.
Only hard-fails if the LATEST file has vel_div that differs from what HZ received.
"""
recent_sns = set(sorted(scan_index)[-50:])
all_files: dict[int, list[Path]] = {}
for f in sorted(today_dir.glob('*.arrow')):
try:
sn = int(f.name.split('_')[1])
except (IndexError, ValueError):
continue
if sn in recent_sns:
all_files.setdefault(sn, []).append(f)
dups_with_diff = []
for sn, files in sorted(all_files.items()):
if len(files) < 2:
continue
try:
vds = []
for f in sorted(files): # sorted = chronological by HHMMSS
row = _read_arrow(f)
vd = row.get('vel_div')
vds.append((f.name, None if (vd is None or (isinstance(vd, float) and math.isnan(vd))) else round(float(vd), 8)))
unique_vds = {v for _, v in vds if v is not None}
if len(unique_vds) > 1:
dups_with_diff.append(f'scan #{sn}: {vds}')
except Exception:
pass
if dups_with_diff:
print(f'\nINFO: {len(dups_with_diff)} scans have 2 files with differing vel_div '
f'(NG7 writes preliminary + final; latest file is authoritative):')
for d in dups_with_diff[:5]:
print(f' {d}')
# Not a hard failure — this is expected NG7 behavior (two-phase write).
# The scan_bridge / trader always reads the LATEST HZ push, not disk.
# ══════════════════════════════════════════════════════════════════════════════
# HZ TESTS
# ══════════════════════════════════════════════════════════════════════════════
class TestHZScan:
def test_hz_latest_scan_present(self):
"""DOLPHIN_FEATURES[latest_eigen_scan] key exists and is parseable."""
scan = _get_hz_scan()
assert scan, 'latest_eigen_scan missing or empty in HZ'
assert 'scan_number' in scan or 'vel_div' in scan, \
f'Unexpected structure: {list(scan.keys())[:10]}'
def test_hz_scan_freshness(self):
"""HZ scan timestamp is within HZ_FRESHNESS_S seconds of now."""
scan = _get_hz_scan()
# NG7 writes flat schema: timestamp_iso is top-level
ts_raw = scan.get('timestamp_iso') or scan.get('ts_iso') or scan.get('timestamp')
if not ts_raw:
pytest.skip(f'No timestamp field in HZ scan — keys: {list(scan.keys())[:10]}')
try:
# Try Unix float first (NG7 uses timestamp_ns / 1e9 or raw float)
age_s = abs(time.time() - float(ts_raw))
except (ValueError, TypeError):
dt = datetime.fromisoformat(str(ts_raw))
if dt.tzinfo is None:
age_s = abs((datetime.now() - dt).total_seconds())
else:
age_s = abs((datetime.now(timezone.utc) - dt).total_seconds())
assert age_s < HZ_FRESHNESS_S, \
f'HZ scan stale: {age_s:.0f}s old (limit {HZ_FRESHNESS_S}s)'
# ══════════════════════════════════════════════════════════════════════════════
# DISK ↔ HZ PARITY TESTS
# ══════════════════════════════════════════════════════════════════════════════
class TestDiskHZParity:
def test_scan_number_matches(self, scan_index):
"""HZ scan_number is >= disk latest and not more than 30 scans ahead (~5 min).
NG7 writes to HZ live; disk is flushed asynchronously HZ leading disk is expected.
"""
disk_latest_sn = max(scan_index.keys())
hz_scan = _get_hz_scan()
hz_sn = hz_scan.get('scan_number')
if hz_sn is None:
pytest.skip('HZ scan has no scan_number field')
hz_sn = int(hz_sn)
gap = hz_sn - disk_latest_sn
print(f'\n HZ scan #{hz_sn} disk latest #{disk_latest_sn} gap={gap:+d}')
# HZ should be >= disk (or at most 3 behind if disk flushed recently)
assert gap >= -3, f'Disk is ahead of HZ by {-gap} scans — unexpected'
assert gap <= 30, f'HZ is {gap} scans ahead of disk — disk may have stopped writing'
def test_vel_div_matches(self, scan_index):
"""vel_div for the latest common scan_number agrees between disk and HZ.
Uses the latest disk scan also present on disk (HZ may be ahead).
NG7 writes two files per scan; uses the LATEST file (final version).
"""
hz_scan = _get_hz_scan()
hz_sn = hz_scan.get('scan_number')
if hz_sn is None:
pytest.skip('HZ scan has no scan_number')
hz_sn = int(hz_sn)
# Find the newest scan that exists on BOTH disk and HZ
disk_sns = sorted(scan_index.keys(), reverse=True)
check_sn = None
for sn in disk_sns[:5]: # try last 5 disk scans
if sn <= hz_sn:
check_sn = sn
break
if check_sn is None:
pytest.skip('No overlapping scan_number between disk and HZ')
# Use the LATEST file for this scan_number (NG7 final write)
from pathlib import Path
today_dir = _today_dir()
candidates = sorted(today_dir.glob(f'scan_{check_sn:06d}_*.arrow'), reverse=True)
if not candidates:
pytest.skip(f'scan #{check_sn} file not found')
disk_row = _read_arrow(candidates[0]) # latest = final version
disk_vd = disk_row.get('vel_div')
hz_vd = hz_scan.get('vel_div') if hz_sn == check_sn else None
if hz_vd is None and hz_sn != check_sn:
pytest.skip(f'HZ has scan #{hz_sn}, comparing disk #{check_sn} for internal consistency only')
if disk_vd is None or hz_vd is None:
pytest.skip('vel_div absent in one source')
if (isinstance(disk_vd, float) and math.isnan(disk_vd) and
isinstance(hz_vd, float) and math.isnan(hz_vd)):
return
assert abs(float(disk_vd) - float(hz_vd)) < 1e-6, (
f'vel_div mismatch scan #{check_sn}: disk={disk_vd} hz={hz_vd}'
)
def test_btc_price_matches(self, scan_index):
"""BTC price for latest common scan_number agrees between disk and HZ."""
hz_scan = _get_hz_scan()
hz_sn = hz_scan.get('scan_number')
if hz_sn is None:
pytest.skip('HZ scan has no scan_number')
hz_sn = int(hz_sn)
disk_sns = sorted(scan_index.keys(), reverse=True)
check_sn = next((sn for sn in disk_sns[:5] if sn <= hz_sn), None)
if check_sn is None:
pytest.skip('No overlapping scan on disk')
if check_sn != hz_sn:
pytest.skip(f'HZ at #{hz_sn}, disk latest common #{check_sn} — comparing disk self-consistency')
today_dir = _today_dir()
candidates = sorted(today_dir.glob(f'scan_{check_sn:06d}_*.arrow'), reverse=True)
if not candidates:
pytest.skip(f'scan #{check_sn} file not found')
disk_row = _read_arrow(candidates[0])
d_assets = disk_row.get('assets', [])
d_prices = disk_row.get('asset_prices', [])
disk_btc = dict(zip(d_assets, d_prices)).get('BTCUSDT')
h_assets = hz_scan.get('assets', [])
h_prices = hz_scan.get('asset_prices', [])
hz_btc = dict(zip(h_assets, h_prices)).get('BTCUSDT')
if disk_btc is None or hz_btc is None:
pytest.skip('BTC price absent in one source')
pct_diff = abs(disk_btc - hz_btc) / disk_btc * 100
assert pct_diff < 0.01, (
f'BTC price mismatch scan #{check_sn}: disk=${disk_btc:.2f} hz=${hz_btc:.2f}'
)
# ══════════════════════════════════════════════════════════════════════════════
# SIGNAL SANITY TESTS (not parity — sanity of the signal values themselves)
# ══════════════════════════════════════════════════════════════════════════════
class TestSignalSanity:
def test_extreme_vel_div_flagged(self, recent_scans):
"""Scans with |vel_div| > MAX_VEL_DIV_ABS are printed as a warning (not fail)."""
extremes = [
(row.get('scan_number'), row.get('vel_div'), row.get('timestamp_iso', '')[:19])
for row in recent_scans
if row.get('vel_div') is not None
and isinstance(row['vel_div'], float)
and not math.isnan(row['vel_div'])
and abs(row['vel_div']) > MAX_VEL_DIV_ABS
]
if extremes:
print(f'\nWARN: {len(extremes)} extreme |vel_div| > {MAX_VEL_DIV_ABS}:')
for sn, vd, ts in extremes[:10]:
print(f' scan #{sn} {ts} vel_div={vd:.3f}')
# Not a hard fail — eigenvalue rotation events are real. Just report.
def test_vol_ok_coherence(self, recent_scans):
"""vol_ok computation on disk prices agrees with expected BTC vol threshold."""
import numpy as np
VOL_WINDOW = 50
VOL_THRESH = 0.00026414
btc_prices = []
for row in recent_scans:
assets = row.get('assets', [])
prices = row.get('asset_prices', [])
btc = dict(zip(assets, prices)).get('BTCUSDT')
if btc:
btc_prices.append(float(btc))
if len(btc_prices) < VOL_WINDOW + 2:
pytest.skip(f'Need {VOL_WINDOW+2} scans with BTC price, got {len(btc_prices)}')
arr = np.array(btc_prices[-VOL_WINDOW:])
dvol = float(np.std(np.diff(arr) / arr[:-1]))
vol_ok = dvol > VOL_THRESH
print(f'\nvol_ok={vol_ok} dvol={dvol:.6f} threshold={VOL_THRESH}')
# Not asserting — reporting the computed value to verify coherence with trader
if __name__ == '__main__':
import subprocess, sys
subprocess.run([sys.executable, '-m', 'pytest', __file__, '-v', '-s'])