274 lines
10 KiB
Python
274 lines
10 KiB
Python
|
|
"""
|
||
|
|
Arrow vs. JSON Fidelity Comparator
|
||
|
|
====================================
|
||
|
|
Runs both the legacy JSON adapter and the new Arrow NG5 adapter
|
||
|
|
on the same date range and compares:
|
||
|
|
- Signal values (vel_div, instability, lambda_max_velocity)
|
||
|
|
- Asset prices
|
||
|
|
- Bar counts
|
||
|
|
|
||
|
|
This is the definitive data-parity test between DOLPHIN NG3 (JSON)
|
||
|
|
and DOLPHIN NG5 (Arrow).
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python compare_arrow_vs_json.py \
|
||
|
|
--arrow-dir "C:/.../correlation_arb512/arrow_scans" \
|
||
|
|
--json-dir "C:/.../correlation_arb512/eigenvalues" \
|
||
|
|
--date 2026-02-25 \
|
||
|
|
--n 50
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
import argparse
|
||
|
|
import logging
|
||
|
|
from pathlib import Path
|
||
|
|
from datetime import datetime
|
||
|
|
from typing import Dict, List, Optional
|
||
|
|
|
||
|
|
import numpy as np
|
||
|
|
import pandas as pd
|
||
|
|
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(asctime)s | %(levelname)-8s | %(message)s',
|
||
|
|
datefmt='%H:%M:%S',
|
||
|
|
)
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
project_root = Path(__file__).parent
|
||
|
|
sys.path.insert(0, str(project_root))
|
||
|
|
|
||
|
|
from nautilus_dolphin.nautilus.arrow_data_adapter import ArrowEigenvalueDataAdapter
|
||
|
|
from nautilus_dolphin.nautilus.data_adapter import JSONEigenvalueDataAdapter
|
||
|
|
|
||
|
|
|
||
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def load_arrow_scans(arrow_scans_dir: str, date_str: str, n: int) -> List[dict]:
|
||
|
|
"""Load n arrow scans for date, return list of signal dicts."""
|
||
|
|
adapter = ArrowEigenvalueDataAdapter(
|
||
|
|
arrow_scans_dir=arrow_scans_dir,
|
||
|
|
venue="BINANCE_FUTURES",
|
||
|
|
)
|
||
|
|
date = datetime.strptime(date_str, '%Y-%m-%d')
|
||
|
|
adapter.load_date_range(date, date)
|
||
|
|
|
||
|
|
records = []
|
||
|
|
for filepath in adapter._scan_files[:n]:
|
||
|
|
scan = adapter.load_scan_file(filepath)
|
||
|
|
if scan is None:
|
||
|
|
continue
|
||
|
|
|
||
|
|
w50 = scan['windows'].get('50', {}).get('tracking_data', {})
|
||
|
|
w150 = scan['windows'].get('150', {}).get('tracking_data', {})
|
||
|
|
pricing = scan.get('pricing_data', {})
|
||
|
|
|
||
|
|
records.append({
|
||
|
|
'file': filepath.name,
|
||
|
|
'timestamp': scan.get('parsed_timestamp'),
|
||
|
|
'vel_div': w50.get('lambda_max_velocity', 0) - w150.get('lambda_max_velocity', 0),
|
||
|
|
'v50_vel': w50.get('lambda_max_velocity', 0),
|
||
|
|
'v150_vel': w150.get('lambda_max_velocity', 0),
|
||
|
|
'instab_50': w50.get('instability_score', 0),
|
||
|
|
'instab_150': w150.get('instability_score', 0),
|
||
|
|
'w50_lambda': w50.get('lambda_max', 0),
|
||
|
|
'prices': pricing.get('current_prices', {}),
|
||
|
|
})
|
||
|
|
|
||
|
|
return records
|
||
|
|
|
||
|
|
|
||
|
|
def load_json_scans(json_eigenvalues_dir: str, date_str: str, n: int) -> List[dict]:
|
||
|
|
"""Load n JSON scans for date, return list of signal dicts."""
|
||
|
|
adapter = JSONEigenvalueDataAdapter(
|
||
|
|
eigenvalues_dir=json_eigenvalues_dir,
|
||
|
|
venue="BINANCE_FUTURES",
|
||
|
|
)
|
||
|
|
date = datetime.strptime(date_str, '%Y-%m-%d')
|
||
|
|
adapter.load_date_range(date, date)
|
||
|
|
|
||
|
|
records = []
|
||
|
|
for filepath in adapter._scan_files[:n]:
|
||
|
|
scan = adapter.load_scan_file(filepath)
|
||
|
|
if scan is None:
|
||
|
|
continue
|
||
|
|
|
||
|
|
windows = scan.get('windows', {})
|
||
|
|
w50 = windows.get('50', {}).get('tracking_data', {})
|
||
|
|
w150 = windows.get('150', {}).get('tracking_data', {})
|
||
|
|
pricing = scan.get('pricing_data', {})
|
||
|
|
|
||
|
|
records.append({
|
||
|
|
'file': filepath.name,
|
||
|
|
'timestamp': scan.get('parsed_timestamp'),
|
||
|
|
'vel_div': w50.get('lambda_max_velocity', 0) - w150.get('lambda_max_velocity', 0),
|
||
|
|
'v50_vel': w50.get('lambda_max_velocity', 0),
|
||
|
|
'v150_vel': w150.get('lambda_max_velocity', 0),
|
||
|
|
'instab_50': w50.get('instability_score', 0),
|
||
|
|
'instab_150': w150.get('instability_score', 0),
|
||
|
|
'w50_lambda': w50.get('lambda_max', 0),
|
||
|
|
'prices': pricing.get('current_prices', {}),
|
||
|
|
})
|
||
|
|
|
||
|
|
return records
|
||
|
|
|
||
|
|
|
||
|
|
def align_by_timestamp(
|
||
|
|
arrow_records: List[dict],
|
||
|
|
json_records: List[dict],
|
||
|
|
) -> pd.DataFrame:
|
||
|
|
"""
|
||
|
|
Align arrow and json records by nearest timestamp.
|
||
|
|
NG5 uses timestamp_ns for precise alignment; we use a 30-second tolerance.
|
||
|
|
"""
|
||
|
|
arrow_df = pd.DataFrame(arrow_records).sort_values('timestamp').reset_index(drop=True)
|
||
|
|
json_df = pd.DataFrame(json_records).sort_values('timestamp').reset_index(drop=True)
|
||
|
|
|
||
|
|
# Merge on nearest timestamp (30-second window)
|
||
|
|
rows = []
|
||
|
|
used_j = set()
|
||
|
|
for _, a_row in arrow_df.iterrows():
|
||
|
|
if a_row['timestamp'] is None:
|
||
|
|
continue
|
||
|
|
# Find closest JSON record within 30 seconds
|
||
|
|
diffs = abs(json_df['timestamp'] - a_row['timestamp']).dt.total_seconds()
|
||
|
|
min_idx = diffs.idxmin()
|
||
|
|
if diffs[min_idx] <= 30 and min_idx not in used_j:
|
||
|
|
j_row = json_df.iloc[min_idx]
|
||
|
|
used_j.add(min_idx)
|
||
|
|
rows.append({
|
||
|
|
'a_file': a_row['file'],
|
||
|
|
'j_file': j_row['file'],
|
||
|
|
'a_ts': a_row['timestamp'],
|
||
|
|
'j_ts': j_row['timestamp'],
|
||
|
|
'dt_sec': float(diffs[min_idx]),
|
||
|
|
# Signal comparison
|
||
|
|
'a_vel_div': a_row['vel_div'],
|
||
|
|
'j_vel_div': j_row['vel_div'],
|
||
|
|
'a_v50_vel': a_row['v50_vel'],
|
||
|
|
'j_v50_vel': j_row['v50_vel'],
|
||
|
|
'a_w50_lambda': a_row['w50_lambda'],
|
||
|
|
'j_w50_lambda': j_row['w50_lambda'],
|
||
|
|
'a_instab_50': a_row['instab_50'],
|
||
|
|
'j_instab_50': j_row['instab_50'],
|
||
|
|
})
|
||
|
|
|
||
|
|
return pd.DataFrame(rows)
|
||
|
|
|
||
|
|
|
||
|
|
def compare_and_report(
|
||
|
|
aligned: pd.DataFrame,
|
||
|
|
tol: float = 1e-6,
|
||
|
|
) -> dict:
|
||
|
|
"""Compute agreement statistics for each signal field."""
|
||
|
|
if aligned.empty:
|
||
|
|
logger.warning("No aligned records — check date / tolerance")
|
||
|
|
return {}
|
||
|
|
|
||
|
|
results = {}
|
||
|
|
for field in ['vel_div', 'v50_vel', 'w50_lambda', 'instab_50']:
|
||
|
|
a_col = f'a_{field}'
|
||
|
|
j_col = f'j_{field}'
|
||
|
|
if a_col not in aligned.columns:
|
||
|
|
continue
|
||
|
|
|
||
|
|
diff = aligned[a_col].fillna(0) - aligned[j_col].fillna(0)
|
||
|
|
rel_err = diff.abs() / (aligned[j_col].abs().replace(0, np.nan))
|
||
|
|
|
||
|
|
results[field] = {
|
||
|
|
'n_pairs': len(aligned),
|
||
|
|
'max_abs_diff': float(diff.abs().max()),
|
||
|
|
'mean_abs_diff': float(diff.abs().mean()),
|
||
|
|
'max_rel_err': float(rel_err.max(skipna=True)),
|
||
|
|
'mean_rel_err': float(rel_err.mean(skipna=True)),
|
||
|
|
'pct_within_tol': float((diff.abs() <= tol).mean() * 100),
|
||
|
|
'corr': float(aligned[a_col].corr(aligned[j_col])),
|
||
|
|
}
|
||
|
|
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
def print_report(stats: dict, aligned: pd.DataFrame, n_show: int = 10):
|
||
|
|
print("\n" + "=" * 70)
|
||
|
|
print("ARROW NG5 vs. JSON NG3 — DATA FIDELITY REPORT")
|
||
|
|
print("=" * 70)
|
||
|
|
print(f"Aligned pairs: {len(aligned)}")
|
||
|
|
print(f"Max timestamp offset: {aligned['dt_sec'].max():.1f}s")
|
||
|
|
print()
|
||
|
|
print(f"{'Field':<18} {'MaxAbsDiff':>12} {'MeanAbsDiff':>12} {'Corr':>8} {'%ExactTol':>10}")
|
||
|
|
print("-" * 65)
|
||
|
|
for field, s in stats.items():
|
||
|
|
print(
|
||
|
|
f"{field:<18} {s['max_abs_diff']:>12.6f} {s['mean_abs_diff']:>12.6f} "
|
||
|
|
f"{s['corr']:>8.6f} {s['pct_within_tol']:>9.1f}%"
|
||
|
|
)
|
||
|
|
|
||
|
|
print()
|
||
|
|
if 'vel_div' in stats:
|
||
|
|
corr = stats['vel_div']['corr']
|
||
|
|
if corr > 0.9999:
|
||
|
|
verdict = "PASS — Arrow and JSON are numerically identical (corr > 0.9999)"
|
||
|
|
elif corr > 0.999:
|
||
|
|
verdict = "PASS — Minor floating-point differences (corr > 0.999)"
|
||
|
|
elif corr > 0.99:
|
||
|
|
verdict = "WARNING — Noticeable differences (corr > 0.99)"
|
||
|
|
else:
|
||
|
|
verdict = "FAIL — Significant discrepancy (corr <= 0.99)"
|
||
|
|
print(f"VERDICT: {verdict}\n")
|
||
|
|
|
||
|
|
if n_show > 0 and not aligned.empty:
|
||
|
|
print(f"First {n_show} aligned pairs (vel_div comparison):")
|
||
|
|
print("-" * 70)
|
||
|
|
show = aligned[['a_file', 'j_file', 'dt_sec', 'a_vel_div', 'j_vel_div']].head(n_show)
|
||
|
|
print(show.to_string(index=False))
|
||
|
|
print()
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
parser = argparse.ArgumentParser(description="Arrow NG5 vs. JSON NG3 fidelity check")
|
||
|
|
parser.add_argument("--arrow-dir", required=True,
|
||
|
|
help="Path to NG5 arrow_scans directory")
|
||
|
|
parser.add_argument("--json-dir", required=True,
|
||
|
|
help="Path to NG3 eigenvalues directory")
|
||
|
|
parser.add_argument("--date", default="2026-02-25",
|
||
|
|
help="Date to compare (YYYY-MM-DD)")
|
||
|
|
parser.add_argument("--n", type=int, default=50,
|
||
|
|
help="Number of scans to compare per source")
|
||
|
|
parser.add_argument("--tol", type=float, default=1e-6,
|
||
|
|
help="Tolerance for exact-match check")
|
||
|
|
parser.add_argument("--output", default=None,
|
||
|
|
help="Save report JSON to this path")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
logger.info(f"Loading {args.n} Arrow scans from {args.date}...")
|
||
|
|
arrow_records = load_arrow_scans(args.arrow_dir, args.date, args.n)
|
||
|
|
logger.info(f"Loaded {len(arrow_records)} arrow records")
|
||
|
|
|
||
|
|
logger.info(f"Loading {args.n} JSON scans from {args.date}...")
|
||
|
|
json_records = load_json_scans(args.json_dir, args.date, args.n)
|
||
|
|
logger.info(f"Loaded {len(json_records)} json records")
|
||
|
|
|
||
|
|
logger.info("Aligning by timestamp...")
|
||
|
|
aligned = align_by_timestamp(arrow_records, json_records)
|
||
|
|
logger.info(f"Aligned: {len(aligned)} pairs")
|
||
|
|
|
||
|
|
stats = compare_and_report(aligned, tol=args.tol)
|
||
|
|
print_report(stats, aligned, n_show=10)
|
||
|
|
|
||
|
|
if args.output:
|
||
|
|
report = {
|
||
|
|
'date': args.date,
|
||
|
|
'n_arrow': len(arrow_records),
|
||
|
|
'n_json': len(json_records),
|
||
|
|
'n_aligned': len(aligned),
|
||
|
|
'stats': stats,
|
||
|
|
}
|
||
|
|
with open(args.output, 'w') as f:
|
||
|
|
json.dump(report, f, indent=2, default=str)
|
||
|
|
logger.info(f"Report saved to {args.output}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|