initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/nautilus_dolphin/compare_arrow_vs_json.py
+++ b/nautilus_dolphin/compare_arrow_vs_json.py
@@ -0,0 +1,273 @@
+"""
+Arrow vs. JSON Fidelity Comparator
+====================================
+Runs both the legacy JSON adapter and the new Arrow NG5 adapter
+on the same date range and compares:
+  - Signal values (vel_div, instability, lambda_max_velocity)
+  - Asset prices
+  - Bar counts
+
+This is the definitive data-parity test between DOLPHIN NG3 (JSON)
+and DOLPHIN NG5 (Arrow).
+
+Usage:
+  python compare_arrow_vs_json.py \
+      --arrow-dir  "C:/.../correlation_arb512/arrow_scans" \
+      --json-dir   "C:/.../correlation_arb512/eigenvalues" \
+      --date       2026-02-25 \
+      --n          50
+"""
+
+import sys
+import json
+import argparse
+import logging
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional
+
+import numpy as np
+import pandas as pd
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(levelname)-8s | %(message)s',
+    datefmt='%H:%M:%S',
+)
+logger = logging.getLogger(__name__)
+
+project_root = Path(__file__).parent
+sys.path.insert(0, str(project_root))
+
+from nautilus_dolphin.nautilus.arrow_data_adapter import ArrowEigenvalueDataAdapter
+from nautilus_dolphin.nautilus.data_adapter import JSONEigenvalueDataAdapter
+
+
+# ─────────────────────────────────────────────────────────────────────────────
+
+def load_arrow_scans(arrow_scans_dir: str, date_str: str, n: int) -> List[dict]:
+    """Load n arrow scans for date, return list of signal dicts."""
+    adapter = ArrowEigenvalueDataAdapter(
+        arrow_scans_dir=arrow_scans_dir,
+        venue="BINANCE_FUTURES",
+    )
+    date = datetime.strptime(date_str, '%Y-%m-%d')
+    adapter.load_date_range(date, date)
+
+    records = []
+    for filepath in adapter._scan_files[:n]:
+        scan = adapter.load_scan_file(filepath)
+        if scan is None:
+            continue
+
+        w50  = scan['windows'].get('50',  {}).get('tracking_data', {})
+        w150 = scan['windows'].get('150', {}).get('tracking_data', {})
+        pricing = scan.get('pricing_data', {})
+
+        records.append({
+            'file':         filepath.name,
+            'timestamp':    scan.get('parsed_timestamp'),
+            'vel_div':      w50.get('lambda_max_velocity', 0) - w150.get('lambda_max_velocity', 0),
+            'v50_vel':      w50.get('lambda_max_velocity', 0),
+            'v150_vel':     w150.get('lambda_max_velocity', 0),
+            'instab_50':    w50.get('instability_score', 0),
+            'instab_150':   w150.get('instability_score', 0),
+            'w50_lambda':   w50.get('lambda_max', 0),
+            'prices':       pricing.get('current_prices', {}),
+        })
+
+    return records
+
+
+def load_json_scans(json_eigenvalues_dir: str, date_str: str, n: int) -> List[dict]:
+    """Load n JSON scans for date, return list of signal dicts."""
+    adapter = JSONEigenvalueDataAdapter(
+        eigenvalues_dir=json_eigenvalues_dir,
+        venue="BINANCE_FUTURES",
+    )
+    date = datetime.strptime(date_str, '%Y-%m-%d')
+    adapter.load_date_range(date, date)
+
+    records = []
+    for filepath in adapter._scan_files[:n]:
+        scan = adapter.load_scan_file(filepath)
+        if scan is None:
+            continue
+
+        windows = scan.get('windows', {})
+        w50  = windows.get('50',  {}).get('tracking_data', {})
+        w150 = windows.get('150', {}).get('tracking_data', {})
+        pricing = scan.get('pricing_data', {})
+
+        records.append({
+            'file':       filepath.name,
+            'timestamp':  scan.get('parsed_timestamp'),
+            'vel_div':    w50.get('lambda_max_velocity', 0) - w150.get('lambda_max_velocity', 0),
+            'v50_vel':    w50.get('lambda_max_velocity', 0),
+            'v150_vel':   w150.get('lambda_max_velocity', 0),
+            'instab_50':  w50.get('instability_score', 0),
+            'instab_150': w150.get('instability_score', 0),
+            'w50_lambda': w50.get('lambda_max', 0),
+            'prices':     pricing.get('current_prices', {}),
+        })
+
+    return records
+
+
+def align_by_timestamp(
+    arrow_records: List[dict],
+    json_records: List[dict],
+) -> pd.DataFrame:
+    """
+    Align arrow and json records by nearest timestamp.
+    NG5 uses timestamp_ns for precise alignment; we use a 30-second tolerance.
+    """
+    arrow_df = pd.DataFrame(arrow_records).sort_values('timestamp').reset_index(drop=True)
+    json_df  = pd.DataFrame(json_records).sort_values('timestamp').reset_index(drop=True)
+
+    # Merge on nearest timestamp (30-second window)
+    rows = []
+    used_j = set()
+    for _, a_row in arrow_df.iterrows():
+        if a_row['timestamp'] is None:
+            continue
+        # Find closest JSON record within 30 seconds
+        diffs = abs(json_df['timestamp'] - a_row['timestamp']).dt.total_seconds()
+        min_idx = diffs.idxmin()
+        if diffs[min_idx] <= 30 and min_idx not in used_j:
+            j_row = json_df.iloc[min_idx]
+            used_j.add(min_idx)
+            rows.append({
+                'a_file':      a_row['file'],
+                'j_file':      j_row['file'],
+                'a_ts':        a_row['timestamp'],
+                'j_ts':        j_row['timestamp'],
+                'dt_sec':      float(diffs[min_idx]),
+                # Signal comparison
+                'a_vel_div':   a_row['vel_div'],
+                'j_vel_div':   j_row['vel_div'],
+                'a_v50_vel':   a_row['v50_vel'],
+                'j_v50_vel':   j_row['v50_vel'],
+                'a_w50_lambda': a_row['w50_lambda'],
+                'j_w50_lambda': j_row['w50_lambda'],
+                'a_instab_50': a_row['instab_50'],
+                'j_instab_50': j_row['instab_50'],
+            })
+
+    return pd.DataFrame(rows)
+
+
+def compare_and_report(
+    aligned: pd.DataFrame,
+    tol: float = 1e-6,
+) -> dict:
+    """Compute agreement statistics for each signal field."""
+    if aligned.empty:
+        logger.warning("No aligned records — check date / tolerance")
+        return {}
+
+    results = {}
+    for field in ['vel_div', 'v50_vel', 'w50_lambda', 'instab_50']:
+        a_col = f'a_{field}'
+        j_col = f'j_{field}'
+        if a_col not in aligned.columns:
+            continue
+
+        diff = aligned[a_col].fillna(0) - aligned[j_col].fillna(0)
+        rel_err = diff.abs() / (aligned[j_col].abs().replace(0, np.nan))
+
+        results[field] = {
+            'n_pairs':         len(aligned),
+            'max_abs_diff':    float(diff.abs().max()),
+            'mean_abs_diff':   float(diff.abs().mean()),
+            'max_rel_err':     float(rel_err.max(skipna=True)),
+            'mean_rel_err':    float(rel_err.mean(skipna=True)),
+            'pct_within_tol':  float((diff.abs() <= tol).mean() * 100),
+            'corr':            float(aligned[a_col].corr(aligned[j_col])),
+        }
+
+    return results
+
+
+def print_report(stats: dict, aligned: pd.DataFrame, n_show: int = 10):
+    print("\n" + "=" * 70)
+    print("ARROW NG5 vs. JSON NG3 — DATA FIDELITY REPORT")
+    print("=" * 70)
+    print(f"Aligned pairs: {len(aligned)}")
+    print(f"Max timestamp offset: {aligned['dt_sec'].max():.1f}s")
+    print()
+    print(f"{'Field':<18} {'MaxAbsDiff':>12} {'MeanAbsDiff':>12} {'Corr':>8} {'%ExactTol':>10}")
+    print("-" * 65)
+    for field, s in stats.items():
+        print(
+            f"{field:<18} {s['max_abs_diff']:>12.6f} {s['mean_abs_diff']:>12.6f} "
+            f"{s['corr']:>8.6f} {s['pct_within_tol']:>9.1f}%"
+        )
+
+    print()
+    if 'vel_div' in stats:
+        corr = stats['vel_div']['corr']
+        if corr > 0.9999:
+            verdict = "PASS — Arrow and JSON are numerically identical (corr > 0.9999)"
+        elif corr > 0.999:
+            verdict = "PASS — Minor floating-point differences (corr > 0.999)"
+        elif corr > 0.99:
+            verdict = "WARNING — Noticeable differences (corr > 0.99)"
+        else:
+            verdict = "FAIL — Significant discrepancy (corr <= 0.99)"
+        print(f"VERDICT: {verdict}\n")
+
+    if n_show > 0 and not aligned.empty:
+        print(f"First {n_show} aligned pairs (vel_div comparison):")
+        print("-" * 70)
+        show = aligned[['a_file', 'j_file', 'dt_sec', 'a_vel_div', 'j_vel_div']].head(n_show)
+        print(show.to_string(index=False))
+    print()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Arrow NG5 vs. JSON NG3 fidelity check")
+    parser.add_argument("--arrow-dir", required=True,
+                        help="Path to NG5 arrow_scans directory")
+    parser.add_argument("--json-dir",  required=True,
+                        help="Path to NG3 eigenvalues directory")
+    parser.add_argument("--date", default="2026-02-25",
+                        help="Date to compare (YYYY-MM-DD)")
+    parser.add_argument("--n", type=int, default=50,
+                        help="Number of scans to compare per source")
+    parser.add_argument("--tol", type=float, default=1e-6,
+                        help="Tolerance for exact-match check")
+    parser.add_argument("--output", default=None,
+                        help="Save report JSON to this path")
+    args = parser.parse_args()
+
+    logger.info(f"Loading {args.n} Arrow scans from {args.date}...")
+    arrow_records = load_arrow_scans(args.arrow_dir, args.date, args.n)
+    logger.info(f"Loaded {len(arrow_records)} arrow records")
+
+    logger.info(f"Loading {args.n} JSON scans from {args.date}...")
+    json_records  = load_json_scans(args.json_dir, args.date, args.n)
+    logger.info(f"Loaded {len(json_records)} json records")
+
+    logger.info("Aligning by timestamp...")
+    aligned = align_by_timestamp(arrow_records, json_records)
+    logger.info(f"Aligned: {len(aligned)} pairs")
+
+    stats = compare_and_report(aligned, tol=args.tol)
+    print_report(stats, aligned, n_show=10)
+
+    if args.output:
+        report = {
+            'date': args.date,
+            'n_arrow': len(arrow_records),
+            'n_json':  len(json_records),
+            'n_aligned': len(aligned),
+            'stats': stats,
+        }
+        with open(args.output, 'w') as f:
+            json.dump(report, f, indent=2, default=str)
+        logger.info(f"Report saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()