"""
Resolution Alignment Check
===========================
GATE TEST: Before training Titan VAE on 1m klines, verify that T0-T4 feature
distributions at 1m resolution are sufficiently aligned with 5s production data.

If they diverge, a 1m-trained model cannot be used at 5s inference — the latent
dims would represent different physics (50-min vs 4.2-min eigenvalue dynamics).

Tests:
  1. Per-dim KS test (distribution shape match)
  2. Per-dim mean/std ratio (scale match)
  3. PCA cosine similarity (covariance structure match)
  4. T1 feature correlation matrix alignment (Frobenius distance)

Pass criteria (all must hold):
  - KS p > 0.05 for >= 70% of T1+T2 dims  (distributions compatible)
  - mean ratio within [0.1, 10x] and std ratio within [0.1, 10x] for >= 70% of dims
  - PCA cosine(PC1, PC2) >= 0.80  (shared dominant variance axes)

Output: resolution_alignment_report.json
"""

import sys
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
import json
import numpy as np
import pandas as pd
from scipy import stats

ROOT   = Path(__file__).parent.parent
VBT5s  = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache")
VBT1m  = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict\vbt_cache_klines")
OUT    = Path(__file__).parent / "resolution_alignment_report.json"

sys.path.insert(0, str(ROOT))
from dvae.titan_sensor import build_feature_vector

META_COLS = {'timestamp','scan_number','v50_lambda_max_velocity','v150_lambda_max_velocity',
             'v300_lambda_max_velocity','v750_lambda_max_velocity','vel_div',
             'instability_50','instability_150'}

N_5S_DAYS  = 10   # use first N days from vbt_cache (5s data)
N_1M_DAYS  = 60   # use N days from klines (1m data); pick older period if available


def extract_features(parquet_dir, n_days, label, date_before=None):
    """Build feature matrix from parquet files."""
    files = sorted(parquet_dir.glob("*.parquet"))
    files = [f for f in files if 'catalog' not in str(f)]
    if date_before:
        files = [f for f in files if f.stem < date_before]
    files = files[:n_days]
    if not files:
        print(f"  [{label}] No files found in {parquet_dir}")
        return None, []

    print(f"  [{label}] {len(files)} days  ({files[0].stem} to {files[-1].stem})")

    rows = []
    for pf in files:
        df = pd.read_parquet(pf)
        assets = [c for c in df.columns if c not in META_COLS]
        for ri in range(50, len(df), 10):   # stride=10 to keep runtime low
            feat = build_feature_vector(df, ri, assets)
            rows.append(feat)

    X = np.array(rows, dtype=np.float64)
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    print(f"  [{label}] Feature matrix: {X.shape}")
    return X, [f.stem for f in files]


def ks_test_per_dim(X5, X1, dims):
    """KS test for each dim. Returns fraction with p>0.05."""
    results = {}
    for d in dims:
        a, b = X5[:, d], X1[:, d]
        # skip if all-zero in either
        if np.std(a) < 1e-10 or np.std(b) < 1e-10:
            results[d] = {"p": 1.0, "stat": 0.0, "trivial": True}
            continue
        stat, p = stats.ks_2samp(a, b)
        results[d] = {"p": float(p), "stat": float(stat), "trivial": False}
    return results


def scale_check(X5, X1, dims):
    """Check mean/std ratios."""
    results = {}
    for d in dims:
        m5, s5 = float(np.mean(X5[:, d])), float(np.std(X5[:, d]))
        m1, s1 = float(np.mean(X1[:, d])), float(np.std(X1[:, d]))
        mean_ratio = abs(m5) / (abs(m1) + 1e-10) if abs(m1) > 1e-10 else float('inf')
        std_ratio  = s5 / (s1 + 1e-10) if s1 > 1e-10 else float('inf')
        results[d] = {
            "mean_5s": m5, "mean_1m": m1, "mean_ratio": mean_ratio,
            "std_5s":  s5, "std_1m":  s1, "std_ratio":  std_ratio,
        }
    return results


def pca_cosine(X5, X1, n_components=4):
    """PCA on both matrices, return cosine sim of top components."""
    from numpy.linalg import svd

    def top_pcs(X, k):
        Xc = X - X.mean(axis=0)
        _, _, Vt = svd(Xc, full_matrices=False)
        return Vt[:k]   # (k, d)

    pcs5 = top_pcs(X5, n_components)
    pcs1 = top_pcs(X1, n_components)

    cosines = []
    for i in range(n_components):
        c = abs(float(np.dot(pcs5[i], pcs1[i]) /
                      (np.linalg.norm(pcs5[i]) * np.linalg.norm(pcs1[i]) + 1e-12)))
        cosines.append(c)
    return cosines


def corr_frobenius(X5, X1, dims):
    """Frobenius distance between T1 correlation matrices."""
    A = np.corrcoef(X5[:, dims].T)
    B = np.corrcoef(X1[:, dims].T)
    A = np.nan_to_num(A); B = np.nan_to_num(B)
    frob = float(np.linalg.norm(A - B, 'fro'))
    max_frob = float(np.sqrt(2 * len(dims)))   # upper bound (orthogonal corr matrices)
    return frob, frob / max_frob   # raw and normalised [0,1]


def main():
    print("=== Resolution Alignment Check ===")
    print(f"5s source: {VBT5s}")
    print(f"1m source: {VBT1m}")

    # Pick 1m days from OLDER period if possible (true OOS); otherwise use what's there
    klines_files = sorted(VBT1m.glob("*.parquet"))
    klines_files = [f for f in klines_files if 'catalog' not in str(f)]
    # try to use oldest N_1M_DAYS
    date_cut = klines_files[0].stem if klines_files else None
    print(f"  1m klines available: {len(klines_files)} days  ({klines_files[0].stem if klines_files else '?'} to {klines_files[-1].stem if klines_files else '?'})")

    print("\nBuilding feature matrices...")
    X5, days5 = extract_features(VBT5s,  N_5S_DAYS,  "5s-prod")
    X1, days1 = extract_features(VBT1m,  N_1M_DAYS,  "1m-klines")

    if X5 is None or X1 is None:
        print("ERROR: could not load data from one or both sources.")
        return

    # T1 dims: 8-27 (eigenvalue velocity features)
    # T2 dims: 28-77 (per-asset z-scores)
    T1_DIMS = list(range(8, 28))
    T2_DIMS = list(range(28, 78))

    report = {
        "n_5s_samples": len(X5),
        "n_1m_samples": len(X1),
        "5s_days":  days5,
        "1m_days":  days1,
    }

    print("\n--- T1 eigenvalue velocity features (dims 8-27) ---")
    ks_t1  = ks_test_per_dim(X5, X1, T1_DIMS)
    sc_t1  = scale_check(X5, X1, T1_DIMS)
    frob_t1, frob_t1_norm = corr_frobenius(X5, X1, T1_DIMS)

    pval_pass_t1 = sum(1 for v in ks_t1.values() if v['p'] > 0.05 or v.get('trivial'))
    scale_pass_t1 = sum(1 for v in sc_t1.values()
                        if 0.1 <= v['std_ratio'] <= 10 and v['std_ratio'] != float('inf'))
    print(f"  KS p>0.05: {pval_pass_t1}/{len(T1_DIMS)} dims")
    print(f"  std_ratio in [0.1,10]: {scale_pass_t1}/{len(T1_DIMS)} dims")
    print(f"  Correlation matrix Frobenius dist: {frob_t1:.3f}  (normalised: {frob_t1_norm:.3f})")
    print("  Per-dim std ratios (5s/1m):")
    for d in T1_DIMS:
        r = sc_t1[d]['std_ratio']
        flag = "OK" if 0.1 <= r <= 10 else "MISMATCH"
        print(f"    dim{d:3d}: std_5s={sc_t1[d]['std_5s']:.4f}  std_1m={sc_t1[d]['std_1m']:.4f}  ratio={r:.2f}  {flag}")

    print("\n--- T2 per-asset z-score features (dims 28-77) ---")
    ks_t2  = ks_test_per_dim(X5, X1, T2_DIMS)
    sc_t2  = scale_check(X5, X1, T2_DIMS)
    pval_pass_t2   = sum(1 for v in ks_t2.values() if v['p'] > 0.05 or v.get('trivial'))
    scale_pass_t2  = sum(1 for v in sc_t2.values()
                         if 0.1 <= v['std_ratio'] <= 10 and v['std_ratio'] != float('inf'))
    print(f"  KS p>0.05: {pval_pass_t2}/{len(T2_DIMS)} dims")
    print(f"  std_ratio in [0.1,10]: {scale_pass_t2}/{len(T2_DIMS)} dims")

    print("\n--- PCA cosine similarity (T1+T2 joint, top-4 components) ---")
    joint_dims = T1_DIMS + T2_DIMS
    pca_cos = pca_cosine(X5[:, joint_dims], X1[:, joint_dims], n_components=4)
    for i, c in enumerate(pca_cos):
        flag = "OK" if c >= 0.80 else "DIVERGED"
        print(f"  PC{i+1}: cosine={c:.3f}  {flag}")

    # Pass/fail verdict
    ks_ok    = (pval_pass_t1 / len(T1_DIMS)) >= 0.70
    scale_ok = (scale_pass_t1 / len(T1_DIMS)) >= 0.70
    pca_ok   = pca_cos[0] >= 0.80 and pca_cos[1] >= 0.80
    frob_ok  = frob_t1_norm < 0.40

    verdict = {
        "ks_pass":    bool(ks_ok),
        "scale_pass": bool(scale_ok),
        "pca_pass":   bool(pca_ok),
        "frob_pass":  bool(frob_ok),
        "PROCEED":    bool(ks_ok and scale_ok and pca_ok and frob_ok),
    }

    print("\n=== VERDICT ===")
    print(f"  KS distribution match:  {'PASS' if ks_ok    else 'FAIL'}  ({pval_pass_t1}/{len(T1_DIMS)} dims pass)")
    print(f"  Scale match:            {'PASS' if scale_ok else 'FAIL'}  ({scale_pass_t1}/{len(T1_DIMS)} dims pass)")
    print(f"  PCA structure match:    {'PASS' if pca_ok   else 'FAIL'}  (PC1={pca_cos[0]:.3f} PC2={pca_cos[1]:.3f})")
    print(f"  Corr matrix alignment:  {'PASS' if frob_ok  else 'FAIL'}  (normalised Frobenius={frob_t1_norm:.3f} < 0.40)")
    print(f"\n  PROCEED with 1m training: {'YES' if verdict['PROCEED'] else 'NO'}")
    if not verdict['PROCEED']:
        print("  --> Feature distributions diverge across resolutions.")
        print("  --> A 1m-trained model would learn different physics than 5s production.")
        print("  --> Training on 1m would NOT yield useful representations at 5s inference.")
        print("  --> Alternative: use time-normalised features (fixed-time windows, not bar-count windows)")
    else:
        print("  --> Feature distributions are compatible across resolutions.")
        print("  --> 1m training on 2021-2024 data is a valid OOS approach.")

    report.update({
        "t1_ks":   {str(k): v for k,v in ks_t1.items()},
        "t1_scale": {str(k): v for k,v in sc_t1.items()},
        "t2_ks":   {str(k): v for k,v in ks_t2.items()},
        "t2_scale": {str(k): v for k,v in sc_t2.items()},
        "pca_cosines": pca_cos,
        "t1_frob_norm": frob_t1_norm,
        "verdict": verdict,
    })

    with open(OUT, 'w') as f:
        json.dump(report, f, indent=2)
    print(f"\nReport: {OUT}")


if __name__ == '__main__':
    main()