"""
Unit tests: LSTM weight save/load fix
======================================
Tests that DisentangledVAEGenerator.save_model() correctly persists W_ih/W_hh/b_h
and that TitanSensor loads them instead of random re-initialising.

Run BEFORE and AFTER retrain to catch regressions.

Usage:
  cd nautilus_dolphin
  python dvae/test_lstm_weight_fix.py
"""

import sys, json, os, tempfile
sys.stdout.reconfigure(encoding='utf-8', errors='replace')
from pathlib import Path
import numpy as np

ROOT     = Path(__file__).parent.parent           # nautilus_dolphin/
PROJECT  = ROOT.parent                            # project root (disentangled_vae_joint_generator.py lives here)
sys.path.insert(0, str(ROOT))
sys.path.insert(0, str(PROJECT))

MODEL_PATH      = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict"
                       r"\dvae_regime_model_TITAN_ULTRA_250_ULTRA261_MCDAIN.json")
MODEL_PATH_GD   = Path(r"C:\Users\Lenovo\Documents\- DOLPHIN NG HD HCM TSF Predict"
                       r"\dvae_regime_model_TITAN_ULTRA_GD.json")

_PASS = "[PASS]"
_FAIL = "[FAIL]"


# ── Helpers ───────────────────────────────────────────────────────────────────

def _make_dummy_generator():
    """
    Build a minimal DisentangledVAEGenerator-like object with known numpy weights,
    bypassing FLINT _init_weights().  Used to test save/load roundtrip without
    requiring FLINT/arb to be present.
    """
    import types, importlib

    # Import the class but intercept _init_weights so it doesn't call crypto_random_arb
    mod = importlib.import_module('disentangled_vae_joint_generator')
    cls = mod.DisentangledVAEGenerator

    obj = cls.__new__(cls)
    obj.input_dim  = 8
    obj.hidden_dim = 4
    obj.latent_dim = 2
    obj.regime_dim = 2
    obj.prec       = 64
    obj.beta       = 1.0
    obj.is_trained = True
    obj.edain      = None
    obj.latent_names = {0: "A", 1: "B"}

    rng = np.random.RandomState(7777)
    obj.W_ih     = rng.randn(8,  16).astype(np.float64)   # (input_dim, hidden*4)
    obj.W_hh     = rng.randn(4,  16).astype(np.float64)   # (hidden_dim, hidden*4)
    obj.b_h      = rng.randn(16).astype(np.float64)
    obj.W_mu     = rng.randn(4,  2).astype(np.float64)
    obj.W_logvar = rng.randn(4,  2).astype(np.float64)
    obj.b_mu     = rng.randn(2).astype(np.float64)
    obj.b_logvar = rng.randn(2).astype(np.float64)
    obj.W_dec    = rng.randn(2,  4).astype(np.float64)
    obj.W_out    = rng.randn(4,  8).astype(np.float64)
    obj.b_dec    = rng.randn(4).astype(np.float64)
    obj.b_out    = rng.randn(8).astype(np.float64)
    return obj


def _load_sensor():
    from dvae.titan_sensor import TitanSensor
    return TitanSensor(str(MODEL_PATH))


# ── Test 1: save_model() roundtrip (no FLINT needed) ─────────────────────────

def test_save_model_includes_lstm():
    print("\n[T1] save_model() includes W_ih / W_hh / b_h ...")
    gen = _make_dummy_generator()

    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
        tmp = f.name
    try:
        gen.save_model(tmp)

        with open(tmp) as f:
            m = json.load(f)

        for key in ('W_ih', 'W_hh', 'b_h', 'W_mu', 'W_logvar', 'W_dec', 'W_out',
                    'b_mu', 'b_logvar', 'b_dec', 'b_out'):
            assert key in m, f"{_FAIL}: key '{key}' missing from saved JSON"

        W_ih_rt = np.array(m['W_ih'])
        assert W_ih_rt.shape == gen.W_ih.shape, \
            f"{_FAIL}: W_ih shape mismatch {W_ih_rt.shape} vs {gen.W_ih.shape}"
        assert np.allclose(W_ih_rt, gen.W_ih, atol=1e-15), \
            f"{_FAIL}: W_ih values differ after roundtrip (max={np.max(np.abs(W_ih_rt - gen.W_ih)):.2e})"

        W_hh_rt = np.array(m['W_hh'])
        assert np.allclose(W_hh_rt, gen.W_hh, atol=1e-15), \
            f"{_FAIL}: W_hh values differ after roundtrip"

        b_h_rt = np.array(m['b_h'])
        assert np.allclose(b_h_rt, gen.b_h, atol=1e-15), \
            f"{_FAIL}: b_h values differ after roundtrip"

        print(f"  {_PASS} W_ih {gen.W_ih.shape} roundtrip exact")
        print(f"  {_PASS} W_hh {gen.W_hh.shape} roundtrip exact")
        print(f"  {_PASS} b_h  {gen.b_h.shape}  roundtrip exact")
        print(f"  {_PASS} all 11 weight keys present")
    finally:
        os.unlink(tmp)


# ── Test 2: TitanSensor loads W_ih from JSON, not random seed=42 ─────────────

def test_sensor_loads_from_json_not_random():
    print("\n[T2] TitanSensor loads LSTM weights from JSON (not seed=42 random) ...")
    assert MODEL_PATH.exists(), f"{_FAIL}: model not found at {MODEL_PATH}"

    with open(MODEL_PATH) as f:
        m = json.load(f)

    assert 'W_ih' in m, \
        f"{_FAIL}: W_ih missing from model JSON — model was saved before the fix. Retrain first."

    sensor = _load_sensor()
    assert sensor.lstm_weights_valid, \
        f"{_FAIL}: sensor.lstm_weights_valid=False — W_ih missing from JSON"

    W_ih_json = np.array(m['W_ih'], dtype=np.float64)
    max_diff   = np.max(np.abs(sensor.W_ih - W_ih_json))
    assert max_diff < 1e-12, \
        f"{_FAIL}: sensor.W_ih != JSON W_ih (max_diff={max_diff:.3e})"
    print(f"  {_PASS} sensor.W_ih == JSON W_ih (max_diff={max_diff:.2e})")

    W_hh_json = np.array(m['W_hh'], dtype=np.float64)
    max_diff_hh = np.max(np.abs(sensor.W_hh - W_hh_json))
    assert max_diff_hh < 1e-12, \
        f"{_FAIL}: sensor.W_hh != JSON W_hh (max_diff={max_diff_hh:.3e})"
    print(f"  {_PASS} sensor.W_hh == JSON W_hh (max_diff={max_diff_hh:.2e})")

    # Confirm it is NOT the seed=42 random initialisation
    rng_42 = np.random.RandomState(42)
    W_ih_42 = rng_42.randn(261, 512) * 0.1
    diff_vs_42 = np.max(np.abs(sensor.W_ih - W_ih_42))
    assert diff_vs_42 > 0.01, \
        f"{_FAIL}: sensor.W_ih matches seed=42 random (diff={diff_vs_42:.3e}) — LSTM still wrong"
    print(f"  {_PASS} sensor.W_ih is NOT seed=42 random (diff_vs_42={diff_vs_42:.3f})")


# ── Test 3: recon_err is finite and in plausible range ────────────────────────

def test_recon_err_plausible():
    print("\n[T3] recon_err is finite and << 10^6 (was ~10^14 pre-fix) ...")
    sensor = _load_sensor()

    from dvae.titan_sensor import build_feature_vector

    rng = np.random.RandomState(42)

    results = {}
    for label, x in [
        ("zeros",    np.zeros(261)),
        ("ones",     np.ones(261) * 0.01),
        ("random_s", rng.randn(261) * 0.05),
        ("random_l", rng.randn(261) * 2.0),
    ]:
        z_mu, recon_err, z_logvar = sensor.encode(x)
        results[label] = recon_err
        assert np.isfinite(recon_err), \
            f"{_FAIL}: recon_err not finite for '{label}' input: {recon_err}"
        # Pre-fix: ~10^14.  Post-fix: should be O(1) or O(100) at worst.
        assert recon_err < 1e8, \
            f"{_FAIL}: recon_err={recon_err:.3e} suspiciously large for '{label}'"
        print(f"  {_PASS} [{label:10s}] recon_err={recon_err:.4e}  z_mu[0:4]={z_mu[:4].round(4)}")

    # Distribution check: recon_err should vary with input (not uniform noise)
    errs = list(results.values())
    assert max(errs) / (min(errs) + 1e-12) > 2.0, \
        f"{_FAIL}: recon_err suspiciously uniform across inputs ({errs}) — LSTM may still be wrong"
    print(f"  {_PASS} recon_err varies meaningfully across inputs (ratio={max(errs)/(min(errs)+1e-12):.1f}x)")


# ── Test 4: Encoding is deterministic ─────────────────────────────────────────

def test_encoding_deterministic():
    print("\n[T4] encode() is deterministic across two sensor instances ...")
    sensor1 = _load_sensor()
    sensor2 = _load_sensor()

    x = np.random.RandomState(99).randn(261) * 0.1
    z1, e1, _ = sensor1.encode(x)
    z2, e2, _ = sensor2.encode(x)

    assert np.allclose(z1, z2, atol=1e-14), \
        f"{_FAIL}: z_mu differs between two sensors (max={np.max(np.abs(z1-z2)):.2e})"
    assert abs(e1 - e2) < 1e-10, \
        f"{_FAIL}: recon_err differs between two sensors ({e1:.6e} vs {e2:.6e})"
    print(f"  {_PASS} z_mu identical (max_diff={np.max(np.abs(z1-z2)):.2e})")
    print(f"  {_PASS} recon_err identical ({e1:.6e})")


# ── Test 5: Legacy model emits RuntimeWarning, sensor.lstm_weights_valid=False ─

def test_legacy_model_warns():
    print("\n[T5] Legacy model (no W_ih in JSON) emits RuntimeWarning ...")
    from dvae.titan_sensor import TitanSensor
    import warnings

    # Build a minimal legacy-style JSON (no W_ih/W_hh)
    legacy = {
        "W_mu":     np.zeros((128, 32)).tolist(),
        "W_logvar": np.zeros((128, 32)).tolist(),
        "W_dec":    np.zeros((32, 128)).tolist(),
        "W_out":    np.zeros((128, 261)).tolist(),
        "latent_names": {},
        "precision_bits": 512,
    }
    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
        json.dump(legacy, f)
        tmp = f.name
    try:
        with warnings.catch_warnings(record=True) as caught:
            warnings.simplefilter("always")
            s = TitanSensor(tmp)
        assert not s.lstm_weights_valid, \
            f"{_FAIL}: lstm_weights_valid should be False for legacy model"
        assert any(issubclass(w.category, RuntimeWarning) for w in caught), \
            f"{_FAIL}: no RuntimeWarning emitted for legacy model"
        print(f"  {_PASS} lstm_weights_valid=False")
        print(f"  {_PASS} RuntimeWarning emitted")
    finally:
        os.unlink(tmp)


# ── Test 6: z_mu dimensionality and range ─────────────────────────────────────

def test_latent_dimensionality():
    print("\n[T6] z_mu has correct dimensionality and plausible range ...")
    sensor = _load_sensor()
    x = np.random.RandomState(55).randn(261) * 0.1

    z_mu, recon_err, z_logvar = sensor.encode(x)

    assert z_mu.shape    == (32,), f"{_FAIL}: z_mu shape {z_mu.shape} != (32,)"
    assert z_logvar.shape == (32,), f"{_FAIL}: z_logvar shape {z_logvar.shape} != (32,)"
    assert np.all(np.isfinite(z_mu)),    f"{_FAIL}: z_mu contains non-finite values"
    assert np.all(np.isfinite(z_logvar)), f"{_FAIL}: z_logvar contains non-finite values"
    assert np.abs(z_mu).max() < 1e3, \
        f"{_FAIL}: z_mu values suspiciously large (max={np.abs(z_mu).max():.2e})"
    print(f"  {_PASS} z_mu.shape=(32,)  z_mu range=[{z_mu.min():.3f}, {z_mu.max():.3f}]")
    print(f"  {_PASS} z_logvar.shape=(32,)  range=[{z_logvar.min():.3f}, {z_logvar.max():.3f}]")


# ── Test 7: GD-trained model has correct LSTM weights and sane recon_err ──────

def test_gd_model():
    print("\n[T7] GD-trained model: W_ih present, recon_err << MCDAIN model ...")
    assert MODEL_PATH_GD.exists(), f"{_FAIL}: GD model not found at {MODEL_PATH_GD}"
    from dvae.titan_sensor import TitanSensor

    sensor_gd = TitanSensor(str(MODEL_PATH_GD))
    assert sensor_gd.lstm_weights_valid, f"{_FAIL}: GD sensor.lstm_weights_valid=False"
    print(f"  {_PASS} GD model: lstm_weights_valid=True")

    # Verify W_ih from GD model matches W_ih from MCDAIN model
    # (they should be THE SAME — GD model was initialized from MCDAIN model's W_ih)
    sensor_mc = _load_sensor()
    diff_wih = np.max(np.abs(sensor_gd.W_ih - sensor_mc.W_ih))
    assert diff_wih < 1e-12, \
        f"{_FAIL}: GD model W_ih != MCDAIN model W_ih (diff={diff_wih:.3e}) — should be same LSTM basis"
    print(f"  {_PASS} GD model W_ih == MCDAIN model W_ih (same LSTM basis, diff={diff_wih:.2e})")

    # Reconstruction error: generate inputs GUARANTEED to be in-distribution by
    # sampling x = norm_mean + norm_std * randn (within ±2σ of training corpus).
    # After encode()'s normalization step: v_norm = clip(randn, -2, 2) → perfectly O(1).
    # recon_err should be close to the training-set value (p50=0.59).
    with open(MODEL_PATH_GD) as fj:
        gd_json = json.load(fj)
    nm_gd = np.array(gd_json['norm_mean']); ns_gd = np.array(gd_json['norm_std'])

    rng = np.random.RandomState(42)
    errs_gd = []; errs_mc = []
    for _ in range(20):
        # Guaranteed in-distribution: raw input = corpus_mean + corpus_std * noise
        x_raw  = nm_gd + ns_gd * np.clip(rng.randn(261), -2, 2)
        x_raw[78:] = 0.0   # T3/T4/T5 are always 0 in corpus
        _, e_gd, _ = sensor_gd.encode(x_raw)
        _, e_mc, _ = sensor_mc.encode(x_raw)
        errs_gd.append(e_gd); errs_mc.append(e_mc)
    med_gd = float(np.median(errs_gd)); med_mc = float(np.median(errs_mc))
    print(f"  GD median recon_err={med_gd:.4e} (in-distribution)  MCDAIN median recon_err={med_mc:.4e}")
    # GD trained with proper GD — should reconstruct in-distribution inputs well
    assert med_gd < 10.0, f"{_FAIL}: GD recon_err too large ({med_gd:.4e}) — model didn't learn"
    print(f"  {_PASS} GD recon_err < 10.0 for in-distribution inputs (model learned)")


# ── Test 8: GD-v2 normalization stored and applied at inference ───────────────

def test_normalization_stored_and_applied():
    print("\n[T8] GD-v2 model: norm_mean/norm_std present and applied by TitanSensor ...")
    assert MODEL_PATH_GD.exists(), f"{_FAIL}: GD model not found at {MODEL_PATH_GD}"
    from dvae.titan_sensor import TitanSensor

    # 8a: JSON must contain norm_mean / norm_std
    with open(MODEL_PATH_GD) as f:
        m = json.load(f)
    assert 'norm_mean' in m, f"{_FAIL}: GD model JSON missing 'norm_mean'"
    assert 'norm_std'  in m, f"{_FAIL}: GD model JSON missing 'norm_std'"
    nm  = np.array(m['norm_mean']); ns = np.array(m['norm_std'])
    assert nm.shape == (261,), f"{_FAIL}: norm_mean shape {nm.shape} != (261,)"
    assert ns.shape == (261,), f"{_FAIL}: norm_std shape {ns.shape} != (261,)"
    assert np.all(ns > 0), f"{_FAIL}: norm_std has zero or negative entries"
    print(f"  {_PASS} norm_mean/norm_std present, shape=(261,), all std>0")

    # 8b: TitanSensor must load them (not None)
    sensor = TitanSensor(str(MODEL_PATH_GD))
    assert sensor.norm_mean is not None, f"{_FAIL}: sensor.norm_mean is None — not loaded"
    assert sensor.norm_std  is not None, f"{_FAIL}: sensor.norm_std is None — not loaded"
    assert np.allclose(sensor.norm_mean, nm, atol=1e-12), \
        f"{_FAIL}: sensor.norm_mean != JSON norm_mean"
    print(f"  {_PASS} sensor.norm_mean loaded from JSON")

    # 8c: recon_err on realistic inputs (matching build_feature_vector() output structure)
    # Before fix (MCDAIN-trained, no normalization stored): encode(raw) → huge recon_err
    # After fix (GD-v2, normalization stored and applied): encode(raw) → O(1-50)
    rng = np.random.RandomState(42)
    errs = []
    for _ in range(30):
        x = np.zeros(261)
        x[0:8]   = rng.randn(8) * 0.7              # T0: time encoding
        x[8:28]  = rng.randn(20) * 0.02            # T1: eigenvalue velocity scale
        x[28:78] = np.clip(rng.randn(50), -5, 5)   # T2: return z-scores [-5,5]
        # T3/T4/T5 = 0 (Stage 1 contract — matches training corpus)
        _, e, _ = sensor.encode(x)
        errs.append(e)
    med_err = float(np.median(errs))
    max_err = float(np.max(errs))
    print(f"  GD-v2 realistic-input recon_err: median={med_err:.4e}  max={max_err:.4e}")
    assert med_err < 100.0, \
        f"{_FAIL}: recon_err too large ({med_err:.4e}) — normalization may not be applied"
    print(f"  {_PASS} recon_err O(1-100) for realistic-scale inputs")

    # 8d: T5 dims are zero after normalization (non-zero norm_mean would shift them)
    x_zeros = np.zeros(261)
    _, e_z, _ = sensor.encode(x_zeros)
    # The sensor zeros T5 after normalization — this just checks it doesn't crash
    assert np.isfinite(e_z), f"{_FAIL}: encode(zeros) returned non-finite recon_err={e_z}"
    print(f"  {_PASS} encode(zeros) is finite after normalization: recon_err={e_z:.4e}")


# ── Runner ────────────────────────────────────────────────────────────────────

if __name__ == '__main__':
    print("=" * 60)
    print("TitanSensor LSTM weight fix — unit tests")
    print("=" * 60)

    n_pass = 0
    n_fail = 0

    tests = [
        ("T1: save_model roundtrip",            test_save_model_includes_lstm),
        ("T2: sensor loads JSON weights",        test_sensor_loads_from_json_not_random),
        ("T3: recon_err plausible",              test_recon_err_plausible),
        ("T4: encoding deterministic",           test_encoding_deterministic),
        ("T5: legacy model warns",               test_legacy_model_warns),
        ("T6: latent dimensionality",            test_latent_dimensionality),
        ("T7: GD model quality",                 test_gd_model),
        ("T8: normalization stored and applied", test_normalization_stored_and_applied),
    ]

    for name, fn in tests:
        try:
            fn()
            n_pass += 1
        except AssertionError as e:
            print(f"  {_FAIL} ASSERTION: {e}")
            n_fail += 1
        except Exception as e:
            print(f"  {_FAIL} EXCEPTION in {name}: {type(e).__name__}: {e}")
            n_fail += 1

    print()
    print("=" * 60)
    print(f"Results: {n_pass}/{n_pass+n_fail} PASSED   {n_fail} FAILED")
    print("=" * 60)

    if n_fail > 0:
        sys.exit(1)