DOLPHIN/nautilus_dolphin/dvae/corpus_builder.py

"""
DOLPHIN Multi-Generation Corpus Builder (Memory-Efficient, 5-Tier)
====================================================================
Loads ALL available Dolphin data into a unified feature matrix.

TIERS (distinct, layered, can be frozen/trained independently):
  Tier 0  (8  dims)  ALWAYS   — breadth (bull/bear), cyclic time, has_eigen flag
  Tier 1  (20 dims)  NG3+     — eigenvalue structure: 4 windows × 5 features
  Tier 2  (50 dims)  NG3+     — per-asset volatility cross-section (50 symbols)
  Tier 3  (25 dims)  NG3+     — ExF macro indicators (dvol, fng, funding, OI, etc.)
  Tier 4  (8  dims)  ALWAYS   — EsoF: lunar, fibonacci, session, cycle (computed)

Total: 111 dims.  Missing tiers are zero-filled; mask tracks availability.

Memory strategy:
  - NEVER accumulate raw JSON dicts — parse → extract → discard immediately
  - Write to memory-mapped numpy array (np.memmap) in fixed-size chunks
  - Per-date ExF NPZ loaded once and reused for all scans of that day
  - Pre-allocate output array based on estimated sample count
"""

import json
import re
import math
import numpy as np
from pathlib import Path
from datetime import datetime
from typing import Optional, Iterator, Tuple

# ── Paths ──────────────────────────────────────────────────────────────────
BASE = Path(r"C:\Users\Lenovo\Documents")
NG1_DIR    = BASE / "- Dolphin NG"
NG2_DIR    = BASE / "- Dolphin NG2"
NG4_DIR    = BASE / "- DOLPHIN NG4" / "- Results"
NG5_DIR    = BASE / "- Dolphin NG5"
NG3_EIGEN  = BASE / "- Dolphin NG HD (NG3)" / "correlation_arb512" / "eigenvalues"
HERE       = Path(__file__).parent

# ── Tier dimensions ────────────────────────────────────────────────────────
T0 =  8   # breadth + time + flag
T1 = 20   # eigenvalues  (4 windows × 5)
T2 = 50   # per-asset volatility
T3 = 25   # ExF macro indicators
T4 =  8   # EsoF esoteric

DIMS = [T0, T1, T2, T3, T4]
TOTAL = sum(DIMS)    # 111
OFF   = [0, T0, T0+T1, T0+T1+T2, T0+T1+T2+T3]   # slice offsets

WINDOWS = [50, 150, 300, 750]
EPS = 1e-8

# ── ExF indicator selection (from the 85-field NPZ, keep reliable ones) ───
EXF_FIELDS = [
    'dvol_btc', 'dvol_eth',          # implied vol
    'fng', 'fng_prev',               # fear & greed
    'btc_dom', 'eth_dom',            # dominance
    'chg24_btc', 'chg24_eth',        # 24h returns
    'dispersion', 'correlation',     # cross-market
    'imbal_btc', 'imbal_eth',        # OB imbalance
    'funding_btc', 'funding_eth',    # perp funding
    'mvrv',                          # on-chain
    'tvl',                           # DeFi
    'pcr_vol', 'pcr_oi',             # options
    'basis', 'liq_proxy',            # futures
    'spread', 'vol24',               # microstructure
    'hashrate',                      # mining
    'btc_price',                     # price level
    'fng_vol',                       # FnG volatility component
]
assert len(EXF_FIELDS) == T3, f"EXF_FIELDS len={len(EXF_FIELDS)} != T3={T3}"

# ExF normalisation constants (robust: divide by median absolute scale)
EXF_SCALE = {
    'dvol_btc': 50.0, 'dvol_eth': 50.0,
    'fng': 50.0, 'fng_prev': 50.0,
    'btc_dom': 50.0, 'eth_dom': 10.0,
    'chg24_btc': 5.0, 'chg24_eth': 5.0,
    'dispersion': 5.0, 'correlation': 1.0,
    'imbal_btc': 1.0, 'imbal_eth': 1.0,
    'funding_btc': 0.001, 'funding_eth': 0.001,
    'mvrv': 3.0,
    'tvl': 1e11,
    'pcr_vol': 1.0, 'pcr_oi': 1.0,
    'basis': 0.1, 'liq_proxy': 1.0,
    'spread': 0.01, 'vol24': 1e10,
    'hashrate': 1e9,
    'btc_price': 1e5,
}


# ── Time helpers ───────────────────────────────────────────────────────────

def _parse_ts(s: str) -> Optional[datetime]:
    for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S",
                "%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.strptime(str(s)[:26], fmt)
        except ValueError:
            continue
    return None


def _tier0(bull_pct: float, bear_pct: float, ts: datetime, has_eigen: bool) -> np.ndarray:
    bull = np.clip(bull_pct / 100.0, 0, 1)
    bear = np.clip(bear_pct / 100.0, 0, 1)
    side = max(0.0, 1.0 - bull - bear)
    h = ts.hour + ts.minute / 60.0
    d = ts.weekday()
    return np.array([
        bull, bear, side,
        math.sin(2 * math.pi * h / 24),
        math.cos(2 * math.pi * h / 24),
        math.sin(2 * math.pi * d / 7),
        math.cos(2 * math.pi * d / 7),
        1.0 if has_eigen else 0.0,
    ], dtype=np.float32)


def _tier1(windows: dict) -> Tuple[np.ndarray, bool]:
    vec = np.zeros(T1, dtype=np.float32)
    if not windows:
        return vec, False
    valid = False
    for i, w in enumerate(WINDOWS):
        wdata = windows.get(w) or windows.get(str(w)) or {}
        td = wdata.get('tracking_data') or wdata
        rs = wdata.get('regime_signals') or {}
        lmax = float(td.get('lambda_max', 0) or 0)
        if lmax > 0:
            valid = True
        vel = float(td.get('lambda_max_velocity', 0) or 0)
        gap = float(td.get('eigenvalue_gap', 0) or 0)
        inst = float(rs.get('instability_score', 0) or 0)
        rtp  = float(rs.get('regime_transition_probability', 0) or 0)
        log_lmax  = math.log(max(lmax, 1e-6))
        vel_norm  = np.clip(vel / (abs(lmax) + EPS), -5, 5)
        gap_ratio = np.clip(gap / (lmax + EPS), 0, 10)
        base = i * 5
        vec[base]   = np.float32(np.clip(log_lmax / 10.0, -3, 3))
        vec[base+1] = np.float32(vel_norm)
        vec[base+2] = np.float32(gap_ratio)
        vec[base+3] = np.float32(np.clip(inst, 0, 1))
        vec[base+4] = np.float32(np.clip(rtp, 0, 1))
    return vec, valid


def _tier2(pricing: dict) -> Tuple[np.ndarray, bool]:
    vec = np.zeros(T2, dtype=np.float32)
    vol = (pricing or {}).get('volatility') or {}
    if not vol:
        return vec, False
    vals = np.array(list(vol.values())[:T2], dtype=np.float32)
    if len(vals) == 0:
        return vec, False
    mu, sd = vals.mean(), vals.std() + EPS
    vals = np.clip((vals - mu) / sd, -5, 5)
    n = min(T2, len(vals))
    vec[:n] = vals[:n]
    return vec, True


def _tier3(exf_lookup: Optional[dict]) -> np.ndarray:
    """Extract ExF Tier-3 vector from per-date indicator dict."""
    vec = np.zeros(T3, dtype=np.float32)
    if not exf_lookup:
        return vec
    for i, field in enumerate(EXF_FIELDS):
        v = exf_lookup.get(field, 0.0) or 0.0
        scale = EXF_SCALE.get(field, 1.0)
        vec[i] = np.float32(np.clip(float(v) / scale, -10, 10))
    return vec


def _tier4(ts) -> np.ndarray:
    """
    EsoF Tier-4: 8 computed esoteric features from timestamp alone.
    Accepts Unix float timestamp OR datetime object.
    No external data needed — all derived from ts.
    """
    import calendar as cal_mod
    # Normalise to both float-seconds and datetime
    if isinstance(ts, (int, float)):
        ts_f = float(ts)
        dt = datetime.utcfromtimestamp(ts_f)
    else:
        dt = ts
        ts_f = dt.timestamp()
    # Moon illumination approx (simplified Meeus formula)
    # JD of Unix epoch (1970-01-01 00:00 UTC) = 2440587.5
    jd = 2440587.5 + ts_f / 86400.0
    D = jd - 2451545.0   # days since J2000.0
    # Moon phase angle (degrees)
    moon_age = (D % 29.53058867) / 29.53058867   # 0=new, 0.5=full
    moon_illum = 0.5 * (1 - math.cos(2 * math.pi * moon_age))
    # Mercury retrograde cycles (~3x/year, each ~21 days) — simplified
    merc_cycle = (D % 115.88) / 115.88
    merc_retro = 1.0 if 0.82 < merc_cycle < 1.0 else 0.0   # last ~18/115 of cycle
    # Fibonacci time: minutes into day
    mins = dt.hour * 60 + dt.minute
    fib_mins = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1440]
    dists = [abs(mins - f) for f in fib_mins]
    fib_proximity = 1.0 / (1.0 + min(dists) / 60.0)   # 1=at fib, 0=far
    # Session (0=Asia, 0.33=London, 0.67=NY, 1=Close)
    h = dt.hour + dt.minute / 60.0
    if 0 <= h < 7:   session = 0.0
    elif 7 <= h < 13: session = 0.33
    elif 13 <= h < 21: session = 0.67
    else:             session = 1.0
    # Market cycle position (annual)
    doy = dt.timetuple().tm_yday
    days_in_year = 366 if cal_mod.isleap(dt.year) else 365
    cycle_pos = doy / days_in_year
    # Day of week sin/cos (weekly cycle)
    dow_sin = math.sin(2 * math.pi * dt.weekday() / 7)
    dow_cos = math.cos(2 * math.pi * dt.weekday() / 7)
    return np.array([
        moon_illum,                               # lunar phase
        moon_age,                                 # 0=new, 0.5=full, 1=new
        merc_retro,                               # binary: Mercury Rx
        fib_proximity,                            # nearness to Fibonacci time
        session,                                  # liquidity session
        cycle_pos,                                # annual cycle position
        dow_sin, dow_cos,                         # weekly cycle
    ], dtype=np.float32)


# ── ExF NPZ loader (per-date, cached) ─────────────────────────────────────

class ExFCache:
    """Loads ExF NPZ once per date directory, provides field lookup."""
    def __init__(self, eigen_base: Path):
        self._base = eigen_base
        self._current_date: Optional[str] = None
        self._lookup: Optional[dict] = None

    def get(self, date_str: str) -> Optional[dict]:
        if date_str == self._current_date:
            return self._lookup
        self._current_date = date_str
        self._lookup = None
        date_dir = self._base / date_str
        # Find ANY __Indicators.npz in this dir
        npz_files = list(date_dir.glob('*__Indicators.npz'))
        if not npz_files:
            return None
        try:
            d = np.load(npz_files[0], allow_pickle=True)
            names = list(d['api_names'])
            vals  = d['api_indicators']
            ok    = d['api_success']
            self._lookup = {n: float(v) for n, v, s in zip(names, vals, ok) if s and float(v) != 0}
        except Exception:
            self._lookup = None
        return self._lookup


# ── Streaming generators (memory-efficient) ───────────────────────────────

def _stream_ng1_ng2() -> Iterator[np.ndarray]:
    import os
    for ng_dir in [NG1_DIR, NG2_DIR]:
        if not ng_dir.exists():
            continue
        # Use os.scandir (non-sorted) — much faster than sorted(rglob) on 300K+ files
        # NG1/NG2 files are all at the top level
        for entry in os.scandir(str(ng_dir)):
            f = Path(entry.path)
            if not (entry.name.startswith('regime_result_') and entry.name.endswith('.json')):
                continue
            try:
                txt = f.read_text(encoding='utf-8', errors='replace')
                d = json.loads(txt)
                ts = _parse_ts(d.get('timestamp', ''))
                if ts is None:
                    continue
                bull = float(d.get('up_ratio', 0)) * 100
                bear = float(d.get('down_ratio', 0)) * 100
                t0 = _tier0(bull, bear, ts, False)
                t4 = _tier4(ts)
                row = np.zeros(TOTAL, dtype=np.float32)
                row[OFF[0]:OFF[0]+T0] = t0
                row[OFF[4]:OFF[4]+T4] = t4
                yield row
            except Exception:
                continue


def _stream_ng4() -> Iterator[np.ndarray]:
    if not NG4_DIR.exists():
        return
    log_re = re.compile(
        r'(\d{4}-\d{2}-\d{2}T[\d:.]+Z).*REGIME STATUS: \w+ \| Bull: ([\d.]+)% Bear: ([\d.]+)%'
    )
    for f in sorted(NG4_DIR.glob('*.txt')):
        try:
            for line in f.read_text(encoding='utf-8', errors='replace').splitlines():
                m = log_re.search(line)
                if not m:
                    continue
                ts = _parse_ts(m.group(1).replace('T', ' ').rstrip('Z'))
                if ts is None:
                    continue
                bull, bear = float(m.group(2)), float(m.group(3))
                t0 = _tier0(bull, bear, ts, False)
                t4 = _tier4(ts)
                row = np.zeros(TOTAL, dtype=np.float32)
                row[OFF[0]:OFF[0]+T0] = t0
                row[OFF[4]:OFF[4]+T4] = t4
                yield row
        except Exception:
            continue


def _stream_ng5_local() -> Iterator[np.ndarray]:
    import os
    if not NG5_DIR.exists():
        return
    for entry in os.scandir(str(NG5_DIR)):
        f = Path(entry.path)
        if not (entry.name.startswith('regime_result_') and entry.name.endswith('.json')):
            continue
        try:
            d = json.loads(f.read_text(encoding='utf-8', errors='replace'))
            ts = _parse_ts(str(d.get('timestamp', '')))
            if ts is None:
                continue
            bull = float(d.get('bull_pct', 50))
            bear = float(d.get('bear_pct', 50))
            mwr = d.get('multi_window_results') or {}
            pricing = d.get('pricing_data') or {}
            t1, has_eigen = _tier1(mwr)
            t2, has_price = _tier2(pricing)
            t0 = _tier0(bull, bear, ts, has_eigen)
            t4 = _tier4(ts)
            row = np.zeros(TOTAL, dtype=np.float32)
            row[OFF[0]:OFF[0]+T0] = t0
            row[OFF[1]:OFF[1]+T1] = t1
            row[OFF[2]:OFF[2]+T2] = t2
            row[OFF[4]:OFF[4]+T4] = t4
            # No ExF for NG5 local (no companion NPZ per scan)
            yield row
        except Exception:
            continue


def _stream_ng3_scans(exf_cache: ExFCache,
                       date_from: str = '2025-12-31',
                       max_per_day: Optional[int] = None) -> Iterator[np.ndarray]:
    """
    Stream NG3/NG5 scan JSONs one at a time — never accumulates in memory.
    ExF loaded once per date from companion NPZ.
    max_per_day: limit scans per day (subsample for very long training days).
    """
    if not NG3_EIGEN.exists():
        return
    date_dirs = sorted(
        d for d in NG3_EIGEN.iterdir()
        if d.is_dir() and not d.name.endswith('_SKIP') and d.name >= date_from
    )
    for date_dir in date_dirs:
        exf = exf_cache.get(date_dir.name)
        t3 = _tier3(exf)
        day_count = 0
        for f in sorted(date_dir.glob('scan_*.json')):
            if '__Indicators' in f.name:
                continue
            if max_per_day and day_count >= max_per_day:
                break
            try:
                # Read and immediately parse — don't accumulate
                txt = f.read_text(encoding='utf-8', errors='replace')
                d = json.loads(txt)
                ts = _parse_ts(str(d.get('timestamp', '')))
                if ts is None:
                    continue
                windows = d.get('windows') or d.get('multi_window_results') or {}
                pricing = d.get('pricing_data') or {}
                pc = pricing.get('price_changes', {})
                if pc:
                    vs = list(pc.values())
                    bull = 100.0 * sum(1 for v in vs if float(v) > 0) / max(len(vs), 1)
                    bear = 100.0 * sum(1 for v in vs if float(v) < 0) / max(len(vs), 1)
                else:
                    bull, bear = 50.0, 50.0
                t1, has_eigen = _tier1(windows)
                t2, _ = _tier2(pricing)
                t0 = _tier0(bull, bear, ts, has_eigen)
                t4 = _tier4(ts)
                row = np.zeros(TOTAL, dtype=np.float32)
                row[OFF[0]:OFF[0]+T0] = t0
                row[OFF[1]:OFF[1]+T1] = t1
                row[OFF[2]:OFF[2]+T2] = t2
                row[OFF[3]:OFF[3]+T3] = t3   # same ExF for all scans of this day
                row[OFF[4]:OFF[4]+T4] = t4
                yield row
                day_count += 1
                del d, txt  # explicit release
            except Exception:
                continue


# ── Master corpus builder ──────────────────────────────────────────────────

class DolphinCorpus:
    """
    Unified DOLPHIN corpus across all generations, 5 tiers, 111 dims.

    Attributes:
        X       : (N, 111)  float32 — the feature matrix
        mask    : (N, 5)    bool    — [t0, t1_eigen, t2_price, t3_exf, t4_esof]
        sources : (N,)      int8    — 0=NG1/2, 1=NG4, 2=NG5-local, 3=NG3-scan
    """

    DIMS  = DIMS
    TOTAL = TOTAL
    OFF   = OFF

    def __init__(self):
        self.X = None
        self.mask = None
        self.sources = None

    def build(self,
              ng3_date_from: str = '2025-12-31',
              max_scans_per_day: Optional[int] = None,
              max_per_source: Optional[int] = None,
              max_ng5: int = 3_000,
              chunk_size: int = 50_000,
              verbose: bool = True) -> 'DolphinCorpus':
        """
        Memory-efficient build using streaming generators.
        chunk_size: accumulate this many rows before extending array.
        max_per_source: cap rows from NG1/NG2/NG4 (breadth-only sources).
        max_ng5: separate cap for NG5-local (files are larger, reads ~26/s).
        """
        print("Building DOLPHIN multi-generation corpus (streaming)...", flush=True)
        exf_cache = ExFCache(NG3_EIGEN)

        # Per-source caps: NG5-local is separately capped (slow reads)
        _caps = {
            0: max_per_source,   # NG1/NG2
            1: max_per_source,   # NG4
            2: max_ng5,          # NG5-local — separate low cap
            3: None,             # NG3-scan — limited by max_scans_per_day
        }

        sources_list = [
            (0, _stream_ng1_ng2(),                                      "NG1/NG2"),
            (1, _stream_ng4(),                                          "NG4"),
            (2, _stream_ng5_local(),                                    "NG5-local"),
            (3, _stream_ng3_scans(exf_cache, ng3_date_from, max_scans_per_day), "NG3-scan"),
        ]

        all_chunks, all_src = [], []
        buf_rows, buf_src = [], []
        total = 0

        for src_id, gen, name in sources_list:
            src_count = 0
            cap = _caps.get(src_id)
            for row in gen:
                buf_rows.append(row)
                buf_src.append(src_id)
                src_count += 1
                total += 1
                if len(buf_rows) >= chunk_size:
                    all_chunks.append(np.array(buf_rows, dtype=np.float32))
                    all_src.extend(buf_src)
                    buf_rows.clear(); buf_src.clear()
                    if verbose:
                        print(f"  {name}: {src_count:,}  (total so far: {total:,})", flush=True)
                if cap and src_count >= cap:
                    break
            if verbose:
                print(f"  {name}: {src_count:,} samples", flush=True)

        # Flush remainder
        if buf_rows:
            all_chunks.append(np.array(buf_rows, dtype=np.float32))
            all_src.extend(buf_src)

        self.X = np.vstack(all_chunks) if all_chunks else np.empty((0, TOTAL), dtype=np.float32)
        self.sources = np.array(all_src, dtype=np.int8)
        np.nan_to_num(self.X, copy=False, nan=0.0, posinf=0.0, neginf=0.0)

        # Build mask from has_eigen flag (bit7 of T0) and non-zero tiers
        has_eigen  = self.X[:, OFF[0] + 7] > 0.5       # T0[-1]
        has_price  = np.any(self.X[:, OFF[2]:OFF[2]+T2] != 0, axis=1)
        has_exf    = np.any(self.X[:, OFF[3]:OFF[3]+T3] != 0, axis=1)
        self.mask = np.column_stack([
            np.ones(len(self.X), dtype=bool),  # T0 always
            has_eigen,
            has_price,
            has_exf,
            np.ones(len(self.X), dtype=bool),  # T4 always (computed)
        ])

        if verbose:
            print(f"\nCorpus summary:")
            print(f"  Total    : {len(self.X):,}")
            print(f"  Shape    : {self.X.shape}  ({self.X.nbytes/1e6:.0f} MB)")
            print(f"  T1 eigen : {self.mask[:,1].sum():,} ({100*self.mask[:,1].mean():.1f}%)")
            print(f"  T2 price : {self.mask[:,2].sum():,} ({100*self.mask[:,2].mean():.1f}%)")
            print(f"  T3 exf   : {self.mask[:,3].sum():,} ({100*self.mask[:,3].mean():.1f}%)")
        return self

    def save(self, path: str):
        p = path if path.endswith('.npz') else path + '.npz'
        np.savez_compressed(p, X=self.X, mask=self.mask, sources=self.sources)
        print(f"Corpus saved: {p}  ({self.X.nbytes/1e6:.0f} MB uncompressed, compressed ~10x)")

    @classmethod
    def load(cls, path: str) -> 'DolphinCorpus':
        c = cls()
        p = path if path.endswith('.npz') else path + '.npz'
        d = np.load(p)
        c.X, c.mask, c.sources = d['X'], d['mask'], d['sources']
        print(f"Corpus loaded: {len(c.X):,} samples, {c.X.shape[1]} dims")
        return c

    # ── Tier slices ─────────────────────────────────────────────────────
    def t0(self): return self.X[:, OFF[0]:OFF[0]+T0]
    def t1(self): return self.X[:, OFF[1]:OFF[1]+T1]
    def t2(self): return self.X[:, OFF[2]:OFF[2]+T2]
    def t3(self): return self.X[:, OFF[3]:OFF[3]+T3]
    def t4(self): return self.X[:, OFF[4]:OFF[4]+T4]

    def tier_names(self):
        return ['breadth+time', 'eigenvalues', 'per-asset-vol', 'ExF-macro', 'EsoF']

    def describe(self):
        print(f"Corpus: N={len(self.X):,}  dims={TOTAL}  ({self.X.nbytes/1e6:.0f}MB)")
        print(f"Tiers: {list(zip(self.tier_names(), DIMS))}")
        print(f"Masks: {[(t, self.mask[:,i].sum()) for i, t in enumerate(self.tier_names())]}")
        src_names = {0: 'NG1/2', 1: 'NG4', 2: 'NG5-local', 3: 'NG3-scan'}
        for sid, name in src_names.items():
            n = (self.sources == sid).sum()
            if n > 0:
                print(f"  {name:12s}: {n:,}")


if __name__ == '__main__':
    import sys
    max_per_day = int(sys.argv[1]) if len(sys.argv) > 1 else None
    corpus = DolphinCorpus().build(verbose=True, max_scans_per_day=max_per_day)
    corpus.save(str(HERE / 'corpus_cache'))
    corpus.describe()