""" DOLPHIN Multi-Generation Corpus Builder (Memory-Efficient, 5-Tier) ==================================================================== Loads ALL available Dolphin data into a unified feature matrix. TIERS (distinct, layered, can be frozen/trained independently): Tier 0 (8 dims) ALWAYS — breadth (bull/bear), cyclic time, has_eigen flag Tier 1 (20 dims) NG3+ — eigenvalue structure: 4 windows × 5 features Tier 2 (50 dims) NG3+ — per-asset volatility cross-section (50 symbols) Tier 3 (25 dims) NG3+ — ExF macro indicators (dvol, fng, funding, OI, etc.) Tier 4 (8 dims) ALWAYS — EsoF: lunar, fibonacci, session, cycle (computed) Total: 111 dims. Missing tiers are zero-filled; mask tracks availability. Memory strategy: - NEVER accumulate raw JSON dicts — parse → extract → discard immediately - Write to memory-mapped numpy array (np.memmap) in fixed-size chunks - Per-date ExF NPZ loaded once and reused for all scans of that day - Pre-allocate output array based on estimated sample count """ import json import re import math import numpy as np from pathlib import Path from datetime import datetime from typing import Optional, Iterator, Tuple # ── Paths ────────────────────────────────────────────────────────────────── BASE = Path(r"C:\Users\Lenovo\Documents") NG1_DIR = BASE / "- Dolphin NG" NG2_DIR = BASE / "- Dolphin NG2" NG4_DIR = BASE / "- DOLPHIN NG4" / "- Results" NG5_DIR = BASE / "- Dolphin NG5" NG3_EIGEN = BASE / "- Dolphin NG HD (NG3)" / "correlation_arb512" / "eigenvalues" HERE = Path(__file__).parent # ── Tier dimensions ──────────────────────────────────────────────────────── T0 = 8 # breadth + time + flag T1 = 20 # eigenvalues (4 windows × 5) T2 = 50 # per-asset volatility T3 = 25 # ExF macro indicators T4 = 8 # EsoF esoteric DIMS = [T0, T1, T2, T3, T4] TOTAL = sum(DIMS) # 111 OFF = [0, T0, T0+T1, T0+T1+T2, T0+T1+T2+T3] # slice offsets WINDOWS = [50, 150, 300, 750] EPS = 1e-8 # ── ExF indicator selection (from the 85-field NPZ, keep reliable ones) ─── EXF_FIELDS = [ 'dvol_btc', 'dvol_eth', # implied vol 'fng', 'fng_prev', # fear & greed 'btc_dom', 'eth_dom', # dominance 'chg24_btc', 'chg24_eth', # 24h returns 'dispersion', 'correlation', # cross-market 'imbal_btc', 'imbal_eth', # OB imbalance 'funding_btc', 'funding_eth', # perp funding 'mvrv', # on-chain 'tvl', # DeFi 'pcr_vol', 'pcr_oi', # options 'basis', 'liq_proxy', # futures 'spread', 'vol24', # microstructure 'hashrate', # mining 'btc_price', # price level 'fng_vol', # FnG volatility component ] assert len(EXF_FIELDS) == T3, f"EXF_FIELDS len={len(EXF_FIELDS)} != T3={T3}" # ExF normalisation constants (robust: divide by median absolute scale) EXF_SCALE = { 'dvol_btc': 50.0, 'dvol_eth': 50.0, 'fng': 50.0, 'fng_prev': 50.0, 'btc_dom': 50.0, 'eth_dom': 10.0, 'chg24_btc': 5.0, 'chg24_eth': 5.0, 'dispersion': 5.0, 'correlation': 1.0, 'imbal_btc': 1.0, 'imbal_eth': 1.0, 'funding_btc': 0.001, 'funding_eth': 0.001, 'mvrv': 3.0, 'tvl': 1e11, 'pcr_vol': 1.0, 'pcr_oi': 1.0, 'basis': 0.1, 'liq_proxy': 1.0, 'spread': 0.01, 'vol24': 1e10, 'hashrate': 1e9, 'btc_price': 1e5, } # ── Time helpers ─────────────────────────────────────────────────────────── def _parse_ts(s: str) -> Optional[datetime]: for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S"): try: return datetime.strptime(str(s)[:26], fmt) except ValueError: continue return None def _tier0(bull_pct: float, bear_pct: float, ts: datetime, has_eigen: bool) -> np.ndarray: bull = np.clip(bull_pct / 100.0, 0, 1) bear = np.clip(bear_pct / 100.0, 0, 1) side = max(0.0, 1.0 - bull - bear) h = ts.hour + ts.minute / 60.0 d = ts.weekday() return np.array([ bull, bear, side, math.sin(2 * math.pi * h / 24), math.cos(2 * math.pi * h / 24), math.sin(2 * math.pi * d / 7), math.cos(2 * math.pi * d / 7), 1.0 if has_eigen else 0.0, ], dtype=np.float32) def _tier1(windows: dict) -> Tuple[np.ndarray, bool]: vec = np.zeros(T1, dtype=np.float32) if not windows: return vec, False valid = False for i, w in enumerate(WINDOWS): wdata = windows.get(w) or windows.get(str(w)) or {} td = wdata.get('tracking_data') or wdata rs = wdata.get('regime_signals') or {} lmax = float(td.get('lambda_max', 0) or 0) if lmax > 0: valid = True vel = float(td.get('lambda_max_velocity', 0) or 0) gap = float(td.get('eigenvalue_gap', 0) or 0) inst = float(rs.get('instability_score', 0) or 0) rtp = float(rs.get('regime_transition_probability', 0) or 0) log_lmax = math.log(max(lmax, 1e-6)) vel_norm = np.clip(vel / (abs(lmax) + EPS), -5, 5) gap_ratio = np.clip(gap / (lmax + EPS), 0, 10) base = i * 5 vec[base] = np.float32(np.clip(log_lmax / 10.0, -3, 3)) vec[base+1] = np.float32(vel_norm) vec[base+2] = np.float32(gap_ratio) vec[base+3] = np.float32(np.clip(inst, 0, 1)) vec[base+4] = np.float32(np.clip(rtp, 0, 1)) return vec, valid def _tier2(pricing: dict) -> Tuple[np.ndarray, bool]: vec = np.zeros(T2, dtype=np.float32) vol = (pricing or {}).get('volatility') or {} if not vol: return vec, False vals = np.array(list(vol.values())[:T2], dtype=np.float32) if len(vals) == 0: return vec, False mu, sd = vals.mean(), vals.std() + EPS vals = np.clip((vals - mu) / sd, -5, 5) n = min(T2, len(vals)) vec[:n] = vals[:n] return vec, True def _tier3(exf_lookup: Optional[dict]) -> np.ndarray: """Extract ExF Tier-3 vector from per-date indicator dict.""" vec = np.zeros(T3, dtype=np.float32) if not exf_lookup: return vec for i, field in enumerate(EXF_FIELDS): v = exf_lookup.get(field, 0.0) or 0.0 scale = EXF_SCALE.get(field, 1.0) vec[i] = np.float32(np.clip(float(v) / scale, -10, 10)) return vec def _tier4(ts) -> np.ndarray: """ EsoF Tier-4: 8 computed esoteric features from timestamp alone. Accepts Unix float timestamp OR datetime object. No external data needed — all derived from ts. """ import calendar as cal_mod # Normalise to both float-seconds and datetime if isinstance(ts, (int, float)): ts_f = float(ts) dt = datetime.utcfromtimestamp(ts_f) else: dt = ts ts_f = dt.timestamp() # Moon illumination approx (simplified Meeus formula) # JD of Unix epoch (1970-01-01 00:00 UTC) = 2440587.5 jd = 2440587.5 + ts_f / 86400.0 D = jd - 2451545.0 # days since J2000.0 # Moon phase angle (degrees) moon_age = (D % 29.53058867) / 29.53058867 # 0=new, 0.5=full moon_illum = 0.5 * (1 - math.cos(2 * math.pi * moon_age)) # Mercury retrograde cycles (~3x/year, each ~21 days) — simplified merc_cycle = (D % 115.88) / 115.88 merc_retro = 1.0 if 0.82 < merc_cycle < 1.0 else 0.0 # last ~18/115 of cycle # Fibonacci time: minutes into day mins = dt.hour * 60 + dt.minute fib_mins = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1440] dists = [abs(mins - f) for f in fib_mins] fib_proximity = 1.0 / (1.0 + min(dists) / 60.0) # 1=at fib, 0=far # Session (0=Asia, 0.33=London, 0.67=NY, 1=Close) h = dt.hour + dt.minute / 60.0 if 0 <= h < 7: session = 0.0 elif 7 <= h < 13: session = 0.33 elif 13 <= h < 21: session = 0.67 else: session = 1.0 # Market cycle position (annual) doy = dt.timetuple().tm_yday days_in_year = 366 if cal_mod.isleap(dt.year) else 365 cycle_pos = doy / days_in_year # Day of week sin/cos (weekly cycle) dow_sin = math.sin(2 * math.pi * dt.weekday() / 7) dow_cos = math.cos(2 * math.pi * dt.weekday() / 7) return np.array([ moon_illum, # lunar phase moon_age, # 0=new, 0.5=full, 1=new merc_retro, # binary: Mercury Rx fib_proximity, # nearness to Fibonacci time session, # liquidity session cycle_pos, # annual cycle position dow_sin, dow_cos, # weekly cycle ], dtype=np.float32) # ── ExF NPZ loader (per-date, cached) ───────────────────────────────────── class ExFCache: """Loads ExF NPZ once per date directory, provides field lookup.""" def __init__(self, eigen_base: Path): self._base = eigen_base self._current_date: Optional[str] = None self._lookup: Optional[dict] = None def get(self, date_str: str) -> Optional[dict]: if date_str == self._current_date: return self._lookup self._current_date = date_str self._lookup = None date_dir = self._base / date_str # Find ANY __Indicators.npz in this dir npz_files = list(date_dir.glob('*__Indicators.npz')) if not npz_files: return None try: d = np.load(npz_files[0], allow_pickle=True) names = list(d['api_names']) vals = d['api_indicators'] ok = d['api_success'] self._lookup = {n: float(v) for n, v, s in zip(names, vals, ok) if s and float(v) != 0} except Exception: self._lookup = None return self._lookup # ── Streaming generators (memory-efficient) ─────────────────────────────── def _stream_ng1_ng2() -> Iterator[np.ndarray]: import os for ng_dir in [NG1_DIR, NG2_DIR]: if not ng_dir.exists(): continue # Use os.scandir (non-sorted) — much faster than sorted(rglob) on 300K+ files # NG1/NG2 files are all at the top level for entry in os.scandir(str(ng_dir)): f = Path(entry.path) if not (entry.name.startswith('regime_result_') and entry.name.endswith('.json')): continue try: txt = f.read_text(encoding='utf-8', errors='replace') d = json.loads(txt) ts = _parse_ts(d.get('timestamp', '')) if ts is None: continue bull = float(d.get('up_ratio', 0)) * 100 bear = float(d.get('down_ratio', 0)) * 100 t0 = _tier0(bull, bear, ts, False) t4 = _tier4(ts) row = np.zeros(TOTAL, dtype=np.float32) row[OFF[0]:OFF[0]+T0] = t0 row[OFF[4]:OFF[4]+T4] = t4 yield row except Exception: continue def _stream_ng4() -> Iterator[np.ndarray]: if not NG4_DIR.exists(): return log_re = re.compile( r'(\d{4}-\d{2}-\d{2}T[\d:.]+Z).*REGIME STATUS: \w+ \| Bull: ([\d.]+)% Bear: ([\d.]+)%' ) for f in sorted(NG4_DIR.glob('*.txt')): try: for line in f.read_text(encoding='utf-8', errors='replace').splitlines(): m = log_re.search(line) if not m: continue ts = _parse_ts(m.group(1).replace('T', ' ').rstrip('Z')) if ts is None: continue bull, bear = float(m.group(2)), float(m.group(3)) t0 = _tier0(bull, bear, ts, False) t4 = _tier4(ts) row = np.zeros(TOTAL, dtype=np.float32) row[OFF[0]:OFF[0]+T0] = t0 row[OFF[4]:OFF[4]+T4] = t4 yield row except Exception: continue def _stream_ng5_local() -> Iterator[np.ndarray]: import os if not NG5_DIR.exists(): return for entry in os.scandir(str(NG5_DIR)): f = Path(entry.path) if not (entry.name.startswith('regime_result_') and entry.name.endswith('.json')): continue try: d = json.loads(f.read_text(encoding='utf-8', errors='replace')) ts = _parse_ts(str(d.get('timestamp', ''))) if ts is None: continue bull = float(d.get('bull_pct', 50)) bear = float(d.get('bear_pct', 50)) mwr = d.get('multi_window_results') or {} pricing = d.get('pricing_data') or {} t1, has_eigen = _tier1(mwr) t2, has_price = _tier2(pricing) t0 = _tier0(bull, bear, ts, has_eigen) t4 = _tier4(ts) row = np.zeros(TOTAL, dtype=np.float32) row[OFF[0]:OFF[0]+T0] = t0 row[OFF[1]:OFF[1]+T1] = t1 row[OFF[2]:OFF[2]+T2] = t2 row[OFF[4]:OFF[4]+T4] = t4 # No ExF for NG5 local (no companion NPZ per scan) yield row except Exception: continue def _stream_ng3_scans(exf_cache: ExFCache, date_from: str = '2025-12-31', max_per_day: Optional[int] = None) -> Iterator[np.ndarray]: """ Stream NG3/NG5 scan JSONs one at a time — never accumulates in memory. ExF loaded once per date from companion NPZ. max_per_day: limit scans per day (subsample for very long training days). """ if not NG3_EIGEN.exists(): return date_dirs = sorted( d for d in NG3_EIGEN.iterdir() if d.is_dir() and not d.name.endswith('_SKIP') and d.name >= date_from ) for date_dir in date_dirs: exf = exf_cache.get(date_dir.name) t3 = _tier3(exf) day_count = 0 for f in sorted(date_dir.glob('scan_*.json')): if '__Indicators' in f.name: continue if max_per_day and day_count >= max_per_day: break try: # Read and immediately parse — don't accumulate txt = f.read_text(encoding='utf-8', errors='replace') d = json.loads(txt) ts = _parse_ts(str(d.get('timestamp', ''))) if ts is None: continue windows = d.get('windows') or d.get('multi_window_results') or {} pricing = d.get('pricing_data') or {} pc = pricing.get('price_changes', {}) if pc: vs = list(pc.values()) bull = 100.0 * sum(1 for v in vs if float(v) > 0) / max(len(vs), 1) bear = 100.0 * sum(1 for v in vs if float(v) < 0) / max(len(vs), 1) else: bull, bear = 50.0, 50.0 t1, has_eigen = _tier1(windows) t2, _ = _tier2(pricing) t0 = _tier0(bull, bear, ts, has_eigen) t4 = _tier4(ts) row = np.zeros(TOTAL, dtype=np.float32) row[OFF[0]:OFF[0]+T0] = t0 row[OFF[1]:OFF[1]+T1] = t1 row[OFF[2]:OFF[2]+T2] = t2 row[OFF[3]:OFF[3]+T3] = t3 # same ExF for all scans of this day row[OFF[4]:OFF[4]+T4] = t4 yield row day_count += 1 del d, txt # explicit release except Exception: continue # ── Master corpus builder ────────────────────────────────────────────────── class DolphinCorpus: """ Unified DOLPHIN corpus across all generations, 5 tiers, 111 dims. Attributes: X : (N, 111) float32 — the feature matrix mask : (N, 5) bool — [t0, t1_eigen, t2_price, t3_exf, t4_esof] sources : (N,) int8 — 0=NG1/2, 1=NG4, 2=NG5-local, 3=NG3-scan """ DIMS = DIMS TOTAL = TOTAL OFF = OFF def __init__(self): self.X = None self.mask = None self.sources = None def build(self, ng3_date_from: str = '2025-12-31', max_scans_per_day: Optional[int] = None, max_per_source: Optional[int] = None, max_ng5: int = 3_000, chunk_size: int = 50_000, verbose: bool = True) -> 'DolphinCorpus': """ Memory-efficient build using streaming generators. chunk_size: accumulate this many rows before extending array. max_per_source: cap rows from NG1/NG2/NG4 (breadth-only sources). max_ng5: separate cap for NG5-local (files are larger, reads ~26/s). """ print("Building DOLPHIN multi-generation corpus (streaming)...", flush=True) exf_cache = ExFCache(NG3_EIGEN) # Per-source caps: NG5-local is separately capped (slow reads) _caps = { 0: max_per_source, # NG1/NG2 1: max_per_source, # NG4 2: max_ng5, # NG5-local — separate low cap 3: None, # NG3-scan — limited by max_scans_per_day } sources_list = [ (0, _stream_ng1_ng2(), "NG1/NG2"), (1, _stream_ng4(), "NG4"), (2, _stream_ng5_local(), "NG5-local"), (3, _stream_ng3_scans(exf_cache, ng3_date_from, max_scans_per_day), "NG3-scan"), ] all_chunks, all_src = [], [] buf_rows, buf_src = [], [] total = 0 for src_id, gen, name in sources_list: src_count = 0 cap = _caps.get(src_id) for row in gen: buf_rows.append(row) buf_src.append(src_id) src_count += 1 total += 1 if len(buf_rows) >= chunk_size: all_chunks.append(np.array(buf_rows, dtype=np.float32)) all_src.extend(buf_src) buf_rows.clear(); buf_src.clear() if verbose: print(f" {name}: {src_count:,} (total so far: {total:,})", flush=True) if cap and src_count >= cap: break if verbose: print(f" {name}: {src_count:,} samples", flush=True) # Flush remainder if buf_rows: all_chunks.append(np.array(buf_rows, dtype=np.float32)) all_src.extend(buf_src) self.X = np.vstack(all_chunks) if all_chunks else np.empty((0, TOTAL), dtype=np.float32) self.sources = np.array(all_src, dtype=np.int8) np.nan_to_num(self.X, copy=False, nan=0.0, posinf=0.0, neginf=0.0) # Build mask from has_eigen flag (bit7 of T0) and non-zero tiers has_eigen = self.X[:, OFF[0] + 7] > 0.5 # T0[-1] has_price = np.any(self.X[:, OFF[2]:OFF[2]+T2] != 0, axis=1) has_exf = np.any(self.X[:, OFF[3]:OFF[3]+T3] != 0, axis=1) self.mask = np.column_stack([ np.ones(len(self.X), dtype=bool), # T0 always has_eigen, has_price, has_exf, np.ones(len(self.X), dtype=bool), # T4 always (computed) ]) if verbose: print(f"\nCorpus summary:") print(f" Total : {len(self.X):,}") print(f" Shape : {self.X.shape} ({self.X.nbytes/1e6:.0f} MB)") print(f" T1 eigen : {self.mask[:,1].sum():,} ({100*self.mask[:,1].mean():.1f}%)") print(f" T2 price : {self.mask[:,2].sum():,} ({100*self.mask[:,2].mean():.1f}%)") print(f" T3 exf : {self.mask[:,3].sum():,} ({100*self.mask[:,3].mean():.1f}%)") return self def save(self, path: str): p = path if path.endswith('.npz') else path + '.npz' np.savez_compressed(p, X=self.X, mask=self.mask, sources=self.sources) print(f"Corpus saved: {p} ({self.X.nbytes/1e6:.0f} MB uncompressed, compressed ~10x)") @classmethod def load(cls, path: str) -> 'DolphinCorpus': c = cls() p = path if path.endswith('.npz') else path + '.npz' d = np.load(p) c.X, c.mask, c.sources = d['X'], d['mask'], d['sources'] print(f"Corpus loaded: {len(c.X):,} samples, {c.X.shape[1]} dims") return c # ── Tier slices ───────────────────────────────────────────────────── def t0(self): return self.X[:, OFF[0]:OFF[0]+T0] def t1(self): return self.X[:, OFF[1]:OFF[1]+T1] def t2(self): return self.X[:, OFF[2]:OFF[2]+T2] def t3(self): return self.X[:, OFF[3]:OFF[3]+T3] def t4(self): return self.X[:, OFF[4]:OFF[4]+T4] def tier_names(self): return ['breadth+time', 'eigenvalues', 'per-asset-vol', 'ExF-macro', 'EsoF'] def describe(self): print(f"Corpus: N={len(self.X):,} dims={TOTAL} ({self.X.nbytes/1e6:.0f}MB)") print(f"Tiers: {list(zip(self.tier_names(), DIMS))}") print(f"Masks: {[(t, self.mask[:,i].sum()) for i, t in enumerate(self.tier_names())]}") src_names = {0: 'NG1/2', 1: 'NG4', 2: 'NG5-local', 3: 'NG3-scan'} for sid, name in src_names.items(): n = (self.sources == sid).sum() if n > 0: print(f" {name:12s}: {n:,}") if __name__ == '__main__': import sys max_per_day = int(sys.argv[1]) if len(sys.argv) > 1 else None corpus = DolphinCorpus().build(verbose=True, max_scans_per_day=max_per_day) corpus.save(str(HERE / 'corpus_cache')) corpus.describe()