Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
549 lines
23 KiB
Python
Executable File
549 lines
23 KiB
Python
Executable File
"""
|
||
DOLPHIN Multi-Generation Corpus Builder (Memory-Efficient, 5-Tier)
|
||
====================================================================
|
||
Loads ALL available Dolphin data into a unified feature matrix.
|
||
|
||
TIERS (distinct, layered, can be frozen/trained independently):
|
||
Tier 0 (8 dims) ALWAYS — breadth (bull/bear), cyclic time, has_eigen flag
|
||
Tier 1 (20 dims) NG3+ — eigenvalue structure: 4 windows × 5 features
|
||
Tier 2 (50 dims) NG3+ — per-asset volatility cross-section (50 symbols)
|
||
Tier 3 (25 dims) NG3+ — ExF macro indicators (dvol, fng, funding, OI, etc.)
|
||
Tier 4 (8 dims) ALWAYS — EsoF: lunar, fibonacci, session, cycle (computed)
|
||
|
||
Total: 111 dims. Missing tiers are zero-filled; mask tracks availability.
|
||
|
||
Memory strategy:
|
||
- NEVER accumulate raw JSON dicts — parse → extract → discard immediately
|
||
- Write to memory-mapped numpy array (np.memmap) in fixed-size chunks
|
||
- Per-date ExF NPZ loaded once and reused for all scans of that day
|
||
- Pre-allocate output array based on estimated sample count
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
import math
|
||
import numpy as np
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
from typing import Optional, Iterator, Tuple
|
||
|
||
# ── Paths ──────────────────────────────────────────────────────────────────
|
||
BASE = Path(r"C:\Users\Lenovo\Documents")
|
||
NG1_DIR = BASE / "- Dolphin NG"
|
||
NG2_DIR = BASE / "- Dolphin NG2"
|
||
NG4_DIR = BASE / "- DOLPHIN NG4" / "- Results"
|
||
NG5_DIR = BASE / "- Dolphin NG5"
|
||
NG3_EIGEN = BASE / "- Dolphin NG HD (NG3)" / "correlation_arb512" / "eigenvalues"
|
||
HERE = Path(__file__).parent
|
||
|
||
# ── Tier dimensions ────────────────────────────────────────────────────────
|
||
T0 = 8 # breadth + time + flag
|
||
T1 = 20 # eigenvalues (4 windows × 5)
|
||
T2 = 50 # per-asset volatility
|
||
T3 = 25 # ExF macro indicators
|
||
T4 = 8 # EsoF esoteric
|
||
|
||
DIMS = [T0, T1, T2, T3, T4]
|
||
TOTAL = sum(DIMS) # 111
|
||
OFF = [0, T0, T0+T1, T0+T1+T2, T0+T1+T2+T3] # slice offsets
|
||
|
||
WINDOWS = [50, 150, 300, 750]
|
||
EPS = 1e-8
|
||
|
||
# ── ExF indicator selection (from the 85-field NPZ, keep reliable ones) ───
|
||
EXF_FIELDS = [
|
||
'dvol_btc', 'dvol_eth', # implied vol
|
||
'fng', 'fng_prev', # fear & greed
|
||
'btc_dom', 'eth_dom', # dominance
|
||
'chg24_btc', 'chg24_eth', # 24h returns
|
||
'dispersion', 'correlation', # cross-market
|
||
'imbal_btc', 'imbal_eth', # OB imbalance
|
||
'funding_btc', 'funding_eth', # perp funding
|
||
'mvrv', # on-chain
|
||
'tvl', # DeFi
|
||
'pcr_vol', 'pcr_oi', # options
|
||
'basis', 'liq_proxy', # futures
|
||
'spread', 'vol24', # microstructure
|
||
'hashrate', # mining
|
||
'btc_price', # price level
|
||
'fng_vol', # FnG volatility component
|
||
]
|
||
assert len(EXF_FIELDS) == T3, f"EXF_FIELDS len={len(EXF_FIELDS)} != T3={T3}"
|
||
|
||
# ExF normalisation constants (robust: divide by median absolute scale)
|
||
EXF_SCALE = {
|
||
'dvol_btc': 50.0, 'dvol_eth': 50.0,
|
||
'fng': 50.0, 'fng_prev': 50.0,
|
||
'btc_dom': 50.0, 'eth_dom': 10.0,
|
||
'chg24_btc': 5.0, 'chg24_eth': 5.0,
|
||
'dispersion': 5.0, 'correlation': 1.0,
|
||
'imbal_btc': 1.0, 'imbal_eth': 1.0,
|
||
'funding_btc': 0.001, 'funding_eth': 0.001,
|
||
'mvrv': 3.0,
|
||
'tvl': 1e11,
|
||
'pcr_vol': 1.0, 'pcr_oi': 1.0,
|
||
'basis': 0.1, 'liq_proxy': 1.0,
|
||
'spread': 0.01, 'vol24': 1e10,
|
||
'hashrate': 1e9,
|
||
'btc_price': 1e5,
|
||
}
|
||
|
||
|
||
# ── Time helpers ───────────────────────────────────────────────────────────
|
||
|
||
def _parse_ts(s: str) -> Optional[datetime]:
|
||
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S",
|
||
"%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S"):
|
||
try:
|
||
return datetime.strptime(str(s)[:26], fmt)
|
||
except ValueError:
|
||
continue
|
||
return None
|
||
|
||
|
||
def _tier0(bull_pct: float, bear_pct: float, ts: datetime, has_eigen: bool) -> np.ndarray:
|
||
bull = np.clip(bull_pct / 100.0, 0, 1)
|
||
bear = np.clip(bear_pct / 100.0, 0, 1)
|
||
side = max(0.0, 1.0 - bull - bear)
|
||
h = ts.hour + ts.minute / 60.0
|
||
d = ts.weekday()
|
||
return np.array([
|
||
bull, bear, side,
|
||
math.sin(2 * math.pi * h / 24),
|
||
math.cos(2 * math.pi * h / 24),
|
||
math.sin(2 * math.pi * d / 7),
|
||
math.cos(2 * math.pi * d / 7),
|
||
1.0 if has_eigen else 0.0,
|
||
], dtype=np.float32)
|
||
|
||
|
||
def _tier1(windows: dict) -> Tuple[np.ndarray, bool]:
|
||
vec = np.zeros(T1, dtype=np.float32)
|
||
if not windows:
|
||
return vec, False
|
||
valid = False
|
||
for i, w in enumerate(WINDOWS):
|
||
wdata = windows.get(w) or windows.get(str(w)) or {}
|
||
td = wdata.get('tracking_data') or wdata
|
||
rs = wdata.get('regime_signals') or {}
|
||
lmax = float(td.get('lambda_max', 0) or 0)
|
||
if lmax > 0:
|
||
valid = True
|
||
vel = float(td.get('lambda_max_velocity', 0) or 0)
|
||
gap = float(td.get('eigenvalue_gap', 0) or 0)
|
||
inst = float(rs.get('instability_score', 0) or 0)
|
||
rtp = float(rs.get('regime_transition_probability', 0) or 0)
|
||
log_lmax = math.log(max(lmax, 1e-6))
|
||
vel_norm = np.clip(vel / (abs(lmax) + EPS), -5, 5)
|
||
gap_ratio = np.clip(gap / (lmax + EPS), 0, 10)
|
||
base = i * 5
|
||
vec[base] = np.float32(np.clip(log_lmax / 10.0, -3, 3))
|
||
vec[base+1] = np.float32(vel_norm)
|
||
vec[base+2] = np.float32(gap_ratio)
|
||
vec[base+3] = np.float32(np.clip(inst, 0, 1))
|
||
vec[base+4] = np.float32(np.clip(rtp, 0, 1))
|
||
return vec, valid
|
||
|
||
|
||
def _tier2(pricing: dict) -> Tuple[np.ndarray, bool]:
|
||
vec = np.zeros(T2, dtype=np.float32)
|
||
vol = (pricing or {}).get('volatility') or {}
|
||
if not vol:
|
||
return vec, False
|
||
vals = np.array(list(vol.values())[:T2], dtype=np.float32)
|
||
if len(vals) == 0:
|
||
return vec, False
|
||
mu, sd = vals.mean(), vals.std() + EPS
|
||
vals = np.clip((vals - mu) / sd, -5, 5)
|
||
n = min(T2, len(vals))
|
||
vec[:n] = vals[:n]
|
||
return vec, True
|
||
|
||
|
||
def _tier3(exf_lookup: Optional[dict]) -> np.ndarray:
|
||
"""Extract ExF Tier-3 vector from per-date indicator dict."""
|
||
vec = np.zeros(T3, dtype=np.float32)
|
||
if not exf_lookup:
|
||
return vec
|
||
for i, field in enumerate(EXF_FIELDS):
|
||
v = exf_lookup.get(field, 0.0) or 0.0
|
||
scale = EXF_SCALE.get(field, 1.0)
|
||
vec[i] = np.float32(np.clip(float(v) / scale, -10, 10))
|
||
return vec
|
||
|
||
|
||
def _tier4(ts) -> np.ndarray:
|
||
"""
|
||
EsoF Tier-4: 8 computed esoteric features from timestamp alone.
|
||
Accepts Unix float timestamp OR datetime object.
|
||
No external data needed — all derived from ts.
|
||
"""
|
||
import calendar as cal_mod
|
||
# Normalise to both float-seconds and datetime
|
||
if isinstance(ts, (int, float)):
|
||
ts_f = float(ts)
|
||
dt = datetime.utcfromtimestamp(ts_f)
|
||
else:
|
||
dt = ts
|
||
ts_f = dt.timestamp()
|
||
# Moon illumination approx (simplified Meeus formula)
|
||
# JD of Unix epoch (1970-01-01 00:00 UTC) = 2440587.5
|
||
jd = 2440587.5 + ts_f / 86400.0
|
||
D = jd - 2451545.0 # days since J2000.0
|
||
# Moon phase angle (degrees)
|
||
moon_age = (D % 29.53058867) / 29.53058867 # 0=new, 0.5=full
|
||
moon_illum = 0.5 * (1 - math.cos(2 * math.pi * moon_age))
|
||
# Mercury retrograde cycles (~3x/year, each ~21 days) — simplified
|
||
merc_cycle = (D % 115.88) / 115.88
|
||
merc_retro = 1.0 if 0.82 < merc_cycle < 1.0 else 0.0 # last ~18/115 of cycle
|
||
# Fibonacci time: minutes into day
|
||
mins = dt.hour * 60 + dt.minute
|
||
fib_mins = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1440]
|
||
dists = [abs(mins - f) for f in fib_mins]
|
||
fib_proximity = 1.0 / (1.0 + min(dists) / 60.0) # 1=at fib, 0=far
|
||
# Session (0=Asia, 0.33=London, 0.67=NY, 1=Close)
|
||
h = dt.hour + dt.minute / 60.0
|
||
if 0 <= h < 7: session = 0.0
|
||
elif 7 <= h < 13: session = 0.33
|
||
elif 13 <= h < 21: session = 0.67
|
||
else: session = 1.0
|
||
# Market cycle position (annual)
|
||
doy = dt.timetuple().tm_yday
|
||
days_in_year = 366 if cal_mod.isleap(dt.year) else 365
|
||
cycle_pos = doy / days_in_year
|
||
# Day of week sin/cos (weekly cycle)
|
||
dow_sin = math.sin(2 * math.pi * dt.weekday() / 7)
|
||
dow_cos = math.cos(2 * math.pi * dt.weekday() / 7)
|
||
return np.array([
|
||
moon_illum, # lunar phase
|
||
moon_age, # 0=new, 0.5=full, 1=new
|
||
merc_retro, # binary: Mercury Rx
|
||
fib_proximity, # nearness to Fibonacci time
|
||
session, # liquidity session
|
||
cycle_pos, # annual cycle position
|
||
dow_sin, dow_cos, # weekly cycle
|
||
], dtype=np.float32)
|
||
|
||
|
||
# ── ExF NPZ loader (per-date, cached) ─────────────────────────────────────
|
||
|
||
class ExFCache:
|
||
"""Loads ExF NPZ once per date directory, provides field lookup."""
|
||
def __init__(self, eigen_base: Path):
|
||
self._base = eigen_base
|
||
self._current_date: Optional[str] = None
|
||
self._lookup: Optional[dict] = None
|
||
|
||
def get(self, date_str: str) -> Optional[dict]:
|
||
if date_str == self._current_date:
|
||
return self._lookup
|
||
self._current_date = date_str
|
||
self._lookup = None
|
||
date_dir = self._base / date_str
|
||
# Find ANY __Indicators.npz in this dir
|
||
npz_files = list(date_dir.glob('*__Indicators.npz'))
|
||
if not npz_files:
|
||
return None
|
||
try:
|
||
d = np.load(npz_files[0], allow_pickle=True)
|
||
names = list(d['api_names'])
|
||
vals = d['api_indicators']
|
||
ok = d['api_success']
|
||
self._lookup = {n: float(v) for n, v, s in zip(names, vals, ok) if s and float(v) != 0}
|
||
except Exception:
|
||
self._lookup = None
|
||
return self._lookup
|
||
|
||
|
||
# ── Streaming generators (memory-efficient) ───────────────────────────────
|
||
|
||
def _stream_ng1_ng2() -> Iterator[np.ndarray]:
|
||
import os
|
||
for ng_dir in [NG1_DIR, NG2_DIR]:
|
||
if not ng_dir.exists():
|
||
continue
|
||
# Use os.scandir (non-sorted) — much faster than sorted(rglob) on 300K+ files
|
||
# NG1/NG2 files are all at the top level
|
||
for entry in os.scandir(str(ng_dir)):
|
||
f = Path(entry.path)
|
||
if not (entry.name.startswith('regime_result_') and entry.name.endswith('.json')):
|
||
continue
|
||
try:
|
||
txt = f.read_text(encoding='utf-8', errors='replace')
|
||
d = json.loads(txt)
|
||
ts = _parse_ts(d.get('timestamp', ''))
|
||
if ts is None:
|
||
continue
|
||
bull = float(d.get('up_ratio', 0)) * 100
|
||
bear = float(d.get('down_ratio', 0)) * 100
|
||
t0 = _tier0(bull, bear, ts, False)
|
||
t4 = _tier4(ts)
|
||
row = np.zeros(TOTAL, dtype=np.float32)
|
||
row[OFF[0]:OFF[0]+T0] = t0
|
||
row[OFF[4]:OFF[4]+T4] = t4
|
||
yield row
|
||
except Exception:
|
||
continue
|
||
|
||
|
||
def _stream_ng4() -> Iterator[np.ndarray]:
|
||
if not NG4_DIR.exists():
|
||
return
|
||
log_re = re.compile(
|
||
r'(\d{4}-\d{2}-\d{2}T[\d:.]+Z).*REGIME STATUS: \w+ \| Bull: ([\d.]+)% Bear: ([\d.]+)%'
|
||
)
|
||
for f in sorted(NG4_DIR.glob('*.txt')):
|
||
try:
|
||
for line in f.read_text(encoding='utf-8', errors='replace').splitlines():
|
||
m = log_re.search(line)
|
||
if not m:
|
||
continue
|
||
ts = _parse_ts(m.group(1).replace('T', ' ').rstrip('Z'))
|
||
if ts is None:
|
||
continue
|
||
bull, bear = float(m.group(2)), float(m.group(3))
|
||
t0 = _tier0(bull, bear, ts, False)
|
||
t4 = _tier4(ts)
|
||
row = np.zeros(TOTAL, dtype=np.float32)
|
||
row[OFF[0]:OFF[0]+T0] = t0
|
||
row[OFF[4]:OFF[4]+T4] = t4
|
||
yield row
|
||
except Exception:
|
||
continue
|
||
|
||
|
||
def _stream_ng5_local() -> Iterator[np.ndarray]:
|
||
import os
|
||
if not NG5_DIR.exists():
|
||
return
|
||
for entry in os.scandir(str(NG5_DIR)):
|
||
f = Path(entry.path)
|
||
if not (entry.name.startswith('regime_result_') and entry.name.endswith('.json')):
|
||
continue
|
||
try:
|
||
d = json.loads(f.read_text(encoding='utf-8', errors='replace'))
|
||
ts = _parse_ts(str(d.get('timestamp', '')))
|
||
if ts is None:
|
||
continue
|
||
bull = float(d.get('bull_pct', 50))
|
||
bear = float(d.get('bear_pct', 50))
|
||
mwr = d.get('multi_window_results') or {}
|
||
pricing = d.get('pricing_data') or {}
|
||
t1, has_eigen = _tier1(mwr)
|
||
t2, has_price = _tier2(pricing)
|
||
t0 = _tier0(bull, bear, ts, has_eigen)
|
||
t4 = _tier4(ts)
|
||
row = np.zeros(TOTAL, dtype=np.float32)
|
||
row[OFF[0]:OFF[0]+T0] = t0
|
||
row[OFF[1]:OFF[1]+T1] = t1
|
||
row[OFF[2]:OFF[2]+T2] = t2
|
||
row[OFF[4]:OFF[4]+T4] = t4
|
||
# No ExF for NG5 local (no companion NPZ per scan)
|
||
yield row
|
||
except Exception:
|
||
continue
|
||
|
||
|
||
def _stream_ng3_scans(exf_cache: ExFCache,
|
||
date_from: str = '2025-12-31',
|
||
max_per_day: Optional[int] = None) -> Iterator[np.ndarray]:
|
||
"""
|
||
Stream NG3/NG5 scan JSONs one at a time — never accumulates in memory.
|
||
ExF loaded once per date from companion NPZ.
|
||
max_per_day: limit scans per day (subsample for very long training days).
|
||
"""
|
||
if not NG3_EIGEN.exists():
|
||
return
|
||
date_dirs = sorted(
|
||
d for d in NG3_EIGEN.iterdir()
|
||
if d.is_dir() and not d.name.endswith('_SKIP') and d.name >= date_from
|
||
)
|
||
for date_dir in date_dirs:
|
||
exf = exf_cache.get(date_dir.name)
|
||
t3 = _tier3(exf)
|
||
day_count = 0
|
||
for f in sorted(date_dir.glob('scan_*.json')):
|
||
if '__Indicators' in f.name:
|
||
continue
|
||
if max_per_day and day_count >= max_per_day:
|
||
break
|
||
try:
|
||
# Read and immediately parse — don't accumulate
|
||
txt = f.read_text(encoding='utf-8', errors='replace')
|
||
d = json.loads(txt)
|
||
ts = _parse_ts(str(d.get('timestamp', '')))
|
||
if ts is None:
|
||
continue
|
||
windows = d.get('windows') or d.get('multi_window_results') or {}
|
||
pricing = d.get('pricing_data') or {}
|
||
pc = pricing.get('price_changes', {})
|
||
if pc:
|
||
vs = list(pc.values())
|
||
bull = 100.0 * sum(1 for v in vs if float(v) > 0) / max(len(vs), 1)
|
||
bear = 100.0 * sum(1 for v in vs if float(v) < 0) / max(len(vs), 1)
|
||
else:
|
||
bull, bear = 50.0, 50.0
|
||
t1, has_eigen = _tier1(windows)
|
||
t2, _ = _tier2(pricing)
|
||
t0 = _tier0(bull, bear, ts, has_eigen)
|
||
t4 = _tier4(ts)
|
||
row = np.zeros(TOTAL, dtype=np.float32)
|
||
row[OFF[0]:OFF[0]+T0] = t0
|
||
row[OFF[1]:OFF[1]+T1] = t1
|
||
row[OFF[2]:OFF[2]+T2] = t2
|
||
row[OFF[3]:OFF[3]+T3] = t3 # same ExF for all scans of this day
|
||
row[OFF[4]:OFF[4]+T4] = t4
|
||
yield row
|
||
day_count += 1
|
||
del d, txt # explicit release
|
||
except Exception:
|
||
continue
|
||
|
||
|
||
# ── Master corpus builder ──────────────────────────────────────────────────
|
||
|
||
class DolphinCorpus:
|
||
"""
|
||
Unified DOLPHIN corpus across all generations, 5 tiers, 111 dims.
|
||
|
||
Attributes:
|
||
X : (N, 111) float32 — the feature matrix
|
||
mask : (N, 5) bool — [t0, t1_eigen, t2_price, t3_exf, t4_esof]
|
||
sources : (N,) int8 — 0=NG1/2, 1=NG4, 2=NG5-local, 3=NG3-scan
|
||
"""
|
||
|
||
DIMS = DIMS
|
||
TOTAL = TOTAL
|
||
OFF = OFF
|
||
|
||
def __init__(self):
|
||
self.X = None
|
||
self.mask = None
|
||
self.sources = None
|
||
|
||
def build(self,
|
||
ng3_date_from: str = '2025-12-31',
|
||
max_scans_per_day: Optional[int] = None,
|
||
max_per_source: Optional[int] = None,
|
||
max_ng5: int = 3_000,
|
||
chunk_size: int = 50_000,
|
||
verbose: bool = True) -> 'DolphinCorpus':
|
||
"""
|
||
Memory-efficient build using streaming generators.
|
||
chunk_size: accumulate this many rows before extending array.
|
||
max_per_source: cap rows from NG1/NG2/NG4 (breadth-only sources).
|
||
max_ng5: separate cap for NG5-local (files are larger, reads ~26/s).
|
||
"""
|
||
print("Building DOLPHIN multi-generation corpus (streaming)...", flush=True)
|
||
exf_cache = ExFCache(NG3_EIGEN)
|
||
|
||
# Per-source caps: NG5-local is separately capped (slow reads)
|
||
_caps = {
|
||
0: max_per_source, # NG1/NG2
|
||
1: max_per_source, # NG4
|
||
2: max_ng5, # NG5-local — separate low cap
|
||
3: None, # NG3-scan — limited by max_scans_per_day
|
||
}
|
||
|
||
sources_list = [
|
||
(0, _stream_ng1_ng2(), "NG1/NG2"),
|
||
(1, _stream_ng4(), "NG4"),
|
||
(2, _stream_ng5_local(), "NG5-local"),
|
||
(3, _stream_ng3_scans(exf_cache, ng3_date_from, max_scans_per_day), "NG3-scan"),
|
||
]
|
||
|
||
all_chunks, all_src = [], []
|
||
buf_rows, buf_src = [], []
|
||
total = 0
|
||
|
||
for src_id, gen, name in sources_list:
|
||
src_count = 0
|
||
cap = _caps.get(src_id)
|
||
for row in gen:
|
||
buf_rows.append(row)
|
||
buf_src.append(src_id)
|
||
src_count += 1
|
||
total += 1
|
||
if len(buf_rows) >= chunk_size:
|
||
all_chunks.append(np.array(buf_rows, dtype=np.float32))
|
||
all_src.extend(buf_src)
|
||
buf_rows.clear(); buf_src.clear()
|
||
if verbose:
|
||
print(f" {name}: {src_count:,} (total so far: {total:,})", flush=True)
|
||
if cap and src_count >= cap:
|
||
break
|
||
if verbose:
|
||
print(f" {name}: {src_count:,} samples", flush=True)
|
||
|
||
# Flush remainder
|
||
if buf_rows:
|
||
all_chunks.append(np.array(buf_rows, dtype=np.float32))
|
||
all_src.extend(buf_src)
|
||
|
||
self.X = np.vstack(all_chunks) if all_chunks else np.empty((0, TOTAL), dtype=np.float32)
|
||
self.sources = np.array(all_src, dtype=np.int8)
|
||
np.nan_to_num(self.X, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
|
||
|
||
# Build mask from has_eigen flag (bit7 of T0) and non-zero tiers
|
||
has_eigen = self.X[:, OFF[0] + 7] > 0.5 # T0[-1]
|
||
has_price = np.any(self.X[:, OFF[2]:OFF[2]+T2] != 0, axis=1)
|
||
has_exf = np.any(self.X[:, OFF[3]:OFF[3]+T3] != 0, axis=1)
|
||
self.mask = np.column_stack([
|
||
np.ones(len(self.X), dtype=bool), # T0 always
|
||
has_eigen,
|
||
has_price,
|
||
has_exf,
|
||
np.ones(len(self.X), dtype=bool), # T4 always (computed)
|
||
])
|
||
|
||
if verbose:
|
||
print(f"\nCorpus summary:")
|
||
print(f" Total : {len(self.X):,}")
|
||
print(f" Shape : {self.X.shape} ({self.X.nbytes/1e6:.0f} MB)")
|
||
print(f" T1 eigen : {self.mask[:,1].sum():,} ({100*self.mask[:,1].mean():.1f}%)")
|
||
print(f" T2 price : {self.mask[:,2].sum():,} ({100*self.mask[:,2].mean():.1f}%)")
|
||
print(f" T3 exf : {self.mask[:,3].sum():,} ({100*self.mask[:,3].mean():.1f}%)")
|
||
return self
|
||
|
||
def save(self, path: str):
|
||
p = path if path.endswith('.npz') else path + '.npz'
|
||
np.savez_compressed(p, X=self.X, mask=self.mask, sources=self.sources)
|
||
print(f"Corpus saved: {p} ({self.X.nbytes/1e6:.0f} MB uncompressed, compressed ~10x)")
|
||
|
||
@classmethod
|
||
def load(cls, path: str) -> 'DolphinCorpus':
|
||
c = cls()
|
||
p = path if path.endswith('.npz') else path + '.npz'
|
||
d = np.load(p)
|
||
c.X, c.mask, c.sources = d['X'], d['mask'], d['sources']
|
||
print(f"Corpus loaded: {len(c.X):,} samples, {c.X.shape[1]} dims")
|
||
return c
|
||
|
||
# ── Tier slices ─────────────────────────────────────────────────────
|
||
def t0(self): return self.X[:, OFF[0]:OFF[0]+T0]
|
||
def t1(self): return self.X[:, OFF[1]:OFF[1]+T1]
|
||
def t2(self): return self.X[:, OFF[2]:OFF[2]+T2]
|
||
def t3(self): return self.X[:, OFF[3]:OFF[3]+T3]
|
||
def t4(self): return self.X[:, OFF[4]:OFF[4]+T4]
|
||
|
||
def tier_names(self):
|
||
return ['breadth+time', 'eigenvalues', 'per-asset-vol', 'ExF-macro', 'EsoF']
|
||
|
||
def describe(self):
|
||
print(f"Corpus: N={len(self.X):,} dims={TOTAL} ({self.X.nbytes/1e6:.0f}MB)")
|
||
print(f"Tiers: {list(zip(self.tier_names(), DIMS))}")
|
||
print(f"Masks: {[(t, self.mask[:,i].sum()) for i, t in enumerate(self.tier_names())]}")
|
||
src_names = {0: 'NG1/2', 1: 'NG4', 2: 'NG5-local', 3: 'NG3-scan'}
|
||
for sid, name in src_names.items():
|
||
n = (self.sources == sid).sum()
|
||
if n > 0:
|
||
print(f" {name:12s}: {n:,}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import sys
|
||
max_per_day = int(sys.argv[1]) if len(sys.argv) > 1 else None
|
||
corpus = DolphinCorpus().build(verbose=True, max_scans_per_day=max_per_day)
|
||
corpus.save(str(HERE / 'corpus_cache'))
|
||
corpus.describe()
|