Files
DOLPHIN/nautilus_dolphin/dvae/corpus_builder.py
hjnormey 01c19662cb initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems:
- prod/ (BLUE harness, configs, scripts, docs)
- nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved)
- adaptive_exit/ (AEM engine + models/bucket_assignments.pkl)
- Observability/ (EsoF advisor, TUI, dashboards)
- external_factors/ (EsoF producer)
- mc_forewarning_qlabs_fork/ (MC regime/envelope)

Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00

549 lines
23 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
DOLPHIN Multi-Generation Corpus Builder (Memory-Efficient, 5-Tier)
====================================================================
Loads ALL available Dolphin data into a unified feature matrix.
TIERS (distinct, layered, can be frozen/trained independently):
Tier 0 (8 dims) ALWAYS — breadth (bull/bear), cyclic time, has_eigen flag
Tier 1 (20 dims) NG3+ — eigenvalue structure: 4 windows × 5 features
Tier 2 (50 dims) NG3+ — per-asset volatility cross-section (50 symbols)
Tier 3 (25 dims) NG3+ — ExF macro indicators (dvol, fng, funding, OI, etc.)
Tier 4 (8 dims) ALWAYS — EsoF: lunar, fibonacci, session, cycle (computed)
Total: 111 dims. Missing tiers are zero-filled; mask tracks availability.
Memory strategy:
- NEVER accumulate raw JSON dicts — parse → extract → discard immediately
- Write to memory-mapped numpy array (np.memmap) in fixed-size chunks
- Per-date ExF NPZ loaded once and reused for all scans of that day
- Pre-allocate output array based on estimated sample count
"""
import json
import re
import math
import numpy as np
from pathlib import Path
from datetime import datetime
from typing import Optional, Iterator, Tuple
# ── Paths ──────────────────────────────────────────────────────────────────
BASE = Path(r"C:\Users\Lenovo\Documents")
NG1_DIR = BASE / "- Dolphin NG"
NG2_DIR = BASE / "- Dolphin NG2"
NG4_DIR = BASE / "- DOLPHIN NG4" / "- Results"
NG5_DIR = BASE / "- Dolphin NG5"
NG3_EIGEN = BASE / "- Dolphin NG HD (NG3)" / "correlation_arb512" / "eigenvalues"
HERE = Path(__file__).parent
# ── Tier dimensions ────────────────────────────────────────────────────────
T0 = 8 # breadth + time + flag
T1 = 20 # eigenvalues (4 windows × 5)
T2 = 50 # per-asset volatility
T3 = 25 # ExF macro indicators
T4 = 8 # EsoF esoteric
DIMS = [T0, T1, T2, T3, T4]
TOTAL = sum(DIMS) # 111
OFF = [0, T0, T0+T1, T0+T1+T2, T0+T1+T2+T3] # slice offsets
WINDOWS = [50, 150, 300, 750]
EPS = 1e-8
# ── ExF indicator selection (from the 85-field NPZ, keep reliable ones) ───
EXF_FIELDS = [
'dvol_btc', 'dvol_eth', # implied vol
'fng', 'fng_prev', # fear & greed
'btc_dom', 'eth_dom', # dominance
'chg24_btc', 'chg24_eth', # 24h returns
'dispersion', 'correlation', # cross-market
'imbal_btc', 'imbal_eth', # OB imbalance
'funding_btc', 'funding_eth', # perp funding
'mvrv', # on-chain
'tvl', # DeFi
'pcr_vol', 'pcr_oi', # options
'basis', 'liq_proxy', # futures
'spread', 'vol24', # microstructure
'hashrate', # mining
'btc_price', # price level
'fng_vol', # FnG volatility component
]
assert len(EXF_FIELDS) == T3, f"EXF_FIELDS len={len(EXF_FIELDS)} != T3={T3}"
# ExF normalisation constants (robust: divide by median absolute scale)
EXF_SCALE = {
'dvol_btc': 50.0, 'dvol_eth': 50.0,
'fng': 50.0, 'fng_prev': 50.0,
'btc_dom': 50.0, 'eth_dom': 10.0,
'chg24_btc': 5.0, 'chg24_eth': 5.0,
'dispersion': 5.0, 'correlation': 1.0,
'imbal_btc': 1.0, 'imbal_eth': 1.0,
'funding_btc': 0.001, 'funding_eth': 0.001,
'mvrv': 3.0,
'tvl': 1e11,
'pcr_vol': 1.0, 'pcr_oi': 1.0,
'basis': 0.1, 'liq_proxy': 1.0,
'spread': 0.01, 'vol24': 1e10,
'hashrate': 1e9,
'btc_price': 1e5,
}
# ── Time helpers ───────────────────────────────────────────────────────────
def _parse_ts(s: str) -> Optional[datetime]:
for fmt in ("%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S",
"%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S"):
try:
return datetime.strptime(str(s)[:26], fmt)
except ValueError:
continue
return None
def _tier0(bull_pct: float, bear_pct: float, ts: datetime, has_eigen: bool) -> np.ndarray:
bull = np.clip(bull_pct / 100.0, 0, 1)
bear = np.clip(bear_pct / 100.0, 0, 1)
side = max(0.0, 1.0 - bull - bear)
h = ts.hour + ts.minute / 60.0
d = ts.weekday()
return np.array([
bull, bear, side,
math.sin(2 * math.pi * h / 24),
math.cos(2 * math.pi * h / 24),
math.sin(2 * math.pi * d / 7),
math.cos(2 * math.pi * d / 7),
1.0 if has_eigen else 0.0,
], dtype=np.float32)
def _tier1(windows: dict) -> Tuple[np.ndarray, bool]:
vec = np.zeros(T1, dtype=np.float32)
if not windows:
return vec, False
valid = False
for i, w in enumerate(WINDOWS):
wdata = windows.get(w) or windows.get(str(w)) or {}
td = wdata.get('tracking_data') or wdata
rs = wdata.get('regime_signals') or {}
lmax = float(td.get('lambda_max', 0) or 0)
if lmax > 0:
valid = True
vel = float(td.get('lambda_max_velocity', 0) or 0)
gap = float(td.get('eigenvalue_gap', 0) or 0)
inst = float(rs.get('instability_score', 0) or 0)
rtp = float(rs.get('regime_transition_probability', 0) or 0)
log_lmax = math.log(max(lmax, 1e-6))
vel_norm = np.clip(vel / (abs(lmax) + EPS), -5, 5)
gap_ratio = np.clip(gap / (lmax + EPS), 0, 10)
base = i * 5
vec[base] = np.float32(np.clip(log_lmax / 10.0, -3, 3))
vec[base+1] = np.float32(vel_norm)
vec[base+2] = np.float32(gap_ratio)
vec[base+3] = np.float32(np.clip(inst, 0, 1))
vec[base+4] = np.float32(np.clip(rtp, 0, 1))
return vec, valid
def _tier2(pricing: dict) -> Tuple[np.ndarray, bool]:
vec = np.zeros(T2, dtype=np.float32)
vol = (pricing or {}).get('volatility') or {}
if not vol:
return vec, False
vals = np.array(list(vol.values())[:T2], dtype=np.float32)
if len(vals) == 0:
return vec, False
mu, sd = vals.mean(), vals.std() + EPS
vals = np.clip((vals - mu) / sd, -5, 5)
n = min(T2, len(vals))
vec[:n] = vals[:n]
return vec, True
def _tier3(exf_lookup: Optional[dict]) -> np.ndarray:
"""Extract ExF Tier-3 vector from per-date indicator dict."""
vec = np.zeros(T3, dtype=np.float32)
if not exf_lookup:
return vec
for i, field in enumerate(EXF_FIELDS):
v = exf_lookup.get(field, 0.0) or 0.0
scale = EXF_SCALE.get(field, 1.0)
vec[i] = np.float32(np.clip(float(v) / scale, -10, 10))
return vec
def _tier4(ts) -> np.ndarray:
"""
EsoF Tier-4: 8 computed esoteric features from timestamp alone.
Accepts Unix float timestamp OR datetime object.
No external data needed — all derived from ts.
"""
import calendar as cal_mod
# Normalise to both float-seconds and datetime
if isinstance(ts, (int, float)):
ts_f = float(ts)
dt = datetime.utcfromtimestamp(ts_f)
else:
dt = ts
ts_f = dt.timestamp()
# Moon illumination approx (simplified Meeus formula)
# JD of Unix epoch (1970-01-01 00:00 UTC) = 2440587.5
jd = 2440587.5 + ts_f / 86400.0
D = jd - 2451545.0 # days since J2000.0
# Moon phase angle (degrees)
moon_age = (D % 29.53058867) / 29.53058867 # 0=new, 0.5=full
moon_illum = 0.5 * (1 - math.cos(2 * math.pi * moon_age))
# Mercury retrograde cycles (~3x/year, each ~21 days) — simplified
merc_cycle = (D % 115.88) / 115.88
merc_retro = 1.0 if 0.82 < merc_cycle < 1.0 else 0.0 # last ~18/115 of cycle
# Fibonacci time: minutes into day
mins = dt.hour * 60 + dt.minute
fib_mins = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1440]
dists = [abs(mins - f) for f in fib_mins]
fib_proximity = 1.0 / (1.0 + min(dists) / 60.0) # 1=at fib, 0=far
# Session (0=Asia, 0.33=London, 0.67=NY, 1=Close)
h = dt.hour + dt.minute / 60.0
if 0 <= h < 7: session = 0.0
elif 7 <= h < 13: session = 0.33
elif 13 <= h < 21: session = 0.67
else: session = 1.0
# Market cycle position (annual)
doy = dt.timetuple().tm_yday
days_in_year = 366 if cal_mod.isleap(dt.year) else 365
cycle_pos = doy / days_in_year
# Day of week sin/cos (weekly cycle)
dow_sin = math.sin(2 * math.pi * dt.weekday() / 7)
dow_cos = math.cos(2 * math.pi * dt.weekday() / 7)
return np.array([
moon_illum, # lunar phase
moon_age, # 0=new, 0.5=full, 1=new
merc_retro, # binary: Mercury Rx
fib_proximity, # nearness to Fibonacci time
session, # liquidity session
cycle_pos, # annual cycle position
dow_sin, dow_cos, # weekly cycle
], dtype=np.float32)
# ── ExF NPZ loader (per-date, cached) ─────────────────────────────────────
class ExFCache:
"""Loads ExF NPZ once per date directory, provides field lookup."""
def __init__(self, eigen_base: Path):
self._base = eigen_base
self._current_date: Optional[str] = None
self._lookup: Optional[dict] = None
def get(self, date_str: str) -> Optional[dict]:
if date_str == self._current_date:
return self._lookup
self._current_date = date_str
self._lookup = None
date_dir = self._base / date_str
# Find ANY __Indicators.npz in this dir
npz_files = list(date_dir.glob('*__Indicators.npz'))
if not npz_files:
return None
try:
d = np.load(npz_files[0], allow_pickle=True)
names = list(d['api_names'])
vals = d['api_indicators']
ok = d['api_success']
self._lookup = {n: float(v) for n, v, s in zip(names, vals, ok) if s and float(v) != 0}
except Exception:
self._lookup = None
return self._lookup
# ── Streaming generators (memory-efficient) ───────────────────────────────
def _stream_ng1_ng2() -> Iterator[np.ndarray]:
import os
for ng_dir in [NG1_DIR, NG2_DIR]:
if not ng_dir.exists():
continue
# Use os.scandir (non-sorted) — much faster than sorted(rglob) on 300K+ files
# NG1/NG2 files are all at the top level
for entry in os.scandir(str(ng_dir)):
f = Path(entry.path)
if not (entry.name.startswith('regime_result_') and entry.name.endswith('.json')):
continue
try:
txt = f.read_text(encoding='utf-8', errors='replace')
d = json.loads(txt)
ts = _parse_ts(d.get('timestamp', ''))
if ts is None:
continue
bull = float(d.get('up_ratio', 0)) * 100
bear = float(d.get('down_ratio', 0)) * 100
t0 = _tier0(bull, bear, ts, False)
t4 = _tier4(ts)
row = np.zeros(TOTAL, dtype=np.float32)
row[OFF[0]:OFF[0]+T0] = t0
row[OFF[4]:OFF[4]+T4] = t4
yield row
except Exception:
continue
def _stream_ng4() -> Iterator[np.ndarray]:
if not NG4_DIR.exists():
return
log_re = re.compile(
r'(\d{4}-\d{2}-\d{2}T[\d:.]+Z).*REGIME STATUS: \w+ \| Bull: ([\d.]+)% Bear: ([\d.]+)%'
)
for f in sorted(NG4_DIR.glob('*.txt')):
try:
for line in f.read_text(encoding='utf-8', errors='replace').splitlines():
m = log_re.search(line)
if not m:
continue
ts = _parse_ts(m.group(1).replace('T', ' ').rstrip('Z'))
if ts is None:
continue
bull, bear = float(m.group(2)), float(m.group(3))
t0 = _tier0(bull, bear, ts, False)
t4 = _tier4(ts)
row = np.zeros(TOTAL, dtype=np.float32)
row[OFF[0]:OFF[0]+T0] = t0
row[OFF[4]:OFF[4]+T4] = t4
yield row
except Exception:
continue
def _stream_ng5_local() -> Iterator[np.ndarray]:
import os
if not NG5_DIR.exists():
return
for entry in os.scandir(str(NG5_DIR)):
f = Path(entry.path)
if not (entry.name.startswith('regime_result_') and entry.name.endswith('.json')):
continue
try:
d = json.loads(f.read_text(encoding='utf-8', errors='replace'))
ts = _parse_ts(str(d.get('timestamp', '')))
if ts is None:
continue
bull = float(d.get('bull_pct', 50))
bear = float(d.get('bear_pct', 50))
mwr = d.get('multi_window_results') or {}
pricing = d.get('pricing_data') or {}
t1, has_eigen = _tier1(mwr)
t2, has_price = _tier2(pricing)
t0 = _tier0(bull, bear, ts, has_eigen)
t4 = _tier4(ts)
row = np.zeros(TOTAL, dtype=np.float32)
row[OFF[0]:OFF[0]+T0] = t0
row[OFF[1]:OFF[1]+T1] = t1
row[OFF[2]:OFF[2]+T2] = t2
row[OFF[4]:OFF[4]+T4] = t4
# No ExF for NG5 local (no companion NPZ per scan)
yield row
except Exception:
continue
def _stream_ng3_scans(exf_cache: ExFCache,
date_from: str = '2025-12-31',
max_per_day: Optional[int] = None) -> Iterator[np.ndarray]:
"""
Stream NG3/NG5 scan JSONs one at a time — never accumulates in memory.
ExF loaded once per date from companion NPZ.
max_per_day: limit scans per day (subsample for very long training days).
"""
if not NG3_EIGEN.exists():
return
date_dirs = sorted(
d for d in NG3_EIGEN.iterdir()
if d.is_dir() and not d.name.endswith('_SKIP') and d.name >= date_from
)
for date_dir in date_dirs:
exf = exf_cache.get(date_dir.name)
t3 = _tier3(exf)
day_count = 0
for f in sorted(date_dir.glob('scan_*.json')):
if '__Indicators' in f.name:
continue
if max_per_day and day_count >= max_per_day:
break
try:
# Read and immediately parse — don't accumulate
txt = f.read_text(encoding='utf-8', errors='replace')
d = json.loads(txt)
ts = _parse_ts(str(d.get('timestamp', '')))
if ts is None:
continue
windows = d.get('windows') or d.get('multi_window_results') or {}
pricing = d.get('pricing_data') or {}
pc = pricing.get('price_changes', {})
if pc:
vs = list(pc.values())
bull = 100.0 * sum(1 for v in vs if float(v) > 0) / max(len(vs), 1)
bear = 100.0 * sum(1 for v in vs if float(v) < 0) / max(len(vs), 1)
else:
bull, bear = 50.0, 50.0
t1, has_eigen = _tier1(windows)
t2, _ = _tier2(pricing)
t0 = _tier0(bull, bear, ts, has_eigen)
t4 = _tier4(ts)
row = np.zeros(TOTAL, dtype=np.float32)
row[OFF[0]:OFF[0]+T0] = t0
row[OFF[1]:OFF[1]+T1] = t1
row[OFF[2]:OFF[2]+T2] = t2
row[OFF[3]:OFF[3]+T3] = t3 # same ExF for all scans of this day
row[OFF[4]:OFF[4]+T4] = t4
yield row
day_count += 1
del d, txt # explicit release
except Exception:
continue
# ── Master corpus builder ──────────────────────────────────────────────────
class DolphinCorpus:
"""
Unified DOLPHIN corpus across all generations, 5 tiers, 111 dims.
Attributes:
X : (N, 111) float32 — the feature matrix
mask : (N, 5) bool — [t0, t1_eigen, t2_price, t3_exf, t4_esof]
sources : (N,) int8 — 0=NG1/2, 1=NG4, 2=NG5-local, 3=NG3-scan
"""
DIMS = DIMS
TOTAL = TOTAL
OFF = OFF
def __init__(self):
self.X = None
self.mask = None
self.sources = None
def build(self,
ng3_date_from: str = '2025-12-31',
max_scans_per_day: Optional[int] = None,
max_per_source: Optional[int] = None,
max_ng5: int = 3_000,
chunk_size: int = 50_000,
verbose: bool = True) -> 'DolphinCorpus':
"""
Memory-efficient build using streaming generators.
chunk_size: accumulate this many rows before extending array.
max_per_source: cap rows from NG1/NG2/NG4 (breadth-only sources).
max_ng5: separate cap for NG5-local (files are larger, reads ~26/s).
"""
print("Building DOLPHIN multi-generation corpus (streaming)...", flush=True)
exf_cache = ExFCache(NG3_EIGEN)
# Per-source caps: NG5-local is separately capped (slow reads)
_caps = {
0: max_per_source, # NG1/NG2
1: max_per_source, # NG4
2: max_ng5, # NG5-local — separate low cap
3: None, # NG3-scan — limited by max_scans_per_day
}
sources_list = [
(0, _stream_ng1_ng2(), "NG1/NG2"),
(1, _stream_ng4(), "NG4"),
(2, _stream_ng5_local(), "NG5-local"),
(3, _stream_ng3_scans(exf_cache, ng3_date_from, max_scans_per_day), "NG3-scan"),
]
all_chunks, all_src = [], []
buf_rows, buf_src = [], []
total = 0
for src_id, gen, name in sources_list:
src_count = 0
cap = _caps.get(src_id)
for row in gen:
buf_rows.append(row)
buf_src.append(src_id)
src_count += 1
total += 1
if len(buf_rows) >= chunk_size:
all_chunks.append(np.array(buf_rows, dtype=np.float32))
all_src.extend(buf_src)
buf_rows.clear(); buf_src.clear()
if verbose:
print(f" {name}: {src_count:,} (total so far: {total:,})", flush=True)
if cap and src_count >= cap:
break
if verbose:
print(f" {name}: {src_count:,} samples", flush=True)
# Flush remainder
if buf_rows:
all_chunks.append(np.array(buf_rows, dtype=np.float32))
all_src.extend(buf_src)
self.X = np.vstack(all_chunks) if all_chunks else np.empty((0, TOTAL), dtype=np.float32)
self.sources = np.array(all_src, dtype=np.int8)
np.nan_to_num(self.X, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
# Build mask from has_eigen flag (bit7 of T0) and non-zero tiers
has_eigen = self.X[:, OFF[0] + 7] > 0.5 # T0[-1]
has_price = np.any(self.X[:, OFF[2]:OFF[2]+T2] != 0, axis=1)
has_exf = np.any(self.X[:, OFF[3]:OFF[3]+T3] != 0, axis=1)
self.mask = np.column_stack([
np.ones(len(self.X), dtype=bool), # T0 always
has_eigen,
has_price,
has_exf,
np.ones(len(self.X), dtype=bool), # T4 always (computed)
])
if verbose:
print(f"\nCorpus summary:")
print(f" Total : {len(self.X):,}")
print(f" Shape : {self.X.shape} ({self.X.nbytes/1e6:.0f} MB)")
print(f" T1 eigen : {self.mask[:,1].sum():,} ({100*self.mask[:,1].mean():.1f}%)")
print(f" T2 price : {self.mask[:,2].sum():,} ({100*self.mask[:,2].mean():.1f}%)")
print(f" T3 exf : {self.mask[:,3].sum():,} ({100*self.mask[:,3].mean():.1f}%)")
return self
def save(self, path: str):
p = path if path.endswith('.npz') else path + '.npz'
np.savez_compressed(p, X=self.X, mask=self.mask, sources=self.sources)
print(f"Corpus saved: {p} ({self.X.nbytes/1e6:.0f} MB uncompressed, compressed ~10x)")
@classmethod
def load(cls, path: str) -> 'DolphinCorpus':
c = cls()
p = path if path.endswith('.npz') else path + '.npz'
d = np.load(p)
c.X, c.mask, c.sources = d['X'], d['mask'], d['sources']
print(f"Corpus loaded: {len(c.X):,} samples, {c.X.shape[1]} dims")
return c
# ── Tier slices ─────────────────────────────────────────────────────
def t0(self): return self.X[:, OFF[0]:OFF[0]+T0]
def t1(self): return self.X[:, OFF[1]:OFF[1]+T1]
def t2(self): return self.X[:, OFF[2]:OFF[2]+T2]
def t3(self): return self.X[:, OFF[3]:OFF[3]+T3]
def t4(self): return self.X[:, OFF[4]:OFF[4]+T4]
def tier_names(self):
return ['breadth+time', 'eigenvalues', 'per-asset-vol', 'ExF-macro', 'EsoF']
def describe(self):
print(f"Corpus: N={len(self.X):,} dims={TOTAL} ({self.X.nbytes/1e6:.0f}MB)")
print(f"Tiers: {list(zip(self.tier_names(), DIMS))}")
print(f"Masks: {[(t, self.mask[:,i].sum()) for i, t in enumerate(self.tier_names())]}")
src_names = {0: 'NG1/2', 1: 'NG4', 2: 'NG5-local', 3: 'NG3-scan'}
for sid, name in src_names.items():
n = (self.sources == sid).sum()
if n > 0:
print(f" {name:12s}: {n:,}")
if __name__ == '__main__':
import sys
max_per_day = int(sys.argv[1]) if len(sys.argv) > 1 else None
corpus = DolphinCorpus().build(verbose=True, max_scans_per_day=max_per_day)
corpus.save(str(HERE / 'corpus_cache'))
corpus.describe()