initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
466
external_factors/backfill_runner.py
Executable file
466
external_factors/backfill_runner.py
Executable file
@@ -0,0 +1,466 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
DOLPHIN BACKFILL RUNNER v2.0
|
||||
============================
|
||||
Spiders DOLPHIN scan directories, enriches with external factors matrix.
|
||||
|
||||
INDICATOR SOURCES:
|
||||
1. API_HISTORICAL: Fetched with scan timestamp (CoinMetrics, FRED, DeFi Llama, etc.)
|
||||
2. SCAN_DERIVED: Computed from scan's market_prices, tracking_data, per_asset_signals
|
||||
3. UNAVAILABLE: No historical API AND cannot compute from scan → NaN
|
||||
|
||||
Output: {original_name}__Indicators.npz (sorts alphabetically next to source)
|
||||
|
||||
Author: HJ / Claude
|
||||
Version: 2.0.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import numpy as np
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional, Tuple, Any, Set
|
||||
import logging
|
||||
import time
|
||||
import argparse
|
||||
|
||||
# Import external factors module
|
||||
from external_factors_matrix import (
|
||||
ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS,
|
||||
HistoricalSupport, Stationarity, Category
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# =============================================================================
|
||||
# INDICATOR SOURCE CLASSIFICATION
|
||||
# =============================================================================
|
||||
|
||||
class IndicatorSource:
|
||||
"""Classifies each indicator by how it can be obtained for backfill"""
|
||||
|
||||
# Indicators that HAVE historical API support (fetch with timestamp)
|
||||
API_HISTORICAL: Set[int] = set()
|
||||
|
||||
# Indicators that are UNAVAILABLE (no history, can't derive from scan)
|
||||
UNAVAILABLE: Set[int] = set()
|
||||
|
||||
@classmethod
|
||||
def classify(cls):
|
||||
"""Classify all indicators by their backfill source"""
|
||||
for ind in INDICATORS:
|
||||
if ind.historical in [HistoricalSupport.FULL, HistoricalSupport.PARTIAL]:
|
||||
cls.API_HISTORICAL.add(ind.id)
|
||||
else:
|
||||
cls.UNAVAILABLE.add(ind.id)
|
||||
|
||||
logger.info(f"Indicator sources: API_HISTORICAL={len(cls.API_HISTORICAL)}, "
|
||||
f"UNAVAILABLE={len(cls.UNAVAILABLE)}")
|
||||
|
||||
@classmethod
|
||||
def get_unavailable_names(cls) -> List[str]:
|
||||
return [INDICATORS[i-1].name for i in sorted(cls.UNAVAILABLE)]
|
||||
|
||||
# Initialize classification
|
||||
IndicatorSource.classify()
|
||||
|
||||
# =============================================================================
|
||||
# CONFIGURATION
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class BackfillConfig:
|
||||
scan_dir: Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
|
||||
output_dir: Optional[str] = None
|
||||
skip_existing: bool = True
|
||||
dry_run: bool = False
|
||||
fred_api_key: str = ""
|
||||
rate_limit_delay: float = 0.5
|
||||
verbose: bool = False
|
||||
|
||||
# =============================================================================
|
||||
# SCAN DATA
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class ScanData:
|
||||
path: Path
|
||||
scan_number: int
|
||||
timestamp: datetime
|
||||
market_prices: Dict[str, float]
|
||||
windows: Dict[str, Dict]
|
||||
|
||||
@property
|
||||
def n_assets(self) -> int:
|
||||
return len(self.market_prices)
|
||||
|
||||
@property
|
||||
def symbols(self) -> List[str]:
|
||||
return sorted(self.market_prices.keys())
|
||||
|
||||
def get_tracking(self, window: str) -> Dict:
|
||||
return self.windows.get(window, {}).get('tracking_data', {})
|
||||
|
||||
def get_regime(self, window: str) -> Dict:
|
||||
return self.windows.get(window, {}).get('regime_signals', {})
|
||||
|
||||
def get_asset_signals(self, window: str) -> Dict:
|
||||
return self.windows.get(window, {}).get('per_asset_signals', {})
|
||||
|
||||
# =============================================================================
|
||||
# INDICATORS FROM SCAN DATA
|
||||
# =============================================================================
|
||||
|
||||
WINDOWS = ['50', '150', '300', '750']
|
||||
|
||||
# Global scan-derived indicators (eigenvalue-based, from tracking_data/regime_signals)
|
||||
SCAN_GLOBAL_INDICATORS = [
|
||||
# Lambda max per window
|
||||
*[(f"lambda_max_w{w}", f"Lambda max window {w}") for w in WINDOWS],
|
||||
*[(f"lambda_min_w{w}", f"Lambda min window {w}") for w in WINDOWS],
|
||||
*[(f"lambda_vel_w{w}", f"Lambda velocity window {w}") for w in WINDOWS],
|
||||
*[(f"lambda_acc_w{w}", f"Lambda acceleration window {w}") for w in WINDOWS],
|
||||
*[(f"eigrot_max_w{w}", f"Eigenvector rotation window {w}") for w in WINDOWS],
|
||||
*[(f"eiggap_w{w}", f"Eigenvalue gap window {w}") for w in WINDOWS],
|
||||
*[(f"instab_w{w}", f"Instability window {w}") for w in WINDOWS],
|
||||
*[(f"transp_w{w}", f"Transition prob window {w}") for w in WINDOWS],
|
||||
*[(f"coher_w{w}", f"Coherence window {w}") for w in WINDOWS],
|
||||
# Aggregates
|
||||
("lambda_max_mean", "Mean lambda max"),
|
||||
("lambda_max_std", "Std lambda max"),
|
||||
("instab_mean", "Mean instability"),
|
||||
("instab_max", "Max instability"),
|
||||
("coher_mean", "Mean coherence"),
|
||||
("coher_min", "Min coherence"),
|
||||
("coher_trend", "Coherence trend (w750-w50)"),
|
||||
# From prices
|
||||
("n_assets", "Number of assets"),
|
||||
("price_dispersion", "Log price dispersion"),
|
||||
]
|
||||
|
||||
N_SCAN_GLOBAL = len(SCAN_GLOBAL_INDICATORS)
|
||||
|
||||
# Per-asset indicators
|
||||
PER_ASSET_INDICATORS = [
|
||||
("price", "Price"),
|
||||
("log_price", "Log price"),
|
||||
("price_rank", "Price percentile"),
|
||||
("price_btc", "Price / BTC"),
|
||||
("price_eth", "Price / ETH"),
|
||||
*[(f"align_w{w}", f"Alignment w{w}") for w in WINDOWS],
|
||||
*[(f"decouple_w{w}", f"Decoupling w{w}") for w in WINDOWS],
|
||||
*[(f"anomaly_w{w}", f"Anomaly w{w}") for w in WINDOWS],
|
||||
*[(f"eigvec_w{w}", f"Eigenvector w{w}") for w in WINDOWS],
|
||||
("align_mean", "Mean alignment"),
|
||||
("align_std", "Alignment std"),
|
||||
("anomaly_max", "Max anomaly"),
|
||||
("decouple_max", "Max |decoupling|"),
|
||||
]
|
||||
|
||||
N_PER_ASSET = len(PER_ASSET_INDICATORS)
|
||||
|
||||
# =============================================================================
|
||||
# PROCESSOR
|
||||
# =============================================================================
|
||||
|
||||
class ScanProcessor:
|
||||
def __init__(self, config: BackfillConfig):
|
||||
self.config = config
|
||||
self.fetcher = ExternalFactorsFetcher(Config(fred_api_key=config.fred_api_key))
|
||||
|
||||
def load_scan(self, path: Path) -> Optional[ScanData]:
|
||||
try:
|
||||
with open(path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
ts_str = data.get('timestamp', '')
|
||||
try:
|
||||
timestamp = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
|
||||
if timestamp.tzinfo is None:
|
||||
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
||||
except:
|
||||
timestamp = datetime.now(timezone.utc)
|
||||
|
||||
return ScanData(
|
||||
path=path,
|
||||
scan_number=data.get('scan_number', 0),
|
||||
timestamp=timestamp,
|
||||
market_prices=data.get('market_prices', {}),
|
||||
windows=data.get('windows', {})
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Load failed {path}: {e}")
|
||||
return None
|
||||
|
||||
async def fetch_api_indicators(self, timestamp: datetime) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Fetch indicators with historical API support"""
|
||||
try:
|
||||
result = await self.fetcher.fetch_all(target_date=timestamp)
|
||||
matrix = result['matrix']
|
||||
success = np.array([
|
||||
result['details'].get(i+1, {}).get('success', False)
|
||||
for i in range(N_INDICATORS)
|
||||
])
|
||||
|
||||
# Mark non-historical indicators as NaN
|
||||
for i in range(N_INDICATORS):
|
||||
if (i+1) not in IndicatorSource.API_HISTORICAL:
|
||||
success[i] = False
|
||||
matrix[i] = np.nan
|
||||
|
||||
return matrix, success
|
||||
except Exception as e:
|
||||
logger.warning(f"API fetch failed: {e}")
|
||||
return np.full(N_INDICATORS, np.nan), np.zeros(N_INDICATORS, dtype=bool)
|
||||
|
||||
def compute_scan_global(self, scan: ScanData) -> np.ndarray:
|
||||
"""Compute global indicators from scan's tracking_data and regime_signals"""
|
||||
values = []
|
||||
|
||||
# Per-window metrics
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_tracking(w).get('lambda_max', np.nan))
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_tracking(w).get('lambda_min', np.nan))
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_tracking(w).get('lambda_max_velocity', np.nan))
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_tracking(w).get('lambda_max_acceleration', np.nan))
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_tracking(w).get('eigenvector_rotation_max', np.nan))
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_tracking(w).get('eigenvalue_gap', np.nan))
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_regime(w).get('instability_score', np.nan))
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_regime(w).get('regime_transition_probability', np.nan))
|
||||
for w in WINDOWS:
|
||||
values.append(scan.get_regime(w).get('market_coherence', np.nan))
|
||||
|
||||
# Aggregates
|
||||
lmax = [scan.get_tracking(w).get('lambda_max', np.nan) for w in WINDOWS]
|
||||
values.append(np.nanmean(lmax))
|
||||
values.append(np.nanstd(lmax))
|
||||
|
||||
instab = [scan.get_regime(w).get('instability_score', np.nan) for w in WINDOWS]
|
||||
values.append(np.nanmean(instab))
|
||||
values.append(np.nanmax(instab))
|
||||
|
||||
coher = [scan.get_regime(w).get('market_coherence', np.nan) for w in WINDOWS]
|
||||
values.append(np.nanmean(coher))
|
||||
values.append(np.nanmin(coher))
|
||||
values.append(coher[3] - coher[0] if not np.isnan(coher[3]) and not np.isnan(coher[0]) else np.nan)
|
||||
|
||||
# From prices
|
||||
prices = np.array(list(scan.market_prices.values())) if scan.market_prices else np.array([])
|
||||
values.append(len(prices))
|
||||
values.append(np.std(np.log(np.maximum(prices, 1e-10))) if len(prices) > 0 else np.nan)
|
||||
|
||||
return np.array(values)
|
||||
|
||||
def compute_per_asset(self, scan: ScanData) -> Tuple[np.ndarray, List[str]]:
|
||||
"""Compute per-asset indicator matrix"""
|
||||
symbols = scan.symbols
|
||||
n = len(symbols)
|
||||
if n == 0:
|
||||
return np.zeros((0, N_PER_ASSET)), []
|
||||
|
||||
matrix = np.zeros((n, N_PER_ASSET))
|
||||
prices = np.array([scan.market_prices[s] for s in symbols])
|
||||
|
||||
btc_p = scan.market_prices.get('BTC', scan.market_prices.get('BTCUSDT', np.nan))
|
||||
eth_p = scan.market_prices.get('ETH', scan.market_prices.get('ETHUSDT', np.nan))
|
||||
|
||||
col = 0
|
||||
matrix[:, col] = prices; col += 1
|
||||
matrix[:, col] = np.log(np.maximum(prices, 1e-10)); col += 1
|
||||
matrix[:, col] = np.argsort(np.argsort(prices)) / n; col += 1
|
||||
matrix[:, col] = prices / btc_p if btc_p > 0 else np.nan; col += 1
|
||||
matrix[:, col] = prices / eth_p if eth_p > 0 else np.nan; col += 1
|
||||
|
||||
# Per-window signals
|
||||
for metric in ['market_alignment', 'decoupling_velocity', 'anomaly_score', 'eigenvector_component']:
|
||||
for w in WINDOWS:
|
||||
sigs = scan.get_asset_signals(w)
|
||||
for i, sym in enumerate(symbols):
|
||||
matrix[i, col] = sigs.get(sym, {}).get(metric, np.nan)
|
||||
col += 1
|
||||
|
||||
# Aggregates
|
||||
align_cols = list(range(5, 9))
|
||||
matrix[:, col] = np.nanmean(matrix[:, align_cols], axis=1); col += 1
|
||||
matrix[:, col] = np.nanstd(matrix[:, align_cols], axis=1); col += 1
|
||||
|
||||
anomaly_cols = list(range(13, 17))
|
||||
matrix[:, col] = np.nanmax(matrix[:, anomaly_cols], axis=1); col += 1
|
||||
|
||||
decouple_cols = list(range(9, 13))
|
||||
matrix[:, col] = np.nanmax(np.abs(matrix[:, decouple_cols]), axis=1); col += 1
|
||||
|
||||
return matrix, symbols
|
||||
|
||||
async def process(self, path: Path) -> Optional[Dict[str, Any]]:
|
||||
start = time.time()
|
||||
|
||||
scan = self.load_scan(path)
|
||||
if scan is None:
|
||||
return None
|
||||
|
||||
# 1. API historical indicators
|
||||
api_matrix, api_success = await self.fetch_api_indicators(scan.timestamp)
|
||||
|
||||
# 2. Scan-derived global
|
||||
scan_global = self.compute_scan_global(scan)
|
||||
|
||||
# 3. Per-asset
|
||||
asset_matrix, asset_symbols = self.compute_per_asset(scan)
|
||||
|
||||
return {
|
||||
'scan_number': scan.scan_number,
|
||||
'timestamp': scan.timestamp.isoformat(),
|
||||
'processing_time': time.time() - start,
|
||||
|
||||
'api_indicators': api_matrix,
|
||||
'api_success': api_success,
|
||||
'api_names': np.array([ind.name for ind in INDICATORS], dtype='U32'),
|
||||
|
||||
'scan_global': scan_global,
|
||||
'scan_global_names': np.array([n for n, _ in SCAN_GLOBAL_INDICATORS], dtype='U32'),
|
||||
|
||||
'asset_matrix': asset_matrix,
|
||||
'asset_symbols': np.array(asset_symbols, dtype='U16'),
|
||||
'asset_names': np.array([n for n, _ in PER_ASSET_INDICATORS], dtype='U32'),
|
||||
|
||||
'n_assets': len(asset_symbols),
|
||||
'api_success_rate': np.nanmean(api_success[list(i-1 for i in IndicatorSource.API_HISTORICAL)]),
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# OUTPUT
|
||||
# =============================================================================
|
||||
|
||||
class OutputWriter:
|
||||
def __init__(self, config: BackfillConfig):
|
||||
self.config = config
|
||||
|
||||
def get_output_path(self, scan_path: Path) -> Path:
|
||||
out_dir = Path(self.config.output_dir) if self.config.output_dir else scan_path.parent
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
return out_dir / f"{scan_path.stem}__Indicators.npz"
|
||||
|
||||
def save(self, data: Dict[str, Any], scan_path: Path) -> Path:
|
||||
out_path = self.get_output_path(scan_path)
|
||||
save_data = {}
|
||||
for k, v in data.items():
|
||||
if isinstance(v, np.ndarray):
|
||||
save_data[k] = v
|
||||
elif isinstance(v, str):
|
||||
save_data[k] = np.array([v], dtype='U64')
|
||||
else:
|
||||
save_data[k] = np.array([v])
|
||||
np.savez_compressed(out_path, **save_data)
|
||||
return out_path
|
||||
|
||||
# =============================================================================
|
||||
# RUNNER
|
||||
# =============================================================================
|
||||
|
||||
class BackfillRunner:
|
||||
def __init__(self, config: BackfillConfig):
|
||||
self.config = config
|
||||
self.processor = ScanProcessor(config)
|
||||
self.writer = OutputWriter(config)
|
||||
self.stats = {'processed': 0, 'failed': 0, 'skipped': 0}
|
||||
|
||||
def find_scans(self) -> List[Path]:
|
||||
root = Path(self.config.scan_dir)
|
||||
files = sorted(root.rglob("scan_*.json"))
|
||||
|
||||
if self.config.skip_existing:
|
||||
files = [f for f in files if not self.writer.get_output_path(f).exists()]
|
||||
|
||||
return files
|
||||
|
||||
async def run(self):
|
||||
unavail = IndicatorSource.get_unavailable_names()
|
||||
logger.info(f"Skipping {len(unavail)} unavailable indicators: {unavail[:5]}...")
|
||||
|
||||
files = self.find_scans()
|
||||
logger.info(f"Processing {len(files)} files...")
|
||||
|
||||
for i, path in enumerate(files):
|
||||
try:
|
||||
result = await self.processor.process(path)
|
||||
if result:
|
||||
if not self.config.dry_run:
|
||||
self.writer.save(result, path)
|
||||
self.stats['processed'] += 1
|
||||
else:
|
||||
self.stats['failed'] += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Error {path.name}: {e}")
|
||||
self.stats['failed'] += 1
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
logger.info(f"Progress: {i+1}/{len(files)}")
|
||||
|
||||
if self.config.rate_limit_delay > 0:
|
||||
await asyncio.sleep(self.config.rate_limit_delay)
|
||||
|
||||
logger.info(f"Done: {self.stats}")
|
||||
return self.stats
|
||||
|
||||
# =============================================================================
|
||||
# UTILITY
|
||||
# =============================================================================
|
||||
|
||||
def load_indicators(path: str) -> Dict[str, np.ndarray]:
|
||||
"""Load .npz indicator file"""
|
||||
return dict(np.load(path, allow_pickle=True))
|
||||
|
||||
def summary(path: str) -> str:
|
||||
"""Summary of indicator file"""
|
||||
d = load_indicators(path)
|
||||
return f"""Timestamp: {d['timestamp'][0]}
|
||||
Assets: {d['n_assets'][0]}
|
||||
API success: {d['api_success_rate'][0]:.1%}
|
||||
API shape: {d['api_indicators'].shape}
|
||||
Scan global: {d['scan_global'].shape}
|
||||
Per-asset: {d['asset_matrix'].shape}"""
|
||||
|
||||
# =============================================================================
|
||||
# CLI
|
||||
# =============================================================================
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="DOLPHIN Backfill Runner")
|
||||
# parser.add_argument("scan_dir", help="Directory with scan JSON files")
|
||||
parser.add_argument("-o", "--output", help="Output directory")
|
||||
parser.add_argument("--fred-key", default="", help="FRED API key")
|
||||
parser.add_argument("--no-skip", action="store_true", help="Reprocess existing")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--delay", type=float, default=0.5)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
config = BackfillConfig(
|
||||
scan_dir= Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues"),
|
||||
output_dir=args.output,
|
||||
# FRED API Key: c16a9cde3e3bb5bb972bb9283485f202
|
||||
fred_api_key=args.fred_key or 'c16a9cde3e3bb5bb972bb9283485f202',
|
||||
skip_existing=not args.no_skip,
|
||||
dry_run=args.dry_run,
|
||||
rate_limit_delay=args.delay,
|
||||
)
|
||||
|
||||
runner = BackfillRunner(config)
|
||||
asyncio.run(runner.run())
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user