Files
DOLPHIN/external_factors/backfill_runner.py

467 lines
18 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
DOLPHIN BACKFILL RUNNER v2.0
============================
Spiders DOLPHIN scan directories, enriches with external factors matrix.
INDICATOR SOURCES:
1. API_HISTORICAL: Fetched with scan timestamp (CoinMetrics, FRED, DeFi Llama, etc.)
2. SCAN_DERIVED: Computed from scan's market_prices, tracking_data, per_asset_signals
3. UNAVAILABLE: No historical API AND cannot compute from scan NaN
Output: {original_name}__Indicators.npz (sorts alphabetically next to source)
Author: HJ / Claude
Version: 2.0.0
"""
import os
import sys
import json
import numpy as np
import asyncio
import aiohttp
from pathlib import Path
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Any, Set
import logging
import time
import argparse
# Import external factors module
from external_factors_matrix import (
ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS,
HistoricalSupport, Stationarity, Category
)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
# =============================================================================
# INDICATOR SOURCE CLASSIFICATION
# =============================================================================
class IndicatorSource:
"""Classifies each indicator by how it can be obtained for backfill"""
# Indicators that HAVE historical API support (fetch with timestamp)
API_HISTORICAL: Set[int] = set()
# Indicators that are UNAVAILABLE (no history, can't derive from scan)
UNAVAILABLE: Set[int] = set()
@classmethod
def classify(cls):
"""Classify all indicators by their backfill source"""
for ind in INDICATORS:
if ind.historical in [HistoricalSupport.FULL, HistoricalSupport.PARTIAL]:
cls.API_HISTORICAL.add(ind.id)
else:
cls.UNAVAILABLE.add(ind.id)
logger.info(f"Indicator sources: API_HISTORICAL={len(cls.API_HISTORICAL)}, "
f"UNAVAILABLE={len(cls.UNAVAILABLE)}")
@classmethod
def get_unavailable_names(cls) -> List[str]:
return [INDICATORS[i-1].name for i in sorted(cls.UNAVAILABLE)]
# Initialize classification
IndicatorSource.classify()
# =============================================================================
# CONFIGURATION
# =============================================================================
@dataclass
class BackfillConfig:
scan_dir: Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
output_dir: Optional[str] = None
skip_existing: bool = True
dry_run: bool = False
fred_api_key: str = ""
rate_limit_delay: float = 0.5
verbose: bool = False
# =============================================================================
# SCAN DATA
# =============================================================================
@dataclass
class ScanData:
path: Path
scan_number: int
timestamp: datetime
market_prices: Dict[str, float]
windows: Dict[str, Dict]
@property
def n_assets(self) -> int:
return len(self.market_prices)
@property
def symbols(self) -> List[str]:
return sorted(self.market_prices.keys())
def get_tracking(self, window: str) -> Dict:
return self.windows.get(window, {}).get('tracking_data', {})
def get_regime(self, window: str) -> Dict:
return self.windows.get(window, {}).get('regime_signals', {})
def get_asset_signals(self, window: str) -> Dict:
return self.windows.get(window, {}).get('per_asset_signals', {})
# =============================================================================
# INDICATORS FROM SCAN DATA
# =============================================================================
WINDOWS = ['50', '150', '300', '750']
# Global scan-derived indicators (eigenvalue-based, from tracking_data/regime_signals)
SCAN_GLOBAL_INDICATORS = [
# Lambda max per window
*[(f"lambda_max_w{w}", f"Lambda max window {w}") for w in WINDOWS],
*[(f"lambda_min_w{w}", f"Lambda min window {w}") for w in WINDOWS],
*[(f"lambda_vel_w{w}", f"Lambda velocity window {w}") for w in WINDOWS],
*[(f"lambda_acc_w{w}", f"Lambda acceleration window {w}") for w in WINDOWS],
*[(f"eigrot_max_w{w}", f"Eigenvector rotation window {w}") for w in WINDOWS],
*[(f"eiggap_w{w}", f"Eigenvalue gap window {w}") for w in WINDOWS],
*[(f"instab_w{w}", f"Instability window {w}") for w in WINDOWS],
*[(f"transp_w{w}", f"Transition prob window {w}") for w in WINDOWS],
*[(f"coher_w{w}", f"Coherence window {w}") for w in WINDOWS],
# Aggregates
("lambda_max_mean", "Mean lambda max"),
("lambda_max_std", "Std lambda max"),
("instab_mean", "Mean instability"),
("instab_max", "Max instability"),
("coher_mean", "Mean coherence"),
("coher_min", "Min coherence"),
("coher_trend", "Coherence trend (w750-w50)"),
# From prices
("n_assets", "Number of assets"),
("price_dispersion", "Log price dispersion"),
]
N_SCAN_GLOBAL = len(SCAN_GLOBAL_INDICATORS)
# Per-asset indicators
PER_ASSET_INDICATORS = [
("price", "Price"),
("log_price", "Log price"),
("price_rank", "Price percentile"),
("price_btc", "Price / BTC"),
("price_eth", "Price / ETH"),
*[(f"align_w{w}", f"Alignment w{w}") for w in WINDOWS],
*[(f"decouple_w{w}", f"Decoupling w{w}") for w in WINDOWS],
*[(f"anomaly_w{w}", f"Anomaly w{w}") for w in WINDOWS],
*[(f"eigvec_w{w}", f"Eigenvector w{w}") for w in WINDOWS],
("align_mean", "Mean alignment"),
("align_std", "Alignment std"),
("anomaly_max", "Max anomaly"),
("decouple_max", "Max |decoupling|"),
]
N_PER_ASSET = len(PER_ASSET_INDICATORS)
# =============================================================================
# PROCESSOR
# =============================================================================
class ScanProcessor:
def __init__(self, config: BackfillConfig):
self.config = config
self.fetcher = ExternalFactorsFetcher(Config(fred_api_key=config.fred_api_key))
def load_scan(self, path: Path) -> Optional[ScanData]:
try:
with open(path, 'r') as f:
data = json.load(f)
ts_str = data.get('timestamp', '')
try:
timestamp = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
if timestamp.tzinfo is None:
timestamp = timestamp.replace(tzinfo=timezone.utc)
except:
timestamp = datetime.now(timezone.utc)
return ScanData(
path=path,
scan_number=data.get('scan_number', 0),
timestamp=timestamp,
market_prices=data.get('market_prices', {}),
windows=data.get('windows', {})
)
except Exception as e:
logger.error(f"Load failed {path}: {e}")
return None
async def fetch_api_indicators(self, timestamp: datetime) -> Tuple[np.ndarray, np.ndarray]:
"""Fetch indicators with historical API support"""
try:
result = await self.fetcher.fetch_all(target_date=timestamp)
matrix = result['matrix']
success = np.array([
result['details'].get(i+1, {}).get('success', False)
for i in range(N_INDICATORS)
])
# Mark non-historical indicators as NaN
for i in range(N_INDICATORS):
if (i+1) not in IndicatorSource.API_HISTORICAL:
success[i] = False
matrix[i] = np.nan
return matrix, success
except Exception as e:
logger.warning(f"API fetch failed: {e}")
return np.full(N_INDICATORS, np.nan), np.zeros(N_INDICATORS, dtype=bool)
def compute_scan_global(self, scan: ScanData) -> np.ndarray:
"""Compute global indicators from scan's tracking_data and regime_signals"""
values = []
# Per-window metrics
for w in WINDOWS:
values.append(scan.get_tracking(w).get('lambda_max', np.nan))
for w in WINDOWS:
values.append(scan.get_tracking(w).get('lambda_min', np.nan))
for w in WINDOWS:
values.append(scan.get_tracking(w).get('lambda_max_velocity', np.nan))
for w in WINDOWS:
values.append(scan.get_tracking(w).get('lambda_max_acceleration', np.nan))
for w in WINDOWS:
values.append(scan.get_tracking(w).get('eigenvector_rotation_max', np.nan))
for w in WINDOWS:
values.append(scan.get_tracking(w).get('eigenvalue_gap', np.nan))
for w in WINDOWS:
values.append(scan.get_regime(w).get('instability_score', np.nan))
for w in WINDOWS:
values.append(scan.get_regime(w).get('regime_transition_probability', np.nan))
for w in WINDOWS:
values.append(scan.get_regime(w).get('market_coherence', np.nan))
# Aggregates
lmax = [scan.get_tracking(w).get('lambda_max', np.nan) for w in WINDOWS]
values.append(np.nanmean(lmax))
values.append(np.nanstd(lmax))
instab = [scan.get_regime(w).get('instability_score', np.nan) for w in WINDOWS]
values.append(np.nanmean(instab))
values.append(np.nanmax(instab))
coher = [scan.get_regime(w).get('market_coherence', np.nan) for w in WINDOWS]
values.append(np.nanmean(coher))
values.append(np.nanmin(coher))
values.append(coher[3] - coher[0] if not np.isnan(coher[3]) and not np.isnan(coher[0]) else np.nan)
# From prices
prices = np.array(list(scan.market_prices.values())) if scan.market_prices else np.array([])
values.append(len(prices))
values.append(np.std(np.log(np.maximum(prices, 1e-10))) if len(prices) > 0 else np.nan)
return np.array(values)
def compute_per_asset(self, scan: ScanData) -> Tuple[np.ndarray, List[str]]:
"""Compute per-asset indicator matrix"""
symbols = scan.symbols
n = len(symbols)
if n == 0:
return np.zeros((0, N_PER_ASSET)), []
matrix = np.zeros((n, N_PER_ASSET))
prices = np.array([scan.market_prices[s] for s in symbols])
btc_p = scan.market_prices.get('BTC', scan.market_prices.get('BTCUSDT', np.nan))
eth_p = scan.market_prices.get('ETH', scan.market_prices.get('ETHUSDT', np.nan))
col = 0
matrix[:, col] = prices; col += 1
matrix[:, col] = np.log(np.maximum(prices, 1e-10)); col += 1
matrix[:, col] = np.argsort(np.argsort(prices)) / n; col += 1
matrix[:, col] = prices / btc_p if btc_p > 0 else np.nan; col += 1
matrix[:, col] = prices / eth_p if eth_p > 0 else np.nan; col += 1
# Per-window signals
for metric in ['market_alignment', 'decoupling_velocity', 'anomaly_score', 'eigenvector_component']:
for w in WINDOWS:
sigs = scan.get_asset_signals(w)
for i, sym in enumerate(symbols):
matrix[i, col] = sigs.get(sym, {}).get(metric, np.nan)
col += 1
# Aggregates
align_cols = list(range(5, 9))
matrix[:, col] = np.nanmean(matrix[:, align_cols], axis=1); col += 1
matrix[:, col] = np.nanstd(matrix[:, align_cols], axis=1); col += 1
anomaly_cols = list(range(13, 17))
matrix[:, col] = np.nanmax(matrix[:, anomaly_cols], axis=1); col += 1
decouple_cols = list(range(9, 13))
matrix[:, col] = np.nanmax(np.abs(matrix[:, decouple_cols]), axis=1); col += 1
return matrix, symbols
async def process(self, path: Path) -> Optional[Dict[str, Any]]:
start = time.time()
scan = self.load_scan(path)
if scan is None:
return None
# 1. API historical indicators
api_matrix, api_success = await self.fetch_api_indicators(scan.timestamp)
# 2. Scan-derived global
scan_global = self.compute_scan_global(scan)
# 3. Per-asset
asset_matrix, asset_symbols = self.compute_per_asset(scan)
return {
'scan_number': scan.scan_number,
'timestamp': scan.timestamp.isoformat(),
'processing_time': time.time() - start,
'api_indicators': api_matrix,
'api_success': api_success,
'api_names': np.array([ind.name for ind in INDICATORS], dtype='U32'),
'scan_global': scan_global,
'scan_global_names': np.array([n for n, _ in SCAN_GLOBAL_INDICATORS], dtype='U32'),
'asset_matrix': asset_matrix,
'asset_symbols': np.array(asset_symbols, dtype='U16'),
'asset_names': np.array([n for n, _ in PER_ASSET_INDICATORS], dtype='U32'),
'n_assets': len(asset_symbols),
'api_success_rate': np.nanmean(api_success[list(i-1 for i in IndicatorSource.API_HISTORICAL)]),
}
# =============================================================================
# OUTPUT
# =============================================================================
class OutputWriter:
def __init__(self, config: BackfillConfig):
self.config = config
def get_output_path(self, scan_path: Path) -> Path:
out_dir = Path(self.config.output_dir) if self.config.output_dir else scan_path.parent
out_dir.mkdir(parents=True, exist_ok=True)
return out_dir / f"{scan_path.stem}__Indicators.npz"
def save(self, data: Dict[str, Any], scan_path: Path) -> Path:
out_path = self.get_output_path(scan_path)
save_data = {}
for k, v in data.items():
if isinstance(v, np.ndarray):
save_data[k] = v
elif isinstance(v, str):
save_data[k] = np.array([v], dtype='U64')
else:
save_data[k] = np.array([v])
np.savez_compressed(out_path, **save_data)
return out_path
# =============================================================================
# RUNNER
# =============================================================================
class BackfillRunner:
def __init__(self, config: BackfillConfig):
self.config = config
self.processor = ScanProcessor(config)
self.writer = OutputWriter(config)
self.stats = {'processed': 0, 'failed': 0, 'skipped': 0}
def find_scans(self) -> List[Path]:
root = Path(self.config.scan_dir)
files = sorted(root.rglob("scan_*.json"))
if self.config.skip_existing:
files = [f for f in files if not self.writer.get_output_path(f).exists()]
return files
async def run(self):
unavail = IndicatorSource.get_unavailable_names()
logger.info(f"Skipping {len(unavail)} unavailable indicators: {unavail[:5]}...")
files = self.find_scans()
logger.info(f"Processing {len(files)} files...")
for i, path in enumerate(files):
try:
result = await self.processor.process(path)
if result:
if not self.config.dry_run:
self.writer.save(result, path)
self.stats['processed'] += 1
else:
self.stats['failed'] += 1
except Exception as e:
logger.error(f"Error {path.name}: {e}")
self.stats['failed'] += 1
if (i + 1) % 10 == 0:
logger.info(f"Progress: {i+1}/{len(files)}")
if self.config.rate_limit_delay > 0:
await asyncio.sleep(self.config.rate_limit_delay)
logger.info(f"Done: {self.stats}")
return self.stats
# =============================================================================
# UTILITY
# =============================================================================
def load_indicators(path: str) -> Dict[str, np.ndarray]:
"""Load .npz indicator file"""
return dict(np.load(path, allow_pickle=True))
def summary(path: str) -> str:
"""Summary of indicator file"""
d = load_indicators(path)
return f"""Timestamp: {d['timestamp'][0]}
Assets: {d['n_assets'][0]}
API success: {d['api_success_rate'][0]:.1%}
API shape: {d['api_indicators'].shape}
Scan global: {d['scan_global'].shape}
Per-asset: {d['asset_matrix'].shape}"""
# =============================================================================
# CLI
# =============================================================================
def main():
parser = argparse.ArgumentParser(description="DOLPHIN Backfill Runner")
# parser.add_argument("scan_dir", help="Directory with scan JSON files")
parser.add_argument("-o", "--output", help="Output directory")
parser.add_argument("--fred-key", default="", help="FRED API key")
parser.add_argument("--no-skip", action="store_true", help="Reprocess existing")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--delay", type=float, default=0.5)
args = parser.parse_args()
config = BackfillConfig(
scan_dir= Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues"),
output_dir=args.output,
# FRED API Key: c16a9cde3e3bb5bb972bb9283485f202
fred_api_key=args.fred_key or 'c16a9cde3e3bb5bb972bb9283485f202',
skip_existing=not args.no_skip,
dry_run=args.dry_run,
rate_limit_delay=args.delay,
)
runner = BackfillRunner(config)
asyncio.run(runner.run())
if __name__ == "__main__":
main()