#!/usr/bin/env python3 """ DOLPHIN BACKFILL RUNNER v2.0 ============================ Spiders DOLPHIN scan directories, enriches with external factors matrix. INDICATOR SOURCES: 1. API_HISTORICAL: Fetched with scan timestamp (CoinMetrics, FRED, DeFi Llama, etc.) 2. SCAN_DERIVED: Computed from scan's market_prices, tracking_data, per_asset_signals 3. UNAVAILABLE: No historical API AND cannot compute from scan → NaN Output: {original_name}__Indicators.npz (sorts alphabetically next to source) Author: HJ / Claude Version: 2.0.0 """ import os import sys import json import numpy as np import asyncio import aiohttp from pathlib import Path from datetime import datetime, timezone from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Any, Set import logging import time import argparse # Import external factors module from external_factors_matrix import ( ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS, HistoricalSupport, Stationarity, Category ) logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') logger = logging.getLogger(__name__) # ============================================================================= # INDICATOR SOURCE CLASSIFICATION # ============================================================================= class IndicatorSource: """Classifies each indicator by how it can be obtained for backfill""" # Indicators that HAVE historical API support (fetch with timestamp) API_HISTORICAL: Set[int] = set() # Indicators that are UNAVAILABLE (no history, can't derive from scan) UNAVAILABLE: Set[int] = set() @classmethod def classify(cls): """Classify all indicators by their backfill source""" for ind in INDICATORS: if ind.historical in [HistoricalSupport.FULL, HistoricalSupport.PARTIAL]: cls.API_HISTORICAL.add(ind.id) else: cls.UNAVAILABLE.add(ind.id) logger.info(f"Indicator sources: API_HISTORICAL={len(cls.API_HISTORICAL)}, " f"UNAVAILABLE={len(cls.UNAVAILABLE)}") @classmethod def get_unavailable_names(cls) -> List[str]: return [INDICATORS[i-1].name for i in sorted(cls.UNAVAILABLE)] # Initialize classification IndicatorSource.classify() # ============================================================================= # CONFIGURATION # ============================================================================= @dataclass class BackfillConfig: scan_dir: Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues") output_dir: Optional[str] = None skip_existing: bool = True dry_run: bool = False fred_api_key: str = "" rate_limit_delay: float = 0.5 verbose: bool = False # ============================================================================= # SCAN DATA # ============================================================================= @dataclass class ScanData: path: Path scan_number: int timestamp: datetime market_prices: Dict[str, float] windows: Dict[str, Dict] @property def n_assets(self) -> int: return len(self.market_prices) @property def symbols(self) -> List[str]: return sorted(self.market_prices.keys()) def get_tracking(self, window: str) -> Dict: return self.windows.get(window, {}).get('tracking_data', {}) def get_regime(self, window: str) -> Dict: return self.windows.get(window, {}).get('regime_signals', {}) def get_asset_signals(self, window: str) -> Dict: return self.windows.get(window, {}).get('per_asset_signals', {}) # ============================================================================= # INDICATORS FROM SCAN DATA # ============================================================================= WINDOWS = ['50', '150', '300', '750'] # Global scan-derived indicators (eigenvalue-based, from tracking_data/regime_signals) SCAN_GLOBAL_INDICATORS = [ # Lambda max per window *[(f"lambda_max_w{w}", f"Lambda max window {w}") for w in WINDOWS], *[(f"lambda_min_w{w}", f"Lambda min window {w}") for w in WINDOWS], *[(f"lambda_vel_w{w}", f"Lambda velocity window {w}") for w in WINDOWS], *[(f"lambda_acc_w{w}", f"Lambda acceleration window {w}") for w in WINDOWS], *[(f"eigrot_max_w{w}", f"Eigenvector rotation window {w}") for w in WINDOWS], *[(f"eiggap_w{w}", f"Eigenvalue gap window {w}") for w in WINDOWS], *[(f"instab_w{w}", f"Instability window {w}") for w in WINDOWS], *[(f"transp_w{w}", f"Transition prob window {w}") for w in WINDOWS], *[(f"coher_w{w}", f"Coherence window {w}") for w in WINDOWS], # Aggregates ("lambda_max_mean", "Mean lambda max"), ("lambda_max_std", "Std lambda max"), ("instab_mean", "Mean instability"), ("instab_max", "Max instability"), ("coher_mean", "Mean coherence"), ("coher_min", "Min coherence"), ("coher_trend", "Coherence trend (w750-w50)"), # From prices ("n_assets", "Number of assets"), ("price_dispersion", "Log price dispersion"), ] N_SCAN_GLOBAL = len(SCAN_GLOBAL_INDICATORS) # Per-asset indicators PER_ASSET_INDICATORS = [ ("price", "Price"), ("log_price", "Log price"), ("price_rank", "Price percentile"), ("price_btc", "Price / BTC"), ("price_eth", "Price / ETH"), *[(f"align_w{w}", f"Alignment w{w}") for w in WINDOWS], *[(f"decouple_w{w}", f"Decoupling w{w}") for w in WINDOWS], *[(f"anomaly_w{w}", f"Anomaly w{w}") for w in WINDOWS], *[(f"eigvec_w{w}", f"Eigenvector w{w}") for w in WINDOWS], ("align_mean", "Mean alignment"), ("align_std", "Alignment std"), ("anomaly_max", "Max anomaly"), ("decouple_max", "Max |decoupling|"), ] N_PER_ASSET = len(PER_ASSET_INDICATORS) # ============================================================================= # PROCESSOR # ============================================================================= class ScanProcessor: def __init__(self, config: BackfillConfig): self.config = config self.fetcher = ExternalFactorsFetcher(Config(fred_api_key=config.fred_api_key)) def load_scan(self, path: Path) -> Optional[ScanData]: try: with open(path, 'r') as f: data = json.load(f) ts_str = data.get('timestamp', '') try: timestamp = datetime.fromisoformat(ts_str.replace('Z', '+00:00')) if timestamp.tzinfo is None: timestamp = timestamp.replace(tzinfo=timezone.utc) except: timestamp = datetime.now(timezone.utc) return ScanData( path=path, scan_number=data.get('scan_number', 0), timestamp=timestamp, market_prices=data.get('market_prices', {}), windows=data.get('windows', {}) ) except Exception as e: logger.error(f"Load failed {path}: {e}") return None async def fetch_api_indicators(self, timestamp: datetime) -> Tuple[np.ndarray, np.ndarray]: """Fetch indicators with historical API support""" try: result = await self.fetcher.fetch_all(target_date=timestamp) matrix = result['matrix'] success = np.array([ result['details'].get(i+1, {}).get('success', False) for i in range(N_INDICATORS) ]) # Mark non-historical indicators as NaN for i in range(N_INDICATORS): if (i+1) not in IndicatorSource.API_HISTORICAL: success[i] = False matrix[i] = np.nan return matrix, success except Exception as e: logger.warning(f"API fetch failed: {e}") return np.full(N_INDICATORS, np.nan), np.zeros(N_INDICATORS, dtype=bool) def compute_scan_global(self, scan: ScanData) -> np.ndarray: """Compute global indicators from scan's tracking_data and regime_signals""" values = [] # Per-window metrics for w in WINDOWS: values.append(scan.get_tracking(w).get('lambda_max', np.nan)) for w in WINDOWS: values.append(scan.get_tracking(w).get('lambda_min', np.nan)) for w in WINDOWS: values.append(scan.get_tracking(w).get('lambda_max_velocity', np.nan)) for w in WINDOWS: values.append(scan.get_tracking(w).get('lambda_max_acceleration', np.nan)) for w in WINDOWS: values.append(scan.get_tracking(w).get('eigenvector_rotation_max', np.nan)) for w in WINDOWS: values.append(scan.get_tracking(w).get('eigenvalue_gap', np.nan)) for w in WINDOWS: values.append(scan.get_regime(w).get('instability_score', np.nan)) for w in WINDOWS: values.append(scan.get_regime(w).get('regime_transition_probability', np.nan)) for w in WINDOWS: values.append(scan.get_regime(w).get('market_coherence', np.nan)) # Aggregates lmax = [scan.get_tracking(w).get('lambda_max', np.nan) for w in WINDOWS] values.append(np.nanmean(lmax)) values.append(np.nanstd(lmax)) instab = [scan.get_regime(w).get('instability_score', np.nan) for w in WINDOWS] values.append(np.nanmean(instab)) values.append(np.nanmax(instab)) coher = [scan.get_regime(w).get('market_coherence', np.nan) for w in WINDOWS] values.append(np.nanmean(coher)) values.append(np.nanmin(coher)) values.append(coher[3] - coher[0] if not np.isnan(coher[3]) and not np.isnan(coher[0]) else np.nan) # From prices prices = np.array(list(scan.market_prices.values())) if scan.market_prices else np.array([]) values.append(len(prices)) values.append(np.std(np.log(np.maximum(prices, 1e-10))) if len(prices) > 0 else np.nan) return np.array(values) def compute_per_asset(self, scan: ScanData) -> Tuple[np.ndarray, List[str]]: """Compute per-asset indicator matrix""" symbols = scan.symbols n = len(symbols) if n == 0: return np.zeros((0, N_PER_ASSET)), [] matrix = np.zeros((n, N_PER_ASSET)) prices = np.array([scan.market_prices[s] for s in symbols]) btc_p = scan.market_prices.get('BTC', scan.market_prices.get('BTCUSDT', np.nan)) eth_p = scan.market_prices.get('ETH', scan.market_prices.get('ETHUSDT', np.nan)) col = 0 matrix[:, col] = prices; col += 1 matrix[:, col] = np.log(np.maximum(prices, 1e-10)); col += 1 matrix[:, col] = np.argsort(np.argsort(prices)) / n; col += 1 matrix[:, col] = prices / btc_p if btc_p > 0 else np.nan; col += 1 matrix[:, col] = prices / eth_p if eth_p > 0 else np.nan; col += 1 # Per-window signals for metric in ['market_alignment', 'decoupling_velocity', 'anomaly_score', 'eigenvector_component']: for w in WINDOWS: sigs = scan.get_asset_signals(w) for i, sym in enumerate(symbols): matrix[i, col] = sigs.get(sym, {}).get(metric, np.nan) col += 1 # Aggregates align_cols = list(range(5, 9)) matrix[:, col] = np.nanmean(matrix[:, align_cols], axis=1); col += 1 matrix[:, col] = np.nanstd(matrix[:, align_cols], axis=1); col += 1 anomaly_cols = list(range(13, 17)) matrix[:, col] = np.nanmax(matrix[:, anomaly_cols], axis=1); col += 1 decouple_cols = list(range(9, 13)) matrix[:, col] = np.nanmax(np.abs(matrix[:, decouple_cols]), axis=1); col += 1 return matrix, symbols async def process(self, path: Path) -> Optional[Dict[str, Any]]: start = time.time() scan = self.load_scan(path) if scan is None: return None # 1. API historical indicators api_matrix, api_success = await self.fetch_api_indicators(scan.timestamp) # 2. Scan-derived global scan_global = self.compute_scan_global(scan) # 3. Per-asset asset_matrix, asset_symbols = self.compute_per_asset(scan) return { 'scan_number': scan.scan_number, 'timestamp': scan.timestamp.isoformat(), 'processing_time': time.time() - start, 'api_indicators': api_matrix, 'api_success': api_success, 'api_names': np.array([ind.name for ind in INDICATORS], dtype='U32'), 'scan_global': scan_global, 'scan_global_names': np.array([n for n, _ in SCAN_GLOBAL_INDICATORS], dtype='U32'), 'asset_matrix': asset_matrix, 'asset_symbols': np.array(asset_symbols, dtype='U16'), 'asset_names': np.array([n for n, _ in PER_ASSET_INDICATORS], dtype='U32'), 'n_assets': len(asset_symbols), 'api_success_rate': np.nanmean(api_success[list(i-1 for i in IndicatorSource.API_HISTORICAL)]), } # ============================================================================= # OUTPUT # ============================================================================= class OutputWriter: def __init__(self, config: BackfillConfig): self.config = config def get_output_path(self, scan_path: Path) -> Path: out_dir = Path(self.config.output_dir) if self.config.output_dir else scan_path.parent out_dir.mkdir(parents=True, exist_ok=True) return out_dir / f"{scan_path.stem}__Indicators.npz" def save(self, data: Dict[str, Any], scan_path: Path) -> Path: out_path = self.get_output_path(scan_path) save_data = {} for k, v in data.items(): if isinstance(v, np.ndarray): save_data[k] = v elif isinstance(v, str): save_data[k] = np.array([v], dtype='U64') else: save_data[k] = np.array([v]) np.savez_compressed(out_path, **save_data) return out_path # ============================================================================= # RUNNER # ============================================================================= class BackfillRunner: def __init__(self, config: BackfillConfig): self.config = config self.processor = ScanProcessor(config) self.writer = OutputWriter(config) self.stats = {'processed': 0, 'failed': 0, 'skipped': 0} def find_scans(self) -> List[Path]: root = Path(self.config.scan_dir) files = sorted(root.rglob("scan_*.json")) if self.config.skip_existing: files = [f for f in files if not self.writer.get_output_path(f).exists()] return files async def run(self): unavail = IndicatorSource.get_unavailable_names() logger.info(f"Skipping {len(unavail)} unavailable indicators: {unavail[:5]}...") files = self.find_scans() logger.info(f"Processing {len(files)} files...") for i, path in enumerate(files): try: result = await self.processor.process(path) if result: if not self.config.dry_run: self.writer.save(result, path) self.stats['processed'] += 1 else: self.stats['failed'] += 1 except Exception as e: logger.error(f"Error {path.name}: {e}") self.stats['failed'] += 1 if (i + 1) % 10 == 0: logger.info(f"Progress: {i+1}/{len(files)}") if self.config.rate_limit_delay > 0: await asyncio.sleep(self.config.rate_limit_delay) logger.info(f"Done: {self.stats}") return self.stats # ============================================================================= # UTILITY # ============================================================================= def load_indicators(path: str) -> Dict[str, np.ndarray]: """Load .npz indicator file""" return dict(np.load(path, allow_pickle=True)) def summary(path: str) -> str: """Summary of indicator file""" d = load_indicators(path) return f"""Timestamp: {d['timestamp'][0]} Assets: {d['n_assets'][0]} API success: {d['api_success_rate'][0]:.1%} API shape: {d['api_indicators'].shape} Scan global: {d['scan_global'].shape} Per-asset: {d['asset_matrix'].shape}""" # ============================================================================= # CLI # ============================================================================= def main(): parser = argparse.ArgumentParser(description="DOLPHIN Backfill Runner") # parser.add_argument("scan_dir", help="Directory with scan JSON files") parser.add_argument("-o", "--output", help="Output directory") parser.add_argument("--fred-key", default="", help="FRED API key") parser.add_argument("--no-skip", action="store_true", help="Reprocess existing") parser.add_argument("--dry-run", action="store_true") parser.add_argument("--delay", type=float, default=0.5) args = parser.parse_args() config = BackfillConfig( scan_dir= Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues"), output_dir=args.output, # FRED API Key: c16a9cde3e3bb5bb972bb9283485f202 fred_api_key=args.fred_key or 'c16a9cde3e3bb5bb972bb9283485f202', skip_existing=not args.no_skip, dry_run=args.dry_run, rate_limit_delay=args.delay, ) runner = BackfillRunner(config) asyncio.run(runner.run()) if __name__ == "__main__": main()