initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree

Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
2026-04-21 16:58:38 +02:00
commit 01c19662cb
643 changed files with 260241 additions and 0 deletions
--- a/external_factors/backfill_runner.py
+++ b/external_factors/backfill_runner.py
@@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+"""
+DOLPHIN BACKFILL RUNNER v2.0
+============================
+Spiders DOLPHIN scan directories, enriches with external factors matrix.
+
+INDICATOR SOURCES:
+  1. API_HISTORICAL: Fetched with scan timestamp (CoinMetrics, FRED, DeFi Llama, etc.)
+  2. SCAN_DERIVED: Computed from scan's market_prices, tracking_data, per_asset_signals
+  3. UNAVAILABLE: No historical API AND cannot compute from scan → NaN
+
+Output: {original_name}__Indicators.npz (sorts alphabetically next to source)
+
+Author: HJ / Claude
+Version: 2.0.0
+"""
+
+import os
+import sys
+import json
+import numpy as np
+import asyncio
+import aiohttp
+from pathlib import Path
+from datetime import datetime, timezone
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Any, Set
+import logging
+import time
+import argparse
+
+# Import external factors module
+from external_factors_matrix import (
+    ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS, 
+    HistoricalSupport, Stationarity, Category
+)
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
+logger = logging.getLogger(__name__)
+
+# =============================================================================
+# INDICATOR SOURCE CLASSIFICATION
+# =============================================================================
+
+class IndicatorSource:
+    """Classifies each indicator by how it can be obtained for backfill"""
+    
+    # Indicators that HAVE historical API support (fetch with timestamp)
+    API_HISTORICAL: Set[int] = set()
+    
+    # Indicators that are UNAVAILABLE (no history, can't derive from scan)
+    UNAVAILABLE: Set[int] = set()
+    
+    @classmethod
+    def classify(cls):
+        """Classify all indicators by their backfill source"""
+        for ind in INDICATORS:
+            if ind.historical in [HistoricalSupport.FULL, HistoricalSupport.PARTIAL]:
+                cls.API_HISTORICAL.add(ind.id)
+            else:
+                cls.UNAVAILABLE.add(ind.id)
+        
+        logger.info(f"Indicator sources: API_HISTORICAL={len(cls.API_HISTORICAL)}, "
+                   f"UNAVAILABLE={len(cls.UNAVAILABLE)}")
+    
+    @classmethod
+    def get_unavailable_names(cls) -> List[str]:
+        return [INDICATORS[i-1].name for i in sorted(cls.UNAVAILABLE)]
+
+# Initialize classification
+IndicatorSource.classify()
+
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+
+@dataclass
+class BackfillConfig:
+    scan_dir: Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
+    output_dir: Optional[str] = None
+    skip_existing: bool = True
+    dry_run: bool = False
+    fred_api_key: str = ""
+    rate_limit_delay: float = 0.5
+    verbose: bool = False
+
+# =============================================================================
+# SCAN DATA
+# =============================================================================
+
+@dataclass
+class ScanData:
+    path: Path
+    scan_number: int
+    timestamp: datetime
+    market_prices: Dict[str, float]
+    windows: Dict[str, Dict]
+    
+    @property
+    def n_assets(self) -> int:
+        return len(self.market_prices)
+    
+    @property
+    def symbols(self) -> List[str]:
+        return sorted(self.market_prices.keys())
+    
+    def get_tracking(self, window: str) -> Dict:
+        return self.windows.get(window, {}).get('tracking_data', {})
+    
+    def get_regime(self, window: str) -> Dict:
+        return self.windows.get(window, {}).get('regime_signals', {})
+    
+    def get_asset_signals(self, window: str) -> Dict:
+        return self.windows.get(window, {}).get('per_asset_signals', {})
+
+# =============================================================================
+# INDICATORS FROM SCAN DATA
+# =============================================================================
+
+WINDOWS = ['50', '150', '300', '750']
+
+# Global scan-derived indicators (eigenvalue-based, from tracking_data/regime_signals)
+SCAN_GLOBAL_INDICATORS = [
+    # Lambda max per window
+    *[(f"lambda_max_w{w}", f"Lambda max window {w}") for w in WINDOWS],
+    *[(f"lambda_min_w{w}", f"Lambda min window {w}") for w in WINDOWS],
+    *[(f"lambda_vel_w{w}", f"Lambda velocity window {w}") for w in WINDOWS],
+    *[(f"lambda_acc_w{w}", f"Lambda acceleration window {w}") for w in WINDOWS],
+    *[(f"eigrot_max_w{w}", f"Eigenvector rotation window {w}") for w in WINDOWS],
+    *[(f"eiggap_w{w}", f"Eigenvalue gap window {w}") for w in WINDOWS],
+    *[(f"instab_w{w}", f"Instability window {w}") for w in WINDOWS],
+    *[(f"transp_w{w}", f"Transition prob window {w}") for w in WINDOWS],
+    *[(f"coher_w{w}", f"Coherence window {w}") for w in WINDOWS],
+    # Aggregates
+    ("lambda_max_mean", "Mean lambda max"),
+    ("lambda_max_std", "Std lambda max"),
+    ("instab_mean", "Mean instability"),
+    ("instab_max", "Max instability"),
+    ("coher_mean", "Mean coherence"),
+    ("coher_min", "Min coherence"),
+    ("coher_trend", "Coherence trend (w750-w50)"),
+    # From prices
+    ("n_assets", "Number of assets"),
+    ("price_dispersion", "Log price dispersion"),
+]
+
+N_SCAN_GLOBAL = len(SCAN_GLOBAL_INDICATORS)
+
+# Per-asset indicators
+PER_ASSET_INDICATORS = [
+    ("price", "Price"),
+    ("log_price", "Log price"),
+    ("price_rank", "Price percentile"),
+    ("price_btc", "Price / BTC"),
+    ("price_eth", "Price / ETH"),
+    *[(f"align_w{w}", f"Alignment w{w}") for w in WINDOWS],
+    *[(f"decouple_w{w}", f"Decoupling w{w}") for w in WINDOWS],
+    *[(f"anomaly_w{w}", f"Anomaly w{w}") for w in WINDOWS],
+    *[(f"eigvec_w{w}", f"Eigenvector w{w}") for w in WINDOWS],
+    ("align_mean", "Mean alignment"),
+    ("align_std", "Alignment std"),
+    ("anomaly_max", "Max anomaly"),
+    ("decouple_max", "Max |decoupling|"),
+]
+
+N_PER_ASSET = len(PER_ASSET_INDICATORS)
+
+# =============================================================================
+# PROCESSOR
+# =============================================================================
+
+class ScanProcessor:
+    def __init__(self, config: BackfillConfig):
+        self.config = config
+        self.fetcher = ExternalFactorsFetcher(Config(fred_api_key=config.fred_api_key))
+    
+    def load_scan(self, path: Path) -> Optional[ScanData]:
+        try:
+            with open(path, 'r') as f:
+                data = json.load(f)
+            
+            ts_str = data.get('timestamp', '')
+            try:
+                timestamp = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
+                if timestamp.tzinfo is None:
+                    timestamp = timestamp.replace(tzinfo=timezone.utc)
+            except:
+                timestamp = datetime.now(timezone.utc)
+            
+            return ScanData(
+                path=path,
+                scan_number=data.get('scan_number', 0),
+                timestamp=timestamp,
+                market_prices=data.get('market_prices', {}),
+                windows=data.get('windows', {})
+            )
+        except Exception as e:
+            logger.error(f"Load failed {path}: {e}")
+            return None
+    
+    async def fetch_api_indicators(self, timestamp: datetime) -> Tuple[np.ndarray, np.ndarray]:
+        """Fetch indicators with historical API support"""
+        try:
+            result = await self.fetcher.fetch_all(target_date=timestamp)
+            matrix = result['matrix']
+            success = np.array([
+                result['details'].get(i+1, {}).get('success', False) 
+                for i in range(N_INDICATORS)
+            ])
+            
+            # Mark non-historical indicators as NaN
+            for i in range(N_INDICATORS):
+                if (i+1) not in IndicatorSource.API_HISTORICAL:
+                    success[i] = False
+                    matrix[i] = np.nan
+            
+            return matrix, success
+        except Exception as e:
+            logger.warning(f"API fetch failed: {e}")
+            return np.full(N_INDICATORS, np.nan), np.zeros(N_INDICATORS, dtype=bool)
+    
+    def compute_scan_global(self, scan: ScanData) -> np.ndarray:
+        """Compute global indicators from scan's tracking_data and regime_signals"""
+        values = []
+        
+        # Per-window metrics
+        for w in WINDOWS:
+            values.append(scan.get_tracking(w).get('lambda_max', np.nan))
+        for w in WINDOWS:
+            values.append(scan.get_tracking(w).get('lambda_min', np.nan))
+        for w in WINDOWS:
+            values.append(scan.get_tracking(w).get('lambda_max_velocity', np.nan))
+        for w in WINDOWS:
+            values.append(scan.get_tracking(w).get('lambda_max_acceleration', np.nan))
+        for w in WINDOWS:
+            values.append(scan.get_tracking(w).get('eigenvector_rotation_max', np.nan))
+        for w in WINDOWS:
+            values.append(scan.get_tracking(w).get('eigenvalue_gap', np.nan))
+        for w in WINDOWS:
+            values.append(scan.get_regime(w).get('instability_score', np.nan))
+        for w in WINDOWS:
+            values.append(scan.get_regime(w).get('regime_transition_probability', np.nan))
+        for w in WINDOWS:
+            values.append(scan.get_regime(w).get('market_coherence', np.nan))
+        
+        # Aggregates
+        lmax = [scan.get_tracking(w).get('lambda_max', np.nan) for w in WINDOWS]
+        values.append(np.nanmean(lmax))
+        values.append(np.nanstd(lmax))
+        
+        instab = [scan.get_regime(w).get('instability_score', np.nan) for w in WINDOWS]
+        values.append(np.nanmean(instab))
+        values.append(np.nanmax(instab))
+        
+        coher = [scan.get_regime(w).get('market_coherence', np.nan) for w in WINDOWS]
+        values.append(np.nanmean(coher))
+        values.append(np.nanmin(coher))
+        values.append(coher[3] - coher[0] if not np.isnan(coher[3]) and not np.isnan(coher[0]) else np.nan)
+        
+        # From prices
+        prices = np.array(list(scan.market_prices.values())) if scan.market_prices else np.array([])
+        values.append(len(prices))
+        values.append(np.std(np.log(np.maximum(prices, 1e-10))) if len(prices) > 0 else np.nan)
+        
+        return np.array(values)
+    
+    def compute_per_asset(self, scan: ScanData) -> Tuple[np.ndarray, List[str]]:
+        """Compute per-asset indicator matrix"""
+        symbols = scan.symbols
+        n = len(symbols)
+        if n == 0:
+            return np.zeros((0, N_PER_ASSET)), []
+        
+        matrix = np.zeros((n, N_PER_ASSET))
+        prices = np.array([scan.market_prices[s] for s in symbols])
+        
+        btc_p = scan.market_prices.get('BTC', scan.market_prices.get('BTCUSDT', np.nan))
+        eth_p = scan.market_prices.get('ETH', scan.market_prices.get('ETHUSDT', np.nan))
+        
+        col = 0
+        matrix[:, col] = prices; col += 1
+        matrix[:, col] = np.log(np.maximum(prices, 1e-10)); col += 1
+        matrix[:, col] = np.argsort(np.argsort(prices)) / n; col += 1
+        matrix[:, col] = prices / btc_p if btc_p > 0 else np.nan; col += 1
+        matrix[:, col] = prices / eth_p if eth_p > 0 else np.nan; col += 1
+        
+        # Per-window signals
+        for metric in ['market_alignment', 'decoupling_velocity', 'anomaly_score', 'eigenvector_component']:
+            for w in WINDOWS:
+                sigs = scan.get_asset_signals(w)
+                for i, sym in enumerate(symbols):
+                    matrix[i, col] = sigs.get(sym, {}).get(metric, np.nan)
+                col += 1
+        
+        # Aggregates
+        align_cols = list(range(5, 9))
+        matrix[:, col] = np.nanmean(matrix[:, align_cols], axis=1); col += 1
+        matrix[:, col] = np.nanstd(matrix[:, align_cols], axis=1); col += 1
+        
+        anomaly_cols = list(range(13, 17))
+        matrix[:, col] = np.nanmax(matrix[:, anomaly_cols], axis=1); col += 1
+        
+        decouple_cols = list(range(9, 13))
+        matrix[:, col] = np.nanmax(np.abs(matrix[:, decouple_cols]), axis=1); col += 1
+        
+        return matrix, symbols
+    
+    async def process(self, path: Path) -> Optional[Dict[str, Any]]:
+        start = time.time()
+        
+        scan = self.load_scan(path)
+        if scan is None:
+            return None
+        
+        # 1. API historical indicators
+        api_matrix, api_success = await self.fetch_api_indicators(scan.timestamp)
+        
+        # 2. Scan-derived global
+        scan_global = self.compute_scan_global(scan)
+        
+        # 3. Per-asset
+        asset_matrix, asset_symbols = self.compute_per_asset(scan)
+        
+        return {
+            'scan_number': scan.scan_number,
+            'timestamp': scan.timestamp.isoformat(),
+            'processing_time': time.time() - start,
+            
+            'api_indicators': api_matrix,
+            'api_success': api_success,
+            'api_names': np.array([ind.name for ind in INDICATORS], dtype='U32'),
+            
+            'scan_global': scan_global,
+            'scan_global_names': np.array([n for n, _ in SCAN_GLOBAL_INDICATORS], dtype='U32'),
+            
+            'asset_matrix': asset_matrix,
+            'asset_symbols': np.array(asset_symbols, dtype='U16'),
+            'asset_names': np.array([n for n, _ in PER_ASSET_INDICATORS], dtype='U32'),
+            
+            'n_assets': len(asset_symbols),
+            'api_success_rate': np.nanmean(api_success[list(i-1 for i in IndicatorSource.API_HISTORICAL)]),
+        }
+
+# =============================================================================
+# OUTPUT
+# =============================================================================
+
+class OutputWriter:
+    def __init__(self, config: BackfillConfig):
+        self.config = config
+    
+    def get_output_path(self, scan_path: Path) -> Path:
+        out_dir = Path(self.config.output_dir) if self.config.output_dir else scan_path.parent
+        out_dir.mkdir(parents=True, exist_ok=True)
+        return out_dir / f"{scan_path.stem}__Indicators.npz"
+    
+    def save(self, data: Dict[str, Any], scan_path: Path) -> Path:
+        out_path = self.get_output_path(scan_path)
+        save_data = {}
+        for k, v in data.items():
+            if isinstance(v, np.ndarray):
+                save_data[k] = v
+            elif isinstance(v, str):
+                save_data[k] = np.array([v], dtype='U64')
+            else:
+                save_data[k] = np.array([v])
+        np.savez_compressed(out_path, **save_data)
+        return out_path
+
+# =============================================================================
+# RUNNER
+# =============================================================================
+
+class BackfillRunner:
+    def __init__(self, config: BackfillConfig):
+        self.config = config
+        self.processor = ScanProcessor(config)
+        self.writer = OutputWriter(config)
+        self.stats = {'processed': 0, 'failed': 0, 'skipped': 0}
+    
+    def find_scans(self) -> List[Path]:
+        root = Path(self.config.scan_dir)
+        files = sorted(root.rglob("scan_*.json"))
+        
+        if self.config.skip_existing:
+            files = [f for f in files if not self.writer.get_output_path(f).exists()]
+        
+        return files
+    
+    async def run(self):
+        unavail = IndicatorSource.get_unavailable_names()
+        logger.info(f"Skipping {len(unavail)} unavailable indicators: {unavail[:5]}...")
+        
+        files = self.find_scans()
+        logger.info(f"Processing {len(files)} files...")
+        
+        for i, path in enumerate(files):
+            try:
+                result = await self.processor.process(path)
+                if result:
+                    if not self.config.dry_run:
+                        self.writer.save(result, path)
+                    self.stats['processed'] += 1
+                else:
+                    self.stats['failed'] += 1
+            except Exception as e:
+                logger.error(f"Error {path.name}: {e}")
+                self.stats['failed'] += 1
+            
+            if (i + 1) % 10 == 0:
+                logger.info(f"Progress: {i+1}/{len(files)}")
+            
+            if self.config.rate_limit_delay > 0:
+                await asyncio.sleep(self.config.rate_limit_delay)
+        
+        logger.info(f"Done: {self.stats}")
+        return self.stats
+
+# =============================================================================
+# UTILITY
+# =============================================================================
+
+def load_indicators(path: str) -> Dict[str, np.ndarray]:
+    """Load .npz indicator file"""
+    return dict(np.load(path, allow_pickle=True))
+
+def summary(path: str) -> str:
+    """Summary of indicator file"""
+    d = load_indicators(path)
+    return f"""Timestamp: {d['timestamp'][0]}
+Assets: {d['n_assets'][0]}
+API success: {d['api_success_rate'][0]:.1%}
+API shape: {d['api_indicators'].shape}
+Scan global: {d['scan_global'].shape}
+Per-asset: {d['asset_matrix'].shape}"""
+
+# =============================================================================
+# CLI
+# =============================================================================
+
+def main():
+    parser = argparse.ArgumentParser(description="DOLPHIN Backfill Runner")
+    # parser.add_argument("scan_dir", help="Directory with scan JSON files")
+    parser.add_argument("-o", "--output", help="Output directory")
+    parser.add_argument("--fred-key", default="", help="FRED API key")
+    parser.add_argument("--no-skip", action="store_true", help="Reprocess existing")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--delay", type=float, default=0.5)
+    
+    args = parser.parse_args()
+    
+    config = BackfillConfig(
+        scan_dir= Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues"),
+        output_dir=args.output,
+        # FRED API Key: c16a9cde3e3bb5bb972bb9283485f202
+        fred_api_key=args.fred_key or 'c16a9cde3e3bb5bb972bb9283485f202',
+        skip_existing=not args.no_skip,
+        dry_run=args.dry_run,
+        rate_limit_delay=args.delay,
+    )
+    
+    runner = BackfillRunner(config)
+    asyncio.run(runner.run())
+
+if __name__ == "__main__":
+    main()