#!/usr/bin/env python3
"""
DOLPHIN BACKFILL RUNNER v2.0
============================
Spiders DOLPHIN scan directories, enriches with external factors matrix.

INDICATOR SOURCES:
  1. API_HISTORICAL: Fetched with scan timestamp (CoinMetrics, FRED, DeFi Llama, etc.)
  2. SCAN_DERIVED: Computed from scan's market_prices, tracking_data, per_asset_signals
  3. UNAVAILABLE: No historical API AND cannot compute from scan → NaN

Output: {original_name}__Indicators.npz (sorts alphabetically next to source)

Author: HJ / Claude
Version: 2.0.0
"""

import os
import sys
import json
import numpy as np
import asyncio
import aiohttp
from pathlib import Path
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Any, Set
import logging
import time
import argparse

# Import external factors module
from external_factors_matrix import (
    ExternalFactorsFetcher, Config, INDICATORS, N_INDICATORS, 
    HistoricalSupport, Stationarity, Category
)

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)

# =============================================================================
# INDICATOR SOURCE CLASSIFICATION
# =============================================================================

class IndicatorSource:
    """Classifies each indicator by how it can be obtained for backfill"""
    
    # Indicators that HAVE historical API support (fetch with timestamp)
    API_HISTORICAL: Set[int] = set()
    
    # Indicators that are UNAVAILABLE (no history, can't derive from scan)
    UNAVAILABLE: Set[int] = set()
    
    @classmethod
    def classify(cls):
        """Classify all indicators by their backfill source"""
        for ind in INDICATORS:
            if ind.historical in [HistoricalSupport.FULL, HistoricalSupport.PARTIAL]:
                cls.API_HISTORICAL.add(ind.id)
            else:
                cls.UNAVAILABLE.add(ind.id)
        
        logger.info(f"Indicator sources: API_HISTORICAL={len(cls.API_HISTORICAL)}, "
                   f"UNAVAILABLE={len(cls.UNAVAILABLE)}")
    
    @classmethod
    def get_unavailable_names(cls) -> List[str]:
        return [INDICATORS[i-1].name for i in sorted(cls.UNAVAILABLE)]

# Initialize classification
IndicatorSource.classify()

# =============================================================================
# CONFIGURATION
# =============================================================================

@dataclass
class BackfillConfig:
    scan_dir: Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues")
    output_dir: Optional[str] = None
    skip_existing: bool = True
    dry_run: bool = False
    fred_api_key: str = ""
    rate_limit_delay: float = 0.5
    verbose: bool = False

# =============================================================================
# SCAN DATA
# =============================================================================

@dataclass
class ScanData:
    path: Path
    scan_number: int
    timestamp: datetime
    market_prices: Dict[str, float]
    windows: Dict[str, Dict]
    
    @property
    def n_assets(self) -> int:
        return len(self.market_prices)
    
    @property
    def symbols(self) -> List[str]:
        return sorted(self.market_prices.keys())
    
    def get_tracking(self, window: str) -> Dict:
        return self.windows.get(window, {}).get('tracking_data', {})
    
    def get_regime(self, window: str) -> Dict:
        return self.windows.get(window, {}).get('regime_signals', {})
    
    def get_asset_signals(self, window: str) -> Dict:
        return self.windows.get(window, {}).get('per_asset_signals', {})

# =============================================================================
# INDICATORS FROM SCAN DATA
# =============================================================================

WINDOWS = ['50', '150', '300', '750']

# Global scan-derived indicators (eigenvalue-based, from tracking_data/regime_signals)
SCAN_GLOBAL_INDICATORS = [
    # Lambda max per window
    *[(f"lambda_max_w{w}", f"Lambda max window {w}") for w in WINDOWS],
    *[(f"lambda_min_w{w}", f"Lambda min window {w}") for w in WINDOWS],
    *[(f"lambda_vel_w{w}", f"Lambda velocity window {w}") for w in WINDOWS],
    *[(f"lambda_acc_w{w}", f"Lambda acceleration window {w}") for w in WINDOWS],
    *[(f"eigrot_max_w{w}", f"Eigenvector rotation window {w}") for w in WINDOWS],
    *[(f"eiggap_w{w}", f"Eigenvalue gap window {w}") for w in WINDOWS],
    *[(f"instab_w{w}", f"Instability window {w}") for w in WINDOWS],
    *[(f"transp_w{w}", f"Transition prob window {w}") for w in WINDOWS],
    *[(f"coher_w{w}", f"Coherence window {w}") for w in WINDOWS],
    # Aggregates
    ("lambda_max_mean", "Mean lambda max"),
    ("lambda_max_std", "Std lambda max"),
    ("instab_mean", "Mean instability"),
    ("instab_max", "Max instability"),
    ("coher_mean", "Mean coherence"),
    ("coher_min", "Min coherence"),
    ("coher_trend", "Coherence trend (w750-w50)"),
    # From prices
    ("n_assets", "Number of assets"),
    ("price_dispersion", "Log price dispersion"),
]

N_SCAN_GLOBAL = len(SCAN_GLOBAL_INDICATORS)

# Per-asset indicators
PER_ASSET_INDICATORS = [
    ("price", "Price"),
    ("log_price", "Log price"),
    ("price_rank", "Price percentile"),
    ("price_btc", "Price / BTC"),
    ("price_eth", "Price / ETH"),
    *[(f"align_w{w}", f"Alignment w{w}") for w in WINDOWS],
    *[(f"decouple_w{w}", f"Decoupling w{w}") for w in WINDOWS],
    *[(f"anomaly_w{w}", f"Anomaly w{w}") for w in WINDOWS],
    *[(f"eigvec_w{w}", f"Eigenvector w{w}") for w in WINDOWS],
    ("align_mean", "Mean alignment"),
    ("align_std", "Alignment std"),
    ("anomaly_max", "Max anomaly"),
    ("decouple_max", "Max |decoupling|"),
]

N_PER_ASSET = len(PER_ASSET_INDICATORS)

# =============================================================================
# PROCESSOR
# =============================================================================

class ScanProcessor:
    def __init__(self, config: BackfillConfig):
        self.config = config
        self.fetcher = ExternalFactorsFetcher(Config(fred_api_key=config.fred_api_key))
    
    def load_scan(self, path: Path) -> Optional[ScanData]:
        try:
            with open(path, 'r') as f:
                data = json.load(f)
            
            ts_str = data.get('timestamp', '')
            try:
                timestamp = datetime.fromisoformat(ts_str.replace('Z', '+00:00'))
                if timestamp.tzinfo is None:
                    timestamp = timestamp.replace(tzinfo=timezone.utc)
            except:
                timestamp = datetime.now(timezone.utc)
            
            return ScanData(
                path=path,
                scan_number=data.get('scan_number', 0),
                timestamp=timestamp,
                market_prices=data.get('market_prices', {}),
                windows=data.get('windows', {})
            )
        except Exception as e:
            logger.error(f"Load failed {path}: {e}")
            return None
    
    async def fetch_api_indicators(self, timestamp: datetime) -> Tuple[np.ndarray, np.ndarray]:
        """Fetch indicators with historical API support"""
        try:
            result = await self.fetcher.fetch_all(target_date=timestamp)
            matrix = result['matrix']
            success = np.array([
                result['details'].get(i+1, {}).get('success', False) 
                for i in range(N_INDICATORS)
            ])
            
            # Mark non-historical indicators as NaN
            for i in range(N_INDICATORS):
                if (i+1) not in IndicatorSource.API_HISTORICAL:
                    success[i] = False
                    matrix[i] = np.nan
            
            return matrix, success
        except Exception as e:
            logger.warning(f"API fetch failed: {e}")
            return np.full(N_INDICATORS, np.nan), np.zeros(N_INDICATORS, dtype=bool)
    
    def compute_scan_global(self, scan: ScanData) -> np.ndarray:
        """Compute global indicators from scan's tracking_data and regime_signals"""
        values = []
        
        # Per-window metrics
        for w in WINDOWS:
            values.append(scan.get_tracking(w).get('lambda_max', np.nan))
        for w in WINDOWS:
            values.append(scan.get_tracking(w).get('lambda_min', np.nan))
        for w in WINDOWS:
            values.append(scan.get_tracking(w).get('lambda_max_velocity', np.nan))
        for w in WINDOWS:
            values.append(scan.get_tracking(w).get('lambda_max_acceleration', np.nan))
        for w in WINDOWS:
            values.append(scan.get_tracking(w).get('eigenvector_rotation_max', np.nan))
        for w in WINDOWS:
            values.append(scan.get_tracking(w).get('eigenvalue_gap', np.nan))
        for w in WINDOWS:
            values.append(scan.get_regime(w).get('instability_score', np.nan))
        for w in WINDOWS:
            values.append(scan.get_regime(w).get('regime_transition_probability', np.nan))
        for w in WINDOWS:
            values.append(scan.get_regime(w).get('market_coherence', np.nan))
        
        # Aggregates
        lmax = [scan.get_tracking(w).get('lambda_max', np.nan) for w in WINDOWS]
        values.append(np.nanmean(lmax))
        values.append(np.nanstd(lmax))
        
        instab = [scan.get_regime(w).get('instability_score', np.nan) for w in WINDOWS]
        values.append(np.nanmean(instab))
        values.append(np.nanmax(instab))
        
        coher = [scan.get_regime(w).get('market_coherence', np.nan) for w in WINDOWS]
        values.append(np.nanmean(coher))
        values.append(np.nanmin(coher))
        values.append(coher[3] - coher[0] if not np.isnan(coher[3]) and not np.isnan(coher[0]) else np.nan)
        
        # From prices
        prices = np.array(list(scan.market_prices.values())) if scan.market_prices else np.array([])
        values.append(len(prices))
        values.append(np.std(np.log(np.maximum(prices, 1e-10))) if len(prices) > 0 else np.nan)
        
        return np.array(values)
    
    def compute_per_asset(self, scan: ScanData) -> Tuple[np.ndarray, List[str]]:
        """Compute per-asset indicator matrix"""
        symbols = scan.symbols
        n = len(symbols)
        if n == 0:
            return np.zeros((0, N_PER_ASSET)), []
        
        matrix = np.zeros((n, N_PER_ASSET))
        prices = np.array([scan.market_prices[s] for s in symbols])
        
        btc_p = scan.market_prices.get('BTC', scan.market_prices.get('BTCUSDT', np.nan))
        eth_p = scan.market_prices.get('ETH', scan.market_prices.get('ETHUSDT', np.nan))
        
        col = 0
        matrix[:, col] = prices; col += 1
        matrix[:, col] = np.log(np.maximum(prices, 1e-10)); col += 1
        matrix[:, col] = np.argsort(np.argsort(prices)) / n; col += 1
        matrix[:, col] = prices / btc_p if btc_p > 0 else np.nan; col += 1
        matrix[:, col] = prices / eth_p if eth_p > 0 else np.nan; col += 1
        
        # Per-window signals
        for metric in ['market_alignment', 'decoupling_velocity', 'anomaly_score', 'eigenvector_component']:
            for w in WINDOWS:
                sigs = scan.get_asset_signals(w)
                for i, sym in enumerate(symbols):
                    matrix[i, col] = sigs.get(sym, {}).get(metric, np.nan)
                col += 1
        
        # Aggregates
        align_cols = list(range(5, 9))
        matrix[:, col] = np.nanmean(matrix[:, align_cols], axis=1); col += 1
        matrix[:, col] = np.nanstd(matrix[:, align_cols], axis=1); col += 1
        
        anomaly_cols = list(range(13, 17))
        matrix[:, col] = np.nanmax(matrix[:, anomaly_cols], axis=1); col += 1
        
        decouple_cols = list(range(9, 13))
        matrix[:, col] = np.nanmax(np.abs(matrix[:, decouple_cols]), axis=1); col += 1
        
        return matrix, symbols
    
    async def process(self, path: Path) -> Optional[Dict[str, Any]]:
        start = time.time()
        
        scan = self.load_scan(path)
        if scan is None:
            return None
        
        # 1. API historical indicators
        api_matrix, api_success = await self.fetch_api_indicators(scan.timestamp)
        
        # 2. Scan-derived global
        scan_global = self.compute_scan_global(scan)
        
        # 3. Per-asset
        asset_matrix, asset_symbols = self.compute_per_asset(scan)
        
        return {
            'scan_number': scan.scan_number,
            'timestamp': scan.timestamp.isoformat(),
            'processing_time': time.time() - start,
            
            'api_indicators': api_matrix,
            'api_success': api_success,
            'api_names': np.array([ind.name for ind in INDICATORS], dtype='U32'),
            
            'scan_global': scan_global,
            'scan_global_names': np.array([n for n, _ in SCAN_GLOBAL_INDICATORS], dtype='U32'),
            
            'asset_matrix': asset_matrix,
            'asset_symbols': np.array(asset_symbols, dtype='U16'),
            'asset_names': np.array([n for n, _ in PER_ASSET_INDICATORS], dtype='U32'),
            
            'n_assets': len(asset_symbols),
            'api_success_rate': np.nanmean(api_success[list(i-1 for i in IndicatorSource.API_HISTORICAL)]),
        }

# =============================================================================
# OUTPUT
# =============================================================================

class OutputWriter:
    def __init__(self, config: BackfillConfig):
        self.config = config
    
    def get_output_path(self, scan_path: Path) -> Path:
        out_dir = Path(self.config.output_dir) if self.config.output_dir else scan_path.parent
        out_dir.mkdir(parents=True, exist_ok=True)
        return out_dir / f"{scan_path.stem}__Indicators.npz"
    
    def save(self, data: Dict[str, Any], scan_path: Path) -> Path:
        out_path = self.get_output_path(scan_path)
        save_data = {}
        for k, v in data.items():
            if isinstance(v, np.ndarray):
                save_data[k] = v
            elif isinstance(v, str):
                save_data[k] = np.array([v], dtype='U64')
            else:
                save_data[k] = np.array([v])
        np.savez_compressed(out_path, **save_data)
        return out_path

# =============================================================================
# RUNNER
# =============================================================================

class BackfillRunner:
    def __init__(self, config: BackfillConfig):
        self.config = config
        self.processor = ScanProcessor(config)
        self.writer = OutputWriter(config)
        self.stats = {'processed': 0, 'failed': 0, 'skipped': 0}
    
    def find_scans(self) -> List[Path]:
        root = Path(self.config.scan_dir)
        files = sorted(root.rglob("scan_*.json"))
        
        if self.config.skip_existing:
            files = [f for f in files if not self.writer.get_output_path(f).exists()]
        
        return files
    
    async def run(self):
        unavail = IndicatorSource.get_unavailable_names()
        logger.info(f"Skipping {len(unavail)} unavailable indicators: {unavail[:5]}...")
        
        files = self.find_scans()
        logger.info(f"Processing {len(files)} files...")
        
        for i, path in enumerate(files):
            try:
                result = await self.processor.process(path)
                if result:
                    if not self.config.dry_run:
                        self.writer.save(result, path)
                    self.stats['processed'] += 1
                else:
                    self.stats['failed'] += 1
            except Exception as e:
                logger.error(f"Error {path.name}: {e}")
                self.stats['failed'] += 1
            
            if (i + 1) % 10 == 0:
                logger.info(f"Progress: {i+1}/{len(files)}")
            
            if self.config.rate_limit_delay > 0:
                await asyncio.sleep(self.config.rate_limit_delay)
        
        logger.info(f"Done: {self.stats}")
        return self.stats

# =============================================================================
# UTILITY
# =============================================================================

def load_indicators(path: str) -> Dict[str, np.ndarray]:
    """Load .npz indicator file"""
    return dict(np.load(path, allow_pickle=True))

def summary(path: str) -> str:
    """Summary of indicator file"""
    d = load_indicators(path)
    return f"""Timestamp: {d['timestamp'][0]}
Assets: {d['n_assets'][0]}
API success: {d['api_success_rate'][0]:.1%}
API shape: {d['api_indicators'].shape}
Scan global: {d['scan_global'].shape}
Per-asset: {d['asset_matrix'].shape}"""

# =============================================================================
# CLI
# =============================================================================

def main():
    parser = argparse.ArgumentParser(description="DOLPHIN Backfill Runner")
    # parser.add_argument("scan_dir", help="Directory with scan JSON files")
    parser.add_argument("-o", "--output", help="Output directory")
    parser.add_argument("--fred-key", default="", help="FRED API key")
    parser.add_argument("--no-skip", action="store_true", help="Reprocess existing")
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--delay", type=float, default=0.5)
    
    args = parser.parse_args()
    
    config = BackfillConfig(
        scan_dir= Path(r"C:\Users\Lenovo\Documents\- Dolphin NG HD (NG3)\correlation_arb512\eigenvalues"),
        output_dir=args.output,
        # FRED API Key: c16a9cde3e3bb5bb972bb9283485f202
        fred_api_key=args.fred_key or 'c16a9cde3e3bb5bb972bb9283485f202',
        skip_existing=not args.no_skip,
        dry_run=args.dry_run,
        rate_limit_delay=args.delay,
    )
    
    runner = BackfillRunner(config)
    asyncio.run(runner.run())

if __name__ == "__main__":
    main()