#!/usr/bin/env python3 """ EXF PERSISTENCE SERVICE v1.0 ============================ Off-hot-path persistence for External Factors indicators. Design: - Background thread, non-blocking to hot path - Writes ALL fetched indicators to NPZ (DOLPHIN-compliant format) - Separate from Hazelcast push (HZ = instant, Disk = durability) - Integrity checksums for validation - Automatic cleanup of old files DOLPHIN Format: /mnt/ng6_data/eigenvalues/{YYYY-MM-DD}/extf_snapshot_{timestamp}__Indicators.npz Author: DOLPHIN ExF System Date: 2026-03-17 """ import json import logging import threading import time import hashlib import numpy as np from pathlib import Path from datetime import datetime, timezone from typing import Dict, Any, Optional, List from dataclasses import dataclass, field from collections import deque logger = logging.getLogger(__name__) # Configuration DATA_DIR = Path("/mnt/ng6_data/eigenvalues") FLUSH_INTERVAL_S = 300 # 5 minutes (off hot path) MAX_FILE_AGE_DAYS = 7 # Keep 7 days of history REQUIRED_INDICATORS_MIN = 20 # Minimum indicators for data sufficiency @dataclass class PersistenceStats: """Statistics for monitoring.""" files_written: int = 0 files_failed: int = 0 bytes_written: int = 0 last_write_time: float = 0.0 last_write_path: Optional[Path] = None integrity_errors: int = 0 history: deque = field(default_factory=lambda: deque(maxlen=100)) class ExFPersistenceService: """ Off-hot-path persistence service for External Factors. Usage: svc = ExFPersistenceService() svc.start() # In hot path (exf_fetcher_flow): svc.update_snapshot(indicators_dict) svc.stop() """ def __init__(self, data_dir: Path = DATA_DIR, flush_interval_s: float = FLUSH_INTERVAL_S): self.data_dir = Path(data_dir) self.flush_interval_s = flush_interval_s self._stats = PersistenceStats() self._latest_snapshot: Optional[Dict[str, Any]] = None self._snapshot_lock = threading.Lock() self._running = False self._thread: Optional[threading.Thread] = None self._stop_event = threading.Event() # Ensure data directory exists self.data_dir.mkdir(parents=True, exist_ok=True) # ----- Public API (thread-safe, non-blocking) ----- def update_snapshot(self, indicators: Dict[str, Any]) -> None: """ Update the latest snapshot (called from hot path). This is non-blocking - just updates in-memory reference. """ with self._snapshot_lock: # Deep copy to avoid reference issues self._latest_snapshot = dict(indicators) def get_stats(self) -> Dict[str, Any]: """Get persistence statistics.""" with self._snapshot_lock: return { 'files_written': self._stats.files_written, 'files_failed': self._stats.files_failed, 'bytes_written': self._stats.bytes_written, 'last_write_time': self._stats.last_write_time, 'last_write_path': str(self._stats.last_write_path) if self._stats.last_write_path else None, 'integrity_errors': self._stats.integrity_errors, 'has_snapshot': self._latest_snapshot is not None, 'snapshot_keys': len(self._latest_snapshot) if self._latest_snapshot else 0, } def check_data_sufficiency(self) -> Dict[str, Any]: """ Check if we have sufficient data for ACB and alpha engine. Returns sufficiency score and details. """ with self._snapshot_lock: if not self._latest_snapshot: return {'sufficient': False, 'score': 0.0, 'reason': 'no_snapshot'} snapshot = self._latest_snapshot # ACB-critical indicators (must have fresh data) acb_critical = [ 'funding_btc', 'funding_eth', 'dvol_btc', 'dvol_eth', 'fng', 'vix', 'ls_btc', 'taker', 'oi_btc' ] # Count available and fresh indicators acb_available = sum(1 for k in acb_critical if k in snapshot) total_available = sum(1 for k in snapshot.keys() if not k.startswith('_')) # Check staleness staleness = snapshot.get('_staleness_s', {}) stale_count = sum(1 for k, v in staleness.items() if isinstance(v, (int, float)) and v > 60) # >60s = stale # Calculate sufficiency score acb_score = acb_available / len(acb_critical) coverage_score = min(1.0, total_available / REQUIRED_INDICATORS_MIN) freshness_score = 1.0 - (stale_count / max(1, len(staleness))) overall_score = (acb_score * 0.5) + (coverage_score * 0.3) + (freshness_score * 0.2) return { 'sufficient': overall_score > 0.7, 'score': round(overall_score, 3), 'acb_critical': f"{acb_available}/{len(acb_critical)}", 'total_indicators': total_available, 'stale_indicators': stale_count, 'freshness': round(freshness_score, 3), } # ----- Background Operations ----- def _compute_checksum(self, data: bytes) -> str: """Compute MD5 checksum for integrity verification.""" return hashlib.md5(data).hexdigest() def _write_npz(self, snapshot: Dict[str, Any]) -> Optional[Path]: """ Write snapshot to NPZ file in DOLPHIN-compliant format. Returns path on success, None on failure. """ try: now = datetime.now(timezone.utc) date_str = now.strftime('%Y-%m-%d') timestamp = now.strftime('%Y-%m-%d_%H-%M-%S') # Create date directory date_dir = self.data_dir / date_str date_dir.mkdir(parents=True, exist_ok=True) # Prepare filename filename = f"extf_snapshot_{timestamp}__Indicators.npz" filepath = date_dir / filename # Separate numeric data from metadata numeric_data = {} metadata = { '_timestamp_utc': now.isoformat(), '_version': '1.0', '_service': 'ExFPersistence', } for key, value in snapshot.items(): if key.startswith('_'): # Metadata/structural keys if isinstance(value, dict): metadata[key] = json.dumps(value) else: metadata[key] = value elif isinstance(value, (int, float)): if value == value: # Not NaN numeric_data[key] = np.array([value], dtype=np.float64) elif isinstance(value, (list, tuple)) and len(value) > 0: try: numeric_data[key] = np.array(value, dtype=np.float64) except (ValueError, TypeError): metadata[key] = json.dumps(value) else: metadata[key] = str(value) # Write NPZ with compression np.savez_compressed( filepath, _metadata=json.dumps(metadata), **numeric_data ) # Verify checksum file_bytes = filepath.read_bytes() checksum = self._compute_checksum(file_bytes) # Write checksum file checksum_path = filepath.with_suffix('.npz.sha256') checksum_path.write_text(f"{checksum} {filename}\n") logger.info(f"Wrote ExF snapshot: {filepath.name} ({len(numeric_data)} indicators, {len(file_bytes)} bytes)") return filepath except Exception as e: logger.error(f"Failed to write ExF snapshot: {e}") self._stats.files_failed += 1 return None def _cleanup_old_files(self) -> int: """Remove files older than MAX_FILE_AGE_DAYS. Returns count removed.""" removed = 0 cutoff = time.time() - (MAX_FILE_AGE_DAYS * 24 * 3600) try: for date_dir in self.data_dir.iterdir(): if not date_dir.is_dir(): continue # Check if directory is old try: dir_time = date_dir.stat().st_mtime if dir_time < cutoff: # Remove old directory import shutil shutil.rmtree(date_dir) removed += 1 logger.info(f"Cleaned up old ExF directory: {date_dir.name}") except Exception as e: logger.warning(f"Failed to cleanup {date_dir}: {e}") except Exception as e: logger.warning(f"Cleanup error: {e}") return removed def _flush_loop(self) -> None: """Background thread: periodically flush to disk.""" logger.info(f"ExF Persistence loop started (interval={self.flush_interval_s}s)") while not self._stop_event.is_set(): try: # Get current snapshot with self._snapshot_lock: snapshot = self._latest_snapshot.copy() if self._latest_snapshot else None if snapshot: # Write to disk filepath = self._write_npz(snapshot) if filepath: file_size = filepath.stat().st_size self._stats.files_written += 1 self._stats.bytes_written += file_size self._stats.last_write_time = time.time() self._stats.last_write_path = filepath # Record history entry sufficiency = self.check_data_sufficiency() self._stats.history.append({ 'timestamp': datetime.now(timezone.utc).isoformat(), 'path': str(filepath), 'size': file_size, 'indicators': len([k for k in snapshot.keys() if not k.startswith('_')]), 'sufficient': sufficiency['sufficient'], 'score': sufficiency['score'], }) # Periodic cleanup (every ~20 flushes = ~100 min) if self._stats.files_written % 20 == 0: self._cleanup_old_files() except Exception as e: logger.error(f"Flush loop error: {e}") self._stats.files_failed += 1 # Wait for next flush self._stop_event.wait(timeout=self.flush_interval_s) logger.info("ExF Persistence loop stopped") # ----- Lifecycle ----- def start(self) -> None: """Start the persistence service.""" if self._running: return self._running = True self._stop_event.clear() self._thread = threading.Thread(target=self._flush_loop, daemon=True) self._thread.start() logger.info(f"ExFPersistenceService started (data_dir={self.data_dir})") def stop(self) -> None: """Stop the persistence service.""" if not self._running: return self._running = False self._stop_event.set() if self._thread: self._thread.join(timeout=10) logger.info("ExFPersistenceService stopped") def force_flush(self) -> Optional[Path]: """Force an immediate flush (for testing/debugging).""" with self._snapshot_lock: snapshot = self._latest_snapshot.copy() if self._latest_snapshot else None if snapshot: return self._write_npz(snapshot) return None # ===================================================================== # INTEGRITY CHECKER # ===================================================================== class ExFIntegrityChecker: """ Verifies integrity between Hazelcast cache and disk persistence. Can be run periodically to detect data divergence. """ def __init__(self, hz_client, persistence_service: ExFPersistenceService): self.hz_client = hz_client self.persistence = persistence_service def check_integrity(self) -> Dict[str, Any]: """ Compare HZ data with last persisted file. Returns integrity report. """ try: # Get HZ data features_map = self.hz_client.get_map("DOLPHIN_FEATURES") hz_data_raw = features_map.get("exf_latest").result() if not hz_data_raw: return {'status': 'error', 'reason': 'hz_data_missing'} hz_data = json.loads(hz_data_raw) if isinstance(hz_data_raw, str) else hz_data_raw # Get last persisted file stats = self.persistence.get_stats() last_path = stats.get('last_write_path') if not last_path: return {'status': 'error', 'reason': 'no_persisted_file'} # Load persisted data with np.load(last_path, allow_pickle=True) as npz: metadata = json.loads(str(npz['_metadata'])) persisted_indicators = set(k for k in npz.keys() if not k.startswith('_')) # Compare hz_indicators = set(k for k in hz_data.keys() if not k.startswith('_')) missing_in_persist = hz_indicators - persisted_indicators missing_in_hz = persisted_indicators - hz_indicators # Check timestamp drift hz_time = hz_data.get('_pushed_at', '') persist_time = metadata.get('_timestamp_utc', '') time_drift_s = None if hz_time and persist_time: try: hz_dt = datetime.fromisoformat(hz_time.replace('Z', '+00:00')) p_dt = datetime.fromisoformat(persist_time.replace('Z', '+00:00')) time_drift_s = abs((hz_dt - p_dt).total_seconds()) except: pass integrity_ok = len(missing_in_persist) == 0 and len(missing_in_hz) == 0 if time_drift_s and time_drift_s > 600: # >10 min drift integrity_ok = False return { 'status': 'ok' if integrity_ok else 'mismatch', 'hz_indicators': len(hz_indicators), 'persisted_indicators': len(persisted_indicators), 'missing_in_persist': list(missing_in_persist)[:5], 'missing_in_hz': list(missing_in_hz)[:5], 'time_drift_seconds': time_drift_s, 'hz_timestamp': hz_time, 'persist_timestamp': persist_time, 'last_persist_path': last_path, } except Exception as e: logger.error(f"Integrity check failed: {e}") return {'status': 'error', 'reason': str(e)} # ===================================================================== # STANDALONE TEST # ===================================================================== if __name__ == "__main__": # Test the persistence service logging.basicConfig(level=logging.INFO) # Create test data test_snapshot = { 'funding_btc': 0.0001, 'funding_eth': -0.0002, 'basis': 0.01, 'spread': 0.001, 'vix': 20.0, '_pushed_at': datetime.now(timezone.utc).isoformat(), '_staleness_s': {'funding_btc': 10.0, 'basis': 0.5}, } # Start service svc = ExFPersistenceService(flush_interval_s=5) svc.start() # Update snapshot svc.update_snapshot(test_snapshot) # Wait for flush print("Waiting for flush...") time.sleep(6) # Check stats stats = svc.get_stats() print(f"Stats: {json.dumps(stats, indent=2, default=str)}") # Check sufficiency suff = svc.check_data_sufficiency() print(f"Sufficiency: {json.dumps(suff, indent=2)}") # Stop svc.stop() print("Test complete")