initial: import DOLPHIN baseline 2026-04-21 from dolphinng5_predict working tree
Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
This commit is contained in:
451
prod/exf_persistence.py
Executable file
451
prod/exf_persistence.py
Executable file
@@ -0,0 +1,451 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
EXF PERSISTENCE SERVICE v1.0
|
||||
============================
|
||||
Off-hot-path persistence for External Factors indicators.
|
||||
|
||||
Design:
|
||||
- Background thread, non-blocking to hot path
|
||||
- Writes ALL fetched indicators to NPZ (DOLPHIN-compliant format)
|
||||
- Separate from Hazelcast push (HZ = instant, Disk = durability)
|
||||
- Integrity checksums for validation
|
||||
- Automatic cleanup of old files
|
||||
|
||||
DOLPHIN Format:
|
||||
/mnt/ng6_data/eigenvalues/{YYYY-MM-DD}/extf_snapshot_{timestamp}__Indicators.npz
|
||||
|
||||
Author: DOLPHIN ExF System
|
||||
Date: 2026-03-17
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import hashlib
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Any, Optional, List
|
||||
from dataclasses import dataclass, field
|
||||
from collections import deque
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration
|
||||
DATA_DIR = Path("/mnt/ng6_data/eigenvalues")
|
||||
FLUSH_INTERVAL_S = 300 # 5 minutes (off hot path)
|
||||
MAX_FILE_AGE_DAYS = 7 # Keep 7 days of history
|
||||
REQUIRED_INDICATORS_MIN = 20 # Minimum indicators for data sufficiency
|
||||
|
||||
|
||||
@dataclass
|
||||
class PersistenceStats:
|
||||
"""Statistics for monitoring."""
|
||||
files_written: int = 0
|
||||
files_failed: int = 0
|
||||
bytes_written: int = 0
|
||||
last_write_time: float = 0.0
|
||||
last_write_path: Optional[Path] = None
|
||||
integrity_errors: int = 0
|
||||
history: deque = field(default_factory=lambda: deque(maxlen=100))
|
||||
|
||||
|
||||
class ExFPersistenceService:
|
||||
"""
|
||||
Off-hot-path persistence service for External Factors.
|
||||
|
||||
Usage:
|
||||
svc = ExFPersistenceService()
|
||||
svc.start()
|
||||
|
||||
# In hot path (exf_fetcher_flow):
|
||||
svc.update_snapshot(indicators_dict)
|
||||
|
||||
svc.stop()
|
||||
"""
|
||||
|
||||
def __init__(self, data_dir: Path = DATA_DIR, flush_interval_s: float = FLUSH_INTERVAL_S):
|
||||
self.data_dir = Path(data_dir)
|
||||
self.flush_interval_s = flush_interval_s
|
||||
self._stats = PersistenceStats()
|
||||
self._latest_snapshot: Optional[Dict[str, Any]] = None
|
||||
self._snapshot_lock = threading.Lock()
|
||||
self._running = False
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._stop_event = threading.Event()
|
||||
|
||||
# Ensure data directory exists
|
||||
self.data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# ----- Public API (thread-safe, non-blocking) -----
|
||||
|
||||
def update_snapshot(self, indicators: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Update the latest snapshot (called from hot path).
|
||||
This is non-blocking - just updates in-memory reference.
|
||||
"""
|
||||
with self._snapshot_lock:
|
||||
# Deep copy to avoid reference issues
|
||||
self._latest_snapshot = dict(indicators)
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get persistence statistics."""
|
||||
with self._snapshot_lock:
|
||||
return {
|
||||
'files_written': self._stats.files_written,
|
||||
'files_failed': self._stats.files_failed,
|
||||
'bytes_written': self._stats.bytes_written,
|
||||
'last_write_time': self._stats.last_write_time,
|
||||
'last_write_path': str(self._stats.last_write_path) if self._stats.last_write_path else None,
|
||||
'integrity_errors': self._stats.integrity_errors,
|
||||
'has_snapshot': self._latest_snapshot is not None,
|
||||
'snapshot_keys': len(self._latest_snapshot) if self._latest_snapshot else 0,
|
||||
}
|
||||
|
||||
def check_data_sufficiency(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check if we have sufficient data for ACB and alpha engine.
|
||||
Returns sufficiency score and details.
|
||||
"""
|
||||
with self._snapshot_lock:
|
||||
if not self._latest_snapshot:
|
||||
return {'sufficient': False, 'score': 0.0, 'reason': 'no_snapshot'}
|
||||
|
||||
snapshot = self._latest_snapshot
|
||||
|
||||
# ACB-critical indicators (must have fresh data)
|
||||
acb_critical = [
|
||||
'funding_btc', 'funding_eth', 'dvol_btc', 'dvol_eth',
|
||||
'fng', 'vix', 'ls_btc', 'taker', 'oi_btc'
|
||||
]
|
||||
|
||||
# Count available and fresh indicators
|
||||
acb_available = sum(1 for k in acb_critical if k in snapshot)
|
||||
total_available = sum(1 for k in snapshot.keys() if not k.startswith('_'))
|
||||
|
||||
# Check staleness
|
||||
staleness = snapshot.get('_staleness_s', {})
|
||||
stale_count = sum(1 for k, v in staleness.items()
|
||||
if isinstance(v, (int, float)) and v > 60) # >60s = stale
|
||||
|
||||
# Calculate sufficiency score
|
||||
acb_score = acb_available / len(acb_critical)
|
||||
coverage_score = min(1.0, total_available / REQUIRED_INDICATORS_MIN)
|
||||
freshness_score = 1.0 - (stale_count / max(1, len(staleness)))
|
||||
|
||||
overall_score = (acb_score * 0.5) + (coverage_score * 0.3) + (freshness_score * 0.2)
|
||||
|
||||
return {
|
||||
'sufficient': overall_score > 0.7,
|
||||
'score': round(overall_score, 3),
|
||||
'acb_critical': f"{acb_available}/{len(acb_critical)}",
|
||||
'total_indicators': total_available,
|
||||
'stale_indicators': stale_count,
|
||||
'freshness': round(freshness_score, 3),
|
||||
}
|
||||
|
||||
# ----- Background Operations -----
|
||||
|
||||
def _compute_checksum(self, data: bytes) -> str:
|
||||
"""Compute MD5 checksum for integrity verification."""
|
||||
return hashlib.md5(data).hexdigest()
|
||||
|
||||
def _write_npz(self, snapshot: Dict[str, Any]) -> Optional[Path]:
|
||||
"""
|
||||
Write snapshot to NPZ file in DOLPHIN-compliant format.
|
||||
Returns path on success, None on failure.
|
||||
"""
|
||||
try:
|
||||
now = datetime.now(timezone.utc)
|
||||
date_str = now.strftime('%Y-%m-%d')
|
||||
timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')
|
||||
|
||||
# Create date directory
|
||||
date_dir = self.data_dir / date_str
|
||||
date_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Prepare filename
|
||||
filename = f"extf_snapshot_{timestamp}__Indicators.npz"
|
||||
filepath = date_dir / filename
|
||||
|
||||
# Separate numeric data from metadata
|
||||
numeric_data = {}
|
||||
metadata = {
|
||||
'_timestamp_utc': now.isoformat(),
|
||||
'_version': '1.0',
|
||||
'_service': 'ExFPersistence',
|
||||
}
|
||||
|
||||
for key, value in snapshot.items():
|
||||
if key.startswith('_'):
|
||||
# Metadata/structural keys
|
||||
if isinstance(value, dict):
|
||||
metadata[key] = json.dumps(value)
|
||||
else:
|
||||
metadata[key] = value
|
||||
elif isinstance(value, (int, float)):
|
||||
if value == value: # Not NaN
|
||||
numeric_data[key] = np.array([value], dtype=np.float64)
|
||||
elif isinstance(value, (list, tuple)) and len(value) > 0:
|
||||
try:
|
||||
numeric_data[key] = np.array(value, dtype=np.float64)
|
||||
except (ValueError, TypeError):
|
||||
metadata[key] = json.dumps(value)
|
||||
else:
|
||||
metadata[key] = str(value)
|
||||
|
||||
# Write NPZ with compression
|
||||
np.savez_compressed(
|
||||
filepath,
|
||||
_metadata=json.dumps(metadata),
|
||||
**numeric_data
|
||||
)
|
||||
|
||||
# Verify checksum
|
||||
file_bytes = filepath.read_bytes()
|
||||
checksum = self._compute_checksum(file_bytes)
|
||||
|
||||
# Write checksum file
|
||||
checksum_path = filepath.with_suffix('.npz.sha256')
|
||||
checksum_path.write_text(f"{checksum} {filename}\n")
|
||||
|
||||
logger.info(f"Wrote ExF snapshot: {filepath.name} ({len(numeric_data)} indicators, {len(file_bytes)} bytes)")
|
||||
|
||||
return filepath
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write ExF snapshot: {e}")
|
||||
self._stats.files_failed += 1
|
||||
return None
|
||||
|
||||
def _cleanup_old_files(self) -> int:
|
||||
"""Remove files older than MAX_FILE_AGE_DAYS. Returns count removed."""
|
||||
removed = 0
|
||||
cutoff = time.time() - (MAX_FILE_AGE_DAYS * 24 * 3600)
|
||||
|
||||
try:
|
||||
for date_dir in self.data_dir.iterdir():
|
||||
if not date_dir.is_dir():
|
||||
continue
|
||||
|
||||
# Check if directory is old
|
||||
try:
|
||||
dir_time = date_dir.stat().st_mtime
|
||||
if dir_time < cutoff:
|
||||
# Remove old directory
|
||||
import shutil
|
||||
shutil.rmtree(date_dir)
|
||||
removed += 1
|
||||
logger.info(f"Cleaned up old ExF directory: {date_dir.name}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to cleanup {date_dir}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Cleanup error: {e}")
|
||||
|
||||
return removed
|
||||
|
||||
def _flush_loop(self) -> None:
|
||||
"""Background thread: periodically flush to disk."""
|
||||
logger.info(f"ExF Persistence loop started (interval={self.flush_interval_s}s)")
|
||||
|
||||
while not self._stop_event.is_set():
|
||||
try:
|
||||
# Get current snapshot
|
||||
with self._snapshot_lock:
|
||||
snapshot = self._latest_snapshot.copy() if self._latest_snapshot else None
|
||||
|
||||
if snapshot:
|
||||
# Write to disk
|
||||
filepath = self._write_npz(snapshot)
|
||||
|
||||
if filepath:
|
||||
file_size = filepath.stat().st_size
|
||||
self._stats.files_written += 1
|
||||
self._stats.bytes_written += file_size
|
||||
self._stats.last_write_time = time.time()
|
||||
self._stats.last_write_path = filepath
|
||||
|
||||
# Record history entry
|
||||
sufficiency = self.check_data_sufficiency()
|
||||
self._stats.history.append({
|
||||
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||
'path': str(filepath),
|
||||
'size': file_size,
|
||||
'indicators': len([k for k in snapshot.keys() if not k.startswith('_')]),
|
||||
'sufficient': sufficiency['sufficient'],
|
||||
'score': sufficiency['score'],
|
||||
})
|
||||
|
||||
# Periodic cleanup (every ~20 flushes = ~100 min)
|
||||
if self._stats.files_written % 20 == 0:
|
||||
self._cleanup_old_files()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Flush loop error: {e}")
|
||||
self._stats.files_failed += 1
|
||||
|
||||
# Wait for next flush
|
||||
self._stop_event.wait(timeout=self.flush_interval_s)
|
||||
|
||||
logger.info("ExF Persistence loop stopped")
|
||||
|
||||
# ----- Lifecycle -----
|
||||
|
||||
def start(self) -> None:
|
||||
"""Start the persistence service."""
|
||||
if self._running:
|
||||
return
|
||||
|
||||
self._running = True
|
||||
self._stop_event.clear()
|
||||
self._thread = threading.Thread(target=self._flush_loop, daemon=True)
|
||||
self._thread.start()
|
||||
logger.info(f"ExFPersistenceService started (data_dir={self.data_dir})")
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop the persistence service."""
|
||||
if not self._running:
|
||||
return
|
||||
|
||||
self._running = False
|
||||
self._stop_event.set()
|
||||
|
||||
if self._thread:
|
||||
self._thread.join(timeout=10)
|
||||
|
||||
logger.info("ExFPersistenceService stopped")
|
||||
|
||||
def force_flush(self) -> Optional[Path]:
|
||||
"""Force an immediate flush (for testing/debugging)."""
|
||||
with self._snapshot_lock:
|
||||
snapshot = self._latest_snapshot.copy() if self._latest_snapshot else None
|
||||
|
||||
if snapshot:
|
||||
return self._write_npz(snapshot)
|
||||
return None
|
||||
|
||||
|
||||
# =====================================================================
|
||||
# INTEGRITY CHECKER
|
||||
# =====================================================================
|
||||
|
||||
class ExFIntegrityChecker:
|
||||
"""
|
||||
Verifies integrity between Hazelcast cache and disk persistence.
|
||||
Can be run periodically to detect data divergence.
|
||||
"""
|
||||
|
||||
def __init__(self, hz_client, persistence_service: ExFPersistenceService):
|
||||
self.hz_client = hz_client
|
||||
self.persistence = persistence_service
|
||||
|
||||
def check_integrity(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Compare HZ data with last persisted file.
|
||||
Returns integrity report.
|
||||
"""
|
||||
try:
|
||||
# Get HZ data
|
||||
features_map = self.hz_client.get_map("DOLPHIN_FEATURES")
|
||||
hz_data_raw = features_map.get("exf_latest").result()
|
||||
|
||||
if not hz_data_raw:
|
||||
return {'status': 'error', 'reason': 'hz_data_missing'}
|
||||
|
||||
hz_data = json.loads(hz_data_raw) if isinstance(hz_data_raw, str) else hz_data_raw
|
||||
|
||||
# Get last persisted file
|
||||
stats = self.persistence.get_stats()
|
||||
last_path = stats.get('last_write_path')
|
||||
|
||||
if not last_path:
|
||||
return {'status': 'error', 'reason': 'no_persisted_file'}
|
||||
|
||||
# Load persisted data
|
||||
with np.load(last_path, allow_pickle=True) as npz:
|
||||
metadata = json.loads(str(npz['_metadata']))
|
||||
persisted_indicators = set(k for k in npz.keys() if not k.startswith('_'))
|
||||
|
||||
# Compare
|
||||
hz_indicators = set(k for k in hz_data.keys() if not k.startswith('_'))
|
||||
|
||||
missing_in_persist = hz_indicators - persisted_indicators
|
||||
missing_in_hz = persisted_indicators - hz_indicators
|
||||
|
||||
# Check timestamp drift
|
||||
hz_time = hz_data.get('_pushed_at', '')
|
||||
persist_time = metadata.get('_timestamp_utc', '')
|
||||
|
||||
time_drift_s = None
|
||||
if hz_time and persist_time:
|
||||
try:
|
||||
hz_dt = datetime.fromisoformat(hz_time.replace('Z', '+00:00'))
|
||||
p_dt = datetime.fromisoformat(persist_time.replace('Z', '+00:00'))
|
||||
time_drift_s = abs((hz_dt - p_dt).total_seconds())
|
||||
except:
|
||||
pass
|
||||
|
||||
integrity_ok = len(missing_in_persist) == 0 and len(missing_in_hz) == 0
|
||||
if time_drift_s and time_drift_s > 600: # >10 min drift
|
||||
integrity_ok = False
|
||||
|
||||
return {
|
||||
'status': 'ok' if integrity_ok else 'mismatch',
|
||||
'hz_indicators': len(hz_indicators),
|
||||
'persisted_indicators': len(persisted_indicators),
|
||||
'missing_in_persist': list(missing_in_persist)[:5],
|
||||
'missing_in_hz': list(missing_in_hz)[:5],
|
||||
'time_drift_seconds': time_drift_s,
|
||||
'hz_timestamp': hz_time,
|
||||
'persist_timestamp': persist_time,
|
||||
'last_persist_path': last_path,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Integrity check failed: {e}")
|
||||
return {'status': 'error', 'reason': str(e)}
|
||||
|
||||
|
||||
# =====================================================================
|
||||
# STANDALONE TEST
|
||||
# =====================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test the persistence service
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Create test data
|
||||
test_snapshot = {
|
||||
'funding_btc': 0.0001,
|
||||
'funding_eth': -0.0002,
|
||||
'basis': 0.01,
|
||||
'spread': 0.001,
|
||||
'vix': 20.0,
|
||||
'_pushed_at': datetime.now(timezone.utc).isoformat(),
|
||||
'_staleness_s': {'funding_btc': 10.0, 'basis': 0.5},
|
||||
}
|
||||
|
||||
# Start service
|
||||
svc = ExFPersistenceService(flush_interval_s=5)
|
||||
svc.start()
|
||||
|
||||
# Update snapshot
|
||||
svc.update_snapshot(test_snapshot)
|
||||
|
||||
# Wait for flush
|
||||
print("Waiting for flush...")
|
||||
time.sleep(6)
|
||||
|
||||
# Check stats
|
||||
stats = svc.get_stats()
|
||||
print(f"Stats: {json.dumps(stats, indent=2, default=str)}")
|
||||
|
||||
# Check sufficiency
|
||||
suff = svc.check_data_sufficiency()
|
||||
print(f"Sufficiency: {json.dumps(suff, indent=2)}")
|
||||
|
||||
# Stop
|
||||
svc.stop()
|
||||
print("Test complete")
|
||||
Reference in New Issue
Block a user