Includes core prod + GREEN/BLUE subsystems: - prod/ (BLUE harness, configs, scripts, docs) - nautilus_dolphin/ (GREEN Nautilus-native impl + dvae/ preserved) - adaptive_exit/ (AEM engine + models/bucket_assignments.pkl) - Observability/ (EsoF advisor, TUI, dashboards) - external_factors/ (EsoF producer) - mc_forewarning_qlabs_fork/ (MC regime/envelope) Excludes runtime caches, logs, backups, and reproducible artifacts per .gitignore.
452 lines
16 KiB
Python
Executable File
452 lines
16 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
EXF PERSISTENCE SERVICE v1.0
|
|
============================
|
|
Off-hot-path persistence for External Factors indicators.
|
|
|
|
Design:
|
|
- Background thread, non-blocking to hot path
|
|
- Writes ALL fetched indicators to NPZ (DOLPHIN-compliant format)
|
|
- Separate from Hazelcast push (HZ = instant, Disk = durability)
|
|
- Integrity checksums for validation
|
|
- Automatic cleanup of old files
|
|
|
|
DOLPHIN Format:
|
|
/mnt/ng6_data/eigenvalues/{YYYY-MM-DD}/extf_snapshot_{timestamp}__Indicators.npz
|
|
|
|
Author: DOLPHIN ExF System
|
|
Date: 2026-03-17
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import threading
|
|
import time
|
|
import hashlib
|
|
import numpy as np
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, Any, Optional, List
|
|
from dataclasses import dataclass, field
|
|
from collections import deque
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration
|
|
DATA_DIR = Path("/mnt/ng6_data/eigenvalues")
|
|
FLUSH_INTERVAL_S = 300 # 5 minutes (off hot path)
|
|
MAX_FILE_AGE_DAYS = 7 # Keep 7 days of history
|
|
REQUIRED_INDICATORS_MIN = 20 # Minimum indicators for data sufficiency
|
|
|
|
|
|
@dataclass
|
|
class PersistenceStats:
|
|
"""Statistics for monitoring."""
|
|
files_written: int = 0
|
|
files_failed: int = 0
|
|
bytes_written: int = 0
|
|
last_write_time: float = 0.0
|
|
last_write_path: Optional[Path] = None
|
|
integrity_errors: int = 0
|
|
history: deque = field(default_factory=lambda: deque(maxlen=100))
|
|
|
|
|
|
class ExFPersistenceService:
|
|
"""
|
|
Off-hot-path persistence service for External Factors.
|
|
|
|
Usage:
|
|
svc = ExFPersistenceService()
|
|
svc.start()
|
|
|
|
# In hot path (exf_fetcher_flow):
|
|
svc.update_snapshot(indicators_dict)
|
|
|
|
svc.stop()
|
|
"""
|
|
|
|
def __init__(self, data_dir: Path = DATA_DIR, flush_interval_s: float = FLUSH_INTERVAL_S):
|
|
self.data_dir = Path(data_dir)
|
|
self.flush_interval_s = flush_interval_s
|
|
self._stats = PersistenceStats()
|
|
self._latest_snapshot: Optional[Dict[str, Any]] = None
|
|
self._snapshot_lock = threading.Lock()
|
|
self._running = False
|
|
self._thread: Optional[threading.Thread] = None
|
|
self._stop_event = threading.Event()
|
|
|
|
# Ensure data directory exists
|
|
self.data_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# ----- Public API (thread-safe, non-blocking) -----
|
|
|
|
def update_snapshot(self, indicators: Dict[str, Any]) -> None:
|
|
"""
|
|
Update the latest snapshot (called from hot path).
|
|
This is non-blocking - just updates in-memory reference.
|
|
"""
|
|
with self._snapshot_lock:
|
|
# Deep copy to avoid reference issues
|
|
self._latest_snapshot = dict(indicators)
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get persistence statistics."""
|
|
with self._snapshot_lock:
|
|
return {
|
|
'files_written': self._stats.files_written,
|
|
'files_failed': self._stats.files_failed,
|
|
'bytes_written': self._stats.bytes_written,
|
|
'last_write_time': self._stats.last_write_time,
|
|
'last_write_path': str(self._stats.last_write_path) if self._stats.last_write_path else None,
|
|
'integrity_errors': self._stats.integrity_errors,
|
|
'has_snapshot': self._latest_snapshot is not None,
|
|
'snapshot_keys': len(self._latest_snapshot) if self._latest_snapshot else 0,
|
|
}
|
|
|
|
def check_data_sufficiency(self) -> Dict[str, Any]:
|
|
"""
|
|
Check if we have sufficient data for ACB and alpha engine.
|
|
Returns sufficiency score and details.
|
|
"""
|
|
with self._snapshot_lock:
|
|
if not self._latest_snapshot:
|
|
return {'sufficient': False, 'score': 0.0, 'reason': 'no_snapshot'}
|
|
|
|
snapshot = self._latest_snapshot
|
|
|
|
# ACB-critical indicators (must have fresh data)
|
|
acb_critical = [
|
|
'funding_btc', 'funding_eth', 'dvol_btc', 'dvol_eth',
|
|
'fng', 'vix', 'ls_btc', 'taker', 'oi_btc'
|
|
]
|
|
|
|
# Count available and fresh indicators
|
|
acb_available = sum(1 for k in acb_critical if k in snapshot)
|
|
total_available = sum(1 for k in snapshot.keys() if not k.startswith('_'))
|
|
|
|
# Check staleness
|
|
staleness = snapshot.get('_staleness_s', {})
|
|
stale_count = sum(1 for k, v in staleness.items()
|
|
if isinstance(v, (int, float)) and v > 60) # >60s = stale
|
|
|
|
# Calculate sufficiency score
|
|
acb_score = acb_available / len(acb_critical)
|
|
coverage_score = min(1.0, total_available / REQUIRED_INDICATORS_MIN)
|
|
freshness_score = 1.0 - (stale_count / max(1, len(staleness)))
|
|
|
|
overall_score = (acb_score * 0.5) + (coverage_score * 0.3) + (freshness_score * 0.2)
|
|
|
|
return {
|
|
'sufficient': overall_score > 0.7,
|
|
'score': round(overall_score, 3),
|
|
'acb_critical': f"{acb_available}/{len(acb_critical)}",
|
|
'total_indicators': total_available,
|
|
'stale_indicators': stale_count,
|
|
'freshness': round(freshness_score, 3),
|
|
}
|
|
|
|
# ----- Background Operations -----
|
|
|
|
def _compute_checksum(self, data: bytes) -> str:
|
|
"""Compute MD5 checksum for integrity verification."""
|
|
return hashlib.md5(data).hexdigest()
|
|
|
|
def _write_npz(self, snapshot: Dict[str, Any]) -> Optional[Path]:
|
|
"""
|
|
Write snapshot to NPZ file in DOLPHIN-compliant format.
|
|
Returns path on success, None on failure.
|
|
"""
|
|
try:
|
|
now = datetime.now(timezone.utc)
|
|
date_str = now.strftime('%Y-%m-%d')
|
|
timestamp = now.strftime('%Y-%m-%d_%H-%M-%S')
|
|
|
|
# Create date directory
|
|
date_dir = self.data_dir / date_str
|
|
date_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Prepare filename
|
|
filename = f"extf_snapshot_{timestamp}__Indicators.npz"
|
|
filepath = date_dir / filename
|
|
|
|
# Separate numeric data from metadata
|
|
numeric_data = {}
|
|
metadata = {
|
|
'_timestamp_utc': now.isoformat(),
|
|
'_version': '1.0',
|
|
'_service': 'ExFPersistence',
|
|
}
|
|
|
|
for key, value in snapshot.items():
|
|
if key.startswith('_'):
|
|
# Metadata/structural keys
|
|
if isinstance(value, dict):
|
|
metadata[key] = json.dumps(value)
|
|
else:
|
|
metadata[key] = value
|
|
elif isinstance(value, (int, float)):
|
|
if value == value: # Not NaN
|
|
numeric_data[key] = np.array([value], dtype=np.float64)
|
|
elif isinstance(value, (list, tuple)) and len(value) > 0:
|
|
try:
|
|
numeric_data[key] = np.array(value, dtype=np.float64)
|
|
except (ValueError, TypeError):
|
|
metadata[key] = json.dumps(value)
|
|
else:
|
|
metadata[key] = str(value)
|
|
|
|
# Write NPZ with compression
|
|
np.savez_compressed(
|
|
filepath,
|
|
_metadata=json.dumps(metadata),
|
|
**numeric_data
|
|
)
|
|
|
|
# Verify checksum
|
|
file_bytes = filepath.read_bytes()
|
|
checksum = self._compute_checksum(file_bytes)
|
|
|
|
# Write checksum file
|
|
checksum_path = filepath.with_suffix('.npz.sha256')
|
|
checksum_path.write_text(f"{checksum} {filename}\n")
|
|
|
|
logger.info(f"Wrote ExF snapshot: {filepath.name} ({len(numeric_data)} indicators, {len(file_bytes)} bytes)")
|
|
|
|
return filepath
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to write ExF snapshot: {e}")
|
|
self._stats.files_failed += 1
|
|
return None
|
|
|
|
def _cleanup_old_files(self) -> int:
|
|
"""Remove files older than MAX_FILE_AGE_DAYS. Returns count removed."""
|
|
removed = 0
|
|
cutoff = time.time() - (MAX_FILE_AGE_DAYS * 24 * 3600)
|
|
|
|
try:
|
|
for date_dir in self.data_dir.iterdir():
|
|
if not date_dir.is_dir():
|
|
continue
|
|
|
|
# Check if directory is old
|
|
try:
|
|
dir_time = date_dir.stat().st_mtime
|
|
if dir_time < cutoff:
|
|
# Remove old directory
|
|
import shutil
|
|
shutil.rmtree(date_dir)
|
|
removed += 1
|
|
logger.info(f"Cleaned up old ExF directory: {date_dir.name}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to cleanup {date_dir}: {e}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Cleanup error: {e}")
|
|
|
|
return removed
|
|
|
|
def _flush_loop(self) -> None:
|
|
"""Background thread: periodically flush to disk."""
|
|
logger.info(f"ExF Persistence loop started (interval={self.flush_interval_s}s)")
|
|
|
|
while not self._stop_event.is_set():
|
|
try:
|
|
# Get current snapshot
|
|
with self._snapshot_lock:
|
|
snapshot = self._latest_snapshot.copy() if self._latest_snapshot else None
|
|
|
|
if snapshot:
|
|
# Write to disk
|
|
filepath = self._write_npz(snapshot)
|
|
|
|
if filepath:
|
|
file_size = filepath.stat().st_size
|
|
self._stats.files_written += 1
|
|
self._stats.bytes_written += file_size
|
|
self._stats.last_write_time = time.time()
|
|
self._stats.last_write_path = filepath
|
|
|
|
# Record history entry
|
|
sufficiency = self.check_data_sufficiency()
|
|
self._stats.history.append({
|
|
'timestamp': datetime.now(timezone.utc).isoformat(),
|
|
'path': str(filepath),
|
|
'size': file_size,
|
|
'indicators': len([k for k in snapshot.keys() if not k.startswith('_')]),
|
|
'sufficient': sufficiency['sufficient'],
|
|
'score': sufficiency['score'],
|
|
})
|
|
|
|
# Periodic cleanup (every ~20 flushes = ~100 min)
|
|
if self._stats.files_written % 20 == 0:
|
|
self._cleanup_old_files()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Flush loop error: {e}")
|
|
self._stats.files_failed += 1
|
|
|
|
# Wait for next flush
|
|
self._stop_event.wait(timeout=self.flush_interval_s)
|
|
|
|
logger.info("ExF Persistence loop stopped")
|
|
|
|
# ----- Lifecycle -----
|
|
|
|
def start(self) -> None:
|
|
"""Start the persistence service."""
|
|
if self._running:
|
|
return
|
|
|
|
self._running = True
|
|
self._stop_event.clear()
|
|
self._thread = threading.Thread(target=self._flush_loop, daemon=True)
|
|
self._thread.start()
|
|
logger.info(f"ExFPersistenceService started (data_dir={self.data_dir})")
|
|
|
|
def stop(self) -> None:
|
|
"""Stop the persistence service."""
|
|
if not self._running:
|
|
return
|
|
|
|
self._running = False
|
|
self._stop_event.set()
|
|
|
|
if self._thread:
|
|
self._thread.join(timeout=10)
|
|
|
|
logger.info("ExFPersistenceService stopped")
|
|
|
|
def force_flush(self) -> Optional[Path]:
|
|
"""Force an immediate flush (for testing/debugging)."""
|
|
with self._snapshot_lock:
|
|
snapshot = self._latest_snapshot.copy() if self._latest_snapshot else None
|
|
|
|
if snapshot:
|
|
return self._write_npz(snapshot)
|
|
return None
|
|
|
|
|
|
# =====================================================================
|
|
# INTEGRITY CHECKER
|
|
# =====================================================================
|
|
|
|
class ExFIntegrityChecker:
|
|
"""
|
|
Verifies integrity between Hazelcast cache and disk persistence.
|
|
Can be run periodically to detect data divergence.
|
|
"""
|
|
|
|
def __init__(self, hz_client, persistence_service: ExFPersistenceService):
|
|
self.hz_client = hz_client
|
|
self.persistence = persistence_service
|
|
|
|
def check_integrity(self) -> Dict[str, Any]:
|
|
"""
|
|
Compare HZ data with last persisted file.
|
|
Returns integrity report.
|
|
"""
|
|
try:
|
|
# Get HZ data
|
|
features_map = self.hz_client.get_map("DOLPHIN_FEATURES")
|
|
hz_data_raw = features_map.get("exf_latest").result()
|
|
|
|
if not hz_data_raw:
|
|
return {'status': 'error', 'reason': 'hz_data_missing'}
|
|
|
|
hz_data = json.loads(hz_data_raw) if isinstance(hz_data_raw, str) else hz_data_raw
|
|
|
|
# Get last persisted file
|
|
stats = self.persistence.get_stats()
|
|
last_path = stats.get('last_write_path')
|
|
|
|
if not last_path:
|
|
return {'status': 'error', 'reason': 'no_persisted_file'}
|
|
|
|
# Load persisted data
|
|
with np.load(last_path, allow_pickle=True) as npz:
|
|
metadata = json.loads(str(npz['_metadata']))
|
|
persisted_indicators = set(k for k in npz.keys() if not k.startswith('_'))
|
|
|
|
# Compare
|
|
hz_indicators = set(k for k in hz_data.keys() if not k.startswith('_'))
|
|
|
|
missing_in_persist = hz_indicators - persisted_indicators
|
|
missing_in_hz = persisted_indicators - hz_indicators
|
|
|
|
# Check timestamp drift
|
|
hz_time = hz_data.get('_pushed_at', '')
|
|
persist_time = metadata.get('_timestamp_utc', '')
|
|
|
|
time_drift_s = None
|
|
if hz_time and persist_time:
|
|
try:
|
|
hz_dt = datetime.fromisoformat(hz_time.replace('Z', '+00:00'))
|
|
p_dt = datetime.fromisoformat(persist_time.replace('Z', '+00:00'))
|
|
time_drift_s = abs((hz_dt - p_dt).total_seconds())
|
|
except:
|
|
pass
|
|
|
|
integrity_ok = len(missing_in_persist) == 0 and len(missing_in_hz) == 0
|
|
if time_drift_s and time_drift_s > 600: # >10 min drift
|
|
integrity_ok = False
|
|
|
|
return {
|
|
'status': 'ok' if integrity_ok else 'mismatch',
|
|
'hz_indicators': len(hz_indicators),
|
|
'persisted_indicators': len(persisted_indicators),
|
|
'missing_in_persist': list(missing_in_persist)[:5],
|
|
'missing_in_hz': list(missing_in_hz)[:5],
|
|
'time_drift_seconds': time_drift_s,
|
|
'hz_timestamp': hz_time,
|
|
'persist_timestamp': persist_time,
|
|
'last_persist_path': last_path,
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Integrity check failed: {e}")
|
|
return {'status': 'error', 'reason': str(e)}
|
|
|
|
|
|
# =====================================================================
|
|
# STANDALONE TEST
|
|
# =====================================================================
|
|
|
|
if __name__ == "__main__":
|
|
# Test the persistence service
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
# Create test data
|
|
test_snapshot = {
|
|
'funding_btc': 0.0001,
|
|
'funding_eth': -0.0002,
|
|
'basis': 0.01,
|
|
'spread': 0.001,
|
|
'vix': 20.0,
|
|
'_pushed_at': datetime.now(timezone.utc).isoformat(),
|
|
'_staleness_s': {'funding_btc': 10.0, 'basis': 0.5},
|
|
}
|
|
|
|
# Start service
|
|
svc = ExFPersistenceService(flush_interval_s=5)
|
|
svc.start()
|
|
|
|
# Update snapshot
|
|
svc.update_snapshot(test_snapshot)
|
|
|
|
# Wait for flush
|
|
print("Waiting for flush...")
|
|
time.sleep(6)
|
|
|
|
# Check stats
|
|
stats = svc.get_stats()
|
|
print(f"Stats: {json.dumps(stats, indent=2, default=str)}")
|
|
|
|
# Check sufficiency
|
|
suff = svc.check_data_sufficiency()
|
|
print(f"Sufficiency: {json.dumps(suff, indent=2)}")
|
|
|
|
# Stop
|
|
svc.stop()
|
|
print("Test complete")
|