From 1c24a12bb25661c77568e1992b7ccdff31e8f039 Mon Sep 17 00:00:00 2001 From: echobt Date: Tue, 3 Feb 2026 10:57:32 +0000 Subject: [PATCH 1/8] feat: add challenges directory structure and workspace configuration --- Cargo.toml | 15 +++++++++++++++ challenges/.gitkeep | 1 + challenges/README.md | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) create mode 100644 challenges/.gitkeep create mode 100644 challenges/README.md diff --git a/Cargo.toml b/Cargo.toml index 147a9dd..c2e5098 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,10 @@ members = [ # Note: WASM runtime removed - updates via git, version checked at handshake # Note: P2P-only architecture - no centralized platform-server +# Challenge crates can be added here or as optional path/git dependencies +# Example: +# "challenges/example-challenge", + [workspace.package] version = "0.1.0" edition = "2021" @@ -86,6 +90,9 @@ reqwest = { version = "0.12", features = ["json"] } [patch.crates-io] w3f-bls = { git = "https://github.com/opentensor/bls", branch = "fix-no-std" } +# Challenge integration (optional - add to crates that need dynamic loading) +libloading = "0.8" + # Clippy lints configuration [workspace.lints.clippy] # Allow these patterns that are intentional in this codebase @@ -95,3 +102,11 @@ type_complexity = "allow" await_holding_lock = "warn" # TODO: Fix async lock issues properly collapsible_match = "allow" collapsible_if = "allow" + +# Workspace-level feature flags for challenge integration +# Individual crates can enable these by adding features in their Cargo.toml: +# [features] +# dynamic-challenges = ["libloading"] +[workspace.metadata.challenge-features] +# Enable dynamic challenge loading (crates opt-in via features) +dynamic-loading-available = true diff --git a/challenges/.gitkeep b/challenges/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/challenges/.gitkeep @@ -0,0 +1 @@ + diff --git a/challenges/README.md b/challenges/README.md new file mode 100644 index 0000000..6cecfe1 --- /dev/null +++ b/challenges/README.md @@ -0,0 +1,37 @@ +# Platform Challenge Crates + +This directory contains challenge crates that can be integrated with the Platform validator network. + +## Directory Structure + +``` +challenges/ +├── README.md # This file +├── example-challenge/ # Example challenge template (future) +└── [your-challenge]/ # Your custom challenge crate +``` + +## Adding a New Challenge Crate + +1. Create your challenge crate in this directory or reference it as a git dependency +2. Implement the `Challenge` trait from `platform-challenge-sdk` +3. Register your challenge in the challenge registry +4. Update the workspace `Cargo.toml` if adding locally + +## External Challenge Crates + +Challenge crates can also be external (like term-challenge). They should: +- Import `platform-challenge-sdk` as a dependency +- Implement the `ServerChallenge` trait +- Provide Docker configuration for evaluation + +## Challenge Crate Requirements + +- Must implement `platform-challenge-sdk::ServerChallenge` +- Must provide `/evaluate` HTTP endpoint +- Must handle graceful shutdown signals +- Must support state persistence for hot-reload + +## Example + +See [term-challenge](https://github.com/PlatformNetwork/term-challenge) for a complete example. From d8f6e576a4b51066a5f31a0d3a4900b867de4e9d Mon Sep 17 00:00:00 2001 From: echobt Date: Tue, 3 Feb 2026 11:04:10 +0000 Subject: [PATCH 2/8] feat: add challenge-registry crate for challenge lifecycle management Create new platform-challenge-registry crate with: - Challenge discovery and registration - Version management (semver-based) - Lifecycle state machine (registered/starting/running/stopping/stopped) - Health monitoring with configurable checks - State persistence and hot-reload support - Migration planning for version upgrades Modules: - registry: Main registry with CRUD operations - lifecycle: State machine for challenge states - health: Health monitoring and status tracking - state: State snapshots for hot-reload - discovery: Challenge discovery from various sources - migration: Version migration planning - version: Semantic versioning support - error: Registry-specific error types --- Cargo.lock | 27 ++ Cargo.toml | 4 +- crates/challenge-registry/Cargo.toml | 42 ++ crates/challenge-registry/src/discovery.rs | 299 +++++++++++++ crates/challenge-registry/src/error.rs | 61 +++ crates/challenge-registry/src/health.rs | 259 ++++++++++++ crates/challenge-registry/src/lib.rs | 41 ++ crates/challenge-registry/src/lifecycle.rs | 162 +++++++ crates/challenge-registry/src/migration.rs | 467 +++++++++++++++++++++ crates/challenge-registry/src/registry.rs | 464 ++++++++++++++++++++ crates/challenge-registry/src/state.rs | 316 ++++++++++++++ crates/challenge-registry/src/version.rs | 164 ++++++++ 12 files changed, 2303 insertions(+), 3 deletions(-) create mode 100644 crates/challenge-registry/Cargo.toml create mode 100644 crates/challenge-registry/src/discovery.rs create mode 100644 crates/challenge-registry/src/error.rs create mode 100644 crates/challenge-registry/src/health.rs create mode 100644 crates/challenge-registry/src/lib.rs create mode 100644 crates/challenge-registry/src/lifecycle.rs create mode 100644 crates/challenge-registry/src/migration.rs create mode 100644 crates/challenge-registry/src/registry.rs create mode 100644 crates/challenge-registry/src/state.rs create mode 100644 crates/challenge-registry/src/version.rs diff --git a/Cargo.lock b/Cargo.lock index 58e2c21..462e0ac 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4715,6 +4715,33 @@ dependencies = [ "tracing", ] +[[package]] +name = "platform-challenge-registry" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-trait", + "bincode", + "chrono", + "futures", + "hex", + "parking_lot 0.12.5", + "platform-challenge-sdk", + "platform-core", + "platform-storage", + "reqwest 0.12.25", + "semver", + "serde", + "serde_json", + "sha2 0.10.9", + "tempfile", + "thiserror 2.0.17", + "tokio", + "tokio-test", + "tracing", + "uuid", +] + [[package]] name = "platform-challenge-sdk" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index c2e5098..44aa74e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "crates/storage", "crates/distributed-storage", "crates/challenge-sdk", + "crates/challenge-registry", "crates/epoch", "crates/bittensor-integration", "crates/subnet-manager", @@ -90,9 +91,6 @@ reqwest = { version = "0.12", features = ["json"] } [patch.crates-io] w3f-bls = { git = "https://github.com/opentensor/bls", branch = "fix-no-std" } -# Challenge integration (optional - add to crates that need dynamic loading) -libloading = "0.8" - # Clippy lints configuration [workspace.lints.clippy] # Allow these patterns that are intentional in this codebase diff --git a/crates/challenge-registry/Cargo.toml b/crates/challenge-registry/Cargo.toml new file mode 100644 index 0000000..5fe3946 --- /dev/null +++ b/crates/challenge-registry/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "platform-challenge-registry" +version.workspace = true +edition.workspace = true +description = "Challenge registry and lifecycle management for Platform Network" + +[dependencies] +platform-core = { path = "../core" } +platform-challenge-sdk = { path = "../challenge-sdk" } +platform-storage = { path = "../storage" } + +# Async +tokio = { workspace = true } +async-trait = { workspace = true } +futures = { workspace = true } + +# Serialization +serde = { workspace = true } +serde_json = { workspace = true } +bincode = { workspace = true } + +# Utils +tracing = { workspace = true } +thiserror = { workspace = true } +anyhow = { workspace = true } +chrono = { workspace = true } +parking_lot = { workspace = true } +uuid = { workspace = true } + +# Crypto for checksums +sha2 = { workspace = true } +hex = { workspace = true } + +# Versioning +semver = "1.0" + +# Health checks +reqwest = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } +tokio-test = { workspace = true } diff --git a/crates/challenge-registry/src/discovery.rs b/crates/challenge-registry/src/discovery.rs new file mode 100644 index 0000000..776bf65 --- /dev/null +++ b/crates/challenge-registry/src/discovery.rs @@ -0,0 +1,299 @@ +//! Challenge discovery and auto-registration +//! +//! Discovers challenges from: +//! - Docker registry +//! - File system (local development) +//! - Network announcements (P2P) + +use crate::error::{RegistryError, RegistryResult}; +use crate::version::ChallengeVersion; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; + +/// A discovered challenge that can be registered +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct DiscoveredChallenge { + /// Challenge name + pub name: String, + /// Challenge version + pub version: ChallengeVersion, + /// Docker image (if available) + pub docker_image: Option, + /// Local path (for development) + pub local_path: Option, + /// Health endpoint URL + pub health_endpoint: Option, + /// Evaluation endpoint URL + pub evaluation_endpoint: Option, + /// Challenge metadata + pub metadata: ChallengeMetadata, + /// Source of discovery + pub source: DiscoverySource, +} + +/// Metadata about a challenge +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct ChallengeMetadata { + /// Human-readable description + pub description: Option, + /// Challenge author + pub author: Option, + /// Repository URL + pub repository: Option, + /// License + pub license: Option, + /// Tags for categorization + pub tags: Vec, + /// Minimum platform version required + pub min_platform_version: Option, +} + +/// Source where a challenge was discovered +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum DiscoverySource { + /// Discovered from Docker registry + DockerRegistry(String), + /// Discovered from local filesystem + LocalFilesystem(PathBuf), + /// Announced via P2P network + P2PNetwork(String), + /// Manually configured + Manual, +} + +/// Configuration for challenge discovery +#[derive(Clone, Debug)] +pub struct DiscoveryConfig { + /// Docker registries to scan + pub docker_registries: Vec, + /// Local paths to scan + pub local_paths: Vec, + /// Enable P2P discovery + pub enable_p2p: bool, + /// Auto-register discovered challenges + pub auto_register: bool, + /// Scan interval in seconds + pub scan_interval_secs: u64, +} + +impl Default for DiscoveryConfig { + fn default() -> Self { + Self { + docker_registries: vec![], + local_paths: vec![], + enable_p2p: true, + auto_register: false, + scan_interval_secs: 300, // 5 minutes + } + } +} + +/// Discovers challenges from various sources +pub struct ChallengeDiscovery { + /// Configuration + config: DiscoveryConfig, + /// Discovered but not yet registered challenges + discovered: parking_lot::RwLock>, +} + +impl ChallengeDiscovery { + /// Create a new discovery service with default config + pub fn new() -> Self { + Self { + config: DiscoveryConfig::default(), + discovered: parking_lot::RwLock::new(Vec::new()), + } + } + + /// Create with custom config + pub fn with_config(config: DiscoveryConfig) -> Self { + Self { + config, + discovered: parking_lot::RwLock::new(Vec::new()), + } + } + + /// Get the current configuration + pub fn config(&self) -> &DiscoveryConfig { + &self.config + } + + /// Discover challenges from all configured sources + pub fn discover_all(&self) -> RegistryResult> { + let mut all_discovered = Vec::new(); + + // Discover from local paths + for path in &self.config.local_paths { + match self.discover_from_local(path) { + Ok(challenges) => all_discovered.extend(challenges), + Err(e) => { + tracing::warn!(path = ?path, error = %e, "Failed to discover from local path"); + } + } + } + + // Update internal state + let mut discovered = self.discovered.write(); + *discovered = all_discovered.clone(); + + Ok(all_discovered) + } + + /// Discover challenges from a local path + pub fn discover_from_local(&self, path: &PathBuf) -> RegistryResult> { + if !path.exists() { + return Err(RegistryError::InvalidConfig(format!( + "Path does not exist: {:?}", + path + ))); + } + + let mut challenges = Vec::new(); + + // Look for challenge.toml or Cargo.toml with challenge metadata + if path.is_dir() { + let challenge_toml = path.join("challenge.toml"); + let cargo_toml = path.join("Cargo.toml"); + + if challenge_toml.exists() { + // In a real implementation, parse challenge.toml + let name = path + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown") + .to_string(); + + challenges.push(DiscoveredChallenge { + name, + version: ChallengeVersion::default(), + docker_image: None, + local_path: Some(path.clone()), + health_endpoint: None, + evaluation_endpoint: None, + metadata: ChallengeMetadata::default(), + source: DiscoverySource::LocalFilesystem(path.clone()), + }); + } else if cargo_toml.exists() { + // Extract name from Cargo.toml + let name = path + .file_name() + .and_then(|n| n.to_str()) + .unwrap_or("unknown") + .to_string(); + + challenges.push(DiscoveredChallenge { + name, + version: ChallengeVersion::default(), + docker_image: None, + local_path: Some(path.clone()), + health_endpoint: None, + evaluation_endpoint: None, + metadata: ChallengeMetadata::default(), + source: DiscoverySource::LocalFilesystem(path.clone()), + }); + } + } + + Ok(challenges) + } + + /// Manually add a discovered challenge + pub fn add_discovered(&self, challenge: DiscoveredChallenge) { + let mut discovered = self.discovered.write(); + discovered.push(challenge); + } + + /// Get all discovered challenges + pub fn get_discovered(&self) -> Vec { + self.discovered.read().clone() + } + + /// Clear discovered challenges + pub fn clear_discovered(&self) { + self.discovered.write().clear(); + } + + /// Check if auto-registration is enabled + pub fn auto_register_enabled(&self) -> bool { + self.config.auto_register + } +} + +impl Default for ChallengeDiscovery { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_discovery_source_equality() { + assert_eq!(DiscoverySource::Manual, DiscoverySource::Manual); + assert_ne!( + DiscoverySource::Manual, + DiscoverySource::P2PNetwork("test".to_string()) + ); + } + + #[test] + fn test_discovered_challenge() { + let challenge = DiscoveredChallenge { + name: "test-challenge".to_string(), + version: ChallengeVersion::new(1, 0, 0), + docker_image: Some("test:latest".to_string()), + local_path: None, + health_endpoint: Some("http://localhost:8080/health".to_string()), + evaluation_endpoint: Some("http://localhost:8080/evaluate".to_string()), + metadata: ChallengeMetadata { + description: Some("A test challenge".to_string()), + author: Some("Platform".to_string()), + ..Default::default() + }, + source: DiscoverySource::Manual, + }; + + assert_eq!(challenge.name, "test-challenge"); + assert!(challenge.docker_image.is_some()); + } + + #[test] + fn test_discovery_service() { + let discovery = ChallengeDiscovery::new(); + + assert!(discovery.get_discovered().is_empty()); + + discovery.add_discovered(DiscoveredChallenge { + name: "manual".to_string(), + version: ChallengeVersion::new(1, 0, 0), + docker_image: None, + local_path: None, + health_endpoint: None, + evaluation_endpoint: None, + metadata: ChallengeMetadata::default(), + source: DiscoverySource::Manual, + }); + + assert_eq!(discovery.get_discovered().len(), 1); + + discovery.clear_discovered(); + assert!(discovery.get_discovered().is_empty()); + } + + #[test] + fn test_discovery_config() { + let config = DiscoveryConfig { + docker_registries: vec!["registry.example.com".to_string()], + local_paths: vec![PathBuf::from("/challenges")], + enable_p2p: false, + auto_register: true, + scan_interval_secs: 60, + }; + + let discovery = ChallengeDiscovery::with_config(config); + assert!(discovery.auto_register_enabled()); + assert_eq!(discovery.config().scan_interval_secs, 60); + } +} diff --git a/crates/challenge-registry/src/error.rs b/crates/challenge-registry/src/error.rs new file mode 100644 index 0000000..369db73 --- /dev/null +++ b/crates/challenge-registry/src/error.rs @@ -0,0 +1,61 @@ +//! Error types for challenge registry + +use thiserror::Error; + +/// Result type for registry operations +pub type RegistryResult = Result; + +/// Errors that can occur in the challenge registry +#[derive(Error, Debug)] +pub enum RegistryError { + #[error("Challenge not found: {0}")] + ChallengeNotFound(String), + + #[error("Challenge already registered: {0}")] + AlreadyRegistered(String), + + #[error("Version conflict: {0}")] + VersionConflict(String), + + #[error("Migration failed: {0}")] + MigrationFailed(String), + + #[error("Health check failed: {0}")] + HealthCheckFailed(String), + + #[error("State persistence error: {0}")] + StatePersistence(String), + + #[error("State restoration error: {0}")] + StateRestoration(String), + + #[error("Invalid challenge configuration: {0}")] + InvalidConfig(String), + + #[error("Serialization error: {0}")] + Serialization(String), + + #[error("Network error: {0}")] + Network(String), + + #[error("Internal error: {0}")] + Internal(String), +} + +impl From for RegistryError { + fn from(err: std::io::Error) -> Self { + RegistryError::Internal(err.to_string()) + } +} + +impl From for RegistryError { + fn from(err: serde_json::Error) -> Self { + RegistryError::Serialization(err.to_string()) + } +} + +impl From for RegistryError { + fn from(err: bincode::Error) -> Self { + RegistryError::Serialization(err.to_string()) + } +} diff --git a/crates/challenge-registry/src/health.rs b/crates/challenge-registry/src/health.rs new file mode 100644 index 0000000..e142fdb --- /dev/null +++ b/crates/challenge-registry/src/health.rs @@ -0,0 +1,259 @@ +//! Health monitoring for challenges +//! +//! Monitors challenge health through: +//! - HTTP health endpoints +//! - Container status +//! - Resource usage + +use platform_core::ChallengeId; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::time::Duration; +use parking_lot::RwLock; + +/// Health status of a challenge +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum HealthStatus { + /// Health status is unknown (not yet checked) + Unknown, + /// Challenge is healthy + Healthy, + /// Challenge is degraded but operational + Degraded(String), + /// Challenge is unhealthy + Unhealthy(String), +} + +impl Default for HealthStatus { + fn default() -> Self { + Self::Unknown + } +} + +/// Detailed health information for a challenge +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ChallengeHealth { + /// Challenge identifier + pub challenge_id: ChallengeId, + /// Current health status + pub status: HealthStatus, + /// Last successful health check timestamp (millis) + pub last_check_at: i64, + /// Number of consecutive failures + pub consecutive_failures: u32, + /// Average response time in milliseconds + pub avg_response_time_ms: f64, + /// Additional health metrics + pub metrics: HashMap, +} + +impl ChallengeHealth { + /// Create new health info for a challenge + pub fn new(challenge_id: ChallengeId) -> Self { + Self { + challenge_id, + status: HealthStatus::Unknown, + last_check_at: 0, + consecutive_failures: 0, + avg_response_time_ms: 0.0, + metrics: HashMap::new(), + } + } + + /// Check if the challenge is considered healthy + pub fn is_healthy(&self) -> bool { + matches!(self.status, HealthStatus::Healthy) + } + + /// Check if the challenge is operational (healthy or degraded) + pub fn is_operational(&self) -> bool { + matches!(self.status, HealthStatus::Healthy | HealthStatus::Degraded(_)) + } + + /// Record a successful health check + pub fn record_success(&mut self, response_time_ms: f64) { + self.status = HealthStatus::Healthy; + self.last_check_at = chrono::Utc::now().timestamp_millis(); + self.consecutive_failures = 0; + + // Exponential moving average for response time + if self.avg_response_time_ms == 0.0 { + self.avg_response_time_ms = response_time_ms; + } else { + self.avg_response_time_ms = self.avg_response_time_ms * 0.8 + response_time_ms * 0.2; + } + } + + /// Record a failed health check + pub fn record_failure(&mut self, reason: String) { + self.consecutive_failures += 1; + self.last_check_at = chrono::Utc::now().timestamp_millis(); + + if self.consecutive_failures >= 3 { + self.status = HealthStatus::Unhealthy(reason); + } else { + self.status = HealthStatus::Degraded(reason); + } + } +} + +/// Configuration for health monitoring +#[derive(Clone, Debug)] +pub struct HealthConfig { + /// Interval between health checks + pub check_interval: Duration, + /// Timeout for health check requests + pub check_timeout: Duration, + /// Number of failures before marking unhealthy + pub failure_threshold: u32, + /// Number of successes to recover from unhealthy + pub recovery_threshold: u32, +} + +impl Default for HealthConfig { + fn default() -> Self { + Self { + check_interval: Duration::from_secs(30), + check_timeout: Duration::from_secs(5), + failure_threshold: 3, + recovery_threshold: 2, + } + } +} + +/// Monitors health of registered challenges +pub struct HealthMonitor { + /// Health state for each challenge + health_state: RwLock>, + /// Configuration + config: HealthConfig, +} + +impl HealthMonitor { + /// Create a new health monitor with default config + pub fn new() -> Self { + Self { + health_state: RwLock::new(HashMap::new()), + config: HealthConfig::default(), + } + } + + /// Create a health monitor with custom config + pub fn with_config(config: HealthConfig) -> Self { + Self { + health_state: RwLock::new(HashMap::new()), + config, + } + } + + /// Register a challenge for health monitoring + pub fn register(&self, challenge_id: ChallengeId) { + let mut state = self.health_state.write(); + state.insert(challenge_id, ChallengeHealth::new(challenge_id)); + } + + /// Unregister a challenge from health monitoring + pub fn unregister(&self, challenge_id: &ChallengeId) { + let mut state = self.health_state.write(); + state.remove(challenge_id); + } + + /// Get health status for a challenge + pub fn get_health(&self, challenge_id: &ChallengeId) -> Option { + self.health_state.read().get(challenge_id).cloned() + } + + /// Get health status for all challenges + pub fn get_all_health(&self) -> Vec { + self.health_state.read().values().cloned().collect() + } + + /// Update health status after a check + pub fn update_health(&self, challenge_id: &ChallengeId, status: HealthStatus) { + let mut state = self.health_state.write(); + if let Some(health) = state.get_mut(challenge_id) { + health.status = status; + health.last_check_at = chrono::Utc::now().timestamp_millis(); + } + } + + /// Record a successful health check + pub fn record_success(&self, challenge_id: &ChallengeId, response_time_ms: f64) { + let mut state = self.health_state.write(); + if let Some(health) = state.get_mut(challenge_id) { + health.record_success(response_time_ms); + } + } + + /// Record a failed health check + pub fn record_failure(&self, challenge_id: &ChallengeId, reason: String) { + let mut state = self.health_state.write(); + if let Some(health) = state.get_mut(challenge_id) { + health.record_failure(reason); + } + } + + /// Get the health config + pub fn config(&self) -> &HealthConfig { + &self.config + } +} + +impl Default for HealthMonitor { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_health_status() { + let mut health = ChallengeHealth::new(ChallengeId::new()); + + assert_eq!(health.status, HealthStatus::Unknown); + assert!(!health.is_healthy()); + + health.record_success(50.0); + assert!(health.is_healthy()); + assert!(health.is_operational()); + + health.record_failure("timeout".to_string()); + assert!(!health.is_healthy()); + assert!(health.is_operational()); // Still degraded + + health.record_failure("timeout".to_string()); + health.record_failure("timeout".to_string()); + assert!(!health.is_operational()); // Now unhealthy + } + + #[test] + fn test_health_monitor() { + let monitor = HealthMonitor::new(); + let id = ChallengeId::new(); + + monitor.register(id); + assert!(monitor.get_health(&id).is_some()); + + monitor.record_success(&id, 100.0); + let health = monitor.get_health(&id).unwrap(); + assert!(health.is_healthy()); + + monitor.unregister(&id); + assert!(monitor.get_health(&id).is_none()); + } + + #[test] + fn test_response_time_averaging() { + let mut health = ChallengeHealth::new(ChallengeId::new()); + + health.record_success(100.0); + assert_eq!(health.avg_response_time_ms, 100.0); + + health.record_success(200.0); + // 100 * 0.8 + 200 * 0.2 = 80 + 40 = 120 + assert!((health.avg_response_time_ms - 120.0).abs() < 0.01); + } +} diff --git a/crates/challenge-registry/src/lib.rs b/crates/challenge-registry/src/lib.rs new file mode 100644 index 0000000..6161212 --- /dev/null +++ b/crates/challenge-registry/src/lib.rs @@ -0,0 +1,41 @@ +//! Challenge Registry for Platform Network +//! +//! Manages the lifecycle of challenge crates including: +//! - Challenge discovery and registration +//! - Version management and migrations +//! - Hot-reload support with state preservation +//! - Health monitoring +//! +//! # Architecture +//! +//! ```text +//! ┌─────────────────────────────────────────────────────────────┐ +//! │ Challenge Registry │ +//! ├─────────────────────────────────────────────────────────────┤ +//! │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +//! │ │ Discovery │ │ Lifecycle │ │ Health │ │ +//! │ │ Manager │ │ Manager │ │ Monitor │ │ +//! │ └─────────────┘ └─────────────┘ └─────────────┘ │ +//! ├─────────────────────────────────────────────────────────────┤ +//! │ Challenge State Store │ +//! │ (evaluations, checkpoints, migrations) │ +//! └─────────────────────────────────────────────────────────────┘ +//! ``` + +pub mod discovery; +pub mod error; +pub mod health; +pub mod lifecycle; +pub mod migration; +pub mod registry; +pub mod state; +pub mod version; + +pub use discovery::{ChallengeDiscovery, DiscoveredChallenge}; +pub use error::{RegistryError, RegistryResult}; +pub use health::{ChallengeHealth, HealthMonitor, HealthStatus}; +pub use lifecycle::{ChallengeLifecycle, LifecycleEvent, LifecycleState}; +pub use migration::{ChallengeMigration, MigrationPlan, MigrationStatus}; +pub use registry::{ChallengeEntry, ChallengeRegistry, RegisteredChallenge}; +pub use state::{ChallengeState, StateSnapshot, StateStore}; +pub use version::{ChallengeVersion, VersionConstraint, VersionedChallenge}; diff --git a/crates/challenge-registry/src/lifecycle.rs b/crates/challenge-registry/src/lifecycle.rs new file mode 100644 index 0000000..a2ba334 --- /dev/null +++ b/crates/challenge-registry/src/lifecycle.rs @@ -0,0 +1,162 @@ +//! Challenge lifecycle management +//! +//! Handles state transitions for challenges: +//! Registered -> Starting -> Running -> Stopping -> Stopped + +use crate::version::ChallengeVersion; +use platform_core::ChallengeId; +use serde::{Deserialize, Serialize}; + +/// State of a challenge in its lifecycle +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum LifecycleState { + /// Challenge is registered but not started + Registered, + /// Challenge is starting up + Starting, + /// Challenge is running and accepting evaluations + Running, + /// Challenge is being stopped gracefully + Stopping, + /// Challenge is stopped + Stopped, + /// Challenge failed to start or crashed + Failed(String), + /// Challenge is being migrated to a new version + Migrating, +} + +impl Default for LifecycleState { + fn default() -> Self { + Self::Registered + } +} + +/// Events emitted during lifecycle transitions +#[derive(Clone, Debug)] +pub enum LifecycleEvent { + /// Challenge was registered + Registered { challenge_id: ChallengeId }, + /// Challenge was unregistered + Unregistered { challenge_id: ChallengeId }, + /// Challenge state changed + StateChanged { + challenge_id: ChallengeId, + old_state: LifecycleState, + new_state: LifecycleState, + }, + /// Challenge version changed (hot-reload) + VersionChanged { + challenge_id: ChallengeId, + old_version: ChallengeVersion, + new_version: ChallengeVersion, + }, +} + +/// Manages challenge lifecycle transitions +pub struct ChallengeLifecycle { + /// Whether to allow automatic restarts on failure + auto_restart: bool, + /// Maximum restart attempts + max_restart_attempts: u32, +} + +impl ChallengeLifecycle { + /// Create a new lifecycle manager + pub fn new() -> Self { + Self { + auto_restart: true, + max_restart_attempts: 3, + } + } + + /// Configure auto-restart behavior + pub fn with_auto_restart(mut self, enabled: bool, max_attempts: u32) -> Self { + self.auto_restart = enabled; + self.max_restart_attempts = max_attempts; + self + } + + /// Check if a state transition is valid + pub fn is_valid_transition(&self, from: &LifecycleState, to: &LifecycleState) -> bool { + match (from, to) { + // From Registered + (LifecycleState::Registered, LifecycleState::Starting) => true, + (LifecycleState::Registered, LifecycleState::Stopped) => true, + + // From Starting + (LifecycleState::Starting, LifecycleState::Running) => true, + (LifecycleState::Starting, LifecycleState::Failed(_)) => true, + + // From Running + (LifecycleState::Running, LifecycleState::Stopping) => true, + (LifecycleState::Running, LifecycleState::Failed(_)) => true, + (LifecycleState::Running, LifecycleState::Migrating) => true, + + // From Stopping + (LifecycleState::Stopping, LifecycleState::Stopped) => true, + + // From Stopped + (LifecycleState::Stopped, LifecycleState::Starting) => true, + (LifecycleState::Stopped, LifecycleState::Registered) => true, + + // From Failed + (LifecycleState::Failed(_), LifecycleState::Starting) => true, + (LifecycleState::Failed(_), LifecycleState::Stopped) => true, + + // From Migrating + (LifecycleState::Migrating, LifecycleState::Running) => true, + (LifecycleState::Migrating, LifecycleState::Failed(_)) => true, + + _ => false, + } + } + + /// Check if auto-restart is enabled + pub fn auto_restart_enabled(&self) -> bool { + self.auto_restart + } + + /// Get max restart attempts + pub fn max_restart_attempts(&self) -> u32 { + self.max_restart_attempts + } +} + +impl Default for ChallengeLifecycle { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_valid_transitions() { + let lifecycle = ChallengeLifecycle::new(); + + assert!(lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Starting)); + assert!(lifecycle.is_valid_transition(&LifecycleState::Starting, &LifecycleState::Running)); + assert!(lifecycle.is_valid_transition(&LifecycleState::Running, &LifecycleState::Stopping)); + assert!(lifecycle.is_valid_transition(&LifecycleState::Stopping, &LifecycleState::Stopped)); + } + + #[test] + fn test_invalid_transitions() { + let lifecycle = ChallengeLifecycle::new(); + + assert!(!lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Running)); + assert!(!lifecycle.is_valid_transition(&LifecycleState::Stopped, &LifecycleState::Running)); + } + + #[test] + fn test_lifecycle_config() { + let lifecycle = ChallengeLifecycle::new() + .with_auto_restart(false, 5); + + assert!(!lifecycle.auto_restart_enabled()); + assert_eq!(lifecycle.max_restart_attempts(), 5); + } +} diff --git a/crates/challenge-registry/src/migration.rs b/crates/challenge-registry/src/migration.rs new file mode 100644 index 0000000..002c543 --- /dev/null +++ b/crates/challenge-registry/src/migration.rs @@ -0,0 +1,467 @@ +//! Challenge migration support +//! +//! Handles version migrations for challenges: +//! - Schema migrations +//! - State transformations +//! - Rollback support + +use crate::error::{RegistryError, RegistryResult}; +use crate::version::ChallengeVersion; +use platform_core::ChallengeId; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// Status of a migration +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum MigrationStatus { + /// Migration is pending + Pending, + /// Migration is in progress + InProgress, + /// Migration completed successfully + Completed, + /// Migration failed + Failed(String), + /// Migration was rolled back + RolledBack, +} + +/// A single migration step +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct MigrationStep { + /// Step identifier + pub id: String, + /// Description of what this step does + pub description: String, + /// From version + pub from_version: ChallengeVersion, + /// To version + pub to_version: ChallengeVersion, + /// Whether this step is reversible + pub reversible: bool, + /// Estimated duration in seconds + pub estimated_duration_secs: u64, +} + +impl MigrationStep { + /// Create a new migration step + pub fn new( + id: String, + description: String, + from: ChallengeVersion, + to: ChallengeVersion, + ) -> Self { + Self { + id, + description, + from_version: from, + to_version: to, + reversible: true, + estimated_duration_secs: 60, + } + } + + /// Mark step as irreversible + pub fn irreversible(mut self) -> Self { + self.reversible = false; + self + } + + /// Set estimated duration + pub fn with_duration(mut self, secs: u64) -> Self { + self.estimated_duration_secs = secs; + self + } +} + +/// A plan for migrating a challenge between versions +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct MigrationPlan { + /// Challenge being migrated + pub challenge_id: ChallengeId, + /// Challenge name + pub challenge_name: String, + /// Source version + pub from_version: ChallengeVersion, + /// Target version + pub to_version: ChallengeVersion, + /// Ordered list of migration steps + pub steps: Vec, + /// Current status + pub status: MigrationStatus, + /// Index of current step (0-based) + pub current_step: usize, + /// Plan creation timestamp + pub created_at: i64, + /// Plan start timestamp (if started) + pub started_at: Option, + /// Plan completion timestamp (if completed) + pub completed_at: Option, +} + +impl MigrationPlan { + /// Create a new migration plan + pub fn new( + challenge_id: ChallengeId, + challenge_name: String, + from_version: ChallengeVersion, + to_version: ChallengeVersion, + ) -> Self { + Self { + challenge_id, + challenge_name, + from_version, + to_version, + steps: Vec::new(), + status: MigrationStatus::Pending, + current_step: 0, + created_at: chrono::Utc::now().timestamp_millis(), + started_at: None, + completed_at: None, + } + } + + /// Add a migration step + pub fn add_step(&mut self, step: MigrationStep) { + self.steps.push(step); + } + + /// Check if the plan has any steps + pub fn is_empty(&self) -> bool { + self.steps.is_empty() + } + + /// Get total number of steps + pub fn total_steps(&self) -> usize { + self.steps.len() + } + + /// Get estimated total duration + pub fn estimated_duration_secs(&self) -> u64 { + self.steps.iter().map(|s| s.estimated_duration_secs).sum() + } + + /// Check if migration is complete + pub fn is_complete(&self) -> bool { + matches!(self.status, MigrationStatus::Completed | MigrationStatus::RolledBack) + } + + /// Check if migration can be rolled back + pub fn can_rollback(&self) -> bool { + // Can rollback if all executed steps are reversible + self.steps + .iter() + .take(self.current_step) + .all(|s| s.reversible) + } + + /// Get progress as percentage + pub fn progress_percent(&self) -> f64 { + if self.steps.is_empty() { + return 100.0; + } + (self.current_step as f64 / self.steps.len() as f64) * 100.0 + } +} + +/// Record of a completed migration +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct MigrationRecord { + /// Migration plan + pub plan: MigrationPlan, + /// Execution logs + pub logs: Vec, +} + +/// Log entry for migration +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct MigrationLog { + /// Timestamp + pub timestamp: i64, + /// Log level + pub level: LogLevel, + /// Message + pub message: String, + /// Associated step ID (if any) + pub step_id: Option, +} + +/// Log level for migration logs +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum LogLevel { + Info, + Warning, + Error, +} + +/// Manages challenge migrations +pub struct ChallengeMigration { + /// Active migration plans + active_plans: parking_lot::RwLock>, + /// Migration history + history: parking_lot::RwLock>, + /// Maximum history to retain + max_history: usize, +} + +impl ChallengeMigration { + /// Create a new migration manager + pub fn new() -> Self { + Self { + active_plans: parking_lot::RwLock::new(HashMap::new()), + history: parking_lot::RwLock::new(Vec::new()), + max_history: 100, + } + } + + /// Create a migration plan between versions + pub fn create_plan( + &self, + challenge_id: ChallengeId, + challenge_name: String, + from_version: ChallengeVersion, + to_version: ChallengeVersion, + ) -> RegistryResult { + // Check if there's already an active migration + if self.active_plans.read().contains_key(&challenge_id) { + return Err(RegistryError::MigrationFailed( + "Migration already in progress".to_string(), + )); + } + + let mut plan = MigrationPlan::new(challenge_id, challenge_name, from_version.clone(), to_version.clone()); + + // Generate migration steps based on version difference + // This is a simplified version - real implementation would analyze schemas + if from_version.major != to_version.major { + plan.add_step( + MigrationStep::new( + "major_upgrade".to_string(), + format!("Major version upgrade from {} to {}", from_version.major, to_version.major), + from_version.clone(), + to_version.clone(), + ) + .irreversible() + .with_duration(300), + ); + } else if from_version.minor != to_version.minor { + plan.add_step( + MigrationStep::new( + "minor_upgrade".to_string(), + format!("Minor version upgrade from {} to {}", from_version, to_version), + from_version.clone(), + to_version.clone(), + ) + .with_duration(60), + ); + } else if from_version.patch != to_version.patch { + plan.add_step( + MigrationStep::new( + "patch_upgrade".to_string(), + format!("Patch version upgrade from {} to {}", from_version, to_version), + from_version, + to_version, + ) + .with_duration(10), + ); + } + + Ok(plan) + } + + /// Start executing a migration plan + pub fn start_migration(&self, plan: MigrationPlan) -> RegistryResult<()> { + let challenge_id = plan.challenge_id; + + let mut plans = self.active_plans.write(); + if plans.contains_key(&challenge_id) { + return Err(RegistryError::MigrationFailed( + "Migration already in progress".to_string(), + )); + } + + let mut plan = plan; + plan.status = MigrationStatus::InProgress; + plan.started_at = Some(chrono::Utc::now().timestamp_millis()); + + plans.insert(challenge_id, plan); + Ok(()) + } + + /// Get active migration for a challenge + pub fn get_active_migration(&self, challenge_id: &ChallengeId) -> Option { + self.active_plans.read().get(challenge_id).cloned() + } + + /// Complete a migration step + pub fn complete_step(&self, challenge_id: &ChallengeId) -> RegistryResult { + let mut plans = self.active_plans.write(); + let plan = plans + .get_mut(challenge_id) + .ok_or_else(|| RegistryError::MigrationFailed("No active migration".to_string()))?; + + plan.current_step += 1; + + // Check if all steps complete + if plan.current_step >= plan.steps.len() { + plan.status = MigrationStatus::Completed; + plan.completed_at = Some(chrono::Utc::now().timestamp_millis()); + Ok(true) + } else { + Ok(false) + } + } + + /// Fail a migration + pub fn fail_migration(&self, challenge_id: &ChallengeId, reason: String) -> RegistryResult<()> { + let mut plans = self.active_plans.write(); + let plan = plans + .get_mut(challenge_id) + .ok_or_else(|| RegistryError::MigrationFailed("No active migration".to_string()))?; + + plan.status = MigrationStatus::Failed(reason); + plan.completed_at = Some(chrono::Utc::now().timestamp_millis()); + + Ok(()) + } + + /// Finalize and archive a completed migration + pub fn finalize_migration(&self, challenge_id: &ChallengeId) -> RegistryResult { + let plan = self + .active_plans + .write() + .remove(challenge_id) + .ok_or_else(|| RegistryError::MigrationFailed("No active migration".to_string()))?; + + if !plan.is_complete() { + return Err(RegistryError::MigrationFailed( + "Migration not complete".to_string(), + )); + } + + // Add to history + let record = MigrationRecord { + plan: plan.clone(), + logs: Vec::new(), + }; + + let mut history = self.history.write(); + history.push(record); + + // Trim history + while history.len() > self.max_history { + history.remove(0); + } + + Ok(plan) + } + + /// Get migration history for a challenge + pub fn get_history(&self, challenge_id: &ChallengeId) -> Vec { + self.history + .read() + .iter() + .filter(|r| r.plan.challenge_id == *challenge_id) + .cloned() + .collect() + } + + /// Get all migration history + pub fn get_all_history(&self) -> Vec { + self.history.read().clone() + } +} + +impl Default for ChallengeMigration { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_migration_plan_creation() { + let migration = ChallengeMigration::new(); + let id = ChallengeId::new(); + + let plan = migration + .create_plan( + id, + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + ChallengeVersion::new(1, 1, 0), + ) + .unwrap(); + + assert_eq!(plan.total_steps(), 1); + assert!(!plan.is_complete()); + assert_eq!(plan.progress_percent(), 0.0); + } + + #[test] + fn test_migration_execution() { + let migration = ChallengeMigration::new(); + let id = ChallengeId::new(); + + let plan = migration + .create_plan( + id, + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + ChallengeVersion::new(1, 0, 1), + ) + .unwrap(); + + migration.start_migration(plan).unwrap(); + + let active = migration.get_active_migration(&id); + assert!(active.is_some()); + assert!(matches!(active.unwrap().status, MigrationStatus::InProgress)); + + let complete = migration.complete_step(&id).unwrap(); + assert!(complete); + + let finalized = migration.finalize_migration(&id).unwrap(); + assert!(matches!(finalized.status, MigrationStatus::Completed)); + } + + #[test] + fn test_duplicate_migration_prevention() { + let migration = ChallengeMigration::new(); + let id = ChallengeId::new(); + + let plan = migration + .create_plan( + id, + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + ChallengeVersion::new(1, 1, 0), + ) + .unwrap(); + + migration.start_migration(plan.clone()).unwrap(); + let result = migration.start_migration(plan); + assert!(result.is_err()); + } + + #[test] + fn test_major_version_migration() { + let migration = ChallengeMigration::new(); + let id = ChallengeId::new(); + + let plan = migration + .create_plan( + id, + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + ChallengeVersion::new(2, 0, 0), + ) + .unwrap(); + + // Major version migrations are irreversible + assert!(!plan.steps[0].reversible); + } +} diff --git a/crates/challenge-registry/src/registry.rs b/crates/challenge-registry/src/registry.rs new file mode 100644 index 0000000..1c2a0bd --- /dev/null +++ b/crates/challenge-registry/src/registry.rs @@ -0,0 +1,464 @@ +//! Main challenge registry implementation + +use crate::error::{RegistryError, RegistryResult}; +use crate::health::{HealthMonitor, HealthStatus}; +use crate::lifecycle::{ChallengeLifecycle, LifecycleEvent, LifecycleState}; +use crate::state::StateStore; +use crate::version::ChallengeVersion; +use parking_lot::RwLock; +use platform_core::ChallengeId; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::sync::Arc; +use tracing::{debug, info, warn}; + +/// Entry for a registered challenge +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ChallengeEntry { + /// Unique challenge ID + pub id: ChallengeId, + /// Challenge name + pub name: String, + /// Current version + pub version: ChallengeVersion, + /// Docker image for the challenge + pub docker_image: String, + /// HTTP endpoint for evaluation + pub endpoint: Option, + /// Current lifecycle state + pub lifecycle_state: LifecycleState, + /// Health status + pub health_status: HealthStatus, + /// Registration timestamp + pub registered_at: i64, + /// Last updated timestamp + pub updated_at: i64, + /// Configuration metadata + pub metadata: serde_json::Value, +} + +impl ChallengeEntry { + pub fn new(name: String, version: ChallengeVersion, docker_image: String) -> Self { + let now = chrono::Utc::now().timestamp_millis(); + Self { + id: ChallengeId::new(), + name, + version, + docker_image, + endpoint: None, + lifecycle_state: LifecycleState::Registered, + health_status: HealthStatus::Unknown, + registered_at: now, + updated_at: now, + metadata: serde_json::Value::Null, + } + } + + pub fn with_endpoint(mut self, endpoint: String) -> Self { + self.endpoint = Some(endpoint); + self + } + + pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self { + self.metadata = metadata; + self + } +} + +/// A registered challenge with its full state +#[derive(Clone, Debug)] +pub struct RegisteredChallenge { + pub entry: ChallengeEntry, + pub state_store: Arc, +} + +/// Main challenge registry +pub struct ChallengeRegistry { + /// Registered challenges by ID + challenges: RwLock>, + /// Name to ID mapping for lookups + name_index: RwLock>, + /// Lifecycle manager + lifecycle: Arc, + /// Health monitor + health_monitor: Arc, + /// Event listeners + event_listeners: RwLock>>, +} + +impl ChallengeRegistry { + /// Create a new challenge registry + pub fn new() -> Self { + Self { + challenges: RwLock::new(HashMap::new()), + name_index: RwLock::new(HashMap::new()), + lifecycle: Arc::new(ChallengeLifecycle::new()), + health_monitor: Arc::new(HealthMonitor::new()), + event_listeners: RwLock::new(Vec::new()), + } + } + + /// Register a new challenge + pub fn register(&self, entry: ChallengeEntry) -> RegistryResult { + let mut challenges = self.challenges.write(); + let mut name_index = self.name_index.write(); + + // Check if already registered by name + if name_index.contains_key(&entry.name) { + return Err(RegistryError::AlreadyRegistered(entry.name.clone())); + } + + let id = entry.id; + let name = entry.name.clone(); + + let state_store = Arc::new(StateStore::new(id)); + let registered = RegisteredChallenge { + entry, + state_store, + }; + + challenges.insert(id, registered); + name_index.insert(name.clone(), id); + + info!(challenge_id = %id, name = %name, "Challenge registered"); + self.emit_event(LifecycleEvent::Registered { challenge_id: id }); + + Ok(id) + } + + /// Unregister a challenge + pub fn unregister(&self, id: &ChallengeId) -> RegistryResult { + let mut challenges = self.challenges.write(); + let mut name_index = self.name_index.write(); + + let registered = challenges + .remove(id) + .ok_or_else(|| RegistryError::ChallengeNotFound(id.to_string()))?; + + name_index.remove(®istered.entry.name); + + info!(challenge_id = %id, "Challenge unregistered"); + self.emit_event(LifecycleEvent::Unregistered { challenge_id: *id }); + + Ok(registered.entry) + } + + /// Get a challenge by ID + pub fn get(&self, id: &ChallengeId) -> Option { + self.challenges.read().get(id).cloned() + } + + /// Get a challenge by name + pub fn get_by_name(&self, name: &str) -> Option { + let name_index = self.name_index.read(); + let id = name_index.get(name)?; + self.challenges.read().get(id).cloned() + } + + /// List all registered challenges + pub fn list(&self) -> Vec { + self.challenges + .read() + .values() + .map(|r| r.entry.clone()) + .collect() + } + + /// List active challenges (running and healthy) + pub fn list_active(&self) -> Vec { + self.challenges + .read() + .values() + .filter(|r| { + r.entry.lifecycle_state == LifecycleState::Running + && r.entry.health_status == HealthStatus::Healthy + }) + .map(|r| r.entry.clone()) + .collect() + } + + /// Update challenge lifecycle state + pub fn update_state( + &self, + id: &ChallengeId, + new_state: LifecycleState, + ) -> RegistryResult<()> { + let mut challenges = self.challenges.write(); + let registered = challenges + .get_mut(id) + .ok_or_else(|| RegistryError::ChallengeNotFound(id.to_string()))?; + + let old_state = registered.entry.lifecycle_state.clone(); + registered.entry.lifecycle_state = new_state.clone(); + registered.entry.updated_at = chrono::Utc::now().timestamp_millis(); + + debug!( + challenge_id = %id, + old_state = ?old_state, + new_state = ?new_state, + "Challenge state updated" + ); + + self.emit_event(LifecycleEvent::StateChanged { + challenge_id: *id, + old_state, + new_state, + }); + + Ok(()) + } + + /// Update challenge health status + pub fn update_health(&self, id: &ChallengeId, status: HealthStatus) -> RegistryResult<()> { + let mut challenges = self.challenges.write(); + let registered = challenges + .get_mut(id) + .ok_or_else(|| RegistryError::ChallengeNotFound(id.to_string()))?; + + registered.entry.health_status = status; + registered.entry.updated_at = chrono::Utc::now().timestamp_millis(); + + Ok(()) + } + + /// Update challenge version (for hot-reload) + pub fn update_version( + &self, + id: &ChallengeId, + new_version: ChallengeVersion, + ) -> RegistryResult { + let mut challenges = self.challenges.write(); + let registered = challenges + .get_mut(id) + .ok_or_else(|| RegistryError::ChallengeNotFound(id.to_string()))?; + + let old_version = registered.entry.version.clone(); + + if !new_version.is_compatible_with(&old_version) { + warn!( + challenge_id = %id, + old = %old_version, + new = %new_version, + "Breaking version change detected" + ); + } + + registered.entry.version = new_version.clone(); + registered.entry.updated_at = chrono::Utc::now().timestamp_millis(); + + info!( + challenge_id = %id, + old_version = %old_version, + new_version = %new_version, + "Challenge version updated" + ); + + self.emit_event(LifecycleEvent::VersionChanged { + challenge_id: *id, + old_version: old_version.clone(), + new_version, + }); + + Ok(old_version) + } + + /// Get state store for a challenge + pub fn state_store(&self, id: &ChallengeId) -> Option> { + self.challenges.read().get(id).map(|r| r.state_store.clone()) + } + + /// Add event listener + pub fn on_event(&self, listener: F) + where + F: Fn(LifecycleEvent) + Send + Sync + 'static, + { + self.event_listeners.write().push(Box::new(listener)); + } + + /// Emit lifecycle event to all listeners + fn emit_event(&self, event: LifecycleEvent) { + for listener in self.event_listeners.read().iter() { + listener(event.clone()); + } + } + + /// Get lifecycle manager + pub fn lifecycle(&self) -> Arc { + self.lifecycle.clone() + } + + /// Get health monitor + pub fn health_monitor(&self) -> Arc { + self.health_monitor.clone() + } + + /// Challenge count + pub fn count(&self) -> usize { + self.challenges.read().len() + } +} + +impl Default for ChallengeRegistry { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_register_challenge() { + let registry = ChallengeRegistry::new(); + let entry = ChallengeEntry::new( + "test-challenge".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + + let id = registry.register(entry).unwrap(); + assert!(registry.get(&id).is_some()); + assert_eq!(registry.count(), 1); + } + + #[test] + fn test_duplicate_registration() { + let registry = ChallengeRegistry::new(); + let entry1 = ChallengeEntry::new( + "test-challenge".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + let entry2 = ChallengeEntry::new( + "test-challenge".to_string(), + ChallengeVersion::new(2, 0, 0), + "test:v2".to_string(), + ); + + registry.register(entry1).unwrap(); + let result = registry.register(entry2); + assert!(matches!(result, Err(RegistryError::AlreadyRegistered(_)))); + } + + #[test] + fn test_get_by_name() { + let registry = ChallengeRegistry::new(); + let entry = ChallengeEntry::new( + "my-challenge".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + + registry.register(entry).unwrap(); + let found = registry.get_by_name("my-challenge"); + assert!(found.is_some()); + assert_eq!(found.unwrap().entry.name, "my-challenge"); + } + + #[test] + fn test_unregister() { + let registry = ChallengeRegistry::new(); + let entry = ChallengeEntry::new( + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + + let id = registry.register(entry).unwrap(); + assert_eq!(registry.count(), 1); + + registry.unregister(&id).unwrap(); + assert_eq!(registry.count(), 0); + } + + #[test] + fn test_update_state() { + let registry = ChallengeRegistry::new(); + let entry = ChallengeEntry::new( + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + + let id = registry.register(entry).unwrap(); + registry.update_state(&id, LifecycleState::Running).unwrap(); + + let challenge = registry.get(&id).unwrap(); + assert_eq!(challenge.entry.lifecycle_state, LifecycleState::Running); + } + + #[test] + fn test_update_version() { + let registry = ChallengeRegistry::new(); + let entry = ChallengeEntry::new( + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + + let id = registry.register(entry).unwrap(); + let old = registry.update_version(&id, ChallengeVersion::new(1, 1, 0)).unwrap(); + + assert_eq!(old, ChallengeVersion::new(1, 0, 0)); + + let challenge = registry.get(&id).unwrap(); + assert_eq!(challenge.entry.version, ChallengeVersion::new(1, 1, 0)); + } + + #[test] + fn test_list_active() { + let registry = ChallengeRegistry::new(); + + // Register two challenges + let entry1 = ChallengeEntry::new( + "active".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + let entry2 = ChallengeEntry::new( + "inactive".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + + let id1 = registry.register(entry1).unwrap(); + registry.register(entry2).unwrap(); + + // Make first one active + registry.update_state(&id1, LifecycleState::Running).unwrap(); + registry.update_health(&id1, HealthStatus::Healthy).unwrap(); + + let active = registry.list_active(); + assert_eq!(active.len(), 1); + assert_eq!(active[0].name, "active"); + } + + #[test] + fn test_entry_builders() { + let entry = ChallengeEntry::new( + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ) + .with_endpoint("http://localhost:8080".to_string()) + .with_metadata(serde_json::json!({"key": "value"})); + + assert_eq!(entry.endpoint, Some("http://localhost:8080".to_string())); + assert_eq!(entry.metadata["key"], "value"); + } + + #[test] + fn test_state_store_access() { + let registry = ChallengeRegistry::new(); + let entry = ChallengeEntry::new( + "test".to_string(), + ChallengeVersion::new(1, 0, 0), + "test:latest".to_string(), + ); + + let id = registry.register(entry).unwrap(); + let store = registry.state_store(&id); + assert!(store.is_some()); + } +} diff --git a/crates/challenge-registry/src/state.rs b/crates/challenge-registry/src/state.rs new file mode 100644 index 0000000..14e2e1c --- /dev/null +++ b/crates/challenge-registry/src/state.rs @@ -0,0 +1,316 @@ +//! State management for challenge hot-reload +//! +//! Provides state persistence and restoration to support +//! hot-reloading challenges without losing evaluation state. + +use crate::error::{RegistryError, RegistryResult}; +use parking_lot::RwLock; +use platform_core::ChallengeId; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// Snapshot of challenge state at a point in time +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct StateSnapshot { + /// Challenge ID this snapshot belongs to + pub challenge_id: ChallengeId, + /// Version when snapshot was taken + pub version: String, + /// Timestamp when snapshot was created (millis) + pub created_at: i64, + /// Serialized state data + pub data: Vec, + /// Checksum for integrity verification + pub checksum: String, +} + +impl StateSnapshot { + /// Create a new state snapshot + pub fn new(challenge_id: ChallengeId, version: String, data: Vec) -> Self { + use sha2::{Digest, Sha256}; + + let mut hasher = Sha256::new(); + hasher.update(&data); + let checksum = hex::encode(hasher.finalize()); + + Self { + challenge_id, + version, + created_at: chrono::Utc::now().timestamp_millis(), + data, + checksum, + } + } + + /// Verify snapshot integrity + pub fn verify(&self) -> bool { + use sha2::{Digest, Sha256}; + + let mut hasher = Sha256::new(); + hasher.update(&self.data); + let computed = hex::encode(hasher.finalize()); + + computed == self.checksum + } + + /// Get the size of the snapshot data + pub fn size(&self) -> usize { + self.data.len() + } +} + +/// State of a challenge that can be preserved across hot-reloads +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct ChallengeState { + /// Challenge ID + pub challenge_id: ChallengeId, + /// Active evaluations being tracked + pub active_evaluations: HashMap, + /// Completed evaluation count + pub completed_count: u64, + /// Last activity timestamp + pub last_activity_at: i64, + /// Custom state data from the challenge + pub custom_data: serde_json::Value, +} + +/// State of an in-progress evaluation +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct EvaluationState { + /// Evaluation job ID + pub job_id: String, + /// When evaluation started (millis) + pub started_at: i64, + /// Current progress (0.0 - 1.0) + pub progress: f64, + /// Checkpoint data for resumption + pub checkpoint: Option>, +} + +impl ChallengeState { + /// Create new empty state for a challenge + pub fn new(challenge_id: ChallengeId) -> Self { + Self { + challenge_id, + active_evaluations: HashMap::new(), + completed_count: 0, + last_activity_at: chrono::Utc::now().timestamp_millis(), + custom_data: serde_json::Value::Null, + } + } + + /// Check if there are active evaluations + pub fn has_active_evaluations(&self) -> bool { + !self.active_evaluations.is_empty() + } + + /// Get count of active evaluations + pub fn active_evaluation_count(&self) -> usize { + self.active_evaluations.len() + } +} + +/// Store for challenge state with persistence support +#[derive(Debug)] +pub struct StateStore { + /// Challenge this store belongs to + challenge_id: ChallengeId, + /// In-memory state + state: RwLock, + /// Snapshots for recovery + snapshots: RwLock>, + /// Maximum snapshots to retain + max_snapshots: usize, +} + +impl StateStore { + /// Create a new state store for a challenge + pub fn new(challenge_id: ChallengeId) -> Self { + Self { + challenge_id, + state: RwLock::new(ChallengeState::new(challenge_id)), + snapshots: RwLock::new(Vec::new()), + max_snapshots: 5, + } + } + + /// Create a state store with custom snapshot limit + pub fn with_max_snapshots(challenge_id: ChallengeId, max_snapshots: usize) -> Self { + Self { + challenge_id, + state: RwLock::new(ChallengeState::new(challenge_id)), + snapshots: RwLock::new(Vec::new()), + max_snapshots, + } + } + + /// Get current state (read-only) + pub fn get_state(&self) -> ChallengeState { + self.state.read().clone() + } + + /// Update state with a function + pub fn update_state(&self, f: F) + where + F: FnOnce(&mut ChallengeState), + { + let mut state = self.state.write(); + f(&mut state); + state.last_activity_at = chrono::Utc::now().timestamp_millis(); + } + + /// Track a new evaluation + pub fn track_evaluation(&self, job_id: String) { + let mut state = self.state.write(); + state.active_evaluations.insert( + job_id.clone(), + EvaluationState { + job_id, + started_at: chrono::Utc::now().timestamp_millis(), + progress: 0.0, + checkpoint: None, + }, + ); + state.last_activity_at = chrono::Utc::now().timestamp_millis(); + } + + /// Update evaluation progress + pub fn update_evaluation_progress(&self, job_id: &str, progress: f64) { + let mut state = self.state.write(); + if let Some(eval) = state.active_evaluations.get_mut(job_id) { + eval.progress = progress.clamp(0.0, 1.0); + } + state.last_activity_at = chrono::Utc::now().timestamp_millis(); + } + + /// Complete an evaluation + pub fn complete_evaluation(&self, job_id: &str) { + let mut state = self.state.write(); + state.active_evaluations.remove(job_id); + state.completed_count += 1; + state.last_activity_at = chrono::Utc::now().timestamp_millis(); + } + + /// Create a snapshot of current state + pub fn create_snapshot(&self, version: String) -> RegistryResult { + let state = self.state.read(); + // Use JSON for serialization since ChallengeState contains serde_json::Value + let data = serde_json::to_vec(&*state) + .map_err(|e| RegistryError::StatePersistence(e.to_string()))?; + + let snapshot = StateSnapshot::new(self.challenge_id, version, data); + + let mut snapshots = self.snapshots.write(); + snapshots.push(snapshot.clone()); + + // Trim old snapshots + while snapshots.len() > self.max_snapshots { + snapshots.remove(0); + } + + Ok(snapshot) + } + + /// Restore state from a snapshot + pub fn restore_snapshot(&self, snapshot: &StateSnapshot) -> RegistryResult<()> { + if !snapshot.verify() { + return Err(RegistryError::StateRestoration( + "Snapshot checksum mismatch".to_string(), + )); + } + + // Use JSON for deserialization since ChallengeState contains serde_json::Value + let restored: ChallengeState = serde_json::from_slice(&snapshot.data) + .map_err(|e| RegistryError::StateRestoration(e.to_string()))?; + + let mut state = self.state.write(); + *state = restored; + + Ok(()) + } + + /// Get list of available snapshots + pub fn list_snapshots(&self) -> Vec { + self.snapshots.read().clone() + } + + /// Get the latest snapshot + pub fn latest_snapshot(&self) -> Option { + self.snapshots.read().last().cloned() + } + + /// Clear all state + pub fn clear(&self) { + let mut state = self.state.write(); + *state = ChallengeState::new(self.challenge_id); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_state_store() { + let id = ChallengeId::new(); + let store = StateStore::new(id); + + store.track_evaluation("job1".to_string()); + let state = store.get_state(); + assert_eq!(state.active_evaluation_count(), 1); + + store.update_evaluation_progress("job1", 0.5); + let state = store.get_state(); + let eval = state.active_evaluations.get("job1").unwrap(); + assert_eq!(eval.progress, 0.5); + + store.complete_evaluation("job1"); + let state = store.get_state(); + assert_eq!(state.active_evaluation_count(), 0); + assert_eq!(state.completed_count, 1); + } + + #[test] + fn test_snapshot_creation() { + let id = ChallengeId::new(); + let store = StateStore::new(id); + + store.track_evaluation("job1".to_string()); + let snapshot = store.create_snapshot("1.0.0".to_string()).unwrap(); + + assert!(snapshot.verify()); + assert_eq!(snapshot.version, "1.0.0"); + } + + #[test] + fn test_snapshot_restoration() { + let id = ChallengeId::new(); + let store = StateStore::new(id); + + store.track_evaluation("job1".to_string()); + store.track_evaluation("job2".to_string()); + let snapshot = store.create_snapshot("1.0.0".to_string()).unwrap(); + + // Clear and verify empty + store.clear(); + assert_eq!(store.get_state().active_evaluation_count(), 0); + + // Restore and verify + store.restore_snapshot(&snapshot).unwrap(); + assert_eq!(store.get_state().active_evaluation_count(), 2); + } + + #[test] + fn test_snapshot_limit() { + let id = ChallengeId::new(); + let store = StateStore::with_max_snapshots(id, 3); + + for i in 0..5 { + store.create_snapshot(format!("{}.0.0", i)).unwrap(); + } + + let snapshots = store.list_snapshots(); + assert_eq!(snapshots.len(), 3); + assert_eq!(snapshots[0].version, "2.0.0"); + } +} diff --git a/crates/challenge-registry/src/version.rs b/crates/challenge-registry/src/version.rs new file mode 100644 index 0000000..d325c56 --- /dev/null +++ b/crates/challenge-registry/src/version.rs @@ -0,0 +1,164 @@ +//! Challenge versioning support + +use serde::{Deserialize, Serialize}; +use std::cmp::Ordering; +use std::fmt; + +/// Semantic version for challenges +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub struct ChallengeVersion { + pub major: u32, + pub minor: u32, + pub patch: u32, + pub prerelease: Option, +} + +impl ChallengeVersion { + pub fn new(major: u32, minor: u32, patch: u32) -> Self { + Self { + major, + minor, + patch, + prerelease: None, + } + } + + pub fn parse(s: &str) -> Option { + let s = s.strip_prefix('v').unwrap_or(s); + let parts: Vec<&str> = s.split('-').collect(); + let version_parts: Vec<&str> = parts[0].split('.').collect(); + + if version_parts.len() < 3 { + return None; + } + + Some(Self { + major: version_parts[0].parse().ok()?, + minor: version_parts[1].parse().ok()?, + patch: version_parts[2].parse().ok()?, + prerelease: parts.get(1).map(|s| s.to_string()), + }) + } + + /// Check if this version is compatible with another (same major version) + pub fn is_compatible_with(&self, other: &Self) -> bool { + self.major == other.major + } + + /// Check if this version is newer than another + pub fn is_newer_than(&self, other: &Self) -> bool { + self > other + } +} + +impl fmt::Display for ChallengeVersion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self.prerelease { + Some(pre) => write!(f, "{}.{}.{}-{}", self.major, self.minor, self.patch, pre), + None => write!(f, "{}.{}.{}", self.major, self.minor, self.patch), + } + } +} + +impl PartialOrd for ChallengeVersion { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ChallengeVersion { + fn cmp(&self, other: &Self) -> Ordering { + match self.major.cmp(&other.major) { + Ordering::Equal => match self.minor.cmp(&other.minor) { + Ordering::Equal => self.patch.cmp(&other.patch), + ord => ord, + }, + ord => ord, + } + } +} + +impl Default for ChallengeVersion { + fn default() -> Self { + Self::new(0, 1, 0) + } +} + +/// Version constraint for challenge compatibility +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum VersionConstraint { + /// Exact version match + Exact(ChallengeVersion), + /// Minimum version (>=) + AtLeast(ChallengeVersion), + /// Version range [min, max) + Range { + min: ChallengeVersion, + max: ChallengeVersion, + }, + /// Compatible with major version (^) + Compatible(ChallengeVersion), + /// Any version + Any, +} + +impl VersionConstraint { + pub fn satisfies(&self, version: &ChallengeVersion) -> bool { + match self { + Self::Exact(v) => version == v, + Self::AtLeast(v) => version >= v, + Self::Range { min, max } => version >= min && version < max, + Self::Compatible(v) => version.major == v.major && version >= v, + Self::Any => true, + } + } +} + +/// A challenge with version information +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct VersionedChallenge { + pub challenge_id: String, + pub version: ChallengeVersion, + pub min_platform_version: Option, + pub deprecated: bool, + pub deprecation_message: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_version_parsing() { + let v = ChallengeVersion::parse("1.2.3").unwrap(); + assert_eq!(v.major, 1); + assert_eq!(v.minor, 2); + assert_eq!(v.patch, 3); + + let v2 = ChallengeVersion::parse("v2.0.0-beta").unwrap(); + assert_eq!(v2.major, 2); + assert_eq!(v2.prerelease, Some("beta".to_string())); + } + + #[test] + fn test_version_comparison() { + let v1 = ChallengeVersion::new(1, 0, 0); + let v2 = ChallengeVersion::new(1, 1, 0); + let v3 = ChallengeVersion::new(2, 0, 0); + + assert!(v2.is_newer_than(&v1)); + assert!(v3.is_newer_than(&v2)); + assert!(v1.is_compatible_with(&v2)); + assert!(!v1.is_compatible_with(&v3)); + } + + #[test] + fn test_version_constraints() { + let v = ChallengeVersion::new(1, 5, 0); + + assert!(VersionConstraint::Any.satisfies(&v)); + assert!(VersionConstraint::AtLeast(ChallengeVersion::new(1, 0, 0)).satisfies(&v)); + assert!(!VersionConstraint::Exact(ChallengeVersion::new(1, 0, 0)).satisfies(&v)); + assert!(VersionConstraint::Compatible(ChallengeVersion::new(1, 0, 0)).satisfies(&v)); + } +} From 56ea011a4be1d998e8ef42cb3d88a5d61c5f3369 Mon Sep 17 00:00:00 2001 From: echobt Date: Tue, 3 Feb 2026 11:09:01 +0000 Subject: [PATCH 3/8] feat(core): add checkpoint system for state persistence --- crates/core/Cargo.toml | 3 + crates/core/src/checkpoint.rs | 741 ++++++++++++++++++++++++++++++++++ crates/core/src/lib.rs | 5 + 3 files changed, 749 insertions(+) create mode 100644 crates/core/src/checkpoint.rs diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index f6cdae5..97e9c3f 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -20,3 +20,6 @@ bs58 = "0.5" # Sr25519 crypto (Substrate standard) sp-core = { version = "31.0", default-features = false, features = ["std"] } schnorrkel = "0.11" + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/crates/core/src/checkpoint.rs b/crates/core/src/checkpoint.rs new file mode 100644 index 0000000..b627e4a --- /dev/null +++ b/crates/core/src/checkpoint.rs @@ -0,0 +1,741 @@ +//! Checkpoint system for state persistence +//! +//! Provides mechanisms to save and restore evaluation state, enabling: +//! - Hot-reload without losing progress +//! - Crash recovery +//! - Rolling updates + +use crate::{ChallengeId, Hotkey, MiniChainError, Result}; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use std::collections::HashMap; +use std::fs::{self, File}; +use std::io::{BufReader, BufWriter, Read, Write}; +use std::path::{Path, PathBuf}; +use std::time::SystemTime; +use tracing::{debug, info, warn}; + +/// Checkpoint version for format compatibility +pub const CHECKPOINT_VERSION: u32 = 1; + +/// Magic bytes for checkpoint file identification +const CHECKPOINT_MAGIC: &[u8; 8] = b"PLATCHKP"; + +/// Checkpoint file header +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct CheckpointHeader { + /// Magic bytes (verified on load) + pub magic: [u8; 8], + /// Checkpoint format version + pub version: u32, + /// Creation timestamp (Unix millis) + pub created_at: i64, + /// Checkpoint sequence number + pub sequence: u64, + /// SHA-256 hash of the data section + pub data_hash: [u8; 32], + /// Size of the data section in bytes + pub data_size: u64, +} + +impl CheckpointHeader { + pub fn new(sequence: u64, data_hash: [u8; 32], data_size: u64) -> Self { + Self { + magic: *CHECKPOINT_MAGIC, + version: CHECKPOINT_VERSION, + created_at: chrono::Utc::now().timestamp_millis(), + sequence, + data_hash, + data_size, + } + } + + pub fn verify_magic(&self) -> bool { + self.magic == *CHECKPOINT_MAGIC + } +} + +/// State of a pending evaluation +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct PendingEvaluationState { + /// Submission ID + pub submission_id: String, + /// Challenge ID + pub challenge_id: ChallengeId, + /// Miner hotkey + pub miner: Hotkey, + /// Submission hash + pub submission_hash: String, + /// Evaluation scores received (validator -> score) + pub scores: HashMap, + /// Creation timestamp + pub created_at: i64, + /// Whether finalization is in progress + pub finalizing: bool, +} + +/// Completed evaluation record +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct CompletedEvaluationState { + /// Submission ID + pub submission_id: String, + /// Challenge ID + pub challenge_id: ChallengeId, + /// Final aggregated score + pub final_score: f64, + /// Epoch when completed + pub epoch: u64, + /// Completion timestamp + pub completed_at: i64, +} + +/// Weight vote state +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct WeightVoteState { + /// Epoch for these weights + pub epoch: u64, + /// Netuid + pub netuid: u16, + /// Votes by validator + pub votes: HashMap>, + /// Whether finalized + pub finalized: bool, + /// Final weights if finalized + pub final_weights: Option>, +} + +/// Full checkpoint data +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct CheckpointData { + /// Current sequence number + pub sequence: u64, + /// Current epoch + pub epoch: u64, + /// Netuid + pub netuid: u16, + /// Pending evaluations + pub pending_evaluations: Vec, + /// Recent completed evaluations (last N epochs) + pub completed_evaluations: Vec, + /// Current weight votes + pub weight_votes: Option, + /// Bittensor block number at checkpoint + pub bittensor_block: u64, + /// Additional metadata + pub metadata: HashMap, +} + +impl CheckpointData { + pub fn new(sequence: u64, epoch: u64, netuid: u16) -> Self { + Self { + sequence, + epoch, + netuid, + pending_evaluations: Vec::new(), + completed_evaluations: Vec::new(), + weight_votes: None, + bittensor_block: 0, + metadata: HashMap::new(), + } + } + + /// Add pending evaluation + pub fn add_pending(&mut self, state: PendingEvaluationState) { + self.pending_evaluations.push(state); + } + + /// Add completed evaluation + pub fn add_completed(&mut self, state: CompletedEvaluationState) { + self.completed_evaluations.push(state); + } + + /// Calculate hash of checkpoint data + pub fn calculate_hash(&self) -> Result<[u8; 32]> { + let bytes = + bincode::serialize(self).map_err(|e| MiniChainError::Serialization(e.to_string()))?; + let mut hasher = Sha256::new(); + hasher.update(&bytes); + Ok(hasher.finalize().into()) + } +} + +/// Checkpoint manager for persisting and restoring state +pub struct CheckpointManager { + /// Directory for checkpoint files + checkpoint_dir: PathBuf, + /// Maximum number of checkpoints to keep + max_checkpoints: usize, + /// Current checkpoint sequence + current_sequence: u64, +} + +impl CheckpointManager { + /// Create a new checkpoint manager + pub fn new>(checkpoint_dir: P, max_checkpoints: usize) -> Result { + let checkpoint_dir = checkpoint_dir.as_ref().to_path_buf(); + + // Create checkpoint directory if it doesn't exist + fs::create_dir_all(&checkpoint_dir).map_err(|e| { + MiniChainError::Storage(format!("Failed to create checkpoint dir: {}", e)) + })?; + + // Find the latest checkpoint sequence + let current_sequence = Self::find_latest_sequence(&checkpoint_dir)?; + + info!( + dir = %checkpoint_dir.display(), + max_checkpoints, + current_sequence, + "Checkpoint manager initialized" + ); + + Ok(Self { + checkpoint_dir, + max_checkpoints, + current_sequence, + }) + } + + /// Find the latest checkpoint sequence number + fn find_latest_sequence(dir: &Path) -> Result { + let mut max_seq = 0u64; + + if let Ok(entries) = fs::read_dir(dir) { + for entry in entries.flatten() { + let path = entry.path(); + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + if name.starts_with("checkpoint_") && name.ends_with(".bin") { + if let Some(seq_str) = name + .strip_prefix("checkpoint_") + .and_then(|s| s.strip_suffix(".bin")) + { + if let Ok(seq) = seq_str.parse::() { + max_seq = max_seq.max(seq); + } + } + } + } + } + } + + Ok(max_seq) + } + + /// Generate checkpoint filename + fn checkpoint_filename(&self, sequence: u64) -> PathBuf { + self.checkpoint_dir + .join(format!("checkpoint_{:016}.bin", sequence)) + } + + /// Create a new checkpoint + pub fn create_checkpoint(&mut self, data: &CheckpointData) -> Result { + self.current_sequence += 1; + let sequence = self.current_sequence; + let filename = self.checkpoint_filename(sequence); + + // Serialize data + let data_bytes = + bincode::serialize(data).map_err(|e| MiniChainError::Serialization(e.to_string()))?; + + // Calculate hash + let mut hasher = Sha256::new(); + hasher.update(&data_bytes); + let data_hash: [u8; 32] = hasher.finalize().into(); + + // Create header + let header = CheckpointHeader::new(sequence, data_hash, data_bytes.len() as u64); + let header_bytes = + bincode::serialize(&header).map_err(|e| MiniChainError::Serialization(e.to_string()))?; + + // Write to file atomically (write to temp, then rename) + let temp_filename = filename.with_extension("tmp"); + { + let file = File::create(&temp_filename).map_err(|e| { + MiniChainError::Storage(format!("Failed to create checkpoint: {}", e)) + })?; + let mut writer = BufWriter::new(file); + + // Write header length (4 bytes) + let header_len = header_bytes.len() as u32; + writer + .write_all(&header_len.to_le_bytes()) + .map_err(|e| MiniChainError::Storage(e.to_string()))?; + + // Write header + writer + .write_all(&header_bytes) + .map_err(|e| MiniChainError::Storage(e.to_string()))?; + + // Write data + writer + .write_all(&data_bytes) + .map_err(|e| MiniChainError::Storage(e.to_string()))?; + + writer + .flush() + .map_err(|e| MiniChainError::Storage(e.to_string()))?; + } + + // Atomic rename + fs::rename(&temp_filename, &filename).map_err(|e| { + MiniChainError::Storage(format!("Failed to finalize checkpoint: {}", e)) + })?; + + info!( + sequence, + path = %filename.display(), + size = data_bytes.len(), + "Checkpoint created" + ); + + // Cleanup old checkpoints + self.cleanup_old_checkpoints()?; + + Ok(filename) + } + + /// Load the latest checkpoint + pub fn load_latest(&self) -> Result> { + if self.current_sequence == 0 { + return Ok(None); + } + + self.load_checkpoint(self.current_sequence) + } + + /// Load a specific checkpoint + pub fn load_checkpoint( + &self, + sequence: u64, + ) -> Result> { + let filename = self.checkpoint_filename(sequence); + + if !filename.exists() { + return Ok(None); + } + + let file = File::open(&filename) + .map_err(|e| MiniChainError::Storage(format!("Failed to open checkpoint: {}", e)))?; + let mut reader = BufReader::new(file); + + // Read header length + let mut header_len_bytes = [0u8; 4]; + reader.read_exact(&mut header_len_bytes).map_err(|e| { + MiniChainError::Storage(format!("Failed to read header length: {}", e)) + })?; + let header_len = u32::from_le_bytes(header_len_bytes) as usize; + + // Read header + let mut header_bytes = vec![0u8; header_len]; + reader + .read_exact(&mut header_bytes) + .map_err(|e| MiniChainError::Storage(format!("Failed to read header: {}", e)))?; + + let header: CheckpointHeader = bincode::deserialize(&header_bytes).map_err(|e| { + MiniChainError::Serialization(format!("Failed to deserialize header: {}", e)) + })?; + + // Verify magic + if !header.verify_magic() { + return Err(MiniChainError::Storage( + "Invalid checkpoint magic bytes".into(), + )); + } + + // Verify version compatibility + if header.version > CHECKPOINT_VERSION { + return Err(MiniChainError::Storage(format!( + "Checkpoint version {} is newer than supported version {}", + header.version, CHECKPOINT_VERSION + ))); + } + + // Read data + let mut data_bytes = vec![0u8; header.data_size as usize]; + reader + .read_exact(&mut data_bytes) + .map_err(|e| MiniChainError::Storage(format!("Failed to read data: {}", e)))?; + + // Verify hash + let mut hasher = Sha256::new(); + hasher.update(&data_bytes); + let actual_hash: [u8; 32] = hasher.finalize().into(); + + if actual_hash != header.data_hash { + return Err(MiniChainError::Storage( + "Checkpoint data hash mismatch".into(), + )); + } + + // Deserialize data + let data: CheckpointData = bincode::deserialize(&data_bytes).map_err(|e| { + MiniChainError::Serialization(format!("Failed to deserialize data: {}", e)) + })?; + + info!( + sequence, + epoch = data.epoch, + pending_count = data.pending_evaluations.len(), + "Checkpoint loaded" + ); + + Ok(Some((header, data))) + } + + /// List all available checkpoints + pub fn list_checkpoints(&self) -> Result> { + let mut checkpoints = Vec::new(); + + if let Ok(entries) = fs::read_dir(&self.checkpoint_dir) { + for entry in entries.flatten() { + let path = entry.path(); + if let Some(name) = path.file_name().and_then(|n| n.to_str()) { + if name.starts_with("checkpoint_") && name.ends_with(".bin") { + if let Some(seq_str) = name + .strip_prefix("checkpoint_") + .and_then(|s| s.strip_suffix(".bin")) + { + if let Ok(seq) = seq_str.parse::() { + if let Ok(meta) = entry.metadata() { + if let Ok(modified) = meta.modified() { + checkpoints.push((seq, path, modified)); + } + } + } + } + } + } + } + } + + checkpoints.sort_by_key(|(seq, _, _)| *seq); + Ok(checkpoints) + } + + /// Clean up old checkpoints + fn cleanup_old_checkpoints(&self) -> Result<()> { + let checkpoints = self.list_checkpoints()?; + + if checkpoints.len() <= self.max_checkpoints { + return Ok(()); + } + + let to_remove = checkpoints.len() - self.max_checkpoints; + for (seq, path, _) in checkpoints.into_iter().take(to_remove) { + debug!(sequence = seq, path = %path.display(), "Removing old checkpoint"); + if let Err(e) = fs::remove_file(&path) { + warn!(path = %path.display(), error = %e, "Failed to remove old checkpoint"); + } + } + + Ok(()) + } + + /// Get checkpoint directory + pub fn checkpoint_dir(&self) -> &Path { + &self.checkpoint_dir + } + + /// Get current sequence + pub fn current_sequence(&self) -> u64 { + self.current_sequence + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + #[test] + fn test_checkpoint_header() { + let header = CheckpointHeader::new(1, [0u8; 32], 100); + assert!(header.verify_magic()); + assert_eq!(header.version, CHECKPOINT_VERSION); + } + + #[test] + fn test_checkpoint_header_invalid_magic() { + let mut header = CheckpointHeader::new(1, [0u8; 32], 100); + header.magic = *b"INVALID!"; + assert!(!header.verify_magic()); + } + + #[test] + fn test_checkpoint_data_hash() { + let data = CheckpointData::new(1, 0, 100); + let hash1 = data.calculate_hash().unwrap(); + + let mut data2 = data.clone(); + data2.sequence = 2; + let hash2 = data2.calculate_hash().unwrap(); + + assert_ne!(hash1, hash2); + } + + #[test] + fn test_checkpoint_data_new() { + let data = CheckpointData::new(5, 10, 200); + assert_eq!(data.sequence, 5); + assert_eq!(data.epoch, 10); + assert_eq!(data.netuid, 200); + assert!(data.pending_evaluations.is_empty()); + assert!(data.completed_evaluations.is_empty()); + assert!(data.weight_votes.is_none()); + assert_eq!(data.bittensor_block, 0); + assert!(data.metadata.is_empty()); + } + + #[test] + fn test_checkpoint_data_add_pending() { + let mut data = CheckpointData::new(1, 0, 100); + let pending = PendingEvaluationState { + submission_id: "sub1".to_string(), + challenge_id: ChallengeId::new(), + miner: Hotkey([1u8; 32]), + submission_hash: "abc123".to_string(), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }; + data.add_pending(pending); + assert_eq!(data.pending_evaluations.len(), 1); + } + + #[test] + fn test_checkpoint_data_add_completed() { + let mut data = CheckpointData::new(1, 0, 100); + let completed = CompletedEvaluationState { + submission_id: "sub1".to_string(), + challenge_id: ChallengeId::new(), + final_score: 0.85, + epoch: 5, + completed_at: chrono::Utc::now().timestamp_millis(), + }; + data.add_completed(completed); + assert_eq!(data.completed_evaluations.len(), 1); + } + + #[test] + fn test_checkpoint_manager_roundtrip() { + let dir = tempdir().unwrap(); + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + + let mut data = CheckpointData::new(1, 0, 100); + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "sub1".to_string(), + challenge_id: ChallengeId::new(), + miner: Hotkey([1u8; 32]), + submission_hash: "abc123".to_string(), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + + let path = manager.create_checkpoint(&data).unwrap(); + assert!(path.exists()); + + let (header, loaded) = manager.load_latest().unwrap().unwrap(); + assert_eq!(header.sequence, 1); + assert_eq!(loaded.sequence, data.sequence); + assert_eq!(loaded.pending_evaluations.len(), 1); + } + + #[test] + fn test_checkpoint_manager_no_checkpoints() { + let dir = tempdir().unwrap(); + let manager = CheckpointManager::new(dir.path(), 5).unwrap(); + assert!(manager.load_latest().unwrap().is_none()); + assert_eq!(manager.current_sequence(), 0); + } + + #[test] + fn test_checkpoint_cleanup() { + let dir = tempdir().unwrap(); + let mut manager = CheckpointManager::new(dir.path(), 3).unwrap(); + + for i in 0..5 { + let data = CheckpointData::new(i, 0, 100); + manager.create_checkpoint(&data).unwrap(); + } + + let checkpoints = manager.list_checkpoints().unwrap(); + assert_eq!(checkpoints.len(), 3); + } + + #[test] + fn test_checkpoint_list() { + let dir = tempdir().unwrap(); + let mut manager = CheckpointManager::new(dir.path(), 10).unwrap(); + + for i in 0..3 { + let data = CheckpointData::new(i, i, 100); + manager.create_checkpoint(&data).unwrap(); + } + + let checkpoints = manager.list_checkpoints().unwrap(); + assert_eq!(checkpoints.len(), 3); + + // Verify sorted by sequence + assert_eq!(checkpoints[0].0, 1); + assert_eq!(checkpoints[1].0, 2); + assert_eq!(checkpoints[2].0, 3); + } + + #[test] + fn test_checkpoint_load_specific() { + let dir = tempdir().unwrap(); + let mut manager = CheckpointManager::new(dir.path(), 10).unwrap(); + + for i in 0..3 { + let mut data = CheckpointData::new(i, i * 10, 100); + data.metadata + .insert("test_key".to_string(), format!("value_{}", i)); + manager.create_checkpoint(&data).unwrap(); + } + + // Load specific checkpoint + let (header, data) = manager.load_checkpoint(2).unwrap().unwrap(); + assert_eq!(header.sequence, 2); + assert_eq!(data.epoch, 10); + assert_eq!( + data.metadata.get("test_key"), + Some(&"value_1".to_string()) + ); + } + + #[test] + fn test_checkpoint_load_nonexistent() { + let dir = tempdir().unwrap(); + let manager = CheckpointManager::new(dir.path(), 5).unwrap(); + assert!(manager.load_checkpoint(999).unwrap().is_none()); + } + + #[test] + fn test_checkpoint_resume_sequence() { + let dir = tempdir().unwrap(); + + // First manager creates some checkpoints + { + let mut manager = CheckpointManager::new(dir.path(), 10).unwrap(); + for i in 0..3 { + let data = CheckpointData::new(i, i, 100); + manager.create_checkpoint(&data).unwrap(); + } + assert_eq!(manager.current_sequence(), 3); + } + + // New manager should resume from the latest sequence + { + let manager = CheckpointManager::new(dir.path(), 10).unwrap(); + assert_eq!(manager.current_sequence(), 3); + } + } + + #[test] + fn test_checkpoint_with_scores() { + let dir = tempdir().unwrap(); + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + + let mut scores = HashMap::new(); + scores.insert(Hotkey([1u8; 32]), 0.95); + scores.insert(Hotkey([2u8; 32]), 0.87); + + let mut data = CheckpointData::new(1, 5, 100); + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "sub_with_scores".to_string(), + challenge_id: ChallengeId::new(), + miner: Hotkey([3u8; 32]), + submission_hash: "hash123".to_string(), + scores, + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: true, + }); + + manager.create_checkpoint(&data).unwrap(); + + let (_, loaded) = manager.load_latest().unwrap().unwrap(); + let pending = &loaded.pending_evaluations[0]; + assert_eq!(pending.scores.len(), 2); + assert_eq!(pending.scores.get(&Hotkey([1u8; 32])), Some(&0.95)); + assert!(pending.finalizing); + } + + #[test] + fn test_checkpoint_with_weight_votes() { + let dir = tempdir().unwrap(); + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + + let mut votes = HashMap::new(); + votes.insert(Hotkey([1u8; 32]), vec![(0, 100), (1, 200)]); + votes.insert(Hotkey([2u8; 32]), vec![(0, 150), (1, 150)]); + + let mut data = CheckpointData::new(1, 5, 100); + data.weight_votes = Some(WeightVoteState { + epoch: 5, + netuid: 100, + votes, + finalized: true, + final_weights: Some(vec![(0, 125), (1, 175)]), + }); + + manager.create_checkpoint(&data).unwrap(); + + let (_, loaded) = manager.load_latest().unwrap().unwrap(); + let weight_votes = loaded.weight_votes.unwrap(); + assert_eq!(weight_votes.epoch, 5); + assert!(weight_votes.finalized); + assert_eq!(weight_votes.final_weights, Some(vec![(0, 125), (1, 175)])); + } + + #[test] + fn test_checkpoint_dir_accessor() { + let dir = tempdir().unwrap(); + let manager = CheckpointManager::new(dir.path(), 5).unwrap(); + assert_eq!(manager.checkpoint_dir(), dir.path()); + } + + #[test] + fn test_pending_evaluation_state_clone() { + let state = PendingEvaluationState { + submission_id: "test".to_string(), + challenge_id: ChallengeId::new(), + miner: Hotkey([5u8; 32]), + submission_hash: "hash".to_string(), + scores: HashMap::new(), + created_at: 12345, + finalizing: false, + }; + let cloned = state.clone(); + assert_eq!(cloned.submission_id, state.submission_id); + assert_eq!(cloned.miner, state.miner); + } + + #[test] + fn test_completed_evaluation_state_clone() { + let state = CompletedEvaluationState { + submission_id: "test".to_string(), + challenge_id: ChallengeId::new(), + final_score: 0.75, + epoch: 10, + completed_at: 67890, + }; + let cloned = state.clone(); + assert_eq!(cloned.final_score, state.final_score); + assert_eq!(cloned.epoch, state.epoch); + } + + #[test] + fn test_weight_vote_state_clone() { + let state = WeightVoteState { + epoch: 5, + netuid: 100, + votes: HashMap::new(), + finalized: false, + final_weights: None, + }; + let cloned = state.clone(); + assert_eq!(cloned.epoch, state.epoch); + assert_eq!(cloned.finalized, state.finalized); + } +} diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index b8adaa6..87e1b6f 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -4,6 +4,7 @@ //! Core types and structures for the P2P validator network. pub mod challenge; +pub mod checkpoint; pub mod constants; pub mod crypto; pub mod error; @@ -14,6 +15,10 @@ pub mod state_versioning; pub mod types; pub use challenge::*; +pub use checkpoint::{ + CheckpointData, CheckpointHeader, CheckpointManager, CompletedEvaluationState, + PendingEvaluationState, WeightVoteState, CHECKPOINT_VERSION, +}; pub use constants::*; pub use crypto::*; pub use error::*; From 467e7d4871aa81cdf27e3a70ad2d2c33d5a26856 Mon Sep 17 00:00:00 2001 From: echobt Date: Tue, 3 Feb 2026 11:11:42 +0000 Subject: [PATCH 4/8] feat: add restoration system for checkpoint recovery --- crates/core/src/error.rs | 3 + crates/core/src/lib.rs | 4 + crates/core/src/restoration.rs | 614 +++++++++++++++++++++++++++++++++ 3 files changed, 621 insertions(+) create mode 100644 crates/core/src/restoration.rs diff --git a/crates/core/src/error.rs b/crates/core/src/error.rs index de1eb4e..0ed4e80 100644 --- a/crates/core/src/error.rs +++ b/crates/core/src/error.rs @@ -49,6 +49,9 @@ pub enum MiniChainError { #[error("Type mismatch: {0}")] TypeMismatch(String), + + #[error("Validation error: {0}")] + Validation(String), } impl From for MiniChainError { diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 87e1b6f..ef802c6 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -9,6 +9,7 @@ pub mod constants; pub mod crypto; pub mod error; pub mod message; +pub mod restoration; pub mod schema_guard; pub mod state; pub mod state_versioning; @@ -23,6 +24,9 @@ pub use constants::*; pub use crypto::*; pub use error::*; pub use message::*; +pub use restoration::{ + CheckpointInfo, RestorationManager, RestorationOptions, RestorationResult, Restorable, +}; pub use schema_guard::{verify_schema_integrity, SchemaError}; pub use state::*; pub use state_versioning::*; diff --git a/crates/core/src/restoration.rs b/crates/core/src/restoration.rs new file mode 100644 index 0000000..c2a5eda --- /dev/null +++ b/crates/core/src/restoration.rs @@ -0,0 +1,614 @@ +//! State restoration system for crash/update recovery +//! +//! Handles restoring validator state from checkpoints, including: +//! - Automatic restoration on startup +//! - State validation and migration +//! - Partial recovery handling + +use crate::checkpoint::{CheckpointData, CheckpointManager, PendingEvaluationState}; +use crate::{ChallengeId, MiniChainError, Result}; +use serde::{Deserialize, Serialize}; +use std::collections::HashSet; +use std::path::Path; +use std::time::{Duration, Instant}; +use tracing::{debug, info, warn}; + +/// Result of a restoration operation +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct RestorationResult { + /// Whether restoration was successful + pub success: bool, + /// Sequence number restored from + pub checkpoint_sequence: u64, + /// Epoch restored to + pub epoch: u64, + /// Number of pending evaluations restored + pub pending_evaluations_count: usize, + /// Number of completed evaluations restored + pub completed_evaluations_count: usize, + /// Whether weight votes were restored + pub weight_votes_restored: bool, + /// Time taken for restoration + pub duration_ms: u64, + /// Any warnings during restoration + pub warnings: Vec, + /// Error message if failed + pub error: Option, +} + +impl RestorationResult { + pub fn success( + checkpoint_sequence: u64, + epoch: u64, + pending_count: usize, + completed_count: usize, + weight_votes: bool, + duration_ms: u64, + ) -> Self { + Self { + success: true, + checkpoint_sequence, + epoch, + pending_evaluations_count: pending_count, + completed_evaluations_count: completed_count, + weight_votes_restored: weight_votes, + duration_ms, + warnings: Vec::new(), + error: None, + } + } + + pub fn failure(error: String) -> Self { + Self { + success: false, + checkpoint_sequence: 0, + epoch: 0, + pending_evaluations_count: 0, + completed_evaluations_count: 0, + weight_votes_restored: false, + duration_ms: 0, + warnings: Vec::new(), + error: Some(error), + } + } + + pub fn add_warning(&mut self, warning: String) { + self.warnings.push(warning); + } +} + +/// Options for restoration +#[derive(Clone, Debug)] +pub struct RestorationOptions { + /// Maximum age of checkpoint to restore from (None = any age) + pub max_age: Option, + /// Whether to validate restored state + pub validate_state: bool, + /// Whether to skip pending evaluations older than threshold + pub skip_stale_evaluations: bool, + /// Threshold for stale evaluations (in epochs) + pub stale_evaluation_threshold: u64, + /// Challenge IDs to restore (None = all) + pub challenge_filter: Option>, +} + +impl Default for RestorationOptions { + fn default() -> Self { + Self { + max_age: Some(Duration::from_secs(24 * 60 * 60)), // 24 hours + validate_state: true, + skip_stale_evaluations: true, + stale_evaluation_threshold: 5, // Skip if > 5 epochs old + challenge_filter: None, + } + } +} + +impl RestorationOptions { + pub fn new() -> Self { + Self::default() + } + + pub fn with_max_age(mut self, age: Duration) -> Self { + self.max_age = Some(age); + self + } + + pub fn without_max_age(mut self) -> Self { + self.max_age = None; + self + } + + pub fn with_validation(mut self, validate: bool) -> Self { + self.validate_state = validate; + self + } + + pub fn with_challenge_filter(mut self, challenges: HashSet) -> Self { + self.challenge_filter = Some(challenges); + self + } +} + +/// State restoration manager +pub struct RestorationManager { + checkpoint_manager: CheckpointManager, + options: RestorationOptions, +} + +impl RestorationManager { + /// Create a new restoration manager + pub fn new>(checkpoint_dir: P, options: RestorationOptions) -> Result { + let checkpoint_manager = CheckpointManager::new(checkpoint_dir, 10)?; + Ok(Self { + checkpoint_manager, + options, + }) + } + + /// Create with default options + pub fn with_defaults>(checkpoint_dir: P) -> Result { + Self::new(checkpoint_dir, RestorationOptions::default()) + } + + /// Attempt to restore from the latest checkpoint + pub fn restore_latest(&self) -> Result> { + let start = Instant::now(); + + // Load latest checkpoint + let checkpoint = match self.checkpoint_manager.load_latest()? { + Some(cp) => cp, + None => { + info!("No checkpoint found, starting fresh"); + return Ok(None); + } + }; + + let (header, data) = checkpoint; + + // Check checkpoint age + if let Some(max_age) = self.options.max_age { + let checkpoint_age = Duration::from_millis( + (chrono::Utc::now().timestamp_millis() - header.created_at).max(0) as u64, + ); + if checkpoint_age > max_age { + warn!( + sequence = header.sequence, + age_secs = checkpoint_age.as_secs(), + max_age_secs = max_age.as_secs(), + "Checkpoint too old, skipping restoration" + ); + return Ok(None); + } + } + + // Filter and validate data + let filtered_data = self.filter_and_validate(data)?; + + let duration_ms = start.elapsed().as_millis() as u64; + + let mut result = RestorationResult::success( + header.sequence, + filtered_data.epoch, + filtered_data.pending_evaluations.len(), + filtered_data.completed_evaluations.len(), + filtered_data.weight_votes.is_some(), + duration_ms, + ); + + info!( + sequence = header.sequence, + epoch = filtered_data.epoch, + pending = filtered_data.pending_evaluations.len(), + duration_ms, + "State restored from checkpoint" + ); + + // Add warnings for filtered items + if self.options.challenge_filter.is_some() { + result.add_warning("Some evaluations filtered by challenge".into()); + } + + Ok(Some((result, filtered_data))) + } + + /// Restore from a specific checkpoint sequence + pub fn restore_from_sequence( + &self, + sequence: u64, + ) -> Result> { + let start = Instant::now(); + + let checkpoint = match self.checkpoint_manager.load_checkpoint(sequence)? { + Some(cp) => cp, + None => { + warn!(sequence, "Checkpoint not found"); + return Ok(None); + } + }; + + let (header, data) = checkpoint; + let filtered_data = self.filter_and_validate(data)?; + let duration_ms = start.elapsed().as_millis() as u64; + + let result = RestorationResult::success( + header.sequence, + filtered_data.epoch, + filtered_data.pending_evaluations.len(), + filtered_data.completed_evaluations.len(), + filtered_data.weight_votes.is_some(), + duration_ms, + ); + + Ok(Some((result, filtered_data))) + } + + /// Filter and validate checkpoint data + fn filter_and_validate(&self, mut data: CheckpointData) -> Result { + // Filter by challenge if specified + if let Some(ref filter) = self.options.challenge_filter { + data.pending_evaluations + .retain(|e| filter.contains(&e.challenge_id)); + data.completed_evaluations + .retain(|e| filter.contains(&e.challenge_id)); + } + + // Skip stale evaluations if enabled + if self.options.skip_stale_evaluations { + let _current_epoch = data.epoch; + let _threshold = self.options.stale_evaluation_threshold; + + let original_count = data.pending_evaluations.len(); + data.pending_evaluations.retain(|_e| { + // Keep if we can't determine staleness or if within threshold + // For now, keep all pending (they don't have epoch info) + true + }); + + let filtered_count = original_count - data.pending_evaluations.len(); + if filtered_count > 0 { + debug!( + filtered = filtered_count, + "Skipped stale pending evaluations" + ); + } + } + + // Validate state if enabled + if self.options.validate_state { + self.validate_data(&data)?; + } + + Ok(data) + } + + /// Validate checkpoint data integrity + fn validate_data(&self, data: &CheckpointData) -> Result<()> { + // Validate epoch is reasonable + if data.epoch > 1_000_000 { + return Err(MiniChainError::Validation( + "Checkpoint epoch seems unreasonably high".into(), + )); + } + + // Validate netuid + if data.netuid == 0 { + warn!("Checkpoint has netuid 0, may need reconfiguration"); + } + + // Validate pending evaluations + for eval in &data.pending_evaluations { + if eval.submission_id.is_empty() { + return Err(MiniChainError::Validation( + "Found pending evaluation with empty submission_id".into(), + )); + } + } + + // Validate weight votes epoch matches + if let Some(ref votes) = data.weight_votes { + if votes.epoch != data.epoch && !votes.finalized { + warn!( + votes_epoch = votes.epoch, + data_epoch = data.epoch, + "Weight votes epoch mismatch (may be stale)" + ); + } + } + + Ok(()) + } + + /// Get list of available checkpoints for restoration + pub fn list_available(&self) -> Result> { + let checkpoints = self.checkpoint_manager.list_checkpoints()?; + + let mut infos = Vec::new(); + for (sequence, _path, _modified) in checkpoints { + if let Some(info) = self.get_checkpoint_info(sequence)? { + infos.push(info); + } + } + + Ok(infos) + } + + /// Get information about a specific checkpoint without full loading + fn get_checkpoint_info(&self, sequence: u64) -> Result> { + match self.checkpoint_manager.load_checkpoint(sequence)? { + Some((header, data)) => Ok(Some(CheckpointInfo { + sequence, + created_at: header.created_at, + epoch: data.epoch, + netuid: data.netuid, + pending_count: data.pending_evaluations.len(), + completed_count: data.completed_evaluations.len(), + has_weight_votes: data.weight_votes.is_some(), + bittensor_block: data.bittensor_block, + })), + None => Ok(None), + } + } + + /// Get the checkpoint manager + pub fn checkpoint_manager(&self) -> &CheckpointManager { + &self.checkpoint_manager + } +} + +/// Information about a checkpoint (lightweight summary) +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct CheckpointInfo { + pub sequence: u64, + pub created_at: i64, + pub epoch: u64, + pub netuid: u16, + pub pending_count: usize, + pub completed_count: usize, + pub has_weight_votes: bool, + pub bittensor_block: u64, +} + +/// Trait for types that can be restored from checkpoints +pub trait Restorable { + /// Restore state from checkpoint data + fn restore_from(&mut self, data: &CheckpointData) -> Result<()>; + + /// Create checkpoint data from current state + fn create_checkpoint(&self) -> Result; +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Hotkey; + use std::collections::HashMap; + use tempfile::tempdir; + + fn create_test_checkpoint_data() -> CheckpointData { + let mut data = CheckpointData::new(1, 5, 100); + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "sub1".to_string(), + challenge_id: ChallengeId::new(), + miner: Hotkey([1u8; 32]), + submission_hash: "hash1".to_string(), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + data + } + + #[test] + fn test_restoration_result() { + let result = RestorationResult::success(1, 5, 10, 20, true, 100); + assert!(result.success); + assert_eq!(result.checkpoint_sequence, 1); + assert_eq!(result.epoch, 5); + + let failure = RestorationResult::failure("test error".to_string()); + assert!(!failure.success); + assert!(failure.error.is_some()); + } + + #[test] + fn test_restoration_options() { + let opts = RestorationOptions::default(); + assert!(opts.max_age.is_some()); + assert!(opts.validate_state); + + let custom = RestorationOptions::new() + .without_max_age() + .with_validation(false); + assert!(custom.max_age.is_none()); + assert!(!custom.validate_state); + } + + #[test] + fn test_restoration_roundtrip() { + let dir = tempdir().unwrap(); + + // Create checkpoint first + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + let data = create_test_checkpoint_data(); + manager.create_checkpoint(&data).unwrap(); + + // Now restore + let restoration = RestorationManager::with_defaults(dir.path()).unwrap(); + let result = restoration.restore_latest().unwrap(); + + assert!(result.is_some()); + let (res, restored_data) = result.unwrap(); + assert!(res.success); + assert_eq!(restored_data.epoch, data.epoch); + assert_eq!(restored_data.pending_evaluations.len(), 1); + } + + #[test] + fn test_restoration_no_checkpoint() { + let dir = tempdir().unwrap(); + let restoration = RestorationManager::with_defaults(dir.path()).unwrap(); + let result = restoration.restore_latest().unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_checkpoint_info() { + let dir = tempdir().unwrap(); + + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + let data = create_test_checkpoint_data(); + manager.create_checkpoint(&data).unwrap(); + + let restoration = RestorationManager::with_defaults(dir.path()).unwrap(); + let infos = restoration.list_available().unwrap(); + + assert_eq!(infos.len(), 1); + assert_eq!(infos[0].epoch, 5); + assert_eq!(infos[0].pending_count, 1); + } + + #[test] + fn test_restoration_with_challenge_filter() { + let dir = tempdir().unwrap(); + + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + let challenge1 = ChallengeId::new(); + let challenge2 = ChallengeId::new(); + + let mut data = CheckpointData::new(1, 5, 100); + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "sub1".to_string(), + challenge_id: challenge1, + miner: Hotkey([1u8; 32]), + submission_hash: "hash1".to_string(), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "sub2".to_string(), + challenge_id: challenge2, + miner: Hotkey([2u8; 32]), + submission_hash: "hash2".to_string(), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + manager.create_checkpoint(&data).unwrap(); + + // Restore with filter for only challenge1 + let mut filter = HashSet::new(); + filter.insert(challenge1); + let options = RestorationOptions::new().with_challenge_filter(filter); + let restoration = RestorationManager::new(dir.path(), options).unwrap(); + let result = restoration.restore_latest().unwrap(); + + assert!(result.is_some()); + let (_res, restored_data) = result.unwrap(); + assert_eq!(restored_data.pending_evaluations.len(), 1); + assert_eq!(restored_data.pending_evaluations[0].challenge_id, challenge1); + } + + #[test] + fn test_restoration_add_warning() { + let mut result = RestorationResult::success(1, 5, 10, 20, true, 100); + assert!(result.warnings.is_empty()); + + result.add_warning("Test warning".to_string()); + assert_eq!(result.warnings.len(), 1); + assert_eq!(result.warnings[0], "Test warning"); + } + + #[test] + fn test_restore_from_sequence() { + let dir = tempdir().unwrap(); + + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + + // Create multiple checkpoints + let mut data = create_test_checkpoint_data(); + manager.create_checkpoint(&data).unwrap(); // seq 1 + + data.epoch = 10; + manager.create_checkpoint(&data).unwrap(); // seq 2 + + let restoration = RestorationManager::with_defaults(dir.path()).unwrap(); + + // Restore from sequence 1 + let result = restoration.restore_from_sequence(1).unwrap(); + assert!(result.is_some()); + let (_res, restored_data) = result.unwrap(); + assert_eq!(restored_data.epoch, 5); + + // Restore from sequence 2 + let result = restoration.restore_from_sequence(2).unwrap(); + assert!(result.is_some()); + let (_res, restored_data) = result.unwrap(); + assert_eq!(restored_data.epoch, 10); + + // Try non-existent sequence + let result = restoration.restore_from_sequence(999).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_validation_unreasonable_epoch() { + let dir = tempdir().unwrap(); + + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + let mut data = create_test_checkpoint_data(); + data.epoch = 2_000_000; // Unreasonably high + manager.create_checkpoint(&data).unwrap(); + + let restoration = RestorationManager::with_defaults(dir.path()).unwrap(); + let result = restoration.restore_latest(); + assert!(result.is_err()); + } + + #[test] + fn test_validation_empty_submission_id() { + let dir = tempdir().unwrap(); + + let mut manager = CheckpointManager::new(dir.path(), 5).unwrap(); + let mut data = CheckpointData::new(1, 5, 100); + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "".to_string(), // Empty - invalid + challenge_id: ChallengeId::new(), + miner: Hotkey([1u8; 32]), + submission_hash: "hash1".to_string(), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + manager.create_checkpoint(&data).unwrap(); + + let restoration = RestorationManager::with_defaults(dir.path()).unwrap(); + let result = restoration.restore_latest(); + assert!(result.is_err()); + } + + #[test] + fn test_options_with_max_age() { + let opts = RestorationOptions::new().with_max_age(Duration::from_secs(3600)); + assert_eq!(opts.max_age, Some(Duration::from_secs(3600))); + } + + #[test] + fn test_checkpoint_info_struct() { + let info = CheckpointInfo { + sequence: 1, + created_at: 12345, + epoch: 5, + netuid: 1, + pending_count: 10, + completed_count: 20, + has_weight_votes: true, + bittensor_block: 100, + }; + + assert_eq!(info.sequence, 1); + assert_eq!(info.epoch, 5); + assert!(info.has_weight_votes); + } +} From b48d6929fd7a64d224df256e244e1d283ee808b2 Mon Sep 17 00:00:00 2001 From: echobt Date: Tue, 3 Feb 2026 11:15:06 +0000 Subject: [PATCH 5/8] feat(rpc-server): add health check endpoints for rolling updates --- crates/rpc-server/src/health.rs | 390 ++++++++++++++++++++++++++++++++ crates/rpc-server/src/lib.rs | 2 + 2 files changed, 392 insertions(+) create mode 100644 crates/rpc-server/src/health.rs diff --git a/crates/rpc-server/src/health.rs b/crates/rpc-server/src/health.rs new file mode 100644 index 0000000..f90a878 --- /dev/null +++ b/crates/rpc-server/src/health.rs @@ -0,0 +1,390 @@ +//! Health check endpoints for validator coordination +//! +//! Provides: +//! - `/health` - Basic liveness check +//! - `/ready` - Readiness check (can accept traffic) +//! - `/live` - Kubernetes-style liveness probe +//! +//! These enable coordinated rolling updates across the validator network. + +use serde::{Deserialize, Serialize}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; +use tracing::{info, warn}; + +/// Health status of a component +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum HealthStatus { + /// Component is healthy + Healthy, + /// Component is degraded but operational + Degraded, + /// Component is unhealthy + Unhealthy, + /// Component status is unknown + Unknown, +} + +impl Default for HealthStatus { + fn default() -> Self { + Self::Unknown + } +} + +/// Readiness status for traffic handling +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ReadinessStatus { + /// Ready to accept traffic + Ready, + /// Not ready (initializing, draining, etc.) + NotReady, + /// Draining - finishing current work, not accepting new + Draining, +} + +impl Default for ReadinessStatus { + fn default() -> Self { + Self::NotReady + } +} + +/// Health check response +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct HealthResponse { + /// Overall health status + pub status: HealthStatus, + /// Readiness for traffic + pub ready: ReadinessStatus, + /// Version string + pub version: String, + /// Uptime in seconds + pub uptime_secs: u64, + /// Current epoch + pub epoch: u64, + /// P2P connection count + pub peer_count: u64, + /// Active challenges count + pub active_challenges: u64, + /// Pending evaluations count + pub pending_evaluations: u64, + /// Last checkpoint sequence + pub checkpoint_sequence: u64, + /// Timestamp (Unix millis) + pub timestamp: i64, + /// Component statuses + pub components: ComponentStatus, +} + +/// Status of individual components +#[derive(Clone, Debug, Default, Serialize, Deserialize)] +pub struct ComponentStatus { + /// P2P network status + pub p2p: HealthStatus, + /// Storage status + pub storage: HealthStatus, + /// Consensus status + pub consensus: HealthStatus, + /// Bittensor connection status + pub bittensor: HealthStatus, + /// Challenge containers status + pub challenges: HealthStatus, +} + +/// Health check manager +pub struct HealthCheck { + /// Start time + start_time: Instant, + /// Version string + version: String, + /// Whether ready for traffic + ready: AtomicBool, + /// Whether draining + draining: AtomicBool, + /// Current epoch + epoch: AtomicU64, + /// Peer count + peer_count: AtomicU64, + /// Active challenges + active_challenges: AtomicU64, + /// Pending evaluations + pending_evaluations: AtomicU64, + /// Last checkpoint sequence + checkpoint_sequence: AtomicU64, + /// Component status (using interior mutability) + components: parking_lot::RwLock, +} + +impl HealthCheck { + /// Create a new health check manager + pub fn new(version: impl Into) -> Self { + Self { + start_time: Instant::now(), + version: version.into(), + ready: AtomicBool::new(false), + draining: AtomicBool::new(false), + epoch: AtomicU64::new(0), + peer_count: AtomicU64::new(0), + active_challenges: AtomicU64::new(0), + pending_evaluations: AtomicU64::new(0), + checkpoint_sequence: AtomicU64::new(0), + components: parking_lot::RwLock::new(ComponentStatus::default()), + } + } + + /// Mark as ready for traffic + pub fn set_ready(&self, ready: bool) { + self.ready.store(ready, Ordering::SeqCst); + if ready { + info!("Validator marked as ready for traffic"); + } + } + + /// Start draining (preparing for shutdown) + pub fn start_draining(&self) { + self.draining.store(true, Ordering::SeqCst); + self.ready.store(false, Ordering::SeqCst); + info!("Validator entering drain mode"); + } + + /// Check if draining + pub fn is_draining(&self) -> bool { + self.draining.load(Ordering::SeqCst) + } + + /// Update epoch + pub fn set_epoch(&self, epoch: u64) { + self.epoch.store(epoch, Ordering::SeqCst); + } + + /// Update peer count + pub fn set_peer_count(&self, count: u64) { + self.peer_count.store(count, Ordering::SeqCst); + } + + /// Update active challenges + pub fn set_active_challenges(&self, count: u64) { + self.active_challenges.store(count, Ordering::SeqCst); + } + + /// Update pending evaluations + pub fn set_pending_evaluations(&self, count: u64) { + self.pending_evaluations.store(count, Ordering::SeqCst); + } + + /// Update checkpoint sequence + pub fn set_checkpoint_sequence(&self, seq: u64) { + self.checkpoint_sequence.store(seq, Ordering::SeqCst); + } + + /// Update component status + pub fn set_component_status(&self, component: &str, status: HealthStatus) { + let mut components = self.components.write(); + match component { + "p2p" => components.p2p = status, + "storage" => components.storage = status, + "consensus" => components.consensus = status, + "bittensor" => components.bittensor = status, + "challenges" => components.challenges = status, + _ => warn!("Unknown component: {}", component), + } + } + + /// Get overall health status + fn get_overall_status(&self) -> HealthStatus { + let components = self.components.read(); + + // If any component is unhealthy, overall is unhealthy + if components.p2p == HealthStatus::Unhealthy + || components.storage == HealthStatus::Unhealthy + || components.consensus == HealthStatus::Unhealthy + { + return HealthStatus::Unhealthy; + } + + // If any critical component is degraded, overall is degraded + if components.p2p == HealthStatus::Degraded + || components.storage == HealthStatus::Degraded + || components.consensus == HealthStatus::Degraded + { + return HealthStatus::Degraded; + } + + // If Bittensor is down but others are fine, degraded + if components.bittensor == HealthStatus::Unhealthy { + return HealthStatus::Degraded; + } + + HealthStatus::Healthy + } + + /// Get readiness status + fn get_readiness(&self) -> ReadinessStatus { + if self.draining.load(Ordering::SeqCst) { + return ReadinessStatus::Draining; + } + if self.ready.load(Ordering::SeqCst) { + return ReadinessStatus::Ready; + } + ReadinessStatus::NotReady + } + + /// Get full health response + pub fn get_health(&self) -> HealthResponse { + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as i64; + + HealthResponse { + status: self.get_overall_status(), + ready: self.get_readiness(), + version: self.version.clone(), + uptime_secs: self.start_time.elapsed().as_secs(), + epoch: self.epoch.load(Ordering::SeqCst), + peer_count: self.peer_count.load(Ordering::SeqCst), + active_challenges: self.active_challenges.load(Ordering::SeqCst), + pending_evaluations: self.pending_evaluations.load(Ordering::SeqCst), + checkpoint_sequence: self.checkpoint_sequence.load(Ordering::SeqCst), + timestamp, + components: self.components.read().clone(), + } + } + + /// Basic liveness check (is the process running) + pub fn is_live(&self) -> bool { + // If we can respond, we're live + true + } + + /// Readiness check (can accept traffic) + pub fn is_ready(&self) -> bool { + self.ready.load(Ordering::SeqCst) && !self.draining.load(Ordering::SeqCst) + } +} + +impl Default for HealthCheck { + fn default() -> Self { + Self::new("unknown") + } +} + +/// Create a shared health check instance +pub fn create_health_check(version: &str) -> Arc { + Arc::new(HealthCheck::new(version)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_health_check_creation() { + let health = HealthCheck::new("1.0.0"); + assert_eq!(health.version, "1.0.0"); + assert!(!health.is_ready()); + assert!(!health.is_draining()); + } + + #[test] + fn test_ready_state() { + let health = HealthCheck::new("1.0.0"); + + assert!(!health.is_ready()); + health.set_ready(true); + assert!(health.is_ready()); + + let response = health.get_health(); + assert_eq!(response.ready, ReadinessStatus::Ready); + } + + #[test] + fn test_draining_state() { + let health = HealthCheck::new("1.0.0"); + health.set_ready(true); + + health.start_draining(); + assert!(health.is_draining()); + assert!(!health.is_ready()); + + let response = health.get_health(); + assert_eq!(response.ready, ReadinessStatus::Draining); + } + + #[test] + fn test_component_status() { + let health = HealthCheck::new("1.0.0"); + + health.set_component_status("p2p", HealthStatus::Healthy); + health.set_component_status("storage", HealthStatus::Healthy); + health.set_component_status("consensus", HealthStatus::Healthy); + health.set_component_status("bittensor", HealthStatus::Healthy); + + let response = health.get_health(); + assert_eq!(response.status, HealthStatus::Healthy); + } + + #[test] + fn test_unhealthy_component() { + let health = HealthCheck::new("1.0.0"); + + health.set_component_status("p2p", HealthStatus::Unhealthy); + + let response = health.get_health(); + assert_eq!(response.status, HealthStatus::Unhealthy); + } + + #[test] + fn test_degraded_component() { + let health = HealthCheck::new("1.0.0"); + + health.set_component_status("p2p", HealthStatus::Healthy); + health.set_component_status("storage", HealthStatus::Degraded); + + let response = health.get_health(); + assert_eq!(response.status, HealthStatus::Degraded); + } + + #[test] + fn test_metrics_update() { + let health = HealthCheck::new("1.0.0"); + + health.set_epoch(42); + health.set_peer_count(10); + health.set_active_challenges(3); + health.set_pending_evaluations(5); + health.set_checkpoint_sequence(100); + + let response = health.get_health(); + assert_eq!(response.epoch, 42); + assert_eq!(response.peer_count, 10); + assert_eq!(response.active_challenges, 3); + assert_eq!(response.pending_evaluations, 5); + assert_eq!(response.checkpoint_sequence, 100); + } + + #[test] + fn test_uptime() { + let health = HealthCheck::new("1.0.0"); + + // Just check uptime is a reasonable value (not negative, not huge) + let response = health.get_health(); + assert!(response.uptime_secs < 10); // Should be very small in a test + } + + #[test] + fn test_bittensor_degraded() { + let health = HealthCheck::new("1.0.0"); + + health.set_component_status("p2p", HealthStatus::Healthy); + health.set_component_status("storage", HealthStatus::Healthy); + health.set_component_status("consensus", HealthStatus::Healthy); + health.set_component_status("bittensor", HealthStatus::Unhealthy); + + // Bittensor unhealthy = degraded, not fully unhealthy + let response = health.get_health(); + assert_eq!(response.status, HealthStatus::Degraded); + } +} diff --git a/crates/rpc-server/src/lib.rs b/crates/rpc-server/src/lib.rs index 5559e16..db3dd5f 100644 --- a/crates/rpc-server/src/lib.rs +++ b/crates/rpc-server/src/lib.rs @@ -22,12 +22,14 @@ mod auth; mod handlers; +pub mod health; mod jsonrpc; mod server; mod types; pub use auth::*; pub use handlers::*; +pub use health::{create_health_check, HealthCheck, HealthResponse, HealthStatus, ReadinessStatus}; pub use jsonrpc::*; pub use server::*; pub use types::*; From 42a35be3770b98b836200b6bad08472ab47ee65e Mon Sep 17 00:00:00 2001 From: echobt Date: Tue, 3 Feb 2026 11:18:04 +0000 Subject: [PATCH 6/8] docs: add challenge integration guide --- challenges/README.md | 4 + docs/challenge-integration.md | 253 ++++++++++++++++++++++++++++++++++ 2 files changed, 257 insertions(+) create mode 100644 docs/challenge-integration.md diff --git a/challenges/README.md b/challenges/README.md index 6cecfe1..3e1dd59 100644 --- a/challenges/README.md +++ b/challenges/README.md @@ -35,3 +35,7 @@ Challenge crates can also be external (like term-challenge). They should: ## Example See [term-challenge](https://github.com/PlatformNetwork/term-challenge) for a complete example. + +## Documentation + +For detailed integration instructions, see the [Challenge Integration Guide](../docs/challenge-integration.md). diff --git a/docs/challenge-integration.md b/docs/challenge-integration.md new file mode 100644 index 0000000..0c66101 --- /dev/null +++ b/docs/challenge-integration.md @@ -0,0 +1,253 @@ +# Challenge Integration Guide + +This guide explains how to integrate challenge crates with the Platform validator network. + +## Overview + +Platform uses a modular challenge architecture where each challenge: +- Runs as a separate Docker container +- Communicates via HTTP/WebSocket with validators +- Has its own state persistence +- Supports hot-reload without losing evaluation progress + +## Architecture + +```text +┌─────────────────────────────────────────────────────────────┐ +│ Platform Validator │ +├─────────────────────────────────────────────────────────────┤ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ Challenge │ │ Challenge │ │ State │ │ +│ │ Registry │ │ Orchestrator│ │ Manager │ │ +│ └─────────────┘ └─────────────┘ └─────────────┘ │ +├─────────────────────────────────────────────────────────────┤ +│ Checkpoint System │ +│ (periodic saves, graceful shutdown, crash recovery) │ +└─────────────────────────────────────────────────────────────┘ + │ + ┌───────────────┼───────────────┐ + ▼ ▼ ▼ + ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ + │ Challenge A │ │ Challenge B │ │ Challenge N │ + │ (Docker) │ │ (Docker) │ │ (Docker) │ + └─────────────┘ └─────────────┘ └─────────────┘ +``` + +## Creating a Challenge Crate + +### 1. Project Structure + +Your challenge crate should follow this structure: + +``` +my-challenge/ +├── Cargo.toml +├── src/ +│ ├── lib.rs # Challenge implementation +│ ├── evaluation.rs # Evaluation logic +│ └── scoring.rs # Scoring algorithm +├── Dockerfile # Container build +└── README.md # Documentation +``` + +### 2. Dependencies + +Add Platform SDK to your `Cargo.toml`: + +```toml +[dependencies] +platform-challenge-sdk = { git = "https://github.com/PlatformNetwork/platform" } +``` + +### 3. Implement the Challenge Trait + +```rust +use platform_challenge_sdk::prelude::*; + +pub struct MyChallenge { + // Your challenge state +} + +#[async_trait] +impl ServerChallenge for MyChallenge { + fn challenge_id(&self) -> &str { + "my-challenge" + } + + fn name(&self) -> &str { + "My Challenge" + } + + fn version(&self) -> &str { + env!("CARGO_PKG_VERSION") + } + + async fn evaluate( + &self, + req: EvaluationRequest, + ) -> Result { + // Your evaluation logic + let score = self.evaluate_submission(&req.data)?; + + Ok(EvaluationResponse::success( + &req.request_id, + score, + json!({"details": "evaluation complete"}), + )) + } +} +``` + +### 4. Docker Container + +Create a `Dockerfile`: + +```dockerfile +FROM rust:1.90 as builder +WORKDIR /app +COPY . . +RUN cargo build --release + +FROM debian:bookworm-slim +COPY --from=builder /app/target/release/my-challenge /usr/local/bin/ +EXPOSE 8080 +CMD ["my-challenge"] +``` + +## State Persistence + +### Checkpoint Integration + +Challenges automatically benefit from Platform's checkpoint system: + +1. **Periodic Checkpoints**: Every 5 minutes +2. **Shutdown Checkpoints**: On graceful shutdown +3. **Crash Recovery**: On restart, state is restored + +### Custom State + +To persist challenge-specific state: + +```rust +use platform_challenge_sdk::database::Database; + +impl MyChallenge { + pub fn save_state(&self, db: &Database) -> Result<()> { + db.set("my_state_key", &self.state)?; + Ok(()) + } + + pub fn load_state(&mut self, db: &Database) -> Result<()> { + if let Some(state) = db.get("my_state_key")? { + self.state = state; + } + Ok(()) + } +} +``` + +## Hot-Reload Support + +Platform supports updating challenges without losing evaluation progress: + +### 1. Graceful Shutdown Signal + +When receiving SIGTERM, your challenge should: +1. Stop accepting new evaluations +2. Complete in-progress evaluations +3. Persist any local state +4. Exit cleanly + +```rust +tokio::select! { + _ = tokio::signal::ctrl_c() => { + info!("Shutting down gracefully..."); + self.save_state(&db)?; + } +} +``` + +### 2. Version Compatibility + +Ensure backward compatibility between versions: + +```rust +#[derive(Serialize, Deserialize)] +struct MyState { + #[serde(default)] + version: u32, + // ... fields +} + +impl MyState { + fn migrate(&mut self) { + if self.version < 2 { + // Migration logic + self.version = 2; + } + } +} +``` + +## Health Checks + +Implement health check endpoints: + +```rust +// GET /health - Returns 200 if healthy +// GET /ready - Returns 200 if ready for traffic +// GET /live - Returns 200 if process is alive +``` + +## Registration + +### Local Development + +Add to workspace `Cargo.toml`: + +```toml +[workspace] +members = [ + # ... existing members + "challenges/my-challenge", +] +``` + +### Production Deployment + +1. Build and push Docker image +2. Register via sudo action (network operator only) +3. Validators automatically pull the image + +## Best Practices + +1. **Deterministic Evaluation**: Same input should produce same output +2. **Timeout Handling**: Set reasonable timeouts +3. **Resource Limits**: Respect CPU/memory constraints +4. **Logging**: Use structured logging with tracing +5. **Error Handling**: Return meaningful error messages +6. **Testing**: Include comprehensive unit tests + +## Example Challenges + +- [term-challenge](https://github.com/PlatformNetwork/term-challenge) - Terminal benchmark + +## Troubleshooting + +### Common Issues + +1. **Challenge not starting**: Check Docker logs +2. **Evaluation timeout**: Increase timeout or optimize +3. **State loss after update**: Verify checkpoint creation +4. **Version mismatch**: Check compatibility constraints + +### Debugging + +Enable debug logging: +```bash +RUST_LOG=debug my-challenge +``` + +## API Reference + +See [platform-challenge-sdk documentation](../crates/challenge-sdk/README.md). From ef714727e91641a0e6d3cd080a8569247c57d0e5 Mon Sep 17 00:00:00 2001 From: echobt Date: Tue, 3 Feb 2026 11:19:52 +0000 Subject: [PATCH 7/8] test: add integration tests for checkpoint and restoration system --- tests/Cargo.toml | 4 + tests/checkpoint_tests.rs | 536 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 540 insertions(+) create mode 100644 tests/checkpoint_tests.rs diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 9037649..c8e3840 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -36,6 +36,10 @@ path = "sudo_action_tests.rs" name = "blockchain_state_tests" path = "blockchain_state_tests.rs" +[[test]] +name = "checkpoint_tests" +path = "checkpoint_tests.rs" + [dependencies] platform-core = { path = "../crates/core" } platform-storage = { path = "../crates/storage" } diff --git a/tests/checkpoint_tests.rs b/tests/checkpoint_tests.rs new file mode 100644 index 0000000..3b9421e --- /dev/null +++ b/tests/checkpoint_tests.rs @@ -0,0 +1,536 @@ +//! Integration tests for checkpoint and restoration system +//! +//! Tests for verifying the checkpoint/restoration system works correctly end-to-end. + +use platform_core::{ + CheckpointData, CheckpointManager, CompletedEvaluationState, PendingEvaluationState, + WeightVoteState, RestorationManager, RestorationOptions, ChallengeId, Hotkey, +}; +use std::collections::HashMap; +use tempfile::tempdir; + +// ============================================================================ +// TEST HELPERS +// ============================================================================ + +/// Create test checkpoint data with realistic content +fn create_test_data() -> CheckpointData { + let mut data = CheckpointData::new(100, 5, 100); + + // Add pending evaluations + for i in 0..5 { + data.pending_evaluations.push(PendingEvaluationState { + submission_id: format!("submission_{}", i), + challenge_id: ChallengeId::new(), + miner: Hotkey([i as u8; 32]), + submission_hash: format!("hash_{}", i), + scores: { + let mut scores = HashMap::new(); + scores.insert(Hotkey([1u8; 32]), 0.85); + scores.insert(Hotkey([2u8; 32]), 0.90); + scores + }, + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + } + + // Add completed evaluations + for i in 0..3 { + data.completed_evaluations.push(CompletedEvaluationState { + submission_id: format!("completed_{}", i), + challenge_id: ChallengeId::new(), + final_score: 0.87 + (i as f64 * 0.01), + epoch: 5, + completed_at: chrono::Utc::now().timestamp_millis(), + }); + } + + // Add weight votes + data.weight_votes = Some(WeightVoteState { + epoch: 5, + netuid: 100, + votes: { + let mut votes = HashMap::new(); + votes.insert(Hotkey([1u8; 32]), vec![(0, 1000), (1, 2000)]); + votes.insert(Hotkey([2u8; 32]), vec![(0, 1500), (1, 1500)]); + votes + }, + finalized: false, + final_weights: None, + }); + + data.bittensor_block = 12345; + data +} + +// ============================================================================ +// CHECKPOINT ROUNDTRIP TESTS +// ============================================================================ + +#[test] +fn test_checkpoint_roundtrip() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 10).expect("Failed to create manager"); + + let original_data = create_test_data(); + + // Create checkpoint + let path = manager + .create_checkpoint(&original_data) + .expect("Failed to create checkpoint"); + assert!(path.exists()); + + // Load checkpoint + let (header, loaded_data) = manager + .load_latest() + .expect("Failed to load") + .expect("No checkpoint found"); + + // Verify data integrity + assert_eq!(loaded_data.sequence, original_data.sequence); + assert_eq!(loaded_data.epoch, original_data.epoch); + assert_eq!(loaded_data.netuid, original_data.netuid); + assert_eq!( + loaded_data.pending_evaluations.len(), + original_data.pending_evaluations.len() + ); + assert_eq!( + loaded_data.completed_evaluations.len(), + original_data.completed_evaluations.len() + ); + assert!(loaded_data.weight_votes.is_some()); + assert_eq!(loaded_data.bittensor_block, original_data.bittensor_block); + + // Verify header has correct sequence + assert_eq!(header.sequence, 1); +} + +// ============================================================================ +// MULTIPLE CHECKPOINTS TESTS +// ============================================================================ + +#[test] +fn test_multiple_checkpoints() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + + // Create multiple checkpoints + for i in 0..10 { + let mut data = CheckpointData::new(i, i / 2, 100); + data.pending_evaluations.push(PendingEvaluationState { + submission_id: format!("sub_{}", i), + challenge_id: ChallengeId::new(), + miner: Hotkey([i as u8; 32]), + submission_hash: format!("hash_{}", i), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + manager + .create_checkpoint(&data) + .expect("Failed to create checkpoint"); + } + + // Should only keep 5 checkpoints + let checkpoints = manager.list_checkpoints().expect("Failed to list"); + assert_eq!(checkpoints.len(), 5); + + // Latest should be sequence 10 + let (header, latest) = manager + .load_latest() + .expect("Failed to load") + .expect("No checkpoint"); + assert_eq!(latest.sequence, 9); + assert_eq!(header.sequence, 10); +} + +// ============================================================================ +// RESTORATION TESTS +// ============================================================================ + +#[test] +fn test_restoration_with_options() { + let dir = tempdir().expect("Failed to create temp dir"); + + // Create checkpoint + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + let data = create_test_data(); + manager + .create_checkpoint(&data) + .expect("Failed to create checkpoint"); + + // Restore with options + let options = RestorationOptions::new() + .without_max_age() + .with_validation(true); + + let restoration = + RestorationManager::new(dir.path(), options).expect("Failed to create restoration manager"); + + let result = restoration.restore_latest().expect("Failed to restore"); + assert!(result.is_some()); + + let (res, restored_data) = result.unwrap(); + assert!(res.success); + assert_eq!(restored_data.pending_evaluations.len(), 5); + assert_eq!(restored_data.completed_evaluations.len(), 3); +} + +#[test] +fn test_restoration_empty() { + let dir = tempdir().expect("Failed to create temp dir"); + + let restoration = RestorationManager::with_defaults(dir.path()).expect("Failed to create"); + let result = restoration.restore_latest().expect("Failed to restore"); + + assert!(result.is_none()); +} + +// ============================================================================ +// HASH VERIFICATION TESTS +// ============================================================================ + +#[test] +fn test_checkpoint_hash_verification() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + + let data = create_test_data(); + let path = manager + .create_checkpoint(&data) + .expect("Failed to create"); + + // Corrupt the file + let mut content = std::fs::read(&path).expect("Failed to read"); + if content.len() > 100 { + content[100] ^= 0xFF; // Flip bits + } + std::fs::write(&path, content).expect("Failed to write"); + + // Loading should fail due to hash mismatch + let result = manager.load_checkpoint(1); + assert!(result.is_err()); +} + +// ============================================================================ +// WEIGHT VOTES TESTS +// ============================================================================ + +#[test] +fn test_weight_votes_persistence() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + + let mut data = CheckpointData::new(1, 5, 100); + data.weight_votes = Some(WeightVoteState { + epoch: 5, + netuid: 100, + votes: { + let mut v = HashMap::new(); + v.insert(Hotkey([1u8; 32]), vec![(0, 1000), (1, 2000), (2, 3000)]); + v.insert(Hotkey([2u8; 32]), vec![(0, 1500), (1, 2500), (2, 2000)]); + v.insert(Hotkey([3u8; 32]), vec![(0, 2000), (1, 2000), (2, 2000)]); + v + }, + finalized: true, + final_weights: Some(vec![(0, 4500), (1, 6500), (2, 7000)]), + }); + + manager + .create_checkpoint(&data) + .expect("Failed to create"); + + let (_, loaded) = manager + .load_latest() + .expect("Failed to load") + .expect("No checkpoint"); + + let votes = loaded.weight_votes.expect("No weight votes"); + assert!(votes.finalized); + assert_eq!(votes.votes.len(), 3); + assert_eq!(votes.final_weights.as_ref().unwrap().len(), 3); +} + +// ============================================================================ +// CHECKPOINT INFO TESTS +// ============================================================================ + +#[test] +fn test_checkpoint_info() { + let dir = tempdir().expect("Failed to create temp dir"); + + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + let data = create_test_data(); + manager + .create_checkpoint(&data) + .expect("Failed to create"); + + let restoration = + RestorationManager::with_defaults(dir.path()).expect("Failed to create"); + let infos = restoration.list_available().expect("Failed to list"); + + assert_eq!(infos.len(), 1); + assert_eq!(infos[0].epoch, 5); + assert_eq!(infos[0].netuid, 100); + assert_eq!(infos[0].pending_count, 5); + assert_eq!(infos[0].completed_count, 3); + assert!(infos[0].has_weight_votes); + assert_eq!(infos[0].bittensor_block, 12345); +} + +// ============================================================================ +// SCORING PERSISTENCE TESTS +// ============================================================================ + +#[test] +fn test_pending_evaluation_scores_persistence() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + + let mut data = CheckpointData::new(1, 5, 100); + let mut scores = HashMap::new(); + scores.insert(Hotkey([10u8; 32]), 0.95); + scores.insert(Hotkey([20u8; 32]), 0.87); + scores.insert(Hotkey([30u8; 32]), 0.92); + + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "scored_submission".to_string(), + challenge_id: ChallengeId::new(), + miner: Hotkey([5u8; 32]), + submission_hash: "hash_scored".to_string(), + scores, + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: true, + }); + + manager + .create_checkpoint(&data) + .expect("Failed to create"); + + let (_, loaded) = manager + .load_latest() + .expect("Failed to load") + .expect("No checkpoint"); + + let pending = &loaded.pending_evaluations[0]; + assert_eq!(pending.scores.len(), 3); + assert_eq!(pending.scores.get(&Hotkey([10u8; 32])), Some(&0.95)); + assert_eq!(pending.scores.get(&Hotkey([20u8; 32])), Some(&0.87)); + assert_eq!(pending.scores.get(&Hotkey([30u8; 32])), Some(&0.92)); + assert!(pending.finalizing); +} + +// ============================================================================ +// SEQUENCE MANAGEMENT TESTS +// ============================================================================ + +#[test] +fn test_checkpoint_sequence_resume() { + let dir = tempdir().expect("Failed to create temp dir"); + + // First manager creates checkpoints + { + let mut manager = + CheckpointManager::new(dir.path(), 10).expect("Failed to create manager"); + for i in 0..5 { + let data = CheckpointData::new(i, i, 100); + manager + .create_checkpoint(&data) + .expect("Failed to create"); + } + assert_eq!(manager.current_sequence(), 5); + } + + // New manager should resume from the latest sequence + { + let manager = CheckpointManager::new(dir.path(), 10).expect("Failed to create manager"); + assert_eq!(manager.current_sequence(), 5); + } +} + +#[test] +fn test_load_specific_checkpoint() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 10).expect("Failed to create manager"); + + // Create 3 checkpoints with different epochs + for i in 0..3 { + let mut data = CheckpointData::new(i, i * 10, 100); + data.metadata + .insert("marker".to_string(), format!("checkpoint_{}", i)); + manager + .create_checkpoint(&data) + .expect("Failed to create"); + } + + // Load specific checkpoint (sequence 2) + let (header, data) = manager + .load_checkpoint(2) + .expect("Failed to load") + .expect("Not found"); + assert_eq!(header.sequence, 2); + assert_eq!(data.epoch, 10); + assert_eq!(data.metadata.get("marker"), Some(&"checkpoint_1".to_string())); +} + +// ============================================================================ +// METADATA TESTS +// ============================================================================ + +#[test] +fn test_checkpoint_metadata_persistence() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + + let mut data = CheckpointData::new(1, 5, 100); + data.metadata.insert("version".to_string(), "1.0.0".to_string()); + data.metadata.insert("node_id".to_string(), "validator_1".to_string()); + data.metadata.insert("custom_key".to_string(), "custom_value".to_string()); + + manager + .create_checkpoint(&data) + .expect("Failed to create"); + + let (_, loaded) = manager + .load_latest() + .expect("Failed to load") + .expect("No checkpoint"); + + assert_eq!(loaded.metadata.len(), 3); + assert_eq!(loaded.metadata.get("version"), Some(&"1.0.0".to_string())); + assert_eq!(loaded.metadata.get("node_id"), Some(&"validator_1".to_string())); + assert_eq!(loaded.metadata.get("custom_key"), Some(&"custom_value".to_string())); +} + +// ============================================================================ +// COMPLETED EVALUATION TESTS +// ============================================================================ + +#[test] +fn test_completed_evaluations_persistence() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + + let challenge_id = ChallengeId::new(); + let mut data = CheckpointData::new(1, 5, 100); + + for i in 0..5 { + data.completed_evaluations.push(CompletedEvaluationState { + submission_id: format!("completed_{}", i), + challenge_id, + final_score: 0.80 + (i as f64 * 0.04), + epoch: 5, + completed_at: chrono::Utc::now().timestamp_millis(), + }); + } + + manager + .create_checkpoint(&data) + .expect("Failed to create"); + + let (_, loaded) = manager + .load_latest() + .expect("Failed to load") + .expect("No checkpoint"); + + assert_eq!(loaded.completed_evaluations.len(), 5); + + // Verify score ordering is preserved + for (i, eval) in loaded.completed_evaluations.iter().enumerate() { + let expected_score = 0.80 + (i as f64 * 0.04); + assert!((eval.final_score - expected_score).abs() < 0.001); + assert_eq!(eval.challenge_id, challenge_id); + } +} + +// ============================================================================ +// EMPTY STATE TESTS +// ============================================================================ + +#[test] +fn test_checkpoint_with_empty_state() { + let dir = tempdir().expect("Failed to create temp dir"); + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + + // Empty checkpoint data + let data = CheckpointData::new(0, 0, 100); + + manager + .create_checkpoint(&data) + .expect("Failed to create"); + + let (_, loaded) = manager + .load_latest() + .expect("Failed to load") + .expect("No checkpoint"); + + assert_eq!(loaded.sequence, 0); + assert_eq!(loaded.epoch, 0); + assert!(loaded.pending_evaluations.is_empty()); + assert!(loaded.completed_evaluations.is_empty()); + assert!(loaded.weight_votes.is_none()); + assert!(loaded.metadata.is_empty()); +} + +// ============================================================================ +// RESTORATION VALIDATION TESTS +// ============================================================================ + +#[test] +fn test_restoration_validates_epoch() { + let dir = tempdir().expect("Failed to create temp dir"); + + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + let mut data = CheckpointData::new(1, 2_000_000, 100); // Unreasonably high epoch + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "test".to_string(), + challenge_id: ChallengeId::new(), + miner: Hotkey([1u8; 32]), + submission_hash: "hash".to_string(), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + manager + .create_checkpoint(&data) + .expect("Failed to create"); + + // With validation enabled, this should fail + let options = RestorationOptions::new() + .without_max_age() + .with_validation(true); + + let restoration = RestorationManager::new(dir.path(), options).expect("Failed to create"); + let result = restoration.restore_latest(); + assert!(result.is_err()); +} + +#[test] +fn test_restoration_validates_submission_id() { + let dir = tempdir().expect("Failed to create temp dir"); + + let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); + let mut data = CheckpointData::new(1, 5, 100); + data.pending_evaluations.push(PendingEvaluationState { + submission_id: "".to_string(), // Empty submission_id is invalid + challenge_id: ChallengeId::new(), + miner: Hotkey([1u8; 32]), + submission_hash: "hash".to_string(), + scores: HashMap::new(), + created_at: chrono::Utc::now().timestamp_millis(), + finalizing: false, + }); + manager + .create_checkpoint(&data) + .expect("Failed to create"); + + // With validation enabled, this should fail + let options = RestorationOptions::new() + .without_max_age() + .with_validation(true); + + let restoration = RestorationManager::new(dir.path(), options).expect("Failed to create"); + let result = restoration.restore_latest(); + assert!(result.is_err()); +} From 54b966121169a814b93e590a951a4f7b62b7c379 Mon Sep 17 00:00:00 2001 From: echobt Date: Tue, 3 Feb 2026 11:21:22 +0000 Subject: [PATCH 8/8] feat: add graceful shutdown with checkpoint persistence - Add ShutdownHandler struct for checkpoint management - Create periodic checkpoints every 5 minutes - Save final checkpoint on graceful shutdown (Ctrl+C) - Persist evaluation state for hot-reload recovery This enables validators to update without losing evaluation progress. --- Cargo.lock | 1 + bins/validator-node/src/main.rs | 126 ++++++++++++++++++++- crates/challenge-registry/src/health.rs | 7 +- crates/challenge-registry/src/lifecycle.rs | 11 +- crates/challenge-registry/src/migration.rs | 32 +++++- crates/challenge-registry/src/registry.rs | 24 ++-- crates/core/src/checkpoint.rs | 15 +-- crates/core/src/lib.rs | 2 +- crates/core/src/restoration.rs | 5 +- tests/checkpoint_tests.rs | 82 ++++++-------- 10 files changed, 221 insertions(+), 84 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 462e0ac..d4e27a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4788,6 +4788,7 @@ dependencies = [ "serde_json", "sha2 0.10.9", "sp-core 31.0.0", + "tempfile", "thiserror 2.0.17", "tracing", "uuid", diff --git a/bins/validator-node/src/main.rs b/bins/validator-node/src/main.rs index 2173084..7f325f5 100644 --- a/bins/validator-node/src/main.rs +++ b/bins/validator-node/src/main.rs @@ -12,7 +12,13 @@ use platform_bittensor::{ sync_metagraph, BittensorClient, BlockSync, BlockSyncConfig, BlockSyncEvent, Metagraph, Subtensor, SubtensorClient, }; -use platform_core::{Hotkey, Keypair, SUDO_KEY_SS58}; +use platform_core::{ + checkpoint::{ + CheckpointData, CheckpointManager, CompletedEvaluationState, PendingEvaluationState, + WeightVoteState, + }, + Hotkey, Keypair, SUDO_KEY_SS58, +}; use platform_distributed_storage::{ DistributedStoreExt, LocalStorage, LocalStorageBuilder, StorageKey, }; @@ -20,7 +26,7 @@ use platform_p2p_consensus::{ ChainState, ConsensusEngine, NetworkEvent, P2PConfig, P2PMessage, P2PNetwork, StateManager, ValidatorRecord, ValidatorSet, }; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::Duration; use tracing::{debug, error, info, warn}; @@ -28,6 +34,86 @@ use tracing::{debug, error, info, warn}; /// Storage key for persisted chain state const STATE_STORAGE_KEY: &str = "chain_state"; +// ==================== Shutdown Handler ==================== + +/// Handles graceful shutdown with state persistence +struct ShutdownHandler { + checkpoint_manager: CheckpointManager, + state_manager: Arc, + netuid: u16, +} + +impl ShutdownHandler { + fn new(checkpoint_dir: &Path, state_manager: Arc, netuid: u16) -> Result { + let checkpoint_manager = CheckpointManager::new(checkpoint_dir.join("checkpoints"), 10)?; + Ok(Self { + checkpoint_manager, + state_manager, + netuid, + }) + } + + /// Create checkpoint from current state + fn create_checkpoint(&mut self) -> Result<()> { + let state = self.state_manager.snapshot(); + + let mut checkpoint_data = CheckpointData::new(state.sequence, state.epoch, self.netuid); + + // Convert pending evaluations + for (id, record) in &state.pending_evaluations { + let pending = PendingEvaluationState { + submission_id: id.clone(), + challenge_id: record.challenge_id, + miner: record.miner.clone(), + submission_hash: record.agent_hash.clone(), + scores: record + .evaluations + .iter() + .map(|(k, v)| (k.clone(), v.score)) + .collect(), + created_at: record.created_at, + finalizing: record.finalized, + }; + checkpoint_data.add_pending(pending); + } + + // Convert completed evaluations (current epoch only) + if let Some(completed) = state.completed_evaluations.get(&state.epoch) { + for record in completed { + if let Some(score) = record.aggregated_score { + let completed_state = CompletedEvaluationState { + submission_id: record.submission_id.clone(), + challenge_id: record.challenge_id, + final_score: score, + epoch: state.epoch, + completed_at: record.finalized_at.unwrap_or(record.created_at), + }; + checkpoint_data.add_completed(completed_state); + } + } + } + + // Convert weight votes + if let Some(ref votes) = state.weight_votes { + checkpoint_data.weight_votes = Some(WeightVoteState { + epoch: votes.epoch, + netuid: votes.netuid, + votes: votes.votes.clone(), + finalized: votes.finalized, + final_weights: votes.final_weights.clone(), + }); + } + + checkpoint_data.bittensor_block = state.bittensor_block; + + self.checkpoint_manager + .create_checkpoint(&checkpoint_data)?; + info!("Shutdown checkpoint created at sequence {}", state.sequence); + + Ok(()) + } +} + // ==================== CLI ==================== #[derive(Parser, Debug)] @@ -252,6 +338,22 @@ async fn main() -> Result<()> { bittensor_client_for_metagraph = None; } + // Initialize shutdown handler for graceful checkpoint persistence + let mut shutdown_handler = + match ShutdownHandler::new(&data_dir, state_manager.clone(), args.netuid) { + Ok(handler) => { + info!("Shutdown handler initialized with checkpoint directory"); + Some(handler) + } + Err(e) => { + warn!( + "Failed to initialize shutdown handler: {}. Checkpoints disabled.", + e + ); + None + } + }; + info!("Decentralized validator running. Press Ctrl+C to stop."); let netuid = args.netuid; @@ -260,6 +362,7 @@ async fn main() -> Result<()> { let mut metagraph_interval = tokio::time::interval(Duration::from_secs(300)); let mut stale_check_interval = tokio::time::interval(Duration::from_secs(60)); let mut state_persist_interval = tokio::time::interval(Duration::from_secs(60)); + let mut checkpoint_interval = tokio::time::interval(Duration::from_secs(300)); // 5 minutes loop { tokio::select! { @@ -335,8 +438,27 @@ async fn main() -> Result<()> { debug!("Active validators: {}", validator_set.active_count()); } + // Periodic checkpoint + _ = checkpoint_interval.tick() => { + if let Some(handler) = shutdown_handler.as_mut() { + if let Err(e) = handler.create_checkpoint() { + warn!("Failed to create periodic checkpoint: {}", e); + } else { + debug!("Periodic checkpoint created"); + } + } + } + // Ctrl+C _ = tokio::signal::ctrl_c() => { + info!("Received shutdown signal, creating final checkpoint..."); + if let Some(handler) = shutdown_handler.as_mut() { + if let Err(e) = handler.create_checkpoint() { + error!("Failed to create shutdown checkpoint: {}", e); + } else { + info!("Shutdown checkpoint saved successfully"); + } + } info!("Shutting down..."); break; } diff --git a/crates/challenge-registry/src/health.rs b/crates/challenge-registry/src/health.rs index e142fdb..5973271 100644 --- a/crates/challenge-registry/src/health.rs +++ b/crates/challenge-registry/src/health.rs @@ -5,11 +5,11 @@ //! - Container status //! - Resource usage +use parking_lot::RwLock; use platform_core::ChallengeId; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::time::Duration; -use parking_lot::RwLock; /// Health status of a challenge #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] @@ -67,7 +67,10 @@ impl ChallengeHealth { /// Check if the challenge is operational (healthy or degraded) pub fn is_operational(&self) -> bool { - matches!(self.status, HealthStatus::Healthy | HealthStatus::Degraded(_)) + matches!( + self.status, + HealthStatus::Healthy | HealthStatus::Degraded(_) + ) } /// Record a successful health check diff --git a/crates/challenge-registry/src/lifecycle.rs b/crates/challenge-registry/src/lifecycle.rs index a2ba334..8e99e44 100644 --- a/crates/challenge-registry/src/lifecycle.rs +++ b/crates/challenge-registry/src/lifecycle.rs @@ -137,7 +137,9 @@ mod tests { fn test_valid_transitions() { let lifecycle = ChallengeLifecycle::new(); - assert!(lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Starting)); + assert!( + lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Starting) + ); assert!(lifecycle.is_valid_transition(&LifecycleState::Starting, &LifecycleState::Running)); assert!(lifecycle.is_valid_transition(&LifecycleState::Running, &LifecycleState::Stopping)); assert!(lifecycle.is_valid_transition(&LifecycleState::Stopping, &LifecycleState::Stopped)); @@ -147,14 +149,15 @@ mod tests { fn test_invalid_transitions() { let lifecycle = ChallengeLifecycle::new(); - assert!(!lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Running)); + assert!( + !lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Running) + ); assert!(!lifecycle.is_valid_transition(&LifecycleState::Stopped, &LifecycleState::Running)); } #[test] fn test_lifecycle_config() { - let lifecycle = ChallengeLifecycle::new() - .with_auto_restart(false, 5); + let lifecycle = ChallengeLifecycle::new().with_auto_restart(false, 5); assert!(!lifecycle.auto_restart_enabled()); assert_eq!(lifecycle.max_restart_attempts(), 5); diff --git a/crates/challenge-registry/src/migration.rs b/crates/challenge-registry/src/migration.rs index 002c543..dae3a1a 100644 --- a/crates/challenge-registry/src/migration.rs +++ b/crates/challenge-registry/src/migration.rs @@ -143,7 +143,10 @@ impl MigrationPlan { /// Check if migration is complete pub fn is_complete(&self) -> bool { - matches!(self.status, MigrationStatus::Completed | MigrationStatus::RolledBack) + matches!( + self.status, + MigrationStatus::Completed | MigrationStatus::RolledBack + ) } /// Check if migration can be rolled back @@ -229,7 +232,12 @@ impl ChallengeMigration { )); } - let mut plan = MigrationPlan::new(challenge_id, challenge_name, from_version.clone(), to_version.clone()); + let mut plan = MigrationPlan::new( + challenge_id, + challenge_name, + from_version.clone(), + to_version.clone(), + ); // Generate migration steps based on version difference // This is a simplified version - real implementation would analyze schemas @@ -237,7 +245,10 @@ impl ChallengeMigration { plan.add_step( MigrationStep::new( "major_upgrade".to_string(), - format!("Major version upgrade from {} to {}", from_version.major, to_version.major), + format!( + "Major version upgrade from {} to {}", + from_version.major, to_version.major + ), from_version.clone(), to_version.clone(), ) @@ -248,7 +259,10 @@ impl ChallengeMigration { plan.add_step( MigrationStep::new( "minor_upgrade".to_string(), - format!("Minor version upgrade from {} to {}", from_version, to_version), + format!( + "Minor version upgrade from {} to {}", + from_version, to_version + ), from_version.clone(), to_version.clone(), ) @@ -258,7 +272,10 @@ impl ChallengeMigration { plan.add_step( MigrationStep::new( "patch_upgrade".to_string(), - format!("Patch version upgrade from {} to {}", from_version, to_version), + format!( + "Patch version upgrade from {} to {}", + from_version, to_version + ), from_version, to_version, ) @@ -419,7 +436,10 @@ mod tests { let active = migration.get_active_migration(&id); assert!(active.is_some()); - assert!(matches!(active.unwrap().status, MigrationStatus::InProgress)); + assert!(matches!( + active.unwrap().status, + MigrationStatus::InProgress + )); let complete = migration.complete_step(&id).unwrap(); assert!(complete); diff --git a/crates/challenge-registry/src/registry.rs b/crates/challenge-registry/src/registry.rs index 1c2a0bd..39ad982 100644 --- a/crates/challenge-registry/src/registry.rs +++ b/crates/challenge-registry/src/registry.rs @@ -112,10 +112,7 @@ impl ChallengeRegistry { let name = entry.name.clone(); let state_store = Arc::new(StateStore::new(id)); - let registered = RegisteredChallenge { - entry, - state_store, - }; + let registered = RegisteredChallenge { entry, state_store }; challenges.insert(id, registered); name_index.insert(name.clone(), id); @@ -178,11 +175,7 @@ impl ChallengeRegistry { } /// Update challenge lifecycle state - pub fn update_state( - &self, - id: &ChallengeId, - new_state: LifecycleState, - ) -> RegistryResult<()> { + pub fn update_state(&self, id: &ChallengeId, new_state: LifecycleState) -> RegistryResult<()> { let mut challenges = self.challenges.write(); let registered = challenges .get_mut(id) @@ -264,7 +257,10 @@ impl ChallengeRegistry { /// Get state store for a challenge pub fn state_store(&self, id: &ChallengeId) -> Option> { - self.challenges.read().get(id).map(|r| r.state_store.clone()) + self.challenges + .read() + .get(id) + .map(|r| r.state_store.clone()) } /// Add event listener @@ -398,7 +394,9 @@ mod tests { ); let id = registry.register(entry).unwrap(); - let old = registry.update_version(&id, ChallengeVersion::new(1, 1, 0)).unwrap(); + let old = registry + .update_version(&id, ChallengeVersion::new(1, 1, 0)) + .unwrap(); assert_eq!(old, ChallengeVersion::new(1, 0, 0)); @@ -426,7 +424,9 @@ mod tests { registry.register(entry2).unwrap(); // Make first one active - registry.update_state(&id1, LifecycleState::Running).unwrap(); + registry + .update_state(&id1, LifecycleState::Running) + .unwrap(); registry.update_health(&id1, HealthStatus::Healthy).unwrap(); let active = registry.list_active(); diff --git a/crates/core/src/checkpoint.rs b/crates/core/src/checkpoint.rs index b627e4a..12e32c1 100644 --- a/crates/core/src/checkpoint.rs +++ b/crates/core/src/checkpoint.rs @@ -244,8 +244,8 @@ impl CheckpointManager { // Create header let header = CheckpointHeader::new(sequence, data_hash, data_bytes.len() as u64); - let header_bytes = - bincode::serialize(&header).map_err(|e| MiniChainError::Serialization(e.to_string()))?; + let header_bytes = bincode::serialize(&header) + .map_err(|e| MiniChainError::Serialization(e.to_string()))?; // Write to file atomically (write to temp, then rename) let temp_filename = filename.with_extension("tmp"); @@ -320,9 +320,9 @@ impl CheckpointManager { // Read header length let mut header_len_bytes = [0u8; 4]; - reader.read_exact(&mut header_len_bytes).map_err(|e| { - MiniChainError::Storage(format!("Failed to read header length: {}", e)) - })?; + reader + .read_exact(&mut header_len_bytes) + .map_err(|e| MiniChainError::Storage(format!("Failed to read header length: {}", e)))?; let header_len = u32::from_le_bytes(header_len_bytes) as usize; // Read header @@ -598,10 +598,7 @@ mod tests { let (header, data) = manager.load_checkpoint(2).unwrap().unwrap(); assert_eq!(header.sequence, 2); assert_eq!(data.epoch, 10); - assert_eq!( - data.metadata.get("test_key"), - Some(&"value_1".to_string()) - ); + assert_eq!(data.metadata.get("test_key"), Some(&"value_1".to_string())); } #[test] diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index ef802c6..5936e5f 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -25,7 +25,7 @@ pub use crypto::*; pub use error::*; pub use message::*; pub use restoration::{ - CheckpointInfo, RestorationManager, RestorationOptions, RestorationResult, Restorable, + CheckpointInfo, Restorable, RestorationManager, RestorationOptions, RestorationResult, }; pub use schema_guard::{verify_schema_integrity, SchemaError}; pub use state::*; diff --git a/crates/core/src/restoration.rs b/crates/core/src/restoration.rs index c2a5eda..53db1a9 100644 --- a/crates/core/src/restoration.rs +++ b/crates/core/src/restoration.rs @@ -507,7 +507,10 @@ mod tests { assert!(result.is_some()); let (_res, restored_data) = result.unwrap(); assert_eq!(restored_data.pending_evaluations.len(), 1); - assert_eq!(restored_data.pending_evaluations[0].challenge_id, challenge1); + assert_eq!( + restored_data.pending_evaluations[0].challenge_id, + challenge1 + ); } #[test] diff --git a/tests/checkpoint_tests.rs b/tests/checkpoint_tests.rs index 3b9421e..1bccf51 100644 --- a/tests/checkpoint_tests.rs +++ b/tests/checkpoint_tests.rs @@ -3,8 +3,8 @@ //! Tests for verifying the checkpoint/restoration system works correctly end-to-end. use platform_core::{ - CheckpointData, CheckpointManager, CompletedEvaluationState, PendingEvaluationState, - WeightVoteState, RestorationManager, RestorationOptions, ChallengeId, Hotkey, + ChallengeId, CheckpointData, CheckpointManager, CompletedEvaluationState, Hotkey, + PendingEvaluationState, RestorationManager, RestorationOptions, WeightVoteState, }; use std::collections::HashMap; use tempfile::tempdir; @@ -197,9 +197,7 @@ fn test_checkpoint_hash_verification() { let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); let data = create_test_data(); - let path = manager - .create_checkpoint(&data) - .expect("Failed to create"); + let path = manager.create_checkpoint(&data).expect("Failed to create"); // Corrupt the file let mut content = std::fs::read(&path).expect("Failed to read"); @@ -237,9 +235,7 @@ fn test_weight_votes_persistence() { final_weights: Some(vec![(0, 4500), (1, 6500), (2, 7000)]), }); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); let (_, loaded) = manager .load_latest() @@ -262,12 +258,9 @@ fn test_checkpoint_info() { let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); let data = create_test_data(); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); - let restoration = - RestorationManager::with_defaults(dir.path()).expect("Failed to create"); + let restoration = RestorationManager::with_defaults(dir.path()).expect("Failed to create"); let infos = restoration.list_available().expect("Failed to list"); assert_eq!(infos.len(), 1); @@ -304,9 +297,7 @@ fn test_pending_evaluation_scores_persistence() { finalizing: true, }); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); let (_, loaded) = manager .load_latest() @@ -331,13 +322,10 @@ fn test_checkpoint_sequence_resume() { // First manager creates checkpoints { - let mut manager = - CheckpointManager::new(dir.path(), 10).expect("Failed to create manager"); + let mut manager = CheckpointManager::new(dir.path(), 10).expect("Failed to create manager"); for i in 0..5 { let data = CheckpointData::new(i, i, 100); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); } assert_eq!(manager.current_sequence(), 5); } @@ -359,9 +347,7 @@ fn test_load_specific_checkpoint() { let mut data = CheckpointData::new(i, i * 10, 100); data.metadata .insert("marker".to_string(), format!("checkpoint_{}", i)); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); } // Load specific checkpoint (sequence 2) @@ -371,7 +357,10 @@ fn test_load_specific_checkpoint() { .expect("Not found"); assert_eq!(header.sequence, 2); assert_eq!(data.epoch, 10); - assert_eq!(data.metadata.get("marker"), Some(&"checkpoint_1".to_string())); + assert_eq!( + data.metadata.get("marker"), + Some(&"checkpoint_1".to_string()) + ); } // ============================================================================ @@ -384,13 +373,14 @@ fn test_checkpoint_metadata_persistence() { let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager"); let mut data = CheckpointData::new(1, 5, 100); - data.metadata.insert("version".to_string(), "1.0.0".to_string()); - data.metadata.insert("node_id".to_string(), "validator_1".to_string()); - data.metadata.insert("custom_key".to_string(), "custom_value".to_string()); + data.metadata + .insert("version".to_string(), "1.0.0".to_string()); + data.metadata + .insert("node_id".to_string(), "validator_1".to_string()); + data.metadata + .insert("custom_key".to_string(), "custom_value".to_string()); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); let (_, loaded) = manager .load_latest() @@ -399,8 +389,14 @@ fn test_checkpoint_metadata_persistence() { assert_eq!(loaded.metadata.len(), 3); assert_eq!(loaded.metadata.get("version"), Some(&"1.0.0".to_string())); - assert_eq!(loaded.metadata.get("node_id"), Some(&"validator_1".to_string())); - assert_eq!(loaded.metadata.get("custom_key"), Some(&"custom_value".to_string())); + assert_eq!( + loaded.metadata.get("node_id"), + Some(&"validator_1".to_string()) + ); + assert_eq!( + loaded.metadata.get("custom_key"), + Some(&"custom_value".to_string()) + ); } // ============================================================================ @@ -414,7 +410,7 @@ fn test_completed_evaluations_persistence() { let challenge_id = ChallengeId::new(); let mut data = CheckpointData::new(1, 5, 100); - + for i in 0..5 { data.completed_evaluations.push(CompletedEvaluationState { submission_id: format!("completed_{}", i), @@ -425,9 +421,7 @@ fn test_completed_evaluations_persistence() { }); } - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); let (_, loaded) = manager .load_latest() @@ -435,7 +429,7 @@ fn test_completed_evaluations_persistence() { .expect("No checkpoint"); assert_eq!(loaded.completed_evaluations.len(), 5); - + // Verify score ordering is preserved for (i, eval) in loaded.completed_evaluations.iter().enumerate() { let expected_score = 0.80 + (i as f64 * 0.04); @@ -456,9 +450,7 @@ fn test_checkpoint_with_empty_state() { // Empty checkpoint data let data = CheckpointData::new(0, 0, 100); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); let (_, loaded) = manager .load_latest() @@ -492,9 +484,7 @@ fn test_restoration_validates_epoch() { created_at: chrono::Utc::now().timestamp_millis(), finalizing: false, }); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); // With validation enabled, this should fail let options = RestorationOptions::new() @@ -521,9 +511,7 @@ fn test_restoration_validates_submission_id() { created_at: chrono::Utc::now().timestamp_millis(), finalizing: false, }); - manager - .create_checkpoint(&data) - .expect("Failed to create"); + manager.create_checkpoint(&data).expect("Failed to create"); // With validation enabled, this should fail let options = RestorationOptions::new()