From 1c24a12bb25661c77568e1992b7ccdff31e8f039 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Tue, 3 Feb 2026 10:57:32 +0000
Subject: [PATCH 1/8] feat: add challenges directory structure and workspace
 configuration

---
 Cargo.toml           | 15 +++++++++++++++
 challenges/.gitkeep  |  1 +
 challenges/README.md | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)
 create mode 100644 challenges/.gitkeep
 create mode 100644 challenges/README.md

diff --git a/Cargo.toml b/Cargo.toml
index 147a9dd..c2e5098 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,10 @@ members = [
 # Note: WASM runtime removed - updates via git, version checked at handshake
 # Note: P2P-only architecture - no centralized platform-server
 
+# Challenge crates can be added here or as optional path/git dependencies
+# Example:
+# "challenges/example-challenge",
+
 [workspace.package]
 version = "0.1.0"
 edition = "2021"
@@ -86,6 +90,9 @@ reqwest = { version = "0.12", features = ["json"] }
 [patch.crates-io]
 w3f-bls = { git = "https://github.com/opentensor/bls", branch = "fix-no-std" }
 
+# Challenge integration (optional - add to crates that need dynamic loading)
+libloading = "0.8"
+
 # Clippy lints configuration
 [workspace.lints.clippy]
 # Allow these patterns that are intentional in this codebase
@@ -95,3 +102,11 @@ type_complexity = "allow"
 await_holding_lock = "warn"  # TODO: Fix async lock issues properly
 collapsible_match = "allow"
 collapsible_if = "allow"
+
+# Workspace-level feature flags for challenge integration
+# Individual crates can enable these by adding features in their Cargo.toml:
+#   [features]
+#   dynamic-challenges = ["libloading"]
+[workspace.metadata.challenge-features]
+# Enable dynamic challenge loading (crates opt-in via features)
+dynamic-loading-available = true
diff --git a/challenges/.gitkeep b/challenges/.gitkeep
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/challenges/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/challenges/README.md b/challenges/README.md
new file mode 100644
index 0000000..6cecfe1
--- /dev/null
+++ b/challenges/README.md
@@ -0,0 +1,37 @@
+# Platform Challenge Crates
+
+This directory contains challenge crates that can be integrated with the Platform validator network.
+
+## Directory Structure
+
+```
+challenges/
+├── README.md           # This file
+├── example-challenge/  # Example challenge template (future)
+└── [your-challenge]/   # Your custom challenge crate
+```
+
+## Adding a New Challenge Crate
+
+1. Create your challenge crate in this directory or reference it as a git dependency
+2. Implement the `Challenge` trait from `platform-challenge-sdk`
+3. Register your challenge in the challenge registry
+4. Update the workspace `Cargo.toml` if adding locally
+
+## External Challenge Crates
+
+Challenge crates can also be external (like term-challenge). They should:
+- Import `platform-challenge-sdk` as a dependency
+- Implement the `ServerChallenge` trait
+- Provide Docker configuration for evaluation
+
+## Challenge Crate Requirements
+
+- Must implement `platform-challenge-sdk::ServerChallenge`
+- Must provide `/evaluate` HTTP endpoint
+- Must handle graceful shutdown signals
+- Must support state persistence for hot-reload
+
+## Example
+
+See [term-challenge](https://github.com/PlatformNetwork/term-challenge) for a complete example.

From d8f6e576a4b51066a5f31a0d3a4900b867de4e9d Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Tue, 3 Feb 2026 11:04:10 +0000
Subject: [PATCH 2/8] feat: add challenge-registry crate for challenge
 lifecycle management

Create new platform-challenge-registry crate with:
- Challenge discovery and registration
- Version management (semver-based)
- Lifecycle state machine (registered/starting/running/stopping/stopped)
- Health monitoring with configurable checks
- State persistence and hot-reload support
- Migration planning for version upgrades

Modules:
- registry: Main registry with CRUD operations
- lifecycle: State machine for challenge states
- health: Health monitoring and status tracking
- state: State snapshots for hot-reload
- discovery: Challenge discovery from various sources
- migration: Version migration planning
- version: Semantic versioning support
- error: Registry-specific error types
---
 Cargo.lock                                 |  27 ++
 Cargo.toml                                 |   4 +-
 crates/challenge-registry/Cargo.toml       |  42 ++
 crates/challenge-registry/src/discovery.rs | 299 +++++++++++++
 crates/challenge-registry/src/error.rs     |  61 +++
 crates/challenge-registry/src/health.rs    | 259 ++++++++++++
 crates/challenge-registry/src/lib.rs       |  41 ++
 crates/challenge-registry/src/lifecycle.rs | 162 +++++++
 crates/challenge-registry/src/migration.rs | 467 +++++++++++++++++++++
 crates/challenge-registry/src/registry.rs  | 464 ++++++++++++++++++++
 crates/challenge-registry/src/state.rs     | 316 ++++++++++++++
 crates/challenge-registry/src/version.rs   | 164 ++++++++
 12 files changed, 2303 insertions(+), 3 deletions(-)
 create mode 100644 crates/challenge-registry/Cargo.toml
 create mode 100644 crates/challenge-registry/src/discovery.rs
 create mode 100644 crates/challenge-registry/src/error.rs
 create mode 100644 crates/challenge-registry/src/health.rs
 create mode 100644 crates/challenge-registry/src/lib.rs
 create mode 100644 crates/challenge-registry/src/lifecycle.rs
 create mode 100644 crates/challenge-registry/src/migration.rs
 create mode 100644 crates/challenge-registry/src/registry.rs
 create mode 100644 crates/challenge-registry/src/state.rs
 create mode 100644 crates/challenge-registry/src/version.rs

diff --git a/Cargo.lock b/Cargo.lock
index 58e2c21..462e0ac 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4715,6 +4715,33 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "platform-challenge-registry"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bincode",
+ "chrono",
+ "futures",
+ "hex",
+ "parking_lot 0.12.5",
+ "platform-challenge-sdk",
+ "platform-core",
+ "platform-storage",
+ "reqwest 0.12.25",
+ "semver",
+ "serde",
+ "serde_json",
+ "sha2 0.10.9",
+ "tempfile",
+ "thiserror 2.0.17",
+ "tokio",
+ "tokio-test",
+ "tracing",
+ "uuid",
+]
+
 [[package]]
 name = "platform-challenge-sdk"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index c2e5098..44aa74e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,6 +5,7 @@ members = [
     "crates/storage",
     "crates/distributed-storage",
     "crates/challenge-sdk",
+    "crates/challenge-registry",
     "crates/epoch",
     "crates/bittensor-integration",
     "crates/subnet-manager",
@@ -90,9 +91,6 @@ reqwest = { version = "0.12", features = ["json"] }
 [patch.crates-io]
 w3f-bls = { git = "https://github.com/opentensor/bls", branch = "fix-no-std" }
 
-# Challenge integration (optional - add to crates that need dynamic loading)
-libloading = "0.8"
-
 # Clippy lints configuration
 [workspace.lints.clippy]
 # Allow these patterns that are intentional in this codebase
diff --git a/crates/challenge-registry/Cargo.toml b/crates/challenge-registry/Cargo.toml
new file mode 100644
index 0000000..5fe3946
--- /dev/null
+++ b/crates/challenge-registry/Cargo.toml
@@ -0,0 +1,42 @@
+[package]
+name = "platform-challenge-registry"
+version.workspace = true
+edition.workspace = true
+description = "Challenge registry and lifecycle management for Platform Network"
+
+[dependencies]
+platform-core = { path = "../core" }
+platform-challenge-sdk = { path = "../challenge-sdk" }
+platform-storage = { path = "../storage" }
+
+# Async
+tokio = { workspace = true }
+async-trait = { workspace = true }
+futures = { workspace = true }
+
+# Serialization
+serde = { workspace = true }
+serde_json = { workspace = true }
+bincode = { workspace = true }
+
+# Utils
+tracing = { workspace = true }
+thiserror = { workspace = true }
+anyhow = { workspace = true }
+chrono = { workspace = true }
+parking_lot = { workspace = true }
+uuid = { workspace = true }
+
+# Crypto for checksums
+sha2 = { workspace = true }
+hex = { workspace = true }
+
+# Versioning
+semver = "1.0"
+
+# Health checks
+reqwest = { workspace = true }
+
+[dev-dependencies]
+tempfile = { workspace = true }
+tokio-test = { workspace = true }
diff --git a/crates/challenge-registry/src/discovery.rs b/crates/challenge-registry/src/discovery.rs
new file mode 100644
index 0000000..776bf65
--- /dev/null
+++ b/crates/challenge-registry/src/discovery.rs
@@ -0,0 +1,299 @@
+//! Challenge discovery and auto-registration
+//!
+//! Discovers challenges from:
+//! - Docker registry
+//! - File system (local development)
+//! - Network announcements (P2P)
+
+use crate::error::{RegistryError, RegistryResult};
+use crate::version::ChallengeVersion;
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+
+/// A discovered challenge that can be registered
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct DiscoveredChallenge {
+    /// Challenge name
+    pub name: String,
+    /// Challenge version
+    pub version: ChallengeVersion,
+    /// Docker image (if available)
+    pub docker_image: Option<String>,
+    /// Local path (for development)
+    pub local_path: Option<PathBuf>,
+    /// Health endpoint URL
+    pub health_endpoint: Option<String>,
+    /// Evaluation endpoint URL
+    pub evaluation_endpoint: Option<String>,
+    /// Challenge metadata
+    pub metadata: ChallengeMetadata,
+    /// Source of discovery
+    pub source: DiscoverySource,
+}
+
+/// Metadata about a challenge
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct ChallengeMetadata {
+    /// Human-readable description
+    pub description: Option<String>,
+    /// Challenge author
+    pub author: Option<String>,
+    /// Repository URL
+    pub repository: Option<String>,
+    /// License
+    pub license: Option<String>,
+    /// Tags for categorization
+    pub tags: Vec<String>,
+    /// Minimum platform version required
+    pub min_platform_version: Option<ChallengeVersion>,
+}
+
+/// Source where a challenge was discovered
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
+pub enum DiscoverySource {
+    /// Discovered from Docker registry
+    DockerRegistry(String),
+    /// Discovered from local filesystem
+    LocalFilesystem(PathBuf),
+    /// Announced via P2P network
+    P2PNetwork(String),
+    /// Manually configured
+    Manual,
+}
+
+/// Configuration for challenge discovery
+#[derive(Clone, Debug)]
+pub struct DiscoveryConfig {
+    /// Docker registries to scan
+    pub docker_registries: Vec<String>,
+    /// Local paths to scan
+    pub local_paths: Vec<PathBuf>,
+    /// Enable P2P discovery
+    pub enable_p2p: bool,
+    /// Auto-register discovered challenges
+    pub auto_register: bool,
+    /// Scan interval in seconds
+    pub scan_interval_secs: u64,
+}
+
+impl Default for DiscoveryConfig {
+    fn default() -> Self {
+        Self {
+            docker_registries: vec![],
+            local_paths: vec![],
+            enable_p2p: true,
+            auto_register: false,
+            scan_interval_secs: 300, // 5 minutes
+        }
+    }
+}
+
+/// Discovers challenges from various sources
+pub struct ChallengeDiscovery {
+    /// Configuration
+    config: DiscoveryConfig,
+    /// Discovered but not yet registered challenges
+    discovered: parking_lot::RwLock<Vec<DiscoveredChallenge>>,
+}
+
+impl ChallengeDiscovery {
+    /// Create a new discovery service with default config
+    pub fn new() -> Self {
+        Self {
+            config: DiscoveryConfig::default(),
+            discovered: parking_lot::RwLock::new(Vec::new()),
+        }
+    }
+
+    /// Create with custom config
+    pub fn with_config(config: DiscoveryConfig) -> Self {
+        Self {
+            config,
+            discovered: parking_lot::RwLock::new(Vec::new()),
+        }
+    }
+
+    /// Get the current configuration
+    pub fn config(&self) -> &DiscoveryConfig {
+        &self.config
+    }
+
+    /// Discover challenges from all configured sources
+    pub fn discover_all(&self) -> RegistryResult<Vec<DiscoveredChallenge>> {
+        let mut all_discovered = Vec::new();
+
+        // Discover from local paths
+        for path in &self.config.local_paths {
+            match self.discover_from_local(path) {
+                Ok(challenges) => all_discovered.extend(challenges),
+                Err(e) => {
+                    tracing::warn!(path = ?path, error = %e, "Failed to discover from local path");
+                }
+            }
+        }
+
+        // Update internal state
+        let mut discovered = self.discovered.write();
+        *discovered = all_discovered.clone();
+
+        Ok(all_discovered)
+    }
+
+    /// Discover challenges from a local path
+    pub fn discover_from_local(&self, path: &PathBuf) -> RegistryResult<Vec<DiscoveredChallenge>> {
+        if !path.exists() {
+            return Err(RegistryError::InvalidConfig(format!(
+                "Path does not exist: {:?}",
+                path
+            )));
+        }
+
+        let mut challenges = Vec::new();
+
+        // Look for challenge.toml or Cargo.toml with challenge metadata
+        if path.is_dir() {
+            let challenge_toml = path.join("challenge.toml");
+            let cargo_toml = path.join("Cargo.toml");
+
+            if challenge_toml.exists() {
+                // In a real implementation, parse challenge.toml
+                let name = path
+                    .file_name()
+                    .and_then(|n| n.to_str())
+                    .unwrap_or("unknown")
+                    .to_string();
+
+                challenges.push(DiscoveredChallenge {
+                    name,
+                    version: ChallengeVersion::default(),
+                    docker_image: None,
+                    local_path: Some(path.clone()),
+                    health_endpoint: None,
+                    evaluation_endpoint: None,
+                    metadata: ChallengeMetadata::default(),
+                    source: DiscoverySource::LocalFilesystem(path.clone()),
+                });
+            } else if cargo_toml.exists() {
+                // Extract name from Cargo.toml
+                let name = path
+                    .file_name()
+                    .and_then(|n| n.to_str())
+                    .unwrap_or("unknown")
+                    .to_string();
+
+                challenges.push(DiscoveredChallenge {
+                    name,
+                    version: ChallengeVersion::default(),
+                    docker_image: None,
+                    local_path: Some(path.clone()),
+                    health_endpoint: None,
+                    evaluation_endpoint: None,
+                    metadata: ChallengeMetadata::default(),
+                    source: DiscoverySource::LocalFilesystem(path.clone()),
+                });
+            }
+        }
+
+        Ok(challenges)
+    }
+
+    /// Manually add a discovered challenge
+    pub fn add_discovered(&self, challenge: DiscoveredChallenge) {
+        let mut discovered = self.discovered.write();
+        discovered.push(challenge);
+    }
+
+    /// Get all discovered challenges
+    pub fn get_discovered(&self) -> Vec<DiscoveredChallenge> {
+        self.discovered.read().clone()
+    }
+
+    /// Clear discovered challenges
+    pub fn clear_discovered(&self) {
+        self.discovered.write().clear();
+    }
+
+    /// Check if auto-registration is enabled
+    pub fn auto_register_enabled(&self) -> bool {
+        self.config.auto_register
+    }
+}
+
+impl Default for ChallengeDiscovery {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_discovery_source_equality() {
+        assert_eq!(DiscoverySource::Manual, DiscoverySource::Manual);
+        assert_ne!(
+            DiscoverySource::Manual,
+            DiscoverySource::P2PNetwork("test".to_string())
+        );
+    }
+
+    #[test]
+    fn test_discovered_challenge() {
+        let challenge = DiscoveredChallenge {
+            name: "test-challenge".to_string(),
+            version: ChallengeVersion::new(1, 0, 0),
+            docker_image: Some("test:latest".to_string()),
+            local_path: None,
+            health_endpoint: Some("http://localhost:8080/health".to_string()),
+            evaluation_endpoint: Some("http://localhost:8080/evaluate".to_string()),
+            metadata: ChallengeMetadata {
+                description: Some("A test challenge".to_string()),
+                author: Some("Platform".to_string()),
+                ..Default::default()
+            },
+            source: DiscoverySource::Manual,
+        };
+
+        assert_eq!(challenge.name, "test-challenge");
+        assert!(challenge.docker_image.is_some());
+    }
+
+    #[test]
+    fn test_discovery_service() {
+        let discovery = ChallengeDiscovery::new();
+
+        assert!(discovery.get_discovered().is_empty());
+
+        discovery.add_discovered(DiscoveredChallenge {
+            name: "manual".to_string(),
+            version: ChallengeVersion::new(1, 0, 0),
+            docker_image: None,
+            local_path: None,
+            health_endpoint: None,
+            evaluation_endpoint: None,
+            metadata: ChallengeMetadata::default(),
+            source: DiscoverySource::Manual,
+        });
+
+        assert_eq!(discovery.get_discovered().len(), 1);
+
+        discovery.clear_discovered();
+        assert!(discovery.get_discovered().is_empty());
+    }
+
+    #[test]
+    fn test_discovery_config() {
+        let config = DiscoveryConfig {
+            docker_registries: vec!["registry.example.com".to_string()],
+            local_paths: vec![PathBuf::from("/challenges")],
+            enable_p2p: false,
+            auto_register: true,
+            scan_interval_secs: 60,
+        };
+
+        let discovery = ChallengeDiscovery::with_config(config);
+        assert!(discovery.auto_register_enabled());
+        assert_eq!(discovery.config().scan_interval_secs, 60);
+    }
+}
diff --git a/crates/challenge-registry/src/error.rs b/crates/challenge-registry/src/error.rs
new file mode 100644
index 0000000..369db73
--- /dev/null
+++ b/crates/challenge-registry/src/error.rs
@@ -0,0 +1,61 @@
+//! Error types for challenge registry
+
+use thiserror::Error;
+
+/// Result type for registry operations
+pub type RegistryResult<T> = Result<T, RegistryError>;
+
+/// Errors that can occur in the challenge registry
+#[derive(Error, Debug)]
+pub enum RegistryError {
+    #[error("Challenge not found: {0}")]
+    ChallengeNotFound(String),
+
+    #[error("Challenge already registered: {0}")]
+    AlreadyRegistered(String),
+
+    #[error("Version conflict: {0}")]
+    VersionConflict(String),
+
+    #[error("Migration failed: {0}")]
+    MigrationFailed(String),
+
+    #[error("Health check failed: {0}")]
+    HealthCheckFailed(String),
+
+    #[error("State persistence error: {0}")]
+    StatePersistence(String),
+
+    #[error("State restoration error: {0}")]
+    StateRestoration(String),
+
+    #[error("Invalid challenge configuration: {0}")]
+    InvalidConfig(String),
+
+    #[error("Serialization error: {0}")]
+    Serialization(String),
+
+    #[error("Network error: {0}")]
+    Network(String),
+
+    #[error("Internal error: {0}")]
+    Internal(String),
+}
+
+impl From<std::io::Error> for RegistryError {
+    fn from(err: std::io::Error) -> Self {
+        RegistryError::Internal(err.to_string())
+    }
+}
+
+impl From<serde_json::Error> for RegistryError {
+    fn from(err: serde_json::Error) -> Self {
+        RegistryError::Serialization(err.to_string())
+    }
+}
+
+impl From<bincode::Error> for RegistryError {
+    fn from(err: bincode::Error) -> Self {
+        RegistryError::Serialization(err.to_string())
+    }
+}
diff --git a/crates/challenge-registry/src/health.rs b/crates/challenge-registry/src/health.rs
new file mode 100644
index 0000000..e142fdb
--- /dev/null
+++ b/crates/challenge-registry/src/health.rs
@@ -0,0 +1,259 @@
+//! Health monitoring for challenges
+//!
+//! Monitors challenge health through:
+//! - HTTP health endpoints
+//! - Container status
+//! - Resource usage
+
+use platform_core::ChallengeId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::time::Duration;
+use parking_lot::RwLock;
+
+/// Health status of a challenge
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum HealthStatus {
+    /// Health status is unknown (not yet checked)
+    Unknown,
+    /// Challenge is healthy
+    Healthy,
+    /// Challenge is degraded but operational
+    Degraded(String),
+    /// Challenge is unhealthy
+    Unhealthy(String),
+}
+
+impl Default for HealthStatus {
+    fn default() -> Self {
+        Self::Unknown
+    }
+}
+
+/// Detailed health information for a challenge
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ChallengeHealth {
+    /// Challenge identifier
+    pub challenge_id: ChallengeId,
+    /// Current health status
+    pub status: HealthStatus,
+    /// Last successful health check timestamp (millis)
+    pub last_check_at: i64,
+    /// Number of consecutive failures
+    pub consecutive_failures: u32,
+    /// Average response time in milliseconds
+    pub avg_response_time_ms: f64,
+    /// Additional health metrics
+    pub metrics: HashMap<String, f64>,
+}
+
+impl ChallengeHealth {
+    /// Create new health info for a challenge
+    pub fn new(challenge_id: ChallengeId) -> Self {
+        Self {
+            challenge_id,
+            status: HealthStatus::Unknown,
+            last_check_at: 0,
+            consecutive_failures: 0,
+            avg_response_time_ms: 0.0,
+            metrics: HashMap::new(),
+        }
+    }
+
+    /// Check if the challenge is considered healthy
+    pub fn is_healthy(&self) -> bool {
+        matches!(self.status, HealthStatus::Healthy)
+    }
+
+    /// Check if the challenge is operational (healthy or degraded)
+    pub fn is_operational(&self) -> bool {
+        matches!(self.status, HealthStatus::Healthy | HealthStatus::Degraded(_))
+    }
+
+    /// Record a successful health check
+    pub fn record_success(&mut self, response_time_ms: f64) {
+        self.status = HealthStatus::Healthy;
+        self.last_check_at = chrono::Utc::now().timestamp_millis();
+        self.consecutive_failures = 0;
+
+        // Exponential moving average for response time
+        if self.avg_response_time_ms == 0.0 {
+            self.avg_response_time_ms = response_time_ms;
+        } else {
+            self.avg_response_time_ms = self.avg_response_time_ms * 0.8 + response_time_ms * 0.2;
+        }
+    }
+
+    /// Record a failed health check
+    pub fn record_failure(&mut self, reason: String) {
+        self.consecutive_failures += 1;
+        self.last_check_at = chrono::Utc::now().timestamp_millis();
+
+        if self.consecutive_failures >= 3 {
+            self.status = HealthStatus::Unhealthy(reason);
+        } else {
+            self.status = HealthStatus::Degraded(reason);
+        }
+    }
+}
+
+/// Configuration for health monitoring
+#[derive(Clone, Debug)]
+pub struct HealthConfig {
+    /// Interval between health checks
+    pub check_interval: Duration,
+    /// Timeout for health check requests
+    pub check_timeout: Duration,
+    /// Number of failures before marking unhealthy
+    pub failure_threshold: u32,
+    /// Number of successes to recover from unhealthy
+    pub recovery_threshold: u32,
+}
+
+impl Default for HealthConfig {
+    fn default() -> Self {
+        Self {
+            check_interval: Duration::from_secs(30),
+            check_timeout: Duration::from_secs(5),
+            failure_threshold: 3,
+            recovery_threshold: 2,
+        }
+    }
+}
+
+/// Monitors health of registered challenges
+pub struct HealthMonitor {
+    /// Health state for each challenge
+    health_state: RwLock<HashMap<ChallengeId, ChallengeHealth>>,
+    /// Configuration
+    config: HealthConfig,
+}
+
+impl HealthMonitor {
+    /// Create a new health monitor with default config
+    pub fn new() -> Self {
+        Self {
+            health_state: RwLock::new(HashMap::new()),
+            config: HealthConfig::default(),
+        }
+    }
+
+    /// Create a health monitor with custom config
+    pub fn with_config(config: HealthConfig) -> Self {
+        Self {
+            health_state: RwLock::new(HashMap::new()),
+            config,
+        }
+    }
+
+    /// Register a challenge for health monitoring
+    pub fn register(&self, challenge_id: ChallengeId) {
+        let mut state = self.health_state.write();
+        state.insert(challenge_id, ChallengeHealth::new(challenge_id));
+    }
+
+    /// Unregister a challenge from health monitoring
+    pub fn unregister(&self, challenge_id: &ChallengeId) {
+        let mut state = self.health_state.write();
+        state.remove(challenge_id);
+    }
+
+    /// Get health status for a challenge
+    pub fn get_health(&self, challenge_id: &ChallengeId) -> Option<ChallengeHealth> {
+        self.health_state.read().get(challenge_id).cloned()
+    }
+
+    /// Get health status for all challenges
+    pub fn get_all_health(&self) -> Vec<ChallengeHealth> {
+        self.health_state.read().values().cloned().collect()
+    }
+
+    /// Update health status after a check
+    pub fn update_health(&self, challenge_id: &ChallengeId, status: HealthStatus) {
+        let mut state = self.health_state.write();
+        if let Some(health) = state.get_mut(challenge_id) {
+            health.status = status;
+            health.last_check_at = chrono::Utc::now().timestamp_millis();
+        }
+    }
+
+    /// Record a successful health check
+    pub fn record_success(&self, challenge_id: &ChallengeId, response_time_ms: f64) {
+        let mut state = self.health_state.write();
+        if let Some(health) = state.get_mut(challenge_id) {
+            health.record_success(response_time_ms);
+        }
+    }
+
+    /// Record a failed health check
+    pub fn record_failure(&self, challenge_id: &ChallengeId, reason: String) {
+        let mut state = self.health_state.write();
+        if let Some(health) = state.get_mut(challenge_id) {
+            health.record_failure(reason);
+        }
+    }
+
+    /// Get the health config
+    pub fn config(&self) -> &HealthConfig {
+        &self.config
+    }
+}
+
+impl Default for HealthMonitor {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_health_status() {
+        let mut health = ChallengeHealth::new(ChallengeId::new());
+
+        assert_eq!(health.status, HealthStatus::Unknown);
+        assert!(!health.is_healthy());
+
+        health.record_success(50.0);
+        assert!(health.is_healthy());
+        assert!(health.is_operational());
+
+        health.record_failure("timeout".to_string());
+        assert!(!health.is_healthy());
+        assert!(health.is_operational()); // Still degraded
+
+        health.record_failure("timeout".to_string());
+        health.record_failure("timeout".to_string());
+        assert!(!health.is_operational()); // Now unhealthy
+    }
+
+    #[test]
+    fn test_health_monitor() {
+        let monitor = HealthMonitor::new();
+        let id = ChallengeId::new();
+
+        monitor.register(id);
+        assert!(monitor.get_health(&id).is_some());
+
+        monitor.record_success(&id, 100.0);
+        let health = monitor.get_health(&id).unwrap();
+        assert!(health.is_healthy());
+
+        monitor.unregister(&id);
+        assert!(monitor.get_health(&id).is_none());
+    }
+
+    #[test]
+    fn test_response_time_averaging() {
+        let mut health = ChallengeHealth::new(ChallengeId::new());
+
+        health.record_success(100.0);
+        assert_eq!(health.avg_response_time_ms, 100.0);
+
+        health.record_success(200.0);
+        // 100 * 0.8 + 200 * 0.2 = 80 + 40 = 120
+        assert!((health.avg_response_time_ms - 120.0).abs() < 0.01);
+    }
+}
diff --git a/crates/challenge-registry/src/lib.rs b/crates/challenge-registry/src/lib.rs
new file mode 100644
index 0000000..6161212
--- /dev/null
+++ b/crates/challenge-registry/src/lib.rs
@@ -0,0 +1,41 @@
+//! Challenge Registry for Platform Network
+//!
+//! Manages the lifecycle of challenge crates including:
+//! - Challenge discovery and registration
+//! - Version management and migrations
+//! - Hot-reload support with state preservation
+//! - Health monitoring
+//!
+//! # Architecture
+//!
+//! ```text
+//! ┌─────────────────────────────────────────────────────────────┐
+//! │                   Challenge Registry                        │
+//! ├─────────────────────────────────────────────────────────────┤
+//! │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐         │
+//! │  │  Discovery  │  │  Lifecycle  │  │   Health    │         │
+//! │  │   Manager   │  │   Manager   │  │   Monitor   │         │
+//! │  └─────────────┘  └─────────────┘  └─────────────┘         │
+//! ├─────────────────────────────────────────────────────────────┤
+//! │                 Challenge State Store                       │
+//! │         (evaluations, checkpoints, migrations)              │
+//! └─────────────────────────────────────────────────────────────┘
+//! ```
+
+pub mod discovery;
+pub mod error;
+pub mod health;
+pub mod lifecycle;
+pub mod migration;
+pub mod registry;
+pub mod state;
+pub mod version;
+
+pub use discovery::{ChallengeDiscovery, DiscoveredChallenge};
+pub use error::{RegistryError, RegistryResult};
+pub use health::{ChallengeHealth, HealthMonitor, HealthStatus};
+pub use lifecycle::{ChallengeLifecycle, LifecycleEvent, LifecycleState};
+pub use migration::{ChallengeMigration, MigrationPlan, MigrationStatus};
+pub use registry::{ChallengeEntry, ChallengeRegistry, RegisteredChallenge};
+pub use state::{ChallengeState, StateSnapshot, StateStore};
+pub use version::{ChallengeVersion, VersionConstraint, VersionedChallenge};
diff --git a/crates/challenge-registry/src/lifecycle.rs b/crates/challenge-registry/src/lifecycle.rs
new file mode 100644
index 0000000..a2ba334
--- /dev/null
+++ b/crates/challenge-registry/src/lifecycle.rs
@@ -0,0 +1,162 @@
+//! Challenge lifecycle management
+//!
+//! Handles state transitions for challenges:
+//! Registered -> Starting -> Running -> Stopping -> Stopped
+
+use crate::version::ChallengeVersion;
+use platform_core::ChallengeId;
+use serde::{Deserialize, Serialize};
+
+/// State of a challenge in its lifecycle
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum LifecycleState {
+    /// Challenge is registered but not started
+    Registered,
+    /// Challenge is starting up
+    Starting,
+    /// Challenge is running and accepting evaluations
+    Running,
+    /// Challenge is being stopped gracefully
+    Stopping,
+    /// Challenge is stopped
+    Stopped,
+    /// Challenge failed to start or crashed
+    Failed(String),
+    /// Challenge is being migrated to a new version
+    Migrating,
+}
+
+impl Default for LifecycleState {
+    fn default() -> Self {
+        Self::Registered
+    }
+}
+
+/// Events emitted during lifecycle transitions
+#[derive(Clone, Debug)]
+pub enum LifecycleEvent {
+    /// Challenge was registered
+    Registered { challenge_id: ChallengeId },
+    /// Challenge was unregistered
+    Unregistered { challenge_id: ChallengeId },
+    /// Challenge state changed
+    StateChanged {
+        challenge_id: ChallengeId,
+        old_state: LifecycleState,
+        new_state: LifecycleState,
+    },
+    /// Challenge version changed (hot-reload)
+    VersionChanged {
+        challenge_id: ChallengeId,
+        old_version: ChallengeVersion,
+        new_version: ChallengeVersion,
+    },
+}
+
+/// Manages challenge lifecycle transitions
+pub struct ChallengeLifecycle {
+    /// Whether to allow automatic restarts on failure
+    auto_restart: bool,
+    /// Maximum restart attempts
+    max_restart_attempts: u32,
+}
+
+impl ChallengeLifecycle {
+    /// Create a new lifecycle manager
+    pub fn new() -> Self {
+        Self {
+            auto_restart: true,
+            max_restart_attempts: 3,
+        }
+    }
+
+    /// Configure auto-restart behavior
+    pub fn with_auto_restart(mut self, enabled: bool, max_attempts: u32) -> Self {
+        self.auto_restart = enabled;
+        self.max_restart_attempts = max_attempts;
+        self
+    }
+
+    /// Check if a state transition is valid
+    pub fn is_valid_transition(&self, from: &LifecycleState, to: &LifecycleState) -> bool {
+        match (from, to) {
+            // From Registered
+            (LifecycleState::Registered, LifecycleState::Starting) => true,
+            (LifecycleState::Registered, LifecycleState::Stopped) => true,
+
+            // From Starting
+            (LifecycleState::Starting, LifecycleState::Running) => true,
+            (LifecycleState::Starting, LifecycleState::Failed(_)) => true,
+
+            // From Running
+            (LifecycleState::Running, LifecycleState::Stopping) => true,
+            (LifecycleState::Running, LifecycleState::Failed(_)) => true,
+            (LifecycleState::Running, LifecycleState::Migrating) => true,
+
+            // From Stopping
+            (LifecycleState::Stopping, LifecycleState::Stopped) => true,
+
+            // From Stopped
+            (LifecycleState::Stopped, LifecycleState::Starting) => true,
+            (LifecycleState::Stopped, LifecycleState::Registered) => true,
+
+            // From Failed
+            (LifecycleState::Failed(_), LifecycleState::Starting) => true,
+            (LifecycleState::Failed(_), LifecycleState::Stopped) => true,
+
+            // From Migrating
+            (LifecycleState::Migrating, LifecycleState::Running) => true,
+            (LifecycleState::Migrating, LifecycleState::Failed(_)) => true,
+
+            _ => false,
+        }
+    }
+
+    /// Check if auto-restart is enabled
+    pub fn auto_restart_enabled(&self) -> bool {
+        self.auto_restart
+    }
+
+    /// Get max restart attempts
+    pub fn max_restart_attempts(&self) -> u32 {
+        self.max_restart_attempts
+    }
+}
+
+impl Default for ChallengeLifecycle {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_valid_transitions() {
+        let lifecycle = ChallengeLifecycle::new();
+
+        assert!(lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Starting));
+        assert!(lifecycle.is_valid_transition(&LifecycleState::Starting, &LifecycleState::Running));
+        assert!(lifecycle.is_valid_transition(&LifecycleState::Running, &LifecycleState::Stopping));
+        assert!(lifecycle.is_valid_transition(&LifecycleState::Stopping, &LifecycleState::Stopped));
+    }
+
+    #[test]
+    fn test_invalid_transitions() {
+        let lifecycle = ChallengeLifecycle::new();
+
+        assert!(!lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Running));
+        assert!(!lifecycle.is_valid_transition(&LifecycleState::Stopped, &LifecycleState::Running));
+    }
+
+    #[test]
+    fn test_lifecycle_config() {
+        let lifecycle = ChallengeLifecycle::new()
+            .with_auto_restart(false, 5);
+
+        assert!(!lifecycle.auto_restart_enabled());
+        assert_eq!(lifecycle.max_restart_attempts(), 5);
+    }
+}
diff --git a/crates/challenge-registry/src/migration.rs b/crates/challenge-registry/src/migration.rs
new file mode 100644
index 0000000..002c543
--- /dev/null
+++ b/crates/challenge-registry/src/migration.rs
@@ -0,0 +1,467 @@
+//! Challenge migration support
+//!
+//! Handles version migrations for challenges:
+//! - Schema migrations
+//! - State transformations
+//! - Rollback support
+
+use crate::error::{RegistryError, RegistryResult};
+use crate::version::ChallengeVersion;
+use platform_core::ChallengeId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// Status of a migration
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+pub enum MigrationStatus {
+    /// Migration is pending
+    Pending,
+    /// Migration is in progress
+    InProgress,
+    /// Migration completed successfully
+    Completed,
+    /// Migration failed
+    Failed(String),
+    /// Migration was rolled back
+    RolledBack,
+}
+
+/// A single migration step
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MigrationStep {
+    /// Step identifier
+    pub id: String,
+    /// Description of what this step does
+    pub description: String,
+    /// From version
+    pub from_version: ChallengeVersion,
+    /// To version
+    pub to_version: ChallengeVersion,
+    /// Whether this step is reversible
+    pub reversible: bool,
+    /// Estimated duration in seconds
+    pub estimated_duration_secs: u64,
+}
+
+impl MigrationStep {
+    /// Create a new migration step
+    pub fn new(
+        id: String,
+        description: String,
+        from: ChallengeVersion,
+        to: ChallengeVersion,
+    ) -> Self {
+        Self {
+            id,
+            description,
+            from_version: from,
+            to_version: to,
+            reversible: true,
+            estimated_duration_secs: 60,
+        }
+    }
+
+    /// Mark step as irreversible
+    pub fn irreversible(mut self) -> Self {
+        self.reversible = false;
+        self
+    }
+
+    /// Set estimated duration
+    pub fn with_duration(mut self, secs: u64) -> Self {
+        self.estimated_duration_secs = secs;
+        self
+    }
+}
+
+/// A plan for migrating a challenge between versions
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MigrationPlan {
+    /// Challenge being migrated
+    pub challenge_id: ChallengeId,
+    /// Challenge name
+    pub challenge_name: String,
+    /// Source version
+    pub from_version: ChallengeVersion,
+    /// Target version
+    pub to_version: ChallengeVersion,
+    /// Ordered list of migration steps
+    pub steps: Vec<MigrationStep>,
+    /// Current status
+    pub status: MigrationStatus,
+    /// Index of current step (0-based)
+    pub current_step: usize,
+    /// Plan creation timestamp
+    pub created_at: i64,
+    /// Plan start timestamp (if started)
+    pub started_at: Option<i64>,
+    /// Plan completion timestamp (if completed)
+    pub completed_at: Option<i64>,
+}
+
+impl MigrationPlan {
+    /// Create a new migration plan
+    pub fn new(
+        challenge_id: ChallengeId,
+        challenge_name: String,
+        from_version: ChallengeVersion,
+        to_version: ChallengeVersion,
+    ) -> Self {
+        Self {
+            challenge_id,
+            challenge_name,
+            from_version,
+            to_version,
+            steps: Vec::new(),
+            status: MigrationStatus::Pending,
+            current_step: 0,
+            created_at: chrono::Utc::now().timestamp_millis(),
+            started_at: None,
+            completed_at: None,
+        }
+    }
+
+    /// Add a migration step
+    pub fn add_step(&mut self, step: MigrationStep) {
+        self.steps.push(step);
+    }
+
+    /// Check if the plan has any steps
+    pub fn is_empty(&self) -> bool {
+        self.steps.is_empty()
+    }
+
+    /// Get total number of steps
+    pub fn total_steps(&self) -> usize {
+        self.steps.len()
+    }
+
+    /// Get estimated total duration
+    pub fn estimated_duration_secs(&self) -> u64 {
+        self.steps.iter().map(|s| s.estimated_duration_secs).sum()
+    }
+
+    /// Check if migration is complete
+    pub fn is_complete(&self) -> bool {
+        matches!(self.status, MigrationStatus::Completed | MigrationStatus::RolledBack)
+    }
+
+    /// Check if migration can be rolled back
+    pub fn can_rollback(&self) -> bool {
+        // Can rollback if all executed steps are reversible
+        self.steps
+            .iter()
+            .take(self.current_step)
+            .all(|s| s.reversible)
+    }
+
+    /// Get progress as percentage
+    pub fn progress_percent(&self) -> f64 {
+        if self.steps.is_empty() {
+            return 100.0;
+        }
+        (self.current_step as f64 / self.steps.len() as f64) * 100.0
+    }
+}
+
+/// Record of a completed migration
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MigrationRecord {
+    /// Migration plan
+    pub plan: MigrationPlan,
+    /// Execution logs
+    pub logs: Vec<MigrationLog>,
+}
+
+/// Log entry for migration
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct MigrationLog {
+    /// Timestamp
+    pub timestamp: i64,
+    /// Log level
+    pub level: LogLevel,
+    /// Message
+    pub message: String,
+    /// Associated step ID (if any)
+    pub step_id: Option<String>,
+}
+
+/// Log level for migration logs
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum LogLevel {
+    Info,
+    Warning,
+    Error,
+}
+
+/// Manages challenge migrations
+pub struct ChallengeMigration {
+    /// Active migration plans
+    active_plans: parking_lot::RwLock<HashMap<ChallengeId, MigrationPlan>>,
+    /// Migration history
+    history: parking_lot::RwLock<Vec<MigrationRecord>>,
+    /// Maximum history to retain
+    max_history: usize,
+}
+
+impl ChallengeMigration {
+    /// Create a new migration manager
+    pub fn new() -> Self {
+        Self {
+            active_plans: parking_lot::RwLock::new(HashMap::new()),
+            history: parking_lot::RwLock::new(Vec::new()),
+            max_history: 100,
+        }
+    }
+
+    /// Create a migration plan between versions
+    pub fn create_plan(
+        &self,
+        challenge_id: ChallengeId,
+        challenge_name: String,
+        from_version: ChallengeVersion,
+        to_version: ChallengeVersion,
+    ) -> RegistryResult<MigrationPlan> {
+        // Check if there's already an active migration
+        if self.active_plans.read().contains_key(&challenge_id) {
+            return Err(RegistryError::MigrationFailed(
+                "Migration already in progress".to_string(),
+            ));
+        }
+
+        let mut plan = MigrationPlan::new(challenge_id, challenge_name, from_version.clone(), to_version.clone());
+
+        // Generate migration steps based on version difference
+        // This is a simplified version - real implementation would analyze schemas
+        if from_version.major != to_version.major {
+            plan.add_step(
+                MigrationStep::new(
+                    "major_upgrade".to_string(),
+                    format!("Major version upgrade from {} to {}", from_version.major, to_version.major),
+                    from_version.clone(),
+                    to_version.clone(),
+                )
+                .irreversible()
+                .with_duration(300),
+            );
+        } else if from_version.minor != to_version.minor {
+            plan.add_step(
+                MigrationStep::new(
+                    "minor_upgrade".to_string(),
+                    format!("Minor version upgrade from {} to {}", from_version, to_version),
+                    from_version.clone(),
+                    to_version.clone(),
+                )
+                .with_duration(60),
+            );
+        } else if from_version.patch != to_version.patch {
+            plan.add_step(
+                MigrationStep::new(
+                    "patch_upgrade".to_string(),
+                    format!("Patch version upgrade from {} to {}", from_version, to_version),
+                    from_version,
+                    to_version,
+                )
+                .with_duration(10),
+            );
+        }
+
+        Ok(plan)
+    }
+
+    /// Start executing a migration plan
+    pub fn start_migration(&self, plan: MigrationPlan) -> RegistryResult<()> {
+        let challenge_id = plan.challenge_id;
+
+        let mut plans = self.active_plans.write();
+        if plans.contains_key(&challenge_id) {
+            return Err(RegistryError::MigrationFailed(
+                "Migration already in progress".to_string(),
+            ));
+        }
+
+        let mut plan = plan;
+        plan.status = MigrationStatus::InProgress;
+        plan.started_at = Some(chrono::Utc::now().timestamp_millis());
+
+        plans.insert(challenge_id, plan);
+        Ok(())
+    }
+
+    /// Get active migration for a challenge
+    pub fn get_active_migration(&self, challenge_id: &ChallengeId) -> Option<MigrationPlan> {
+        self.active_plans.read().get(challenge_id).cloned()
+    }
+
+    /// Complete a migration step
+    pub fn complete_step(&self, challenge_id: &ChallengeId) -> RegistryResult<bool> {
+        let mut plans = self.active_plans.write();
+        let plan = plans
+            .get_mut(challenge_id)
+            .ok_or_else(|| RegistryError::MigrationFailed("No active migration".to_string()))?;
+
+        plan.current_step += 1;
+
+        // Check if all steps complete
+        if plan.current_step >= plan.steps.len() {
+            plan.status = MigrationStatus::Completed;
+            plan.completed_at = Some(chrono::Utc::now().timestamp_millis());
+            Ok(true)
+        } else {
+            Ok(false)
+        }
+    }
+
+    /// Fail a migration
+    pub fn fail_migration(&self, challenge_id: &ChallengeId, reason: String) -> RegistryResult<()> {
+        let mut plans = self.active_plans.write();
+        let plan = plans
+            .get_mut(challenge_id)
+            .ok_or_else(|| RegistryError::MigrationFailed("No active migration".to_string()))?;
+
+        plan.status = MigrationStatus::Failed(reason);
+        plan.completed_at = Some(chrono::Utc::now().timestamp_millis());
+
+        Ok(())
+    }
+
+    /// Finalize and archive a completed migration
+    pub fn finalize_migration(&self, challenge_id: &ChallengeId) -> RegistryResult<MigrationPlan> {
+        let plan = self
+            .active_plans
+            .write()
+            .remove(challenge_id)
+            .ok_or_else(|| RegistryError::MigrationFailed("No active migration".to_string()))?;
+
+        if !plan.is_complete() {
+            return Err(RegistryError::MigrationFailed(
+                "Migration not complete".to_string(),
+            ));
+        }
+
+        // Add to history
+        let record = MigrationRecord {
+            plan: plan.clone(),
+            logs: Vec::new(),
+        };
+
+        let mut history = self.history.write();
+        history.push(record);
+
+        // Trim history
+        while history.len() > self.max_history {
+            history.remove(0);
+        }
+
+        Ok(plan)
+    }
+
+    /// Get migration history for a challenge
+    pub fn get_history(&self, challenge_id: &ChallengeId) -> Vec<MigrationRecord> {
+        self.history
+            .read()
+            .iter()
+            .filter(|r| r.plan.challenge_id == *challenge_id)
+            .cloned()
+            .collect()
+    }
+
+    /// Get all migration history
+    pub fn get_all_history(&self) -> Vec<MigrationRecord> {
+        self.history.read().clone()
+    }
+}
+
+impl Default for ChallengeMigration {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_migration_plan_creation() {
+        let migration = ChallengeMigration::new();
+        let id = ChallengeId::new();
+
+        let plan = migration
+            .create_plan(
+                id,
+                "test".to_string(),
+                ChallengeVersion::new(1, 0, 0),
+                ChallengeVersion::new(1, 1, 0),
+            )
+            .unwrap();
+
+        assert_eq!(plan.total_steps(), 1);
+        assert!(!plan.is_complete());
+        assert_eq!(plan.progress_percent(), 0.0);
+    }
+
+    #[test]
+    fn test_migration_execution() {
+        let migration = ChallengeMigration::new();
+        let id = ChallengeId::new();
+
+        let plan = migration
+            .create_plan(
+                id,
+                "test".to_string(),
+                ChallengeVersion::new(1, 0, 0),
+                ChallengeVersion::new(1, 0, 1),
+            )
+            .unwrap();
+
+        migration.start_migration(plan).unwrap();
+
+        let active = migration.get_active_migration(&id);
+        assert!(active.is_some());
+        assert!(matches!(active.unwrap().status, MigrationStatus::InProgress));
+
+        let complete = migration.complete_step(&id).unwrap();
+        assert!(complete);
+
+        let finalized = migration.finalize_migration(&id).unwrap();
+        assert!(matches!(finalized.status, MigrationStatus::Completed));
+    }
+
+    #[test]
+    fn test_duplicate_migration_prevention() {
+        let migration = ChallengeMigration::new();
+        let id = ChallengeId::new();
+
+        let plan = migration
+            .create_plan(
+                id,
+                "test".to_string(),
+                ChallengeVersion::new(1, 0, 0),
+                ChallengeVersion::new(1, 1, 0),
+            )
+            .unwrap();
+
+        migration.start_migration(plan.clone()).unwrap();
+        let result = migration.start_migration(plan);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_major_version_migration() {
+        let migration = ChallengeMigration::new();
+        let id = ChallengeId::new();
+
+        let plan = migration
+            .create_plan(
+                id,
+                "test".to_string(),
+                ChallengeVersion::new(1, 0, 0),
+                ChallengeVersion::new(2, 0, 0),
+            )
+            .unwrap();
+
+        // Major version migrations are irreversible
+        assert!(!plan.steps[0].reversible);
+    }
+}
diff --git a/crates/challenge-registry/src/registry.rs b/crates/challenge-registry/src/registry.rs
new file mode 100644
index 0000000..1c2a0bd
--- /dev/null
+++ b/crates/challenge-registry/src/registry.rs
@@ -0,0 +1,464 @@
+//! Main challenge registry implementation
+
+use crate::error::{RegistryError, RegistryResult};
+use crate::health::{HealthMonitor, HealthStatus};
+use crate::lifecycle::{ChallengeLifecycle, LifecycleEvent, LifecycleState};
+use crate::state::StateStore;
+use crate::version::ChallengeVersion;
+use parking_lot::RwLock;
+use platform_core::ChallengeId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::sync::Arc;
+use tracing::{debug, info, warn};
+
+/// Entry for a registered challenge
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ChallengeEntry {
+    /// Unique challenge ID
+    pub id: ChallengeId,
+    /// Challenge name
+    pub name: String,
+    /// Current version
+    pub version: ChallengeVersion,
+    /// Docker image for the challenge
+    pub docker_image: String,
+    /// HTTP endpoint for evaluation
+    pub endpoint: Option<String>,
+    /// Current lifecycle state
+    pub lifecycle_state: LifecycleState,
+    /// Health status
+    pub health_status: HealthStatus,
+    /// Registration timestamp
+    pub registered_at: i64,
+    /// Last updated timestamp
+    pub updated_at: i64,
+    /// Configuration metadata
+    pub metadata: serde_json::Value,
+}
+
+impl ChallengeEntry {
+    pub fn new(name: String, version: ChallengeVersion, docker_image: String) -> Self {
+        let now = chrono::Utc::now().timestamp_millis();
+        Self {
+            id: ChallengeId::new(),
+            name,
+            version,
+            docker_image,
+            endpoint: None,
+            lifecycle_state: LifecycleState::Registered,
+            health_status: HealthStatus::Unknown,
+            registered_at: now,
+            updated_at: now,
+            metadata: serde_json::Value::Null,
+        }
+    }
+
+    pub fn with_endpoint(mut self, endpoint: String) -> Self {
+        self.endpoint = Some(endpoint);
+        self
+    }
+
+    pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
+        self.metadata = metadata;
+        self
+    }
+}
+
+/// A registered challenge with its full state
+#[derive(Clone, Debug)]
+pub struct RegisteredChallenge {
+    pub entry: ChallengeEntry,
+    pub state_store: Arc<StateStore>,
+}
+
+/// Main challenge registry
+pub struct ChallengeRegistry {
+    /// Registered challenges by ID
+    challenges: RwLock<HashMap<ChallengeId, RegisteredChallenge>>,
+    /// Name to ID mapping for lookups
+    name_index: RwLock<HashMap<String, ChallengeId>>,
+    /// Lifecycle manager
+    lifecycle: Arc<ChallengeLifecycle>,
+    /// Health monitor
+    health_monitor: Arc<HealthMonitor>,
+    /// Event listeners
+    event_listeners: RwLock<Vec<Box<dyn Fn(LifecycleEvent) + Send + Sync>>>,
+}
+
+impl ChallengeRegistry {
+    /// Create a new challenge registry
+    pub fn new() -> Self {
+        Self {
+            challenges: RwLock::new(HashMap::new()),
+            name_index: RwLock::new(HashMap::new()),
+            lifecycle: Arc::new(ChallengeLifecycle::new()),
+            health_monitor: Arc::new(HealthMonitor::new()),
+            event_listeners: RwLock::new(Vec::new()),
+        }
+    }
+
+    /// Register a new challenge
+    pub fn register(&self, entry: ChallengeEntry) -> RegistryResult<ChallengeId> {
+        let mut challenges = self.challenges.write();
+        let mut name_index = self.name_index.write();
+
+        // Check if already registered by name
+        if name_index.contains_key(&entry.name) {
+            return Err(RegistryError::AlreadyRegistered(entry.name.clone()));
+        }
+
+        let id = entry.id;
+        let name = entry.name.clone();
+
+        let state_store = Arc::new(StateStore::new(id));
+        let registered = RegisteredChallenge {
+            entry,
+            state_store,
+        };
+
+        challenges.insert(id, registered);
+        name_index.insert(name.clone(), id);
+
+        info!(challenge_id = %id, name = %name, "Challenge registered");
+        self.emit_event(LifecycleEvent::Registered { challenge_id: id });
+
+        Ok(id)
+    }
+
+    /// Unregister a challenge
+    pub fn unregister(&self, id: &ChallengeId) -> RegistryResult<ChallengeEntry> {
+        let mut challenges = self.challenges.write();
+        let mut name_index = self.name_index.write();
+
+        let registered = challenges
+            .remove(id)
+            .ok_or_else(|| RegistryError::ChallengeNotFound(id.to_string()))?;
+
+        name_index.remove(&registered.entry.name);
+
+        info!(challenge_id = %id, "Challenge unregistered");
+        self.emit_event(LifecycleEvent::Unregistered { challenge_id: *id });
+
+        Ok(registered.entry)
+    }
+
+    /// Get a challenge by ID
+    pub fn get(&self, id: &ChallengeId) -> Option<RegisteredChallenge> {
+        self.challenges.read().get(id).cloned()
+    }
+
+    /// Get a challenge by name
+    pub fn get_by_name(&self, name: &str) -> Option<RegisteredChallenge> {
+        let name_index = self.name_index.read();
+        let id = name_index.get(name)?;
+        self.challenges.read().get(id).cloned()
+    }
+
+    /// List all registered challenges
+    pub fn list(&self) -> Vec<ChallengeEntry> {
+        self.challenges
+            .read()
+            .values()
+            .map(|r| r.entry.clone())
+            .collect()
+    }
+
+    /// List active challenges (running and healthy)
+    pub fn list_active(&self) -> Vec<ChallengeEntry> {
+        self.challenges
+            .read()
+            .values()
+            .filter(|r| {
+                r.entry.lifecycle_state == LifecycleState::Running
+                    && r.entry.health_status == HealthStatus::Healthy
+            })
+            .map(|r| r.entry.clone())
+            .collect()
+    }
+
+    /// Update challenge lifecycle state
+    pub fn update_state(
+        &self,
+        id: &ChallengeId,
+        new_state: LifecycleState,
+    ) -> RegistryResult<()> {
+        let mut challenges = self.challenges.write();
+        let registered = challenges
+            .get_mut(id)
+            .ok_or_else(|| RegistryError::ChallengeNotFound(id.to_string()))?;
+
+        let old_state = registered.entry.lifecycle_state.clone();
+        registered.entry.lifecycle_state = new_state.clone();
+        registered.entry.updated_at = chrono::Utc::now().timestamp_millis();
+
+        debug!(
+            challenge_id = %id,
+            old_state = ?old_state,
+            new_state = ?new_state,
+            "Challenge state updated"
+        );
+
+        self.emit_event(LifecycleEvent::StateChanged {
+            challenge_id: *id,
+            old_state,
+            new_state,
+        });
+
+        Ok(())
+    }
+
+    /// Update challenge health status
+    pub fn update_health(&self, id: &ChallengeId, status: HealthStatus) -> RegistryResult<()> {
+        let mut challenges = self.challenges.write();
+        let registered = challenges
+            .get_mut(id)
+            .ok_or_else(|| RegistryError::ChallengeNotFound(id.to_string()))?;
+
+        registered.entry.health_status = status;
+        registered.entry.updated_at = chrono::Utc::now().timestamp_millis();
+
+        Ok(())
+    }
+
+    /// Update challenge version (for hot-reload)
+    pub fn update_version(
+        &self,
+        id: &ChallengeId,
+        new_version: ChallengeVersion,
+    ) -> RegistryResult<ChallengeVersion> {
+        let mut challenges = self.challenges.write();
+        let registered = challenges
+            .get_mut(id)
+            .ok_or_else(|| RegistryError::ChallengeNotFound(id.to_string()))?;
+
+        let old_version = registered.entry.version.clone();
+
+        if !new_version.is_compatible_with(&old_version) {
+            warn!(
+                challenge_id = %id,
+                old = %old_version,
+                new = %new_version,
+                "Breaking version change detected"
+            );
+        }
+
+        registered.entry.version = new_version.clone();
+        registered.entry.updated_at = chrono::Utc::now().timestamp_millis();
+
+        info!(
+            challenge_id = %id,
+            old_version = %old_version,
+            new_version = %new_version,
+            "Challenge version updated"
+        );
+
+        self.emit_event(LifecycleEvent::VersionChanged {
+            challenge_id: *id,
+            old_version: old_version.clone(),
+            new_version,
+        });
+
+        Ok(old_version)
+    }
+
+    /// Get state store for a challenge
+    pub fn state_store(&self, id: &ChallengeId) -> Option<Arc<StateStore>> {
+        self.challenges.read().get(id).map(|r| r.state_store.clone())
+    }
+
+    /// Add event listener
+    pub fn on_event<F>(&self, listener: F)
+    where
+        F: Fn(LifecycleEvent) + Send + Sync + 'static,
+    {
+        self.event_listeners.write().push(Box::new(listener));
+    }
+
+    /// Emit lifecycle event to all listeners
+    fn emit_event(&self, event: LifecycleEvent) {
+        for listener in self.event_listeners.read().iter() {
+            listener(event.clone());
+        }
+    }
+
+    /// Get lifecycle manager
+    pub fn lifecycle(&self) -> Arc<ChallengeLifecycle> {
+        self.lifecycle.clone()
+    }
+
+    /// Get health monitor
+    pub fn health_monitor(&self) -> Arc<HealthMonitor> {
+        self.health_monitor.clone()
+    }
+
+    /// Challenge count
+    pub fn count(&self) -> usize {
+        self.challenges.read().len()
+    }
+}
+
+impl Default for ChallengeRegistry {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_register_challenge() {
+        let registry = ChallengeRegistry::new();
+        let entry = ChallengeEntry::new(
+            "test-challenge".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+
+        let id = registry.register(entry).unwrap();
+        assert!(registry.get(&id).is_some());
+        assert_eq!(registry.count(), 1);
+    }
+
+    #[test]
+    fn test_duplicate_registration() {
+        let registry = ChallengeRegistry::new();
+        let entry1 = ChallengeEntry::new(
+            "test-challenge".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+        let entry2 = ChallengeEntry::new(
+            "test-challenge".to_string(),
+            ChallengeVersion::new(2, 0, 0),
+            "test:v2".to_string(),
+        );
+
+        registry.register(entry1).unwrap();
+        let result = registry.register(entry2);
+        assert!(matches!(result, Err(RegistryError::AlreadyRegistered(_))));
+    }
+
+    #[test]
+    fn test_get_by_name() {
+        let registry = ChallengeRegistry::new();
+        let entry = ChallengeEntry::new(
+            "my-challenge".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+
+        registry.register(entry).unwrap();
+        let found = registry.get_by_name("my-challenge");
+        assert!(found.is_some());
+        assert_eq!(found.unwrap().entry.name, "my-challenge");
+    }
+
+    #[test]
+    fn test_unregister() {
+        let registry = ChallengeRegistry::new();
+        let entry = ChallengeEntry::new(
+            "test".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+
+        let id = registry.register(entry).unwrap();
+        assert_eq!(registry.count(), 1);
+
+        registry.unregister(&id).unwrap();
+        assert_eq!(registry.count(), 0);
+    }
+
+    #[test]
+    fn test_update_state() {
+        let registry = ChallengeRegistry::new();
+        let entry = ChallengeEntry::new(
+            "test".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+
+        let id = registry.register(entry).unwrap();
+        registry.update_state(&id, LifecycleState::Running).unwrap();
+
+        let challenge = registry.get(&id).unwrap();
+        assert_eq!(challenge.entry.lifecycle_state, LifecycleState::Running);
+    }
+
+    #[test]
+    fn test_update_version() {
+        let registry = ChallengeRegistry::new();
+        let entry = ChallengeEntry::new(
+            "test".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+
+        let id = registry.register(entry).unwrap();
+        let old = registry.update_version(&id, ChallengeVersion::new(1, 1, 0)).unwrap();
+
+        assert_eq!(old, ChallengeVersion::new(1, 0, 0));
+
+        let challenge = registry.get(&id).unwrap();
+        assert_eq!(challenge.entry.version, ChallengeVersion::new(1, 1, 0));
+    }
+
+    #[test]
+    fn test_list_active() {
+        let registry = ChallengeRegistry::new();
+
+        // Register two challenges
+        let entry1 = ChallengeEntry::new(
+            "active".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+        let entry2 = ChallengeEntry::new(
+            "inactive".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+
+        let id1 = registry.register(entry1).unwrap();
+        registry.register(entry2).unwrap();
+
+        // Make first one active
+        registry.update_state(&id1, LifecycleState::Running).unwrap();
+        registry.update_health(&id1, HealthStatus::Healthy).unwrap();
+
+        let active = registry.list_active();
+        assert_eq!(active.len(), 1);
+        assert_eq!(active[0].name, "active");
+    }
+
+    #[test]
+    fn test_entry_builders() {
+        let entry = ChallengeEntry::new(
+            "test".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        )
+        .with_endpoint("http://localhost:8080".to_string())
+        .with_metadata(serde_json::json!({"key": "value"}));
+
+        assert_eq!(entry.endpoint, Some("http://localhost:8080".to_string()));
+        assert_eq!(entry.metadata["key"], "value");
+    }
+
+    #[test]
+    fn test_state_store_access() {
+        let registry = ChallengeRegistry::new();
+        let entry = ChallengeEntry::new(
+            "test".to_string(),
+            ChallengeVersion::new(1, 0, 0),
+            "test:latest".to_string(),
+        );
+
+        let id = registry.register(entry).unwrap();
+        let store = registry.state_store(&id);
+        assert!(store.is_some());
+    }
+}
diff --git a/crates/challenge-registry/src/state.rs b/crates/challenge-registry/src/state.rs
new file mode 100644
index 0000000..14e2e1c
--- /dev/null
+++ b/crates/challenge-registry/src/state.rs
@@ -0,0 +1,316 @@
+//! State management for challenge hot-reload
+//!
+//! Provides state persistence and restoration to support
+//! hot-reloading challenges without losing evaluation state.
+
+use crate::error::{RegistryError, RegistryResult};
+use parking_lot::RwLock;
+use platform_core::ChallengeId;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+/// Snapshot of challenge state at a point in time
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct StateSnapshot {
+    /// Challenge ID this snapshot belongs to
+    pub challenge_id: ChallengeId,
+    /// Version when snapshot was taken
+    pub version: String,
+    /// Timestamp when snapshot was created (millis)
+    pub created_at: i64,
+    /// Serialized state data
+    pub data: Vec<u8>,
+    /// Checksum for integrity verification
+    pub checksum: String,
+}
+
+impl StateSnapshot {
+    /// Create a new state snapshot
+    pub fn new(challenge_id: ChallengeId, version: String, data: Vec<u8>) -> Self {
+        use sha2::{Digest, Sha256};
+
+        let mut hasher = Sha256::new();
+        hasher.update(&data);
+        let checksum = hex::encode(hasher.finalize());
+
+        Self {
+            challenge_id,
+            version,
+            created_at: chrono::Utc::now().timestamp_millis(),
+            data,
+            checksum,
+        }
+    }
+
+    /// Verify snapshot integrity
+    pub fn verify(&self) -> bool {
+        use sha2::{Digest, Sha256};
+
+        let mut hasher = Sha256::new();
+        hasher.update(&self.data);
+        let computed = hex::encode(hasher.finalize());
+
+        computed == self.checksum
+    }
+
+    /// Get the size of the snapshot data
+    pub fn size(&self) -> usize {
+        self.data.len()
+    }
+}
+
+/// State of a challenge that can be preserved across hot-reloads
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ChallengeState {
+    /// Challenge ID
+    pub challenge_id: ChallengeId,
+    /// Active evaluations being tracked
+    pub active_evaluations: HashMap<String, EvaluationState>,
+    /// Completed evaluation count
+    pub completed_count: u64,
+    /// Last activity timestamp
+    pub last_activity_at: i64,
+    /// Custom state data from the challenge
+    pub custom_data: serde_json::Value,
+}
+
+/// State of an in-progress evaluation
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct EvaluationState {
+    /// Evaluation job ID
+    pub job_id: String,
+    /// When evaluation started (millis)
+    pub started_at: i64,
+    /// Current progress (0.0 - 1.0)
+    pub progress: f64,
+    /// Checkpoint data for resumption
+    pub checkpoint: Option<Vec<u8>>,
+}
+
+impl ChallengeState {
+    /// Create new empty state for a challenge
+    pub fn new(challenge_id: ChallengeId) -> Self {
+        Self {
+            challenge_id,
+            active_evaluations: HashMap::new(),
+            completed_count: 0,
+            last_activity_at: chrono::Utc::now().timestamp_millis(),
+            custom_data: serde_json::Value::Null,
+        }
+    }
+
+    /// Check if there are active evaluations
+    pub fn has_active_evaluations(&self) -> bool {
+        !self.active_evaluations.is_empty()
+    }
+
+    /// Get count of active evaluations
+    pub fn active_evaluation_count(&self) -> usize {
+        self.active_evaluations.len()
+    }
+}
+
+/// Store for challenge state with persistence support
+#[derive(Debug)]
+pub struct StateStore {
+    /// Challenge this store belongs to
+    challenge_id: ChallengeId,
+    /// In-memory state
+    state: RwLock<ChallengeState>,
+    /// Snapshots for recovery
+    snapshots: RwLock<Vec<StateSnapshot>>,
+    /// Maximum snapshots to retain
+    max_snapshots: usize,
+}
+
+impl StateStore {
+    /// Create a new state store for a challenge
+    pub fn new(challenge_id: ChallengeId) -> Self {
+        Self {
+            challenge_id,
+            state: RwLock::new(ChallengeState::new(challenge_id)),
+            snapshots: RwLock::new(Vec::new()),
+            max_snapshots: 5,
+        }
+    }
+
+    /// Create a state store with custom snapshot limit
+    pub fn with_max_snapshots(challenge_id: ChallengeId, max_snapshots: usize) -> Self {
+        Self {
+            challenge_id,
+            state: RwLock::new(ChallengeState::new(challenge_id)),
+            snapshots: RwLock::new(Vec::new()),
+            max_snapshots,
+        }
+    }
+
+    /// Get current state (read-only)
+    pub fn get_state(&self) -> ChallengeState {
+        self.state.read().clone()
+    }
+
+    /// Update state with a function
+    pub fn update_state<F>(&self, f: F)
+    where
+        F: FnOnce(&mut ChallengeState),
+    {
+        let mut state = self.state.write();
+        f(&mut state);
+        state.last_activity_at = chrono::Utc::now().timestamp_millis();
+    }
+
+    /// Track a new evaluation
+    pub fn track_evaluation(&self, job_id: String) {
+        let mut state = self.state.write();
+        state.active_evaluations.insert(
+            job_id.clone(),
+            EvaluationState {
+                job_id,
+                started_at: chrono::Utc::now().timestamp_millis(),
+                progress: 0.0,
+                checkpoint: None,
+            },
+        );
+        state.last_activity_at = chrono::Utc::now().timestamp_millis();
+    }
+
+    /// Update evaluation progress
+    pub fn update_evaluation_progress(&self, job_id: &str, progress: f64) {
+        let mut state = self.state.write();
+        if let Some(eval) = state.active_evaluations.get_mut(job_id) {
+            eval.progress = progress.clamp(0.0, 1.0);
+        }
+        state.last_activity_at = chrono::Utc::now().timestamp_millis();
+    }
+
+    /// Complete an evaluation
+    pub fn complete_evaluation(&self, job_id: &str) {
+        let mut state = self.state.write();
+        state.active_evaluations.remove(job_id);
+        state.completed_count += 1;
+        state.last_activity_at = chrono::Utc::now().timestamp_millis();
+    }
+
+    /// Create a snapshot of current state
+    pub fn create_snapshot(&self, version: String) -> RegistryResult<StateSnapshot> {
+        let state = self.state.read();
+        // Use JSON for serialization since ChallengeState contains serde_json::Value
+        let data = serde_json::to_vec(&*state)
+            .map_err(|e| RegistryError::StatePersistence(e.to_string()))?;
+
+        let snapshot = StateSnapshot::new(self.challenge_id, version, data);
+
+        let mut snapshots = self.snapshots.write();
+        snapshots.push(snapshot.clone());
+
+        // Trim old snapshots
+        while snapshots.len() > self.max_snapshots {
+            snapshots.remove(0);
+        }
+
+        Ok(snapshot)
+    }
+
+    /// Restore state from a snapshot
+    pub fn restore_snapshot(&self, snapshot: &StateSnapshot) -> RegistryResult<()> {
+        if !snapshot.verify() {
+            return Err(RegistryError::StateRestoration(
+                "Snapshot checksum mismatch".to_string(),
+            ));
+        }
+
+        // Use JSON for deserialization since ChallengeState contains serde_json::Value
+        let restored: ChallengeState = serde_json::from_slice(&snapshot.data)
+            .map_err(|e| RegistryError::StateRestoration(e.to_string()))?;
+
+        let mut state = self.state.write();
+        *state = restored;
+
+        Ok(())
+    }
+
+    /// Get list of available snapshots
+    pub fn list_snapshots(&self) -> Vec<StateSnapshot> {
+        self.snapshots.read().clone()
+    }
+
+    /// Get the latest snapshot
+    pub fn latest_snapshot(&self) -> Option<StateSnapshot> {
+        self.snapshots.read().last().cloned()
+    }
+
+    /// Clear all state
+    pub fn clear(&self) {
+        let mut state = self.state.write();
+        *state = ChallengeState::new(self.challenge_id);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_state_store() {
+        let id = ChallengeId::new();
+        let store = StateStore::new(id);
+
+        store.track_evaluation("job1".to_string());
+        let state = store.get_state();
+        assert_eq!(state.active_evaluation_count(), 1);
+
+        store.update_evaluation_progress("job1", 0.5);
+        let state = store.get_state();
+        let eval = state.active_evaluations.get("job1").unwrap();
+        assert_eq!(eval.progress, 0.5);
+
+        store.complete_evaluation("job1");
+        let state = store.get_state();
+        assert_eq!(state.active_evaluation_count(), 0);
+        assert_eq!(state.completed_count, 1);
+    }
+
+    #[test]
+    fn test_snapshot_creation() {
+        let id = ChallengeId::new();
+        let store = StateStore::new(id);
+
+        store.track_evaluation("job1".to_string());
+        let snapshot = store.create_snapshot("1.0.0".to_string()).unwrap();
+
+        assert!(snapshot.verify());
+        assert_eq!(snapshot.version, "1.0.0");
+    }
+
+    #[test]
+    fn test_snapshot_restoration() {
+        let id = ChallengeId::new();
+        let store = StateStore::new(id);
+
+        store.track_evaluation("job1".to_string());
+        store.track_evaluation("job2".to_string());
+        let snapshot = store.create_snapshot("1.0.0".to_string()).unwrap();
+
+        // Clear and verify empty
+        store.clear();
+        assert_eq!(store.get_state().active_evaluation_count(), 0);
+
+        // Restore and verify
+        store.restore_snapshot(&snapshot).unwrap();
+        assert_eq!(store.get_state().active_evaluation_count(), 2);
+    }
+
+    #[test]
+    fn test_snapshot_limit() {
+        let id = ChallengeId::new();
+        let store = StateStore::with_max_snapshots(id, 3);
+
+        for i in 0..5 {
+            store.create_snapshot(format!("{}.0.0", i)).unwrap();
+        }
+
+        let snapshots = store.list_snapshots();
+        assert_eq!(snapshots.len(), 3);
+        assert_eq!(snapshots[0].version, "2.0.0");
+    }
+}
diff --git a/crates/challenge-registry/src/version.rs b/crates/challenge-registry/src/version.rs
new file mode 100644
index 0000000..d325c56
--- /dev/null
+++ b/crates/challenge-registry/src/version.rs
@@ -0,0 +1,164 @@
+//! Challenge versioning support
+
+use serde::{Deserialize, Serialize};
+use std::cmp::Ordering;
+use std::fmt;
+
+/// Semantic version for challenges
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
+pub struct ChallengeVersion {
+    pub major: u32,
+    pub minor: u32,
+    pub patch: u32,
+    pub prerelease: Option<String>,
+}
+
+impl ChallengeVersion {
+    pub fn new(major: u32, minor: u32, patch: u32) -> Self {
+        Self {
+            major,
+            minor,
+            patch,
+            prerelease: None,
+        }
+    }
+
+    pub fn parse(s: &str) -> Option<Self> {
+        let s = s.strip_prefix('v').unwrap_or(s);
+        let parts: Vec<&str> = s.split('-').collect();
+        let version_parts: Vec<&str> = parts[0].split('.').collect();
+
+        if version_parts.len() < 3 {
+            return None;
+        }
+
+        Some(Self {
+            major: version_parts[0].parse().ok()?,
+            minor: version_parts[1].parse().ok()?,
+            patch: version_parts[2].parse().ok()?,
+            prerelease: parts.get(1).map(|s| s.to_string()),
+        })
+    }
+
+    /// Check if this version is compatible with another (same major version)
+    pub fn is_compatible_with(&self, other: &Self) -> bool {
+        self.major == other.major
+    }
+
+    /// Check if this version is newer than another
+    pub fn is_newer_than(&self, other: &Self) -> bool {
+        self > other
+    }
+}
+
+impl fmt::Display for ChallengeVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match &self.prerelease {
+            Some(pre) => write!(f, "{}.{}.{}-{}", self.major, self.minor, self.patch, pre),
+            None => write!(f, "{}.{}.{}", self.major, self.minor, self.patch),
+        }
+    }
+}
+
+impl PartialOrd for ChallengeVersion {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for ChallengeVersion {
+    fn cmp(&self, other: &Self) -> Ordering {
+        match self.major.cmp(&other.major) {
+            Ordering::Equal => match self.minor.cmp(&other.minor) {
+                Ordering::Equal => self.patch.cmp(&other.patch),
+                ord => ord,
+            },
+            ord => ord,
+        }
+    }
+}
+
+impl Default for ChallengeVersion {
+    fn default() -> Self {
+        Self::new(0, 1, 0)
+    }
+}
+
+/// Version constraint for challenge compatibility
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub enum VersionConstraint {
+    /// Exact version match
+    Exact(ChallengeVersion),
+    /// Minimum version (>=)
+    AtLeast(ChallengeVersion),
+    /// Version range [min, max)
+    Range {
+        min: ChallengeVersion,
+        max: ChallengeVersion,
+    },
+    /// Compatible with major version (^)
+    Compatible(ChallengeVersion),
+    /// Any version
+    Any,
+}
+
+impl VersionConstraint {
+    pub fn satisfies(&self, version: &ChallengeVersion) -> bool {
+        match self {
+            Self::Exact(v) => version == v,
+            Self::AtLeast(v) => version >= v,
+            Self::Range { min, max } => version >= min && version < max,
+            Self::Compatible(v) => version.major == v.major && version >= v,
+            Self::Any => true,
+        }
+    }
+}
+
+/// A challenge with version information
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct VersionedChallenge {
+    pub challenge_id: String,
+    pub version: ChallengeVersion,
+    pub min_platform_version: Option<ChallengeVersion>,
+    pub deprecated: bool,
+    pub deprecation_message: Option<String>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_version_parsing() {
+        let v = ChallengeVersion::parse("1.2.3").unwrap();
+        assert_eq!(v.major, 1);
+        assert_eq!(v.minor, 2);
+        assert_eq!(v.patch, 3);
+
+        let v2 = ChallengeVersion::parse("v2.0.0-beta").unwrap();
+        assert_eq!(v2.major, 2);
+        assert_eq!(v2.prerelease, Some("beta".to_string()));
+    }
+
+    #[test]
+    fn test_version_comparison() {
+        let v1 = ChallengeVersion::new(1, 0, 0);
+        let v2 = ChallengeVersion::new(1, 1, 0);
+        let v3 = ChallengeVersion::new(2, 0, 0);
+
+        assert!(v2.is_newer_than(&v1));
+        assert!(v3.is_newer_than(&v2));
+        assert!(v1.is_compatible_with(&v2));
+        assert!(!v1.is_compatible_with(&v3));
+    }
+
+    #[test]
+    fn test_version_constraints() {
+        let v = ChallengeVersion::new(1, 5, 0);
+
+        assert!(VersionConstraint::Any.satisfies(&v));
+        assert!(VersionConstraint::AtLeast(ChallengeVersion::new(1, 0, 0)).satisfies(&v));
+        assert!(!VersionConstraint::Exact(ChallengeVersion::new(1, 0, 0)).satisfies(&v));
+        assert!(VersionConstraint::Compatible(ChallengeVersion::new(1, 0, 0)).satisfies(&v));
+    }
+}

From 56ea011a4be1d998e8ef42cb3d88a5d61c5f3369 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Tue, 3 Feb 2026 11:09:01 +0000
Subject: [PATCH 3/8] feat(core): add checkpoint system for state persistence

---
 crates/core/Cargo.toml        |   3 +
 crates/core/src/checkpoint.rs | 741 ++++++++++++++++++++++++++++++++++
 crates/core/src/lib.rs        |   5 +
 3 files changed, 749 insertions(+)
 create mode 100644 crates/core/src/checkpoint.rs

diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml
index f6cdae5..97e9c3f 100644
--- a/crates/core/Cargo.toml
+++ b/crates/core/Cargo.toml
@@ -20,3 +20,6 @@ bs58 = "0.5"
 # Sr25519 crypto (Substrate standard)
 sp-core = { version = "31.0", default-features = false, features = ["std"] }
 schnorrkel = "0.11"
+
+[dev-dependencies]
+tempfile = { workspace = true }
diff --git a/crates/core/src/checkpoint.rs b/crates/core/src/checkpoint.rs
new file mode 100644
index 0000000..b627e4a
--- /dev/null
+++ b/crates/core/src/checkpoint.rs
@@ -0,0 +1,741 @@
+//! Checkpoint system for state persistence
+//!
+//! Provides mechanisms to save and restore evaluation state, enabling:
+//! - Hot-reload without losing progress
+//! - Crash recovery
+//! - Rolling updates
+
+use crate::{ChallengeId, Hotkey, MiniChainError, Result};
+use serde::{Deserialize, Serialize};
+use sha2::{Digest, Sha256};
+use std::collections::HashMap;
+use std::fs::{self, File};
+use std::io::{BufReader, BufWriter, Read, Write};
+use std::path::{Path, PathBuf};
+use std::time::SystemTime;
+use tracing::{debug, info, warn};
+
+/// Checkpoint version for format compatibility
+pub const CHECKPOINT_VERSION: u32 = 1;
+
+/// Magic bytes for checkpoint file identification
+const CHECKPOINT_MAGIC: &[u8; 8] = b"PLATCHKP";
+
+/// Checkpoint file header
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct CheckpointHeader {
+    /// Magic bytes (verified on load)
+    pub magic: [u8; 8],
+    /// Checkpoint format version
+    pub version: u32,
+    /// Creation timestamp (Unix millis)
+    pub created_at: i64,
+    /// Checkpoint sequence number
+    pub sequence: u64,
+    /// SHA-256 hash of the data section
+    pub data_hash: [u8; 32],
+    /// Size of the data section in bytes
+    pub data_size: u64,
+}
+
+impl CheckpointHeader {
+    pub fn new(sequence: u64, data_hash: [u8; 32], data_size: u64) -> Self {
+        Self {
+            magic: *CHECKPOINT_MAGIC,
+            version: CHECKPOINT_VERSION,
+            created_at: chrono::Utc::now().timestamp_millis(),
+            sequence,
+            data_hash,
+            data_size,
+        }
+    }
+
+    pub fn verify_magic(&self) -> bool {
+        self.magic == *CHECKPOINT_MAGIC
+    }
+}
+
+/// State of a pending evaluation
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PendingEvaluationState {
+    /// Submission ID
+    pub submission_id: String,
+    /// Challenge ID
+    pub challenge_id: ChallengeId,
+    /// Miner hotkey
+    pub miner: Hotkey,
+    /// Submission hash
+    pub submission_hash: String,
+    /// Evaluation scores received (validator -> score)
+    pub scores: HashMap<Hotkey, f64>,
+    /// Creation timestamp
+    pub created_at: i64,
+    /// Whether finalization is in progress
+    pub finalizing: bool,
+}
+
+/// Completed evaluation record
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct CompletedEvaluationState {
+    /// Submission ID
+    pub submission_id: String,
+    /// Challenge ID
+    pub challenge_id: ChallengeId,
+    /// Final aggregated score
+    pub final_score: f64,
+    /// Epoch when completed
+    pub epoch: u64,
+    /// Completion timestamp
+    pub completed_at: i64,
+}
+
+/// Weight vote state
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct WeightVoteState {
+    /// Epoch for these weights
+    pub epoch: u64,
+    /// Netuid
+    pub netuid: u16,
+    /// Votes by validator
+    pub votes: HashMap<Hotkey, Vec<(u16, u16)>>,
+    /// Whether finalized
+    pub finalized: bool,
+    /// Final weights if finalized
+    pub final_weights: Option<Vec<(u16, u16)>>,
+}
+
+/// Full checkpoint data
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct CheckpointData {
+    /// Current sequence number
+    pub sequence: u64,
+    /// Current epoch
+    pub epoch: u64,
+    /// Netuid
+    pub netuid: u16,
+    /// Pending evaluations
+    pub pending_evaluations: Vec<PendingEvaluationState>,
+    /// Recent completed evaluations (last N epochs)
+    pub completed_evaluations: Vec<CompletedEvaluationState>,
+    /// Current weight votes
+    pub weight_votes: Option<WeightVoteState>,
+    /// Bittensor block number at checkpoint
+    pub bittensor_block: u64,
+    /// Additional metadata
+    pub metadata: HashMap<String, String>,
+}
+
+impl CheckpointData {
+    pub fn new(sequence: u64, epoch: u64, netuid: u16) -> Self {
+        Self {
+            sequence,
+            epoch,
+            netuid,
+            pending_evaluations: Vec::new(),
+            completed_evaluations: Vec::new(),
+            weight_votes: None,
+            bittensor_block: 0,
+            metadata: HashMap::new(),
+        }
+    }
+
+    /// Add pending evaluation
+    pub fn add_pending(&mut self, state: PendingEvaluationState) {
+        self.pending_evaluations.push(state);
+    }
+
+    /// Add completed evaluation
+    pub fn add_completed(&mut self, state: CompletedEvaluationState) {
+        self.completed_evaluations.push(state);
+    }
+
+    /// Calculate hash of checkpoint data
+    pub fn calculate_hash(&self) -> Result<[u8; 32]> {
+        let bytes =
+            bincode::serialize(self).map_err(|e| MiniChainError::Serialization(e.to_string()))?;
+        let mut hasher = Sha256::new();
+        hasher.update(&bytes);
+        Ok(hasher.finalize().into())
+    }
+}
+
+/// Checkpoint manager for persisting and restoring state
+pub struct CheckpointManager {
+    /// Directory for checkpoint files
+    checkpoint_dir: PathBuf,
+    /// Maximum number of checkpoints to keep
+    max_checkpoints: usize,
+    /// Current checkpoint sequence
+    current_sequence: u64,
+}
+
+impl CheckpointManager {
+    /// Create a new checkpoint manager
+    pub fn new<P: AsRef<Path>>(checkpoint_dir: P, max_checkpoints: usize) -> Result<Self> {
+        let checkpoint_dir = checkpoint_dir.as_ref().to_path_buf();
+
+        // Create checkpoint directory if it doesn't exist
+        fs::create_dir_all(&checkpoint_dir).map_err(|e| {
+            MiniChainError::Storage(format!("Failed to create checkpoint dir: {}", e))
+        })?;
+
+        // Find the latest checkpoint sequence
+        let current_sequence = Self::find_latest_sequence(&checkpoint_dir)?;
+
+        info!(
+            dir = %checkpoint_dir.display(),
+            max_checkpoints,
+            current_sequence,
+            "Checkpoint manager initialized"
+        );
+
+        Ok(Self {
+            checkpoint_dir,
+            max_checkpoints,
+            current_sequence,
+        })
+    }
+
+    /// Find the latest checkpoint sequence number
+    fn find_latest_sequence(dir: &Path) -> Result<u64> {
+        let mut max_seq = 0u64;
+
+        if let Ok(entries) = fs::read_dir(dir) {
+            for entry in entries.flatten() {
+                let path = entry.path();
+                if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
+                    if name.starts_with("checkpoint_") && name.ends_with(".bin") {
+                        if let Some(seq_str) = name
+                            .strip_prefix("checkpoint_")
+                            .and_then(|s| s.strip_suffix(".bin"))
+                        {
+                            if let Ok(seq) = seq_str.parse::<u64>() {
+                                max_seq = max_seq.max(seq);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(max_seq)
+    }
+
+    /// Generate checkpoint filename
+    fn checkpoint_filename(&self, sequence: u64) -> PathBuf {
+        self.checkpoint_dir
+            .join(format!("checkpoint_{:016}.bin", sequence))
+    }
+
+    /// Create a new checkpoint
+    pub fn create_checkpoint(&mut self, data: &CheckpointData) -> Result<PathBuf> {
+        self.current_sequence += 1;
+        let sequence = self.current_sequence;
+        let filename = self.checkpoint_filename(sequence);
+
+        // Serialize data
+        let data_bytes =
+            bincode::serialize(data).map_err(|e| MiniChainError::Serialization(e.to_string()))?;
+
+        // Calculate hash
+        let mut hasher = Sha256::new();
+        hasher.update(&data_bytes);
+        let data_hash: [u8; 32] = hasher.finalize().into();
+
+        // Create header
+        let header = CheckpointHeader::new(sequence, data_hash, data_bytes.len() as u64);
+        let header_bytes =
+            bincode::serialize(&header).map_err(|e| MiniChainError::Serialization(e.to_string()))?;
+
+        // Write to file atomically (write to temp, then rename)
+        let temp_filename = filename.with_extension("tmp");
+        {
+            let file = File::create(&temp_filename).map_err(|e| {
+                MiniChainError::Storage(format!("Failed to create checkpoint: {}", e))
+            })?;
+            let mut writer = BufWriter::new(file);
+
+            // Write header length (4 bytes)
+            let header_len = header_bytes.len() as u32;
+            writer
+                .write_all(&header_len.to_le_bytes())
+                .map_err(|e| MiniChainError::Storage(e.to_string()))?;
+
+            // Write header
+            writer
+                .write_all(&header_bytes)
+                .map_err(|e| MiniChainError::Storage(e.to_string()))?;
+
+            // Write data
+            writer
+                .write_all(&data_bytes)
+                .map_err(|e| MiniChainError::Storage(e.to_string()))?;
+
+            writer
+                .flush()
+                .map_err(|e| MiniChainError::Storage(e.to_string()))?;
+        }
+
+        // Atomic rename
+        fs::rename(&temp_filename, &filename).map_err(|e| {
+            MiniChainError::Storage(format!("Failed to finalize checkpoint: {}", e))
+        })?;
+
+        info!(
+            sequence,
+            path = %filename.display(),
+            size = data_bytes.len(),
+            "Checkpoint created"
+        );
+
+        // Cleanup old checkpoints
+        self.cleanup_old_checkpoints()?;
+
+        Ok(filename)
+    }
+
+    /// Load the latest checkpoint
+    pub fn load_latest(&self) -> Result<Option<(CheckpointHeader, CheckpointData)>> {
+        if self.current_sequence == 0 {
+            return Ok(None);
+        }
+
+        self.load_checkpoint(self.current_sequence)
+    }
+
+    /// Load a specific checkpoint
+    pub fn load_checkpoint(
+        &self,
+        sequence: u64,
+    ) -> Result<Option<(CheckpointHeader, CheckpointData)>> {
+        let filename = self.checkpoint_filename(sequence);
+
+        if !filename.exists() {
+            return Ok(None);
+        }
+
+        let file = File::open(&filename)
+            .map_err(|e| MiniChainError::Storage(format!("Failed to open checkpoint: {}", e)))?;
+        let mut reader = BufReader::new(file);
+
+        // Read header length
+        let mut header_len_bytes = [0u8; 4];
+        reader.read_exact(&mut header_len_bytes).map_err(|e| {
+            MiniChainError::Storage(format!("Failed to read header length: {}", e))
+        })?;
+        let header_len = u32::from_le_bytes(header_len_bytes) as usize;
+
+        // Read header
+        let mut header_bytes = vec![0u8; header_len];
+        reader
+            .read_exact(&mut header_bytes)
+            .map_err(|e| MiniChainError::Storage(format!("Failed to read header: {}", e)))?;
+
+        let header: CheckpointHeader = bincode::deserialize(&header_bytes).map_err(|e| {
+            MiniChainError::Serialization(format!("Failed to deserialize header: {}", e))
+        })?;
+
+        // Verify magic
+        if !header.verify_magic() {
+            return Err(MiniChainError::Storage(
+                "Invalid checkpoint magic bytes".into(),
+            ));
+        }
+
+        // Verify version compatibility
+        if header.version > CHECKPOINT_VERSION {
+            return Err(MiniChainError::Storage(format!(
+                "Checkpoint version {} is newer than supported version {}",
+                header.version, CHECKPOINT_VERSION
+            )));
+        }
+
+        // Read data
+        let mut data_bytes = vec![0u8; header.data_size as usize];
+        reader
+            .read_exact(&mut data_bytes)
+            .map_err(|e| MiniChainError::Storage(format!("Failed to read data: {}", e)))?;
+
+        // Verify hash
+        let mut hasher = Sha256::new();
+        hasher.update(&data_bytes);
+        let actual_hash: [u8; 32] = hasher.finalize().into();
+
+        if actual_hash != header.data_hash {
+            return Err(MiniChainError::Storage(
+                "Checkpoint data hash mismatch".into(),
+            ));
+        }
+
+        // Deserialize data
+        let data: CheckpointData = bincode::deserialize(&data_bytes).map_err(|e| {
+            MiniChainError::Serialization(format!("Failed to deserialize data: {}", e))
+        })?;
+
+        info!(
+            sequence,
+            epoch = data.epoch,
+            pending_count = data.pending_evaluations.len(),
+            "Checkpoint loaded"
+        );
+
+        Ok(Some((header, data)))
+    }
+
+    /// List all available checkpoints
+    pub fn list_checkpoints(&self) -> Result<Vec<(u64, PathBuf, SystemTime)>> {
+        let mut checkpoints = Vec::new();
+
+        if let Ok(entries) = fs::read_dir(&self.checkpoint_dir) {
+            for entry in entries.flatten() {
+                let path = entry.path();
+                if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
+                    if name.starts_with("checkpoint_") && name.ends_with(".bin") {
+                        if let Some(seq_str) = name
+                            .strip_prefix("checkpoint_")
+                            .and_then(|s| s.strip_suffix(".bin"))
+                        {
+                            if let Ok(seq) = seq_str.parse::<u64>() {
+                                if let Ok(meta) = entry.metadata() {
+                                    if let Ok(modified) = meta.modified() {
+                                        checkpoints.push((seq, path, modified));
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        checkpoints.sort_by_key(|(seq, _, _)| *seq);
+        Ok(checkpoints)
+    }
+
+    /// Clean up old checkpoints
+    fn cleanup_old_checkpoints(&self) -> Result<()> {
+        let checkpoints = self.list_checkpoints()?;
+
+        if checkpoints.len() <= self.max_checkpoints {
+            return Ok(());
+        }
+
+        let to_remove = checkpoints.len() - self.max_checkpoints;
+        for (seq, path, _) in checkpoints.into_iter().take(to_remove) {
+            debug!(sequence = seq, path = %path.display(), "Removing old checkpoint");
+            if let Err(e) = fs::remove_file(&path) {
+                warn!(path = %path.display(), error = %e, "Failed to remove old checkpoint");
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Get checkpoint directory
+    pub fn checkpoint_dir(&self) -> &Path {
+        &self.checkpoint_dir
+    }
+
+    /// Get current sequence
+    pub fn current_sequence(&self) -> u64 {
+        self.current_sequence
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::tempdir;
+
+    #[test]
+    fn test_checkpoint_header() {
+        let header = CheckpointHeader::new(1, [0u8; 32], 100);
+        assert!(header.verify_magic());
+        assert_eq!(header.version, CHECKPOINT_VERSION);
+    }
+
+    #[test]
+    fn test_checkpoint_header_invalid_magic() {
+        let mut header = CheckpointHeader::new(1, [0u8; 32], 100);
+        header.magic = *b"INVALID!";
+        assert!(!header.verify_magic());
+    }
+
+    #[test]
+    fn test_checkpoint_data_hash() {
+        let data = CheckpointData::new(1, 0, 100);
+        let hash1 = data.calculate_hash().unwrap();
+
+        let mut data2 = data.clone();
+        data2.sequence = 2;
+        let hash2 = data2.calculate_hash().unwrap();
+
+        assert_ne!(hash1, hash2);
+    }
+
+    #[test]
+    fn test_checkpoint_data_new() {
+        let data = CheckpointData::new(5, 10, 200);
+        assert_eq!(data.sequence, 5);
+        assert_eq!(data.epoch, 10);
+        assert_eq!(data.netuid, 200);
+        assert!(data.pending_evaluations.is_empty());
+        assert!(data.completed_evaluations.is_empty());
+        assert!(data.weight_votes.is_none());
+        assert_eq!(data.bittensor_block, 0);
+        assert!(data.metadata.is_empty());
+    }
+
+    #[test]
+    fn test_checkpoint_data_add_pending() {
+        let mut data = CheckpointData::new(1, 0, 100);
+        let pending = PendingEvaluationState {
+            submission_id: "sub1".to_string(),
+            challenge_id: ChallengeId::new(),
+            miner: Hotkey([1u8; 32]),
+            submission_hash: "abc123".to_string(),
+            scores: HashMap::new(),
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: false,
+        };
+        data.add_pending(pending);
+        assert_eq!(data.pending_evaluations.len(), 1);
+    }
+
+    #[test]
+    fn test_checkpoint_data_add_completed() {
+        let mut data = CheckpointData::new(1, 0, 100);
+        let completed = CompletedEvaluationState {
+            submission_id: "sub1".to_string(),
+            challenge_id: ChallengeId::new(),
+            final_score: 0.85,
+            epoch: 5,
+            completed_at: chrono::Utc::now().timestamp_millis(),
+        };
+        data.add_completed(completed);
+        assert_eq!(data.completed_evaluations.len(), 1);
+    }
+
+    #[test]
+    fn test_checkpoint_manager_roundtrip() {
+        let dir = tempdir().unwrap();
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+
+        let mut data = CheckpointData::new(1, 0, 100);
+        data.pending_evaluations.push(PendingEvaluationState {
+            submission_id: "sub1".to_string(),
+            challenge_id: ChallengeId::new(),
+            miner: Hotkey([1u8; 32]),
+            submission_hash: "abc123".to_string(),
+            scores: HashMap::new(),
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: false,
+        });
+
+        let path = manager.create_checkpoint(&data).unwrap();
+        assert!(path.exists());
+
+        let (header, loaded) = manager.load_latest().unwrap().unwrap();
+        assert_eq!(header.sequence, 1);
+        assert_eq!(loaded.sequence, data.sequence);
+        assert_eq!(loaded.pending_evaluations.len(), 1);
+    }
+
+    #[test]
+    fn test_checkpoint_manager_no_checkpoints() {
+        let dir = tempdir().unwrap();
+        let manager = CheckpointManager::new(dir.path(), 5).unwrap();
+        assert!(manager.load_latest().unwrap().is_none());
+        assert_eq!(manager.current_sequence(), 0);
+    }
+
+    #[test]
+    fn test_checkpoint_cleanup() {
+        let dir = tempdir().unwrap();
+        let mut manager = CheckpointManager::new(dir.path(), 3).unwrap();
+
+        for i in 0..5 {
+            let data = CheckpointData::new(i, 0, 100);
+            manager.create_checkpoint(&data).unwrap();
+        }
+
+        let checkpoints = manager.list_checkpoints().unwrap();
+        assert_eq!(checkpoints.len(), 3);
+    }
+
+    #[test]
+    fn test_checkpoint_list() {
+        let dir = tempdir().unwrap();
+        let mut manager = CheckpointManager::new(dir.path(), 10).unwrap();
+
+        for i in 0..3 {
+            let data = CheckpointData::new(i, i, 100);
+            manager.create_checkpoint(&data).unwrap();
+        }
+
+        let checkpoints = manager.list_checkpoints().unwrap();
+        assert_eq!(checkpoints.len(), 3);
+
+        // Verify sorted by sequence
+        assert_eq!(checkpoints[0].0, 1);
+        assert_eq!(checkpoints[1].0, 2);
+        assert_eq!(checkpoints[2].0, 3);
+    }
+
+    #[test]
+    fn test_checkpoint_load_specific() {
+        let dir = tempdir().unwrap();
+        let mut manager = CheckpointManager::new(dir.path(), 10).unwrap();
+
+        for i in 0..3 {
+            let mut data = CheckpointData::new(i, i * 10, 100);
+            data.metadata
+                .insert("test_key".to_string(), format!("value_{}", i));
+            manager.create_checkpoint(&data).unwrap();
+        }
+
+        // Load specific checkpoint
+        let (header, data) = manager.load_checkpoint(2).unwrap().unwrap();
+        assert_eq!(header.sequence, 2);
+        assert_eq!(data.epoch, 10);
+        assert_eq!(
+            data.metadata.get("test_key"),
+            Some(&"value_1".to_string())
+        );
+    }
+
+    #[test]
+    fn test_checkpoint_load_nonexistent() {
+        let dir = tempdir().unwrap();
+        let manager = CheckpointManager::new(dir.path(), 5).unwrap();
+        assert!(manager.load_checkpoint(999).unwrap().is_none());
+    }
+
+    #[test]
+    fn test_checkpoint_resume_sequence() {
+        let dir = tempdir().unwrap();
+
+        // First manager creates some checkpoints
+        {
+            let mut manager = CheckpointManager::new(dir.path(), 10).unwrap();
+            for i in 0..3 {
+                let data = CheckpointData::new(i, i, 100);
+                manager.create_checkpoint(&data).unwrap();
+            }
+            assert_eq!(manager.current_sequence(), 3);
+        }
+
+        // New manager should resume from the latest sequence
+        {
+            let manager = CheckpointManager::new(dir.path(), 10).unwrap();
+            assert_eq!(manager.current_sequence(), 3);
+        }
+    }
+
+    #[test]
+    fn test_checkpoint_with_scores() {
+        let dir = tempdir().unwrap();
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+
+        let mut scores = HashMap::new();
+        scores.insert(Hotkey([1u8; 32]), 0.95);
+        scores.insert(Hotkey([2u8; 32]), 0.87);
+
+        let mut data = CheckpointData::new(1, 5, 100);
+        data.pending_evaluations.push(PendingEvaluationState {
+            submission_id: "sub_with_scores".to_string(),
+            challenge_id: ChallengeId::new(),
+            miner: Hotkey([3u8; 32]),
+            submission_hash: "hash123".to_string(),
+            scores,
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: true,
+        });
+
+        manager.create_checkpoint(&data).unwrap();
+
+        let (_, loaded) = manager.load_latest().unwrap().unwrap();
+        let pending = &loaded.pending_evaluations[0];
+        assert_eq!(pending.scores.len(), 2);
+        assert_eq!(pending.scores.get(&Hotkey([1u8; 32])), Some(&0.95));
+        assert!(pending.finalizing);
+    }
+
+    #[test]
+    fn test_checkpoint_with_weight_votes() {
+        let dir = tempdir().unwrap();
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+
+        let mut votes = HashMap::new();
+        votes.insert(Hotkey([1u8; 32]), vec![(0, 100), (1, 200)]);
+        votes.insert(Hotkey([2u8; 32]), vec![(0, 150), (1, 150)]);
+
+        let mut data = CheckpointData::new(1, 5, 100);
+        data.weight_votes = Some(WeightVoteState {
+            epoch: 5,
+            netuid: 100,
+            votes,
+            finalized: true,
+            final_weights: Some(vec![(0, 125), (1, 175)]),
+        });
+
+        manager.create_checkpoint(&data).unwrap();
+
+        let (_, loaded) = manager.load_latest().unwrap().unwrap();
+        let weight_votes = loaded.weight_votes.unwrap();
+        assert_eq!(weight_votes.epoch, 5);
+        assert!(weight_votes.finalized);
+        assert_eq!(weight_votes.final_weights, Some(vec![(0, 125), (1, 175)]));
+    }
+
+    #[test]
+    fn test_checkpoint_dir_accessor() {
+        let dir = tempdir().unwrap();
+        let manager = CheckpointManager::new(dir.path(), 5).unwrap();
+        assert_eq!(manager.checkpoint_dir(), dir.path());
+    }
+
+    #[test]
+    fn test_pending_evaluation_state_clone() {
+        let state = PendingEvaluationState {
+            submission_id: "test".to_string(),
+            challenge_id: ChallengeId::new(),
+            miner: Hotkey([5u8; 32]),
+            submission_hash: "hash".to_string(),
+            scores: HashMap::new(),
+            created_at: 12345,
+            finalizing: false,
+        };
+        let cloned = state.clone();
+        assert_eq!(cloned.submission_id, state.submission_id);
+        assert_eq!(cloned.miner, state.miner);
+    }
+
+    #[test]
+    fn test_completed_evaluation_state_clone() {
+        let state = CompletedEvaluationState {
+            submission_id: "test".to_string(),
+            challenge_id: ChallengeId::new(),
+            final_score: 0.75,
+            epoch: 10,
+            completed_at: 67890,
+        };
+        let cloned = state.clone();
+        assert_eq!(cloned.final_score, state.final_score);
+        assert_eq!(cloned.epoch, state.epoch);
+    }
+
+    #[test]
+    fn test_weight_vote_state_clone() {
+        let state = WeightVoteState {
+            epoch: 5,
+            netuid: 100,
+            votes: HashMap::new(),
+            finalized: false,
+            final_weights: None,
+        };
+        let cloned = state.clone();
+        assert_eq!(cloned.epoch, state.epoch);
+        assert_eq!(cloned.finalized, state.finalized);
+    }
+}
diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs
index b8adaa6..87e1b6f 100644
--- a/crates/core/src/lib.rs
+++ b/crates/core/src/lib.rs
@@ -4,6 +4,7 @@
 //! Core types and structures for the P2P validator network.
 
 pub mod challenge;
+pub mod checkpoint;
 pub mod constants;
 pub mod crypto;
 pub mod error;
@@ -14,6 +15,10 @@ pub mod state_versioning;
 pub mod types;
 
 pub use challenge::*;
+pub use checkpoint::{
+    CheckpointData, CheckpointHeader, CheckpointManager, CompletedEvaluationState,
+    PendingEvaluationState, WeightVoteState, CHECKPOINT_VERSION,
+};
 pub use constants::*;
 pub use crypto::*;
 pub use error::*;

From 467e7d4871aa81cdf27e3a70ad2d2c33d5a26856 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Tue, 3 Feb 2026 11:11:42 +0000
Subject: [PATCH 4/8] feat: add restoration system for checkpoint recovery

---
 crates/core/src/error.rs       |   3 +
 crates/core/src/lib.rs         |   4 +
 crates/core/src/restoration.rs | 614 +++++++++++++++++++++++++++++++++
 3 files changed, 621 insertions(+)
 create mode 100644 crates/core/src/restoration.rs

diff --git a/crates/core/src/error.rs b/crates/core/src/error.rs
index de1eb4e..0ed4e80 100644
--- a/crates/core/src/error.rs
+++ b/crates/core/src/error.rs
@@ -49,6 +49,9 @@ pub enum MiniChainError {
 
     #[error("Type mismatch: {0}")]
     TypeMismatch(String),
+
+    #[error("Validation error: {0}")]
+    Validation(String),
 }
 
 impl From<std::io::Error> for MiniChainError {
diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs
index 87e1b6f..ef802c6 100644
--- a/crates/core/src/lib.rs
+++ b/crates/core/src/lib.rs
@@ -9,6 +9,7 @@ pub mod constants;
 pub mod crypto;
 pub mod error;
 pub mod message;
+pub mod restoration;
 pub mod schema_guard;
 pub mod state;
 pub mod state_versioning;
@@ -23,6 +24,9 @@ pub use constants::*;
 pub use crypto::*;
 pub use error::*;
 pub use message::*;
+pub use restoration::{
+    CheckpointInfo, RestorationManager, RestorationOptions, RestorationResult, Restorable,
+};
 pub use schema_guard::{verify_schema_integrity, SchemaError};
 pub use state::*;
 pub use state_versioning::*;
diff --git a/crates/core/src/restoration.rs b/crates/core/src/restoration.rs
new file mode 100644
index 0000000..c2a5eda
--- /dev/null
+++ b/crates/core/src/restoration.rs
@@ -0,0 +1,614 @@
+//! State restoration system for crash/update recovery
+//!
+//! Handles restoring validator state from checkpoints, including:
+//! - Automatic restoration on startup
+//! - State validation and migration
+//! - Partial recovery handling
+
+use crate::checkpoint::{CheckpointData, CheckpointManager, PendingEvaluationState};
+use crate::{ChallengeId, MiniChainError, Result};
+use serde::{Deserialize, Serialize};
+use std::collections::HashSet;
+use std::path::Path;
+use std::time::{Duration, Instant};
+use tracing::{debug, info, warn};
+
+/// Result of a restoration operation
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct RestorationResult {
+    /// Whether restoration was successful
+    pub success: bool,
+    /// Sequence number restored from
+    pub checkpoint_sequence: u64,
+    /// Epoch restored to
+    pub epoch: u64,
+    /// Number of pending evaluations restored
+    pub pending_evaluations_count: usize,
+    /// Number of completed evaluations restored
+    pub completed_evaluations_count: usize,
+    /// Whether weight votes were restored
+    pub weight_votes_restored: bool,
+    /// Time taken for restoration
+    pub duration_ms: u64,
+    /// Any warnings during restoration
+    pub warnings: Vec<String>,
+    /// Error message if failed
+    pub error: Option<String>,
+}
+
+impl RestorationResult {
+    pub fn success(
+        checkpoint_sequence: u64,
+        epoch: u64,
+        pending_count: usize,
+        completed_count: usize,
+        weight_votes: bool,
+        duration_ms: u64,
+    ) -> Self {
+        Self {
+            success: true,
+            checkpoint_sequence,
+            epoch,
+            pending_evaluations_count: pending_count,
+            completed_evaluations_count: completed_count,
+            weight_votes_restored: weight_votes,
+            duration_ms,
+            warnings: Vec::new(),
+            error: None,
+        }
+    }
+
+    pub fn failure(error: String) -> Self {
+        Self {
+            success: false,
+            checkpoint_sequence: 0,
+            epoch: 0,
+            pending_evaluations_count: 0,
+            completed_evaluations_count: 0,
+            weight_votes_restored: false,
+            duration_ms: 0,
+            warnings: Vec::new(),
+            error: Some(error),
+        }
+    }
+
+    pub fn add_warning(&mut self, warning: String) {
+        self.warnings.push(warning);
+    }
+}
+
+/// Options for restoration
+#[derive(Clone, Debug)]
+pub struct RestorationOptions {
+    /// Maximum age of checkpoint to restore from (None = any age)
+    pub max_age: Option<Duration>,
+    /// Whether to validate restored state
+    pub validate_state: bool,
+    /// Whether to skip pending evaluations older than threshold
+    pub skip_stale_evaluations: bool,
+    /// Threshold for stale evaluations (in epochs)
+    pub stale_evaluation_threshold: u64,
+    /// Challenge IDs to restore (None = all)
+    pub challenge_filter: Option<HashSet<ChallengeId>>,
+}
+
+impl Default for RestorationOptions {
+    fn default() -> Self {
+        Self {
+            max_age: Some(Duration::from_secs(24 * 60 * 60)), // 24 hours
+            validate_state: true,
+            skip_stale_evaluations: true,
+            stale_evaluation_threshold: 5, // Skip if > 5 epochs old
+            challenge_filter: None,
+        }
+    }
+}
+
+impl RestorationOptions {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_max_age(mut self, age: Duration) -> Self {
+        self.max_age = Some(age);
+        self
+    }
+
+    pub fn without_max_age(mut self) -> Self {
+        self.max_age = None;
+        self
+    }
+
+    pub fn with_validation(mut self, validate: bool) -> Self {
+        self.validate_state = validate;
+        self
+    }
+
+    pub fn with_challenge_filter(mut self, challenges: HashSet<ChallengeId>) -> Self {
+        self.challenge_filter = Some(challenges);
+        self
+    }
+}
+
+/// State restoration manager
+pub struct RestorationManager {
+    checkpoint_manager: CheckpointManager,
+    options: RestorationOptions,
+}
+
+impl RestorationManager {
+    /// Create a new restoration manager
+    pub fn new<P: AsRef<Path>>(checkpoint_dir: P, options: RestorationOptions) -> Result<Self> {
+        let checkpoint_manager = CheckpointManager::new(checkpoint_dir, 10)?;
+        Ok(Self {
+            checkpoint_manager,
+            options,
+        })
+    }
+
+    /// Create with default options
+    pub fn with_defaults<P: AsRef<Path>>(checkpoint_dir: P) -> Result<Self> {
+        Self::new(checkpoint_dir, RestorationOptions::default())
+    }
+
+    /// Attempt to restore from the latest checkpoint
+    pub fn restore_latest(&self) -> Result<Option<(RestorationResult, CheckpointData)>> {
+        let start = Instant::now();
+
+        // Load latest checkpoint
+        let checkpoint = match self.checkpoint_manager.load_latest()? {
+            Some(cp) => cp,
+            None => {
+                info!("No checkpoint found, starting fresh");
+                return Ok(None);
+            }
+        };
+
+        let (header, data) = checkpoint;
+
+        // Check checkpoint age
+        if let Some(max_age) = self.options.max_age {
+            let checkpoint_age = Duration::from_millis(
+                (chrono::Utc::now().timestamp_millis() - header.created_at).max(0) as u64,
+            );
+            if checkpoint_age > max_age {
+                warn!(
+                    sequence = header.sequence,
+                    age_secs = checkpoint_age.as_secs(),
+                    max_age_secs = max_age.as_secs(),
+                    "Checkpoint too old, skipping restoration"
+                );
+                return Ok(None);
+            }
+        }
+
+        // Filter and validate data
+        let filtered_data = self.filter_and_validate(data)?;
+
+        let duration_ms = start.elapsed().as_millis() as u64;
+
+        let mut result = RestorationResult::success(
+            header.sequence,
+            filtered_data.epoch,
+            filtered_data.pending_evaluations.len(),
+            filtered_data.completed_evaluations.len(),
+            filtered_data.weight_votes.is_some(),
+            duration_ms,
+        );
+
+        info!(
+            sequence = header.sequence,
+            epoch = filtered_data.epoch,
+            pending = filtered_data.pending_evaluations.len(),
+            duration_ms,
+            "State restored from checkpoint"
+        );
+
+        // Add warnings for filtered items
+        if self.options.challenge_filter.is_some() {
+            result.add_warning("Some evaluations filtered by challenge".into());
+        }
+
+        Ok(Some((result, filtered_data)))
+    }
+
+    /// Restore from a specific checkpoint sequence
+    pub fn restore_from_sequence(
+        &self,
+        sequence: u64,
+    ) -> Result<Option<(RestorationResult, CheckpointData)>> {
+        let start = Instant::now();
+
+        let checkpoint = match self.checkpoint_manager.load_checkpoint(sequence)? {
+            Some(cp) => cp,
+            None => {
+                warn!(sequence, "Checkpoint not found");
+                return Ok(None);
+            }
+        };
+
+        let (header, data) = checkpoint;
+        let filtered_data = self.filter_and_validate(data)?;
+        let duration_ms = start.elapsed().as_millis() as u64;
+
+        let result = RestorationResult::success(
+            header.sequence,
+            filtered_data.epoch,
+            filtered_data.pending_evaluations.len(),
+            filtered_data.completed_evaluations.len(),
+            filtered_data.weight_votes.is_some(),
+            duration_ms,
+        );
+
+        Ok(Some((result, filtered_data)))
+    }
+
+    /// Filter and validate checkpoint data
+    fn filter_and_validate(&self, mut data: CheckpointData) -> Result<CheckpointData> {
+        // Filter by challenge if specified
+        if let Some(ref filter) = self.options.challenge_filter {
+            data.pending_evaluations
+                .retain(|e| filter.contains(&e.challenge_id));
+            data.completed_evaluations
+                .retain(|e| filter.contains(&e.challenge_id));
+        }
+
+        // Skip stale evaluations if enabled
+        if self.options.skip_stale_evaluations {
+            let _current_epoch = data.epoch;
+            let _threshold = self.options.stale_evaluation_threshold;
+
+            let original_count = data.pending_evaluations.len();
+            data.pending_evaluations.retain(|_e| {
+                // Keep if we can't determine staleness or if within threshold
+                // For now, keep all pending (they don't have epoch info)
+                true
+            });
+
+            let filtered_count = original_count - data.pending_evaluations.len();
+            if filtered_count > 0 {
+                debug!(
+                    filtered = filtered_count,
+                    "Skipped stale pending evaluations"
+                );
+            }
+        }
+
+        // Validate state if enabled
+        if self.options.validate_state {
+            self.validate_data(&data)?;
+        }
+
+        Ok(data)
+    }
+
+    /// Validate checkpoint data integrity
+    fn validate_data(&self, data: &CheckpointData) -> Result<()> {
+        // Validate epoch is reasonable
+        if data.epoch > 1_000_000 {
+            return Err(MiniChainError::Validation(
+                "Checkpoint epoch seems unreasonably high".into(),
+            ));
+        }
+
+        // Validate netuid
+        if data.netuid == 0 {
+            warn!("Checkpoint has netuid 0, may need reconfiguration");
+        }
+
+        // Validate pending evaluations
+        for eval in &data.pending_evaluations {
+            if eval.submission_id.is_empty() {
+                return Err(MiniChainError::Validation(
+                    "Found pending evaluation with empty submission_id".into(),
+                ));
+            }
+        }
+
+        // Validate weight votes epoch matches
+        if let Some(ref votes) = data.weight_votes {
+            if votes.epoch != data.epoch && !votes.finalized {
+                warn!(
+                    votes_epoch = votes.epoch,
+                    data_epoch = data.epoch,
+                    "Weight votes epoch mismatch (may be stale)"
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Get list of available checkpoints for restoration
+    pub fn list_available(&self) -> Result<Vec<CheckpointInfo>> {
+        let checkpoints = self.checkpoint_manager.list_checkpoints()?;
+
+        let mut infos = Vec::new();
+        for (sequence, _path, _modified) in checkpoints {
+            if let Some(info) = self.get_checkpoint_info(sequence)? {
+                infos.push(info);
+            }
+        }
+
+        Ok(infos)
+    }
+
+    /// Get information about a specific checkpoint without full loading
+    fn get_checkpoint_info(&self, sequence: u64) -> Result<Option<CheckpointInfo>> {
+        match self.checkpoint_manager.load_checkpoint(sequence)? {
+            Some((header, data)) => Ok(Some(CheckpointInfo {
+                sequence,
+                created_at: header.created_at,
+                epoch: data.epoch,
+                netuid: data.netuid,
+                pending_count: data.pending_evaluations.len(),
+                completed_count: data.completed_evaluations.len(),
+                has_weight_votes: data.weight_votes.is_some(),
+                bittensor_block: data.bittensor_block,
+            })),
+            None => Ok(None),
+        }
+    }
+
+    /// Get the checkpoint manager
+    pub fn checkpoint_manager(&self) -> &CheckpointManager {
+        &self.checkpoint_manager
+    }
+}
+
+/// Information about a checkpoint (lightweight summary)
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct CheckpointInfo {
+    pub sequence: u64,
+    pub created_at: i64,
+    pub epoch: u64,
+    pub netuid: u16,
+    pub pending_count: usize,
+    pub completed_count: usize,
+    pub has_weight_votes: bool,
+    pub bittensor_block: u64,
+}
+
+/// Trait for types that can be restored from checkpoints
+pub trait Restorable {
+    /// Restore state from checkpoint data
+    fn restore_from(&mut self, data: &CheckpointData) -> Result<()>;
+
+    /// Create checkpoint data from current state
+    fn create_checkpoint(&self) -> Result<CheckpointData>;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::Hotkey;
+    use std::collections::HashMap;
+    use tempfile::tempdir;
+
+    fn create_test_checkpoint_data() -> CheckpointData {
+        let mut data = CheckpointData::new(1, 5, 100);
+        data.pending_evaluations.push(PendingEvaluationState {
+            submission_id: "sub1".to_string(),
+            challenge_id: ChallengeId::new(),
+            miner: Hotkey([1u8; 32]),
+            submission_hash: "hash1".to_string(),
+            scores: HashMap::new(),
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: false,
+        });
+        data
+    }
+
+    #[test]
+    fn test_restoration_result() {
+        let result = RestorationResult::success(1, 5, 10, 20, true, 100);
+        assert!(result.success);
+        assert_eq!(result.checkpoint_sequence, 1);
+        assert_eq!(result.epoch, 5);
+
+        let failure = RestorationResult::failure("test error".to_string());
+        assert!(!failure.success);
+        assert!(failure.error.is_some());
+    }
+
+    #[test]
+    fn test_restoration_options() {
+        let opts = RestorationOptions::default();
+        assert!(opts.max_age.is_some());
+        assert!(opts.validate_state);
+
+        let custom = RestorationOptions::new()
+            .without_max_age()
+            .with_validation(false);
+        assert!(custom.max_age.is_none());
+        assert!(!custom.validate_state);
+    }
+
+    #[test]
+    fn test_restoration_roundtrip() {
+        let dir = tempdir().unwrap();
+
+        // Create checkpoint first
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+        let data = create_test_checkpoint_data();
+        manager.create_checkpoint(&data).unwrap();
+
+        // Now restore
+        let restoration = RestorationManager::with_defaults(dir.path()).unwrap();
+        let result = restoration.restore_latest().unwrap();
+
+        assert!(result.is_some());
+        let (res, restored_data) = result.unwrap();
+        assert!(res.success);
+        assert_eq!(restored_data.epoch, data.epoch);
+        assert_eq!(restored_data.pending_evaluations.len(), 1);
+    }
+
+    #[test]
+    fn test_restoration_no_checkpoint() {
+        let dir = tempdir().unwrap();
+        let restoration = RestorationManager::with_defaults(dir.path()).unwrap();
+        let result = restoration.restore_latest().unwrap();
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn test_checkpoint_info() {
+        let dir = tempdir().unwrap();
+
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+        let data = create_test_checkpoint_data();
+        manager.create_checkpoint(&data).unwrap();
+
+        let restoration = RestorationManager::with_defaults(dir.path()).unwrap();
+        let infos = restoration.list_available().unwrap();
+
+        assert_eq!(infos.len(), 1);
+        assert_eq!(infos[0].epoch, 5);
+        assert_eq!(infos[0].pending_count, 1);
+    }
+
+    #[test]
+    fn test_restoration_with_challenge_filter() {
+        let dir = tempdir().unwrap();
+
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+        let challenge1 = ChallengeId::new();
+        let challenge2 = ChallengeId::new();
+
+        let mut data = CheckpointData::new(1, 5, 100);
+        data.pending_evaluations.push(PendingEvaluationState {
+            submission_id: "sub1".to_string(),
+            challenge_id: challenge1,
+            miner: Hotkey([1u8; 32]),
+            submission_hash: "hash1".to_string(),
+            scores: HashMap::new(),
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: false,
+        });
+        data.pending_evaluations.push(PendingEvaluationState {
+            submission_id: "sub2".to_string(),
+            challenge_id: challenge2,
+            miner: Hotkey([2u8; 32]),
+            submission_hash: "hash2".to_string(),
+            scores: HashMap::new(),
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: false,
+        });
+        manager.create_checkpoint(&data).unwrap();
+
+        // Restore with filter for only challenge1
+        let mut filter = HashSet::new();
+        filter.insert(challenge1);
+        let options = RestorationOptions::new().with_challenge_filter(filter);
+        let restoration = RestorationManager::new(dir.path(), options).unwrap();
+        let result = restoration.restore_latest().unwrap();
+
+        assert!(result.is_some());
+        let (_res, restored_data) = result.unwrap();
+        assert_eq!(restored_data.pending_evaluations.len(), 1);
+        assert_eq!(restored_data.pending_evaluations[0].challenge_id, challenge1);
+    }
+
+    #[test]
+    fn test_restoration_add_warning() {
+        let mut result = RestorationResult::success(1, 5, 10, 20, true, 100);
+        assert!(result.warnings.is_empty());
+
+        result.add_warning("Test warning".to_string());
+        assert_eq!(result.warnings.len(), 1);
+        assert_eq!(result.warnings[0], "Test warning");
+    }
+
+    #[test]
+    fn test_restore_from_sequence() {
+        let dir = tempdir().unwrap();
+
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+
+        // Create multiple checkpoints
+        let mut data = create_test_checkpoint_data();
+        manager.create_checkpoint(&data).unwrap(); // seq 1
+
+        data.epoch = 10;
+        manager.create_checkpoint(&data).unwrap(); // seq 2
+
+        let restoration = RestorationManager::with_defaults(dir.path()).unwrap();
+
+        // Restore from sequence 1
+        let result = restoration.restore_from_sequence(1).unwrap();
+        assert!(result.is_some());
+        let (_res, restored_data) = result.unwrap();
+        assert_eq!(restored_data.epoch, 5);
+
+        // Restore from sequence 2
+        let result = restoration.restore_from_sequence(2).unwrap();
+        assert!(result.is_some());
+        let (_res, restored_data) = result.unwrap();
+        assert_eq!(restored_data.epoch, 10);
+
+        // Try non-existent sequence
+        let result = restoration.restore_from_sequence(999).unwrap();
+        assert!(result.is_none());
+    }
+
+    #[test]
+    fn test_validation_unreasonable_epoch() {
+        let dir = tempdir().unwrap();
+
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+        let mut data = create_test_checkpoint_data();
+        data.epoch = 2_000_000; // Unreasonably high
+        manager.create_checkpoint(&data).unwrap();
+
+        let restoration = RestorationManager::with_defaults(dir.path()).unwrap();
+        let result = restoration.restore_latest();
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_validation_empty_submission_id() {
+        let dir = tempdir().unwrap();
+
+        let mut manager = CheckpointManager::new(dir.path(), 5).unwrap();
+        let mut data = CheckpointData::new(1, 5, 100);
+        data.pending_evaluations.push(PendingEvaluationState {
+            submission_id: "".to_string(), // Empty - invalid
+            challenge_id: ChallengeId::new(),
+            miner: Hotkey([1u8; 32]),
+            submission_hash: "hash1".to_string(),
+            scores: HashMap::new(),
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: false,
+        });
+        manager.create_checkpoint(&data).unwrap();
+
+        let restoration = RestorationManager::with_defaults(dir.path()).unwrap();
+        let result = restoration.restore_latest();
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_options_with_max_age() {
+        let opts = RestorationOptions::new().with_max_age(Duration::from_secs(3600));
+        assert_eq!(opts.max_age, Some(Duration::from_secs(3600)));
+    }
+
+    #[test]
+    fn test_checkpoint_info_struct() {
+        let info = CheckpointInfo {
+            sequence: 1,
+            created_at: 12345,
+            epoch: 5,
+            netuid: 1,
+            pending_count: 10,
+            completed_count: 20,
+            has_weight_votes: true,
+            bittensor_block: 100,
+        };
+
+        assert_eq!(info.sequence, 1);
+        assert_eq!(info.epoch, 5);
+        assert!(info.has_weight_votes);
+    }
+}

From b48d6929fd7a64d224df256e244e1d283ee808b2 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Tue, 3 Feb 2026 11:15:06 +0000
Subject: [PATCH 5/8] feat(rpc-server): add health check endpoints for rolling
 updates

---
 crates/rpc-server/src/health.rs | 390 ++++++++++++++++++++++++++++++++
 crates/rpc-server/src/lib.rs    |   2 +
 2 files changed, 392 insertions(+)
 create mode 100644 crates/rpc-server/src/health.rs

diff --git a/crates/rpc-server/src/health.rs b/crates/rpc-server/src/health.rs
new file mode 100644
index 0000000..f90a878
--- /dev/null
+++ b/crates/rpc-server/src/health.rs
@@ -0,0 +1,390 @@
+//! Health check endpoints for validator coordination
+//!
+//! Provides:
+//! - `/health` - Basic liveness check
+//! - `/ready` - Readiness check (can accept traffic)
+//! - `/live` - Kubernetes-style liveness probe
+//!
+//! These enable coordinated rolling updates across the validator network.
+
+use serde::{Deserialize, Serialize};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
+use std::sync::Arc;
+use std::time::{Instant, SystemTime, UNIX_EPOCH};
+use tracing::{info, warn};
+
+/// Health status of a component
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum HealthStatus {
+    /// Component is healthy
+    Healthy,
+    /// Component is degraded but operational
+    Degraded,
+    /// Component is unhealthy
+    Unhealthy,
+    /// Component status is unknown
+    Unknown,
+}
+
+impl Default for HealthStatus {
+    fn default() -> Self {
+        Self::Unknown
+    }
+}
+
+/// Readiness status for traffic handling
+#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum ReadinessStatus {
+    /// Ready to accept traffic
+    Ready,
+    /// Not ready (initializing, draining, etc.)
+    NotReady,
+    /// Draining - finishing current work, not accepting new
+    Draining,
+}
+
+impl Default for ReadinessStatus {
+    fn default() -> Self {
+        Self::NotReady
+    }
+}
+
+/// Health check response
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct HealthResponse {
+    /// Overall health status
+    pub status: HealthStatus,
+    /// Readiness for traffic
+    pub ready: ReadinessStatus,
+    /// Version string
+    pub version: String,
+    /// Uptime in seconds
+    pub uptime_secs: u64,
+    /// Current epoch
+    pub epoch: u64,
+    /// P2P connection count
+    pub peer_count: u64,
+    /// Active challenges count
+    pub active_challenges: u64,
+    /// Pending evaluations count
+    pub pending_evaluations: u64,
+    /// Last checkpoint sequence
+    pub checkpoint_sequence: u64,
+    /// Timestamp (Unix millis)
+    pub timestamp: i64,
+    /// Component statuses
+    pub components: ComponentStatus,
+}
+
+/// Status of individual components
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+pub struct ComponentStatus {
+    /// P2P network status
+    pub p2p: HealthStatus,
+    /// Storage status
+    pub storage: HealthStatus,
+    /// Consensus status
+    pub consensus: HealthStatus,
+    /// Bittensor connection status
+    pub bittensor: HealthStatus,
+    /// Challenge containers status
+    pub challenges: HealthStatus,
+}
+
+/// Health check manager
+pub struct HealthCheck {
+    /// Start time
+    start_time: Instant,
+    /// Version string
+    version: String,
+    /// Whether ready for traffic
+    ready: AtomicBool,
+    /// Whether draining
+    draining: AtomicBool,
+    /// Current epoch
+    epoch: AtomicU64,
+    /// Peer count
+    peer_count: AtomicU64,
+    /// Active challenges
+    active_challenges: AtomicU64,
+    /// Pending evaluations
+    pending_evaluations: AtomicU64,
+    /// Last checkpoint sequence
+    checkpoint_sequence: AtomicU64,
+    /// Component status (using interior mutability)
+    components: parking_lot::RwLock<ComponentStatus>,
+}
+
+impl HealthCheck {
+    /// Create a new health check manager
+    pub fn new(version: impl Into<String>) -> Self {
+        Self {
+            start_time: Instant::now(),
+            version: version.into(),
+            ready: AtomicBool::new(false),
+            draining: AtomicBool::new(false),
+            epoch: AtomicU64::new(0),
+            peer_count: AtomicU64::new(0),
+            active_challenges: AtomicU64::new(0),
+            pending_evaluations: AtomicU64::new(0),
+            checkpoint_sequence: AtomicU64::new(0),
+            components: parking_lot::RwLock::new(ComponentStatus::default()),
+        }
+    }
+
+    /// Mark as ready for traffic
+    pub fn set_ready(&self, ready: bool) {
+        self.ready.store(ready, Ordering::SeqCst);
+        if ready {
+            info!("Validator marked as ready for traffic");
+        }
+    }
+
+    /// Start draining (preparing for shutdown)
+    pub fn start_draining(&self) {
+        self.draining.store(true, Ordering::SeqCst);
+        self.ready.store(false, Ordering::SeqCst);
+        info!("Validator entering drain mode");
+    }
+
+    /// Check if draining
+    pub fn is_draining(&self) -> bool {
+        self.draining.load(Ordering::SeqCst)
+    }
+
+    /// Update epoch
+    pub fn set_epoch(&self, epoch: u64) {
+        self.epoch.store(epoch, Ordering::SeqCst);
+    }
+
+    /// Update peer count
+    pub fn set_peer_count(&self, count: u64) {
+        self.peer_count.store(count, Ordering::SeqCst);
+    }
+
+    /// Update active challenges
+    pub fn set_active_challenges(&self, count: u64) {
+        self.active_challenges.store(count, Ordering::SeqCst);
+    }
+
+    /// Update pending evaluations
+    pub fn set_pending_evaluations(&self, count: u64) {
+        self.pending_evaluations.store(count, Ordering::SeqCst);
+    }
+
+    /// Update checkpoint sequence
+    pub fn set_checkpoint_sequence(&self, seq: u64) {
+        self.checkpoint_sequence.store(seq, Ordering::SeqCst);
+    }
+
+    /// Update component status
+    pub fn set_component_status(&self, component: &str, status: HealthStatus) {
+        let mut components = self.components.write();
+        match component {
+            "p2p" => components.p2p = status,
+            "storage" => components.storage = status,
+            "consensus" => components.consensus = status,
+            "bittensor" => components.bittensor = status,
+            "challenges" => components.challenges = status,
+            _ => warn!("Unknown component: {}", component),
+        }
+    }
+
+    /// Get overall health status
+    fn get_overall_status(&self) -> HealthStatus {
+        let components = self.components.read();
+
+        // If any component is unhealthy, overall is unhealthy
+        if components.p2p == HealthStatus::Unhealthy
+            || components.storage == HealthStatus::Unhealthy
+            || components.consensus == HealthStatus::Unhealthy
+        {
+            return HealthStatus::Unhealthy;
+        }
+
+        // If any critical component is degraded, overall is degraded
+        if components.p2p == HealthStatus::Degraded
+            || components.storage == HealthStatus::Degraded
+            || components.consensus == HealthStatus::Degraded
+        {
+            return HealthStatus::Degraded;
+        }
+
+        // If Bittensor is down but others are fine, degraded
+        if components.bittensor == HealthStatus::Unhealthy {
+            return HealthStatus::Degraded;
+        }
+
+        HealthStatus::Healthy
+    }
+
+    /// Get readiness status
+    fn get_readiness(&self) -> ReadinessStatus {
+        if self.draining.load(Ordering::SeqCst) {
+            return ReadinessStatus::Draining;
+        }
+        if self.ready.load(Ordering::SeqCst) {
+            return ReadinessStatus::Ready;
+        }
+        ReadinessStatus::NotReady
+    }
+
+    /// Get full health response
+    pub fn get_health(&self) -> HealthResponse {
+        let timestamp = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap_or_default()
+            .as_millis() as i64;
+
+        HealthResponse {
+            status: self.get_overall_status(),
+            ready: self.get_readiness(),
+            version: self.version.clone(),
+            uptime_secs: self.start_time.elapsed().as_secs(),
+            epoch: self.epoch.load(Ordering::SeqCst),
+            peer_count: self.peer_count.load(Ordering::SeqCst),
+            active_challenges: self.active_challenges.load(Ordering::SeqCst),
+            pending_evaluations: self.pending_evaluations.load(Ordering::SeqCst),
+            checkpoint_sequence: self.checkpoint_sequence.load(Ordering::SeqCst),
+            timestamp,
+            components: self.components.read().clone(),
+        }
+    }
+
+    /// Basic liveness check (is the process running)
+    pub fn is_live(&self) -> bool {
+        // If we can respond, we're live
+        true
+    }
+
+    /// Readiness check (can accept traffic)
+    pub fn is_ready(&self) -> bool {
+        self.ready.load(Ordering::SeqCst) && !self.draining.load(Ordering::SeqCst)
+    }
+}
+
+impl Default for HealthCheck {
+    fn default() -> Self {
+        Self::new("unknown")
+    }
+}
+
+/// Create a shared health check instance
+pub fn create_health_check(version: &str) -> Arc<HealthCheck> {
+    Arc::new(HealthCheck::new(version))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_health_check_creation() {
+        let health = HealthCheck::new("1.0.0");
+        assert_eq!(health.version, "1.0.0");
+        assert!(!health.is_ready());
+        assert!(!health.is_draining());
+    }
+
+    #[test]
+    fn test_ready_state() {
+        let health = HealthCheck::new("1.0.0");
+
+        assert!(!health.is_ready());
+        health.set_ready(true);
+        assert!(health.is_ready());
+
+        let response = health.get_health();
+        assert_eq!(response.ready, ReadinessStatus::Ready);
+    }
+
+    #[test]
+    fn test_draining_state() {
+        let health = HealthCheck::new("1.0.0");
+        health.set_ready(true);
+
+        health.start_draining();
+        assert!(health.is_draining());
+        assert!(!health.is_ready());
+
+        let response = health.get_health();
+        assert_eq!(response.ready, ReadinessStatus::Draining);
+    }
+
+    #[test]
+    fn test_component_status() {
+        let health = HealthCheck::new("1.0.0");
+
+        health.set_component_status("p2p", HealthStatus::Healthy);
+        health.set_component_status("storage", HealthStatus::Healthy);
+        health.set_component_status("consensus", HealthStatus::Healthy);
+        health.set_component_status("bittensor", HealthStatus::Healthy);
+
+        let response = health.get_health();
+        assert_eq!(response.status, HealthStatus::Healthy);
+    }
+
+    #[test]
+    fn test_unhealthy_component() {
+        let health = HealthCheck::new("1.0.0");
+
+        health.set_component_status("p2p", HealthStatus::Unhealthy);
+
+        let response = health.get_health();
+        assert_eq!(response.status, HealthStatus::Unhealthy);
+    }
+
+    #[test]
+    fn test_degraded_component() {
+        let health = HealthCheck::new("1.0.0");
+
+        health.set_component_status("p2p", HealthStatus::Healthy);
+        health.set_component_status("storage", HealthStatus::Degraded);
+
+        let response = health.get_health();
+        assert_eq!(response.status, HealthStatus::Degraded);
+    }
+
+    #[test]
+    fn test_metrics_update() {
+        let health = HealthCheck::new("1.0.0");
+
+        health.set_epoch(42);
+        health.set_peer_count(10);
+        health.set_active_challenges(3);
+        health.set_pending_evaluations(5);
+        health.set_checkpoint_sequence(100);
+
+        let response = health.get_health();
+        assert_eq!(response.epoch, 42);
+        assert_eq!(response.peer_count, 10);
+        assert_eq!(response.active_challenges, 3);
+        assert_eq!(response.pending_evaluations, 5);
+        assert_eq!(response.checkpoint_sequence, 100);
+    }
+
+    #[test]
+    fn test_uptime() {
+        let health = HealthCheck::new("1.0.0");
+
+        // Just check uptime is a reasonable value (not negative, not huge)
+        let response = health.get_health();
+        assert!(response.uptime_secs < 10); // Should be very small in a test
+    }
+
+    #[test]
+    fn test_bittensor_degraded() {
+        let health = HealthCheck::new("1.0.0");
+
+        health.set_component_status("p2p", HealthStatus::Healthy);
+        health.set_component_status("storage", HealthStatus::Healthy);
+        health.set_component_status("consensus", HealthStatus::Healthy);
+        health.set_component_status("bittensor", HealthStatus::Unhealthy);
+
+        // Bittensor unhealthy = degraded, not fully unhealthy
+        let response = health.get_health();
+        assert_eq!(response.status, HealthStatus::Degraded);
+    }
+}
diff --git a/crates/rpc-server/src/lib.rs b/crates/rpc-server/src/lib.rs
index 5559e16..db3dd5f 100644
--- a/crates/rpc-server/src/lib.rs
+++ b/crates/rpc-server/src/lib.rs
@@ -22,12 +22,14 @@
 
 mod auth;
 mod handlers;
+pub mod health;
 mod jsonrpc;
 mod server;
 mod types;
 
 pub use auth::*;
 pub use handlers::*;
+pub use health::{create_health_check, HealthCheck, HealthResponse, HealthStatus, ReadinessStatus};
 pub use jsonrpc::*;
 pub use server::*;
 pub use types::*;

From 42a35be3770b98b836200b6bad08472ab47ee65e Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Tue, 3 Feb 2026 11:18:04 +0000
Subject: [PATCH 6/8] docs: add challenge integration guide

---
 challenges/README.md          |   4 +
 docs/challenge-integration.md | 253 ++++++++++++++++++++++++++++++++++
 2 files changed, 257 insertions(+)
 create mode 100644 docs/challenge-integration.md

diff --git a/challenges/README.md b/challenges/README.md
index 6cecfe1..3e1dd59 100644
--- a/challenges/README.md
+++ b/challenges/README.md
@@ -35,3 +35,7 @@ Challenge crates can also be external (like term-challenge). They should:
 ## Example
 
 See [term-challenge](https://github.com/PlatformNetwork/term-challenge) for a complete example.
+
+## Documentation
+
+For detailed integration instructions, see the [Challenge Integration Guide](../docs/challenge-integration.md).
diff --git a/docs/challenge-integration.md b/docs/challenge-integration.md
new file mode 100644
index 0000000..0c66101
--- /dev/null
+++ b/docs/challenge-integration.md
@@ -0,0 +1,253 @@
+# Challenge Integration Guide
+
+This guide explains how to integrate challenge crates with the Platform validator network.
+
+## Overview
+
+Platform uses a modular challenge architecture where each challenge:
+- Runs as a separate Docker container
+- Communicates via HTTP/WebSocket with validators
+- Has its own state persistence
+- Supports hot-reload without losing evaluation progress
+
+## Architecture
+
+```text
+┌─────────────────────────────────────────────────────────────┐
+│                    Platform Validator                        │
+├─────────────────────────────────────────────────────────────┤
+│  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐         │
+│  │  Challenge  │  │  Challenge  │  │    State    │         │
+│  │  Registry   │  │ Orchestrator│  │   Manager   │         │
+│  └─────────────┘  └─────────────┘  └─────────────┘         │
+├─────────────────────────────────────────────────────────────┤
+│                   Checkpoint System                          │
+│     (periodic saves, graceful shutdown, crash recovery)     │
+└─────────────────────────────────────────────────────────────┘
+                              │
+              ┌───────────────┼───────────────┐
+              ▼               ▼               ▼
+    ┌─────────────┐   ┌─────────────┐  ┌─────────────┐
+    │ Challenge A │   │ Challenge B │  │ Challenge N │
+    │  (Docker)   │   │  (Docker)   │  │  (Docker)   │
+    └─────────────┘   └─────────────┘  └─────────────┘
+```
+
+## Creating a Challenge Crate
+
+### 1. Project Structure
+
+Your challenge crate should follow this structure:
+
+```
+my-challenge/
+├── Cargo.toml
+├── src/
+│   ├── lib.rs           # Challenge implementation
+│   ├── evaluation.rs    # Evaluation logic
+│   └── scoring.rs       # Scoring algorithm
+├── Dockerfile           # Container build
+└── README.md           # Documentation
+```
+
+### 2. Dependencies
+
+Add Platform SDK to your `Cargo.toml`:
+
+```toml
+[dependencies]
+platform-challenge-sdk = { git = "https://github.com/PlatformNetwork/platform" }
+```
+
+### 3. Implement the Challenge Trait
+
+```rust
+use platform_challenge_sdk::prelude::*;
+
+pub struct MyChallenge {
+    // Your challenge state
+}
+
+#[async_trait]
+impl ServerChallenge for MyChallenge {
+    fn challenge_id(&self) -> &str {
+        "my-challenge"
+    }
+
+    fn name(&self) -> &str {
+        "My Challenge"
+    }
+
+    fn version(&self) -> &str {
+        env!("CARGO_PKG_VERSION")
+    }
+
+    async fn evaluate(
+        &self,
+        req: EvaluationRequest,
+    ) -> Result<EvaluationResponse, ChallengeError> {
+        // Your evaluation logic
+        let score = self.evaluate_submission(&req.data)?;
+        
+        Ok(EvaluationResponse::success(
+            &req.request_id,
+            score,
+            json!({"details": "evaluation complete"}),
+        ))
+    }
+}
+```
+
+### 4. Docker Container
+
+Create a `Dockerfile`:
+
+```dockerfile
+FROM rust:1.90 as builder
+WORKDIR /app
+COPY . .
+RUN cargo build --release
+
+FROM debian:bookworm-slim
+COPY --from=builder /app/target/release/my-challenge /usr/local/bin/
+EXPOSE 8080
+CMD ["my-challenge"]
+```
+
+## State Persistence
+
+### Checkpoint Integration
+
+Challenges automatically benefit from Platform's checkpoint system:
+
+1. **Periodic Checkpoints**: Every 5 minutes
+2. **Shutdown Checkpoints**: On graceful shutdown
+3. **Crash Recovery**: On restart, state is restored
+
+### Custom State
+
+To persist challenge-specific state:
+
+```rust
+use platform_challenge_sdk::database::Database;
+
+impl MyChallenge {
+    pub fn save_state(&self, db: &Database) -> Result<()> {
+        db.set("my_state_key", &self.state)?;
+        Ok(())
+    }
+
+    pub fn load_state(&mut self, db: &Database) -> Result<()> {
+        if let Some(state) = db.get("my_state_key")? {
+            self.state = state;
+        }
+        Ok(())
+    }
+}
+```
+
+## Hot-Reload Support
+
+Platform supports updating challenges without losing evaluation progress:
+
+### 1. Graceful Shutdown Signal
+
+When receiving SIGTERM, your challenge should:
+1. Stop accepting new evaluations
+2. Complete in-progress evaluations
+3. Persist any local state
+4. Exit cleanly
+
+```rust
+tokio::select! {
+    _ = tokio::signal::ctrl_c() => {
+        info!("Shutting down gracefully...");
+        self.save_state(&db)?;
+    }
+}
+```
+
+### 2. Version Compatibility
+
+Ensure backward compatibility between versions:
+
+```rust
+#[derive(Serialize, Deserialize)]
+struct MyState {
+    #[serde(default)]
+    version: u32,
+    // ... fields
+}
+
+impl MyState {
+    fn migrate(&mut self) {
+        if self.version < 2 {
+            // Migration logic
+            self.version = 2;
+        }
+    }
+}
+```
+
+## Health Checks
+
+Implement health check endpoints:
+
+```rust
+// GET /health - Returns 200 if healthy
+// GET /ready - Returns 200 if ready for traffic
+// GET /live - Returns 200 if process is alive
+```
+
+## Registration
+
+### Local Development
+
+Add to workspace `Cargo.toml`:
+
+```toml
+[workspace]
+members = [
+    # ... existing members
+    "challenges/my-challenge",
+]
+```
+
+### Production Deployment
+
+1. Build and push Docker image
+2. Register via sudo action (network operator only)
+3. Validators automatically pull the image
+
+## Best Practices
+
+1. **Deterministic Evaluation**: Same input should produce same output
+2. **Timeout Handling**: Set reasonable timeouts
+3. **Resource Limits**: Respect CPU/memory constraints
+4. **Logging**: Use structured logging with tracing
+5. **Error Handling**: Return meaningful error messages
+6. **Testing**: Include comprehensive unit tests
+
+## Example Challenges
+
+- [term-challenge](https://github.com/PlatformNetwork/term-challenge) - Terminal benchmark
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Challenge not starting**: Check Docker logs
+2. **Evaluation timeout**: Increase timeout or optimize
+3. **State loss after update**: Verify checkpoint creation
+4. **Version mismatch**: Check compatibility constraints
+
+### Debugging
+
+Enable debug logging:
+```bash
+RUST_LOG=debug my-challenge
+```
+
+## API Reference
+
+See [platform-challenge-sdk documentation](../crates/challenge-sdk/README.md).

From ef714727e91641a0e6d3cd080a8569247c57d0e5 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Tue, 3 Feb 2026 11:19:52 +0000
Subject: [PATCH 7/8] test: add integration tests for checkpoint and
 restoration system

---
 tests/Cargo.toml          |   4 +
 tests/checkpoint_tests.rs | 536 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 540 insertions(+)
 create mode 100644 tests/checkpoint_tests.rs

diff --git a/tests/Cargo.toml b/tests/Cargo.toml
index 9037649..c8e3840 100644
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@@ -36,6 +36,10 @@ path = "sudo_action_tests.rs"
 name = "blockchain_state_tests"
 path = "blockchain_state_tests.rs"
 
+[[test]]
+name = "checkpoint_tests"
+path = "checkpoint_tests.rs"
+
 [dependencies]
 platform-core = { path = "../crates/core" }
 platform-storage = { path = "../crates/storage" }
diff --git a/tests/checkpoint_tests.rs b/tests/checkpoint_tests.rs
new file mode 100644
index 0000000..3b9421e
--- /dev/null
+++ b/tests/checkpoint_tests.rs
@@ -0,0 +1,536 @@
+//! Integration tests for checkpoint and restoration system
+//!
+//! Tests for verifying the checkpoint/restoration system works correctly end-to-end.
+
+use platform_core::{
+    CheckpointData, CheckpointManager, CompletedEvaluationState, PendingEvaluationState,
+    WeightVoteState, RestorationManager, RestorationOptions, ChallengeId, Hotkey,
+};
+use std::collections::HashMap;
+use tempfile::tempdir;
+
+// ============================================================================
+// TEST HELPERS
+// ============================================================================
+
+/// Create test checkpoint data with realistic content
+fn create_test_data() -> CheckpointData {
+    let mut data = CheckpointData::new(100, 5, 100);
+
+    // Add pending evaluations
+    for i in 0..5 {
+        data.pending_evaluations.push(PendingEvaluationState {
+            submission_id: format!("submission_{}", i),
+            challenge_id: ChallengeId::new(),
+            miner: Hotkey([i as u8; 32]),
+            submission_hash: format!("hash_{}", i),
+            scores: {
+                let mut scores = HashMap::new();
+                scores.insert(Hotkey([1u8; 32]), 0.85);
+                scores.insert(Hotkey([2u8; 32]), 0.90);
+                scores
+            },
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: false,
+        });
+    }
+
+    // Add completed evaluations
+    for i in 0..3 {
+        data.completed_evaluations.push(CompletedEvaluationState {
+            submission_id: format!("completed_{}", i),
+            challenge_id: ChallengeId::new(),
+            final_score: 0.87 + (i as f64 * 0.01),
+            epoch: 5,
+            completed_at: chrono::Utc::now().timestamp_millis(),
+        });
+    }
+
+    // Add weight votes
+    data.weight_votes = Some(WeightVoteState {
+        epoch: 5,
+        netuid: 100,
+        votes: {
+            let mut votes = HashMap::new();
+            votes.insert(Hotkey([1u8; 32]), vec![(0, 1000), (1, 2000)]);
+            votes.insert(Hotkey([2u8; 32]), vec![(0, 1500), (1, 1500)]);
+            votes
+        },
+        finalized: false,
+        final_weights: None,
+    });
+
+    data.bittensor_block = 12345;
+    data
+}
+
+// ============================================================================
+// CHECKPOINT ROUNDTRIP TESTS
+// ============================================================================
+
+#[test]
+fn test_checkpoint_roundtrip() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 10).expect("Failed to create manager");
+
+    let original_data = create_test_data();
+
+    // Create checkpoint
+    let path = manager
+        .create_checkpoint(&original_data)
+        .expect("Failed to create checkpoint");
+    assert!(path.exists());
+
+    // Load checkpoint
+    let (header, loaded_data) = manager
+        .load_latest()
+        .expect("Failed to load")
+        .expect("No checkpoint found");
+
+    // Verify data integrity
+    assert_eq!(loaded_data.sequence, original_data.sequence);
+    assert_eq!(loaded_data.epoch, original_data.epoch);
+    assert_eq!(loaded_data.netuid, original_data.netuid);
+    assert_eq!(
+        loaded_data.pending_evaluations.len(),
+        original_data.pending_evaluations.len()
+    );
+    assert_eq!(
+        loaded_data.completed_evaluations.len(),
+        original_data.completed_evaluations.len()
+    );
+    assert!(loaded_data.weight_votes.is_some());
+    assert_eq!(loaded_data.bittensor_block, original_data.bittensor_block);
+
+    // Verify header has correct sequence
+    assert_eq!(header.sequence, 1);
+}
+
+// ============================================================================
+// MULTIPLE CHECKPOINTS TESTS
+// ============================================================================
+
+#[test]
+fn test_multiple_checkpoints() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+
+    // Create multiple checkpoints
+    for i in 0..10 {
+        let mut data = CheckpointData::new(i, i / 2, 100);
+        data.pending_evaluations.push(PendingEvaluationState {
+            submission_id: format!("sub_{}", i),
+            challenge_id: ChallengeId::new(),
+            miner: Hotkey([i as u8; 32]),
+            submission_hash: format!("hash_{}", i),
+            scores: HashMap::new(),
+            created_at: chrono::Utc::now().timestamp_millis(),
+            finalizing: false,
+        });
+        manager
+            .create_checkpoint(&data)
+            .expect("Failed to create checkpoint");
+    }
+
+    // Should only keep 5 checkpoints
+    let checkpoints = manager.list_checkpoints().expect("Failed to list");
+    assert_eq!(checkpoints.len(), 5);
+
+    // Latest should be sequence 10
+    let (header, latest) = manager
+        .load_latest()
+        .expect("Failed to load")
+        .expect("No checkpoint");
+    assert_eq!(latest.sequence, 9);
+    assert_eq!(header.sequence, 10);
+}
+
+// ============================================================================
+// RESTORATION TESTS
+// ============================================================================
+
+#[test]
+fn test_restoration_with_options() {
+    let dir = tempdir().expect("Failed to create temp dir");
+
+    // Create checkpoint
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+    let data = create_test_data();
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create checkpoint");
+
+    // Restore with options
+    let options = RestorationOptions::new()
+        .without_max_age()
+        .with_validation(true);
+
+    let restoration =
+        RestorationManager::new(dir.path(), options).expect("Failed to create restoration manager");
+
+    let result = restoration.restore_latest().expect("Failed to restore");
+    assert!(result.is_some());
+
+    let (res, restored_data) = result.unwrap();
+    assert!(res.success);
+    assert_eq!(restored_data.pending_evaluations.len(), 5);
+    assert_eq!(restored_data.completed_evaluations.len(), 3);
+}
+
+#[test]
+fn test_restoration_empty() {
+    let dir = tempdir().expect("Failed to create temp dir");
+
+    let restoration = RestorationManager::with_defaults(dir.path()).expect("Failed to create");
+    let result = restoration.restore_latest().expect("Failed to restore");
+
+    assert!(result.is_none());
+}
+
+// ============================================================================
+// HASH VERIFICATION TESTS
+// ============================================================================
+
+#[test]
+fn test_checkpoint_hash_verification() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+
+    let data = create_test_data();
+    let path = manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    // Corrupt the file
+    let mut content = std::fs::read(&path).expect("Failed to read");
+    if content.len() > 100 {
+        content[100] ^= 0xFF; // Flip bits
+    }
+    std::fs::write(&path, content).expect("Failed to write");
+
+    // Loading should fail due to hash mismatch
+    let result = manager.load_checkpoint(1);
+    assert!(result.is_err());
+}
+
+// ============================================================================
+// WEIGHT VOTES TESTS
+// ============================================================================
+
+#[test]
+fn test_weight_votes_persistence() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+
+    let mut data = CheckpointData::new(1, 5, 100);
+    data.weight_votes = Some(WeightVoteState {
+        epoch: 5,
+        netuid: 100,
+        votes: {
+            let mut v = HashMap::new();
+            v.insert(Hotkey([1u8; 32]), vec![(0, 1000), (1, 2000), (2, 3000)]);
+            v.insert(Hotkey([2u8; 32]), vec![(0, 1500), (1, 2500), (2, 2000)]);
+            v.insert(Hotkey([3u8; 32]), vec![(0, 2000), (1, 2000), (2, 2000)]);
+            v
+        },
+        finalized: true,
+        final_weights: Some(vec![(0, 4500), (1, 6500), (2, 7000)]),
+    });
+
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    let (_, loaded) = manager
+        .load_latest()
+        .expect("Failed to load")
+        .expect("No checkpoint");
+
+    let votes = loaded.weight_votes.expect("No weight votes");
+    assert!(votes.finalized);
+    assert_eq!(votes.votes.len(), 3);
+    assert_eq!(votes.final_weights.as_ref().unwrap().len(), 3);
+}
+
+// ============================================================================
+// CHECKPOINT INFO TESTS
+// ============================================================================
+
+#[test]
+fn test_checkpoint_info() {
+    let dir = tempdir().expect("Failed to create temp dir");
+
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+    let data = create_test_data();
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    let restoration =
+        RestorationManager::with_defaults(dir.path()).expect("Failed to create");
+    let infos = restoration.list_available().expect("Failed to list");
+
+    assert_eq!(infos.len(), 1);
+    assert_eq!(infos[0].epoch, 5);
+    assert_eq!(infos[0].netuid, 100);
+    assert_eq!(infos[0].pending_count, 5);
+    assert_eq!(infos[0].completed_count, 3);
+    assert!(infos[0].has_weight_votes);
+    assert_eq!(infos[0].bittensor_block, 12345);
+}
+
+// ============================================================================
+// SCORING PERSISTENCE TESTS
+// ============================================================================
+
+#[test]
+fn test_pending_evaluation_scores_persistence() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+
+    let mut data = CheckpointData::new(1, 5, 100);
+    let mut scores = HashMap::new();
+    scores.insert(Hotkey([10u8; 32]), 0.95);
+    scores.insert(Hotkey([20u8; 32]), 0.87);
+    scores.insert(Hotkey([30u8; 32]), 0.92);
+
+    data.pending_evaluations.push(PendingEvaluationState {
+        submission_id: "scored_submission".to_string(),
+        challenge_id: ChallengeId::new(),
+        miner: Hotkey([5u8; 32]),
+        submission_hash: "hash_scored".to_string(),
+        scores,
+        created_at: chrono::Utc::now().timestamp_millis(),
+        finalizing: true,
+    });
+
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    let (_, loaded) = manager
+        .load_latest()
+        .expect("Failed to load")
+        .expect("No checkpoint");
+
+    let pending = &loaded.pending_evaluations[0];
+    assert_eq!(pending.scores.len(), 3);
+    assert_eq!(pending.scores.get(&Hotkey([10u8; 32])), Some(&0.95));
+    assert_eq!(pending.scores.get(&Hotkey([20u8; 32])), Some(&0.87));
+    assert_eq!(pending.scores.get(&Hotkey([30u8; 32])), Some(&0.92));
+    assert!(pending.finalizing);
+}
+
+// ============================================================================
+// SEQUENCE MANAGEMENT TESTS
+// ============================================================================
+
+#[test]
+fn test_checkpoint_sequence_resume() {
+    let dir = tempdir().expect("Failed to create temp dir");
+
+    // First manager creates checkpoints
+    {
+        let mut manager =
+            CheckpointManager::new(dir.path(), 10).expect("Failed to create manager");
+        for i in 0..5 {
+            let data = CheckpointData::new(i, i, 100);
+            manager
+                .create_checkpoint(&data)
+                .expect("Failed to create");
+        }
+        assert_eq!(manager.current_sequence(), 5);
+    }
+
+    // New manager should resume from the latest sequence
+    {
+        let manager = CheckpointManager::new(dir.path(), 10).expect("Failed to create manager");
+        assert_eq!(manager.current_sequence(), 5);
+    }
+}
+
+#[test]
+fn test_load_specific_checkpoint() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 10).expect("Failed to create manager");
+
+    // Create 3 checkpoints with different epochs
+    for i in 0..3 {
+        let mut data = CheckpointData::new(i, i * 10, 100);
+        data.metadata
+            .insert("marker".to_string(), format!("checkpoint_{}", i));
+        manager
+            .create_checkpoint(&data)
+            .expect("Failed to create");
+    }
+
+    // Load specific checkpoint (sequence 2)
+    let (header, data) = manager
+        .load_checkpoint(2)
+        .expect("Failed to load")
+        .expect("Not found");
+    assert_eq!(header.sequence, 2);
+    assert_eq!(data.epoch, 10);
+    assert_eq!(data.metadata.get("marker"), Some(&"checkpoint_1".to_string()));
+}
+
+// ============================================================================
+// METADATA TESTS
+// ============================================================================
+
+#[test]
+fn test_checkpoint_metadata_persistence() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+
+    let mut data = CheckpointData::new(1, 5, 100);
+    data.metadata.insert("version".to_string(), "1.0.0".to_string());
+    data.metadata.insert("node_id".to_string(), "validator_1".to_string());
+    data.metadata.insert("custom_key".to_string(), "custom_value".to_string());
+
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    let (_, loaded) = manager
+        .load_latest()
+        .expect("Failed to load")
+        .expect("No checkpoint");
+
+    assert_eq!(loaded.metadata.len(), 3);
+    assert_eq!(loaded.metadata.get("version"), Some(&"1.0.0".to_string()));
+    assert_eq!(loaded.metadata.get("node_id"), Some(&"validator_1".to_string()));
+    assert_eq!(loaded.metadata.get("custom_key"), Some(&"custom_value".to_string()));
+}
+
+// ============================================================================
+// COMPLETED EVALUATION TESTS
+// ============================================================================
+
+#[test]
+fn test_completed_evaluations_persistence() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+
+    let challenge_id = ChallengeId::new();
+    let mut data = CheckpointData::new(1, 5, 100);
+    
+    for i in 0..5 {
+        data.completed_evaluations.push(CompletedEvaluationState {
+            submission_id: format!("completed_{}", i),
+            challenge_id,
+            final_score: 0.80 + (i as f64 * 0.04),
+            epoch: 5,
+            completed_at: chrono::Utc::now().timestamp_millis(),
+        });
+    }
+
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    let (_, loaded) = manager
+        .load_latest()
+        .expect("Failed to load")
+        .expect("No checkpoint");
+
+    assert_eq!(loaded.completed_evaluations.len(), 5);
+    
+    // Verify score ordering is preserved
+    for (i, eval) in loaded.completed_evaluations.iter().enumerate() {
+        let expected_score = 0.80 + (i as f64 * 0.04);
+        assert!((eval.final_score - expected_score).abs() < 0.001);
+        assert_eq!(eval.challenge_id, challenge_id);
+    }
+}
+
+// ============================================================================
+// EMPTY STATE TESTS
+// ============================================================================
+
+#[test]
+fn test_checkpoint_with_empty_state() {
+    let dir = tempdir().expect("Failed to create temp dir");
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+
+    // Empty checkpoint data
+    let data = CheckpointData::new(0, 0, 100);
+
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    let (_, loaded) = manager
+        .load_latest()
+        .expect("Failed to load")
+        .expect("No checkpoint");
+
+    assert_eq!(loaded.sequence, 0);
+    assert_eq!(loaded.epoch, 0);
+    assert!(loaded.pending_evaluations.is_empty());
+    assert!(loaded.completed_evaluations.is_empty());
+    assert!(loaded.weight_votes.is_none());
+    assert!(loaded.metadata.is_empty());
+}
+
+// ============================================================================
+// RESTORATION VALIDATION TESTS
+// ============================================================================
+
+#[test]
+fn test_restoration_validates_epoch() {
+    let dir = tempdir().expect("Failed to create temp dir");
+
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+    let mut data = CheckpointData::new(1, 2_000_000, 100); // Unreasonably high epoch
+    data.pending_evaluations.push(PendingEvaluationState {
+        submission_id: "test".to_string(),
+        challenge_id: ChallengeId::new(),
+        miner: Hotkey([1u8; 32]),
+        submission_hash: "hash".to_string(),
+        scores: HashMap::new(),
+        created_at: chrono::Utc::now().timestamp_millis(),
+        finalizing: false,
+    });
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    // With validation enabled, this should fail
+    let options = RestorationOptions::new()
+        .without_max_age()
+        .with_validation(true);
+
+    let restoration = RestorationManager::new(dir.path(), options).expect("Failed to create");
+    let result = restoration.restore_latest();
+    assert!(result.is_err());
+}
+
+#[test]
+fn test_restoration_validates_submission_id() {
+    let dir = tempdir().expect("Failed to create temp dir");
+
+    let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
+    let mut data = CheckpointData::new(1, 5, 100);
+    data.pending_evaluations.push(PendingEvaluationState {
+        submission_id: "".to_string(), // Empty submission_id is invalid
+        challenge_id: ChallengeId::new(),
+        miner: Hotkey([1u8; 32]),
+        submission_hash: "hash".to_string(),
+        scores: HashMap::new(),
+        created_at: chrono::Utc::now().timestamp_millis(),
+        finalizing: false,
+    });
+    manager
+        .create_checkpoint(&data)
+        .expect("Failed to create");
+
+    // With validation enabled, this should fail
+    let options = RestorationOptions::new()
+        .without_max_age()
+        .with_validation(true);
+
+    let restoration = RestorationManager::new(dir.path(), options).expect("Failed to create");
+    let result = restoration.restore_latest();
+    assert!(result.is_err());
+}

From 54b966121169a814b93e590a951a4f7b62b7c379 Mon Sep 17 00:00:00 2001
From: echobt <mathis.massimino+echo@cortex.foundation>
Date: Tue, 3 Feb 2026 11:21:22 +0000
Subject: [PATCH 8/8] feat: add graceful shutdown with checkpoint persistence

- Add ShutdownHandler struct for checkpoint management
- Create periodic checkpoints every 5 minutes
- Save final checkpoint on graceful shutdown (Ctrl+C)
- Persist evaluation state for hot-reload recovery

This enables validators to update without losing evaluation progress.
---
 Cargo.lock                                 |   1 +
 bins/validator-node/src/main.rs            | 126 ++++++++++++++++++++-
 crates/challenge-registry/src/health.rs    |   7 +-
 crates/challenge-registry/src/lifecycle.rs |  11 +-
 crates/challenge-registry/src/migration.rs |  32 +++++-
 crates/challenge-registry/src/registry.rs  |  24 ++--
 crates/core/src/checkpoint.rs              |  15 +--
 crates/core/src/lib.rs                     |   2 +-
 crates/core/src/restoration.rs             |   5 +-
 tests/checkpoint_tests.rs                  |  82 ++++++--------
 10 files changed, 221 insertions(+), 84 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 462e0ac..d4e27a9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4788,6 +4788,7 @@ dependencies = [
  "serde_json",
  "sha2 0.10.9",
  "sp-core 31.0.0",
+ "tempfile",
  "thiserror 2.0.17",
  "tracing",
  "uuid",
diff --git a/bins/validator-node/src/main.rs b/bins/validator-node/src/main.rs
index 2173084..7f325f5 100644
--- a/bins/validator-node/src/main.rs
+++ b/bins/validator-node/src/main.rs
@@ -12,7 +12,13 @@ use platform_bittensor::{
     sync_metagraph, BittensorClient, BlockSync, BlockSyncConfig, BlockSyncEvent, Metagraph,
     Subtensor, SubtensorClient,
 };
-use platform_core::{Hotkey, Keypair, SUDO_KEY_SS58};
+use platform_core::{
+    checkpoint::{
+        CheckpointData, CheckpointManager, CompletedEvaluationState, PendingEvaluationState,
+        WeightVoteState,
+    },
+    Hotkey, Keypair, SUDO_KEY_SS58,
+};
 use platform_distributed_storage::{
     DistributedStoreExt, LocalStorage, LocalStorageBuilder, StorageKey,
 };
@@ -20,7 +26,7 @@ use platform_p2p_consensus::{
     ChainState, ConsensusEngine, NetworkEvent, P2PConfig, P2PMessage, P2PNetwork, StateManager,
     ValidatorRecord, ValidatorSet,
 };
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use std::time::Duration;
 use tracing::{debug, error, info, warn};
@@ -28,6 +34,86 @@ use tracing::{debug, error, info, warn};
 /// Storage key for persisted chain state
 const STATE_STORAGE_KEY: &str = "chain_state";
 
+// ==================== Shutdown Handler ====================
+
+/// Handles graceful shutdown with state persistence
+struct ShutdownHandler {
+    checkpoint_manager: CheckpointManager,
+    state_manager: Arc<StateManager>,
+    netuid: u16,
+}
+
+impl ShutdownHandler {
+    fn new(checkpoint_dir: &Path, state_manager: Arc<StateManager>, netuid: u16) -> Result<Self> {
+        let checkpoint_manager = CheckpointManager::new(checkpoint_dir.join("checkpoints"), 10)?;
+        Ok(Self {
+            checkpoint_manager,
+            state_manager,
+            netuid,
+        })
+    }
+
+    /// Create checkpoint from current state
+    fn create_checkpoint(&mut self) -> Result<()> {
+        let state = self.state_manager.snapshot();
+
+        let mut checkpoint_data = CheckpointData::new(state.sequence, state.epoch, self.netuid);
+
+        // Convert pending evaluations
+        for (id, record) in &state.pending_evaluations {
+            let pending = PendingEvaluationState {
+                submission_id: id.clone(),
+                challenge_id: record.challenge_id,
+                miner: record.miner.clone(),
+                submission_hash: record.agent_hash.clone(),
+                scores: record
+                    .evaluations
+                    .iter()
+                    .map(|(k, v)| (k.clone(), v.score))
+                    .collect(),
+                created_at: record.created_at,
+                finalizing: record.finalized,
+            };
+            checkpoint_data.add_pending(pending);
+        }
+
+        // Convert completed evaluations (current epoch only)
+        if let Some(completed) = state.completed_evaluations.get(&state.epoch) {
+            for record in completed {
+                if let Some(score) = record.aggregated_score {
+                    let completed_state = CompletedEvaluationState {
+                        submission_id: record.submission_id.clone(),
+                        challenge_id: record.challenge_id,
+                        final_score: score,
+                        epoch: state.epoch,
+                        completed_at: record.finalized_at.unwrap_or(record.created_at),
+                    };
+                    checkpoint_data.add_completed(completed_state);
+                }
+            }
+        }
+
+        // Convert weight votes
+        if let Some(ref votes) = state.weight_votes {
+            checkpoint_data.weight_votes = Some(WeightVoteState {
+                epoch: votes.epoch,
+                netuid: votes.netuid,
+                votes: votes.votes.clone(),
+                finalized: votes.finalized,
+                final_weights: votes.final_weights.clone(),
+            });
+        }
+
+        checkpoint_data.bittensor_block = state.bittensor_block;
+
+        self.checkpoint_manager
+            .create_checkpoint(&checkpoint_data)?;
+        info!("Shutdown checkpoint created at sequence {}", state.sequence);
+
+        Ok(())
+    }
+}
+
 // ==================== CLI ====================
 
 #[derive(Parser, Debug)]
@@ -252,6 +338,22 @@ async fn main() -> Result<()> {
         bittensor_client_for_metagraph = None;
     }
 
+    // Initialize shutdown handler for graceful checkpoint persistence
+    let mut shutdown_handler =
+        match ShutdownHandler::new(&data_dir, state_manager.clone(), args.netuid) {
+            Ok(handler) => {
+                info!("Shutdown handler initialized with checkpoint directory");
+                Some(handler)
+            }
+            Err(e) => {
+                warn!(
+                    "Failed to initialize shutdown handler: {}. Checkpoints disabled.",
+                    e
+                );
+                None
+            }
+        };
+
     info!("Decentralized validator running. Press Ctrl+C to stop.");
 
     let netuid = args.netuid;
@@ -260,6 +362,7 @@ async fn main() -> Result<()> {
     let mut metagraph_interval = tokio::time::interval(Duration::from_secs(300));
     let mut stale_check_interval = tokio::time::interval(Duration::from_secs(60));
     let mut state_persist_interval = tokio::time::interval(Duration::from_secs(60));
+    let mut checkpoint_interval = tokio::time::interval(Duration::from_secs(300)); // 5 minutes
 
     loop {
         tokio::select! {
@@ -335,8 +438,27 @@ async fn main() -> Result<()> {
                 debug!("Active validators: {}", validator_set.active_count());
             }
 
+            // Periodic checkpoint
+            _ = checkpoint_interval.tick() => {
+                if let Some(handler) = shutdown_handler.as_mut() {
+                    if let Err(e) = handler.create_checkpoint() {
+                        warn!("Failed to create periodic checkpoint: {}", e);
+                    } else {
+                        debug!("Periodic checkpoint created");
+                    }
+                }
+            }
+
             // Ctrl+C
             _ = tokio::signal::ctrl_c() => {
+                info!("Received shutdown signal, creating final checkpoint...");
+                if let Some(handler) = shutdown_handler.as_mut() {
+                    if let Err(e) = handler.create_checkpoint() {
+                        error!("Failed to create shutdown checkpoint: {}", e);
+                    } else {
+                        info!("Shutdown checkpoint saved successfully");
+                    }
+                }
                 info!("Shutting down...");
                 break;
             }
diff --git a/crates/challenge-registry/src/health.rs b/crates/challenge-registry/src/health.rs
index e142fdb..5973271 100644
--- a/crates/challenge-registry/src/health.rs
+++ b/crates/challenge-registry/src/health.rs
@@ -5,11 +5,11 @@
 //! - Container status
 //! - Resource usage
 
+use parking_lot::RwLock;
 use platform_core::ChallengeId;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::time::Duration;
-use parking_lot::RwLock;
 
 /// Health status of a challenge
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
@@ -67,7 +67,10 @@ impl ChallengeHealth {
 
     /// Check if the challenge is operational (healthy or degraded)
     pub fn is_operational(&self) -> bool {
-        matches!(self.status, HealthStatus::Healthy | HealthStatus::Degraded(_))
+        matches!(
+            self.status,
+            HealthStatus::Healthy | HealthStatus::Degraded(_)
+        )
     }
 
     /// Record a successful health check
diff --git a/crates/challenge-registry/src/lifecycle.rs b/crates/challenge-registry/src/lifecycle.rs
index a2ba334..8e99e44 100644
--- a/crates/challenge-registry/src/lifecycle.rs
+++ b/crates/challenge-registry/src/lifecycle.rs
@@ -137,7 +137,9 @@ mod tests {
     fn test_valid_transitions() {
         let lifecycle = ChallengeLifecycle::new();
 
-        assert!(lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Starting));
+        assert!(
+            lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Starting)
+        );
         assert!(lifecycle.is_valid_transition(&LifecycleState::Starting, &LifecycleState::Running));
         assert!(lifecycle.is_valid_transition(&LifecycleState::Running, &LifecycleState::Stopping));
         assert!(lifecycle.is_valid_transition(&LifecycleState::Stopping, &LifecycleState::Stopped));
@@ -147,14 +149,15 @@ mod tests {
     fn test_invalid_transitions() {
         let lifecycle = ChallengeLifecycle::new();
 
-        assert!(!lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Running));
+        assert!(
+            !lifecycle.is_valid_transition(&LifecycleState::Registered, &LifecycleState::Running)
+        );
         assert!(!lifecycle.is_valid_transition(&LifecycleState::Stopped, &LifecycleState::Running));
     }
 
     #[test]
     fn test_lifecycle_config() {
-        let lifecycle = ChallengeLifecycle::new()
-            .with_auto_restart(false, 5);
+        let lifecycle = ChallengeLifecycle::new().with_auto_restart(false, 5);
 
         assert!(!lifecycle.auto_restart_enabled());
         assert_eq!(lifecycle.max_restart_attempts(), 5);
diff --git a/crates/challenge-registry/src/migration.rs b/crates/challenge-registry/src/migration.rs
index 002c543..dae3a1a 100644
--- a/crates/challenge-registry/src/migration.rs
+++ b/crates/challenge-registry/src/migration.rs
@@ -143,7 +143,10 @@ impl MigrationPlan {
 
     /// Check if migration is complete
     pub fn is_complete(&self) -> bool {
-        matches!(self.status, MigrationStatus::Completed | MigrationStatus::RolledBack)
+        matches!(
+            self.status,
+            MigrationStatus::Completed | MigrationStatus::RolledBack
+        )
     }
 
     /// Check if migration can be rolled back
@@ -229,7 +232,12 @@ impl ChallengeMigration {
             ));
         }
 
-        let mut plan = MigrationPlan::new(challenge_id, challenge_name, from_version.clone(), to_version.clone());
+        let mut plan = MigrationPlan::new(
+            challenge_id,
+            challenge_name,
+            from_version.clone(),
+            to_version.clone(),
+        );
 
         // Generate migration steps based on version difference
         // This is a simplified version - real implementation would analyze schemas
@@ -237,7 +245,10 @@ impl ChallengeMigration {
             plan.add_step(
                 MigrationStep::new(
                     "major_upgrade".to_string(),
-                    format!("Major version upgrade from {} to {}", from_version.major, to_version.major),
+                    format!(
+                        "Major version upgrade from {} to {}",
+                        from_version.major, to_version.major
+                    ),
                     from_version.clone(),
                     to_version.clone(),
                 )
@@ -248,7 +259,10 @@ impl ChallengeMigration {
             plan.add_step(
                 MigrationStep::new(
                     "minor_upgrade".to_string(),
-                    format!("Minor version upgrade from {} to {}", from_version, to_version),
+                    format!(
+                        "Minor version upgrade from {} to {}",
+                        from_version, to_version
+                    ),
                     from_version.clone(),
                     to_version.clone(),
                 )
@@ -258,7 +272,10 @@ impl ChallengeMigration {
             plan.add_step(
                 MigrationStep::new(
                     "patch_upgrade".to_string(),
-                    format!("Patch version upgrade from {} to {}", from_version, to_version),
+                    format!(
+                        "Patch version upgrade from {} to {}",
+                        from_version, to_version
+                    ),
                     from_version,
                     to_version,
                 )
@@ -419,7 +436,10 @@ mod tests {
 
         let active = migration.get_active_migration(&id);
         assert!(active.is_some());
-        assert!(matches!(active.unwrap().status, MigrationStatus::InProgress));
+        assert!(matches!(
+            active.unwrap().status,
+            MigrationStatus::InProgress
+        ));
 
         let complete = migration.complete_step(&id).unwrap();
         assert!(complete);
diff --git a/crates/challenge-registry/src/registry.rs b/crates/challenge-registry/src/registry.rs
index 1c2a0bd..39ad982 100644
--- a/crates/challenge-registry/src/registry.rs
+++ b/crates/challenge-registry/src/registry.rs
@@ -112,10 +112,7 @@ impl ChallengeRegistry {
         let name = entry.name.clone();
 
         let state_store = Arc::new(StateStore::new(id));
-        let registered = RegisteredChallenge {
-            entry,
-            state_store,
-        };
+        let registered = RegisteredChallenge { entry, state_store };
 
         challenges.insert(id, registered);
         name_index.insert(name.clone(), id);
@@ -178,11 +175,7 @@ impl ChallengeRegistry {
     }
 
     /// Update challenge lifecycle state
-    pub fn update_state(
-        &self,
-        id: &ChallengeId,
-        new_state: LifecycleState,
-    ) -> RegistryResult<()> {
+    pub fn update_state(&self, id: &ChallengeId, new_state: LifecycleState) -> RegistryResult<()> {
         let mut challenges = self.challenges.write();
         let registered = challenges
             .get_mut(id)
@@ -264,7 +257,10 @@ impl ChallengeRegistry {
 
     /// Get state store for a challenge
     pub fn state_store(&self, id: &ChallengeId) -> Option<Arc<StateStore>> {
-        self.challenges.read().get(id).map(|r| r.state_store.clone())
+        self.challenges
+            .read()
+            .get(id)
+            .map(|r| r.state_store.clone())
     }
 
     /// Add event listener
@@ -398,7 +394,9 @@ mod tests {
         );
 
         let id = registry.register(entry).unwrap();
-        let old = registry.update_version(&id, ChallengeVersion::new(1, 1, 0)).unwrap();
+        let old = registry
+            .update_version(&id, ChallengeVersion::new(1, 1, 0))
+            .unwrap();
 
         assert_eq!(old, ChallengeVersion::new(1, 0, 0));
 
@@ -426,7 +424,9 @@ mod tests {
         registry.register(entry2).unwrap();
 
         // Make first one active
-        registry.update_state(&id1, LifecycleState::Running).unwrap();
+        registry
+            .update_state(&id1, LifecycleState::Running)
+            .unwrap();
         registry.update_health(&id1, HealthStatus::Healthy).unwrap();
 
         let active = registry.list_active();
diff --git a/crates/core/src/checkpoint.rs b/crates/core/src/checkpoint.rs
index b627e4a..12e32c1 100644
--- a/crates/core/src/checkpoint.rs
+++ b/crates/core/src/checkpoint.rs
@@ -244,8 +244,8 @@ impl CheckpointManager {
 
         // Create header
         let header = CheckpointHeader::new(sequence, data_hash, data_bytes.len() as u64);
-        let header_bytes =
-            bincode::serialize(&header).map_err(|e| MiniChainError::Serialization(e.to_string()))?;
+        let header_bytes = bincode::serialize(&header)
+            .map_err(|e| MiniChainError::Serialization(e.to_string()))?;
 
         // Write to file atomically (write to temp, then rename)
         let temp_filename = filename.with_extension("tmp");
@@ -320,9 +320,9 @@ impl CheckpointManager {
 
         // Read header length
         let mut header_len_bytes = [0u8; 4];
-        reader.read_exact(&mut header_len_bytes).map_err(|e| {
-            MiniChainError::Storage(format!("Failed to read header length: {}", e))
-        })?;
+        reader
+            .read_exact(&mut header_len_bytes)
+            .map_err(|e| MiniChainError::Storage(format!("Failed to read header length: {}", e)))?;
         let header_len = u32::from_le_bytes(header_len_bytes) as usize;
 
         // Read header
@@ -598,10 +598,7 @@ mod tests {
         let (header, data) = manager.load_checkpoint(2).unwrap().unwrap();
         assert_eq!(header.sequence, 2);
         assert_eq!(data.epoch, 10);
-        assert_eq!(
-            data.metadata.get("test_key"),
-            Some(&"value_1".to_string())
-        );
+        assert_eq!(data.metadata.get("test_key"), Some(&"value_1".to_string()));
     }
 
     #[test]
diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs
index ef802c6..5936e5f 100644
--- a/crates/core/src/lib.rs
+++ b/crates/core/src/lib.rs
@@ -25,7 +25,7 @@ pub use crypto::*;
 pub use error::*;
 pub use message::*;
 pub use restoration::{
-    CheckpointInfo, RestorationManager, RestorationOptions, RestorationResult, Restorable,
+    CheckpointInfo, Restorable, RestorationManager, RestorationOptions, RestorationResult,
 };
 pub use schema_guard::{verify_schema_integrity, SchemaError};
 pub use state::*;
diff --git a/crates/core/src/restoration.rs b/crates/core/src/restoration.rs
index c2a5eda..53db1a9 100644
--- a/crates/core/src/restoration.rs
+++ b/crates/core/src/restoration.rs
@@ -507,7 +507,10 @@ mod tests {
         assert!(result.is_some());
         let (_res, restored_data) = result.unwrap();
         assert_eq!(restored_data.pending_evaluations.len(), 1);
-        assert_eq!(restored_data.pending_evaluations[0].challenge_id, challenge1);
+        assert_eq!(
+            restored_data.pending_evaluations[0].challenge_id,
+            challenge1
+        );
     }
 
     #[test]
diff --git a/tests/checkpoint_tests.rs b/tests/checkpoint_tests.rs
index 3b9421e..1bccf51 100644
--- a/tests/checkpoint_tests.rs
+++ b/tests/checkpoint_tests.rs
@@ -3,8 +3,8 @@
 //! Tests for verifying the checkpoint/restoration system works correctly end-to-end.
 
 use platform_core::{
-    CheckpointData, CheckpointManager, CompletedEvaluationState, PendingEvaluationState,
-    WeightVoteState, RestorationManager, RestorationOptions, ChallengeId, Hotkey,
+    ChallengeId, CheckpointData, CheckpointManager, CompletedEvaluationState, Hotkey,
+    PendingEvaluationState, RestorationManager, RestorationOptions, WeightVoteState,
 };
 use std::collections::HashMap;
 use tempfile::tempdir;
@@ -197,9 +197,7 @@ fn test_checkpoint_hash_verification() {
     let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
 
     let data = create_test_data();
-    let path = manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    let path = manager.create_checkpoint(&data).expect("Failed to create");
 
     // Corrupt the file
     let mut content = std::fs::read(&path).expect("Failed to read");
@@ -237,9 +235,7 @@ fn test_weight_votes_persistence() {
         final_weights: Some(vec![(0, 4500), (1, 6500), (2, 7000)]),
     });
 
-    manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    manager.create_checkpoint(&data).expect("Failed to create");
 
     let (_, loaded) = manager
         .load_latest()
@@ -262,12 +258,9 @@ fn test_checkpoint_info() {
 
     let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
     let data = create_test_data();
-    manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    manager.create_checkpoint(&data).expect("Failed to create");
 
-    let restoration =
-        RestorationManager::with_defaults(dir.path()).expect("Failed to create");
+    let restoration = RestorationManager::with_defaults(dir.path()).expect("Failed to create");
     let infos = restoration.list_available().expect("Failed to list");
 
     assert_eq!(infos.len(), 1);
@@ -304,9 +297,7 @@ fn test_pending_evaluation_scores_persistence() {
         finalizing: true,
     });
 
-    manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    manager.create_checkpoint(&data).expect("Failed to create");
 
     let (_, loaded) = manager
         .load_latest()
@@ -331,13 +322,10 @@ fn test_checkpoint_sequence_resume() {
 
     // First manager creates checkpoints
     {
-        let mut manager =
-            CheckpointManager::new(dir.path(), 10).expect("Failed to create manager");
+        let mut manager = CheckpointManager::new(dir.path(), 10).expect("Failed to create manager");
         for i in 0..5 {
             let data = CheckpointData::new(i, i, 100);
-            manager
-                .create_checkpoint(&data)
-                .expect("Failed to create");
+            manager.create_checkpoint(&data).expect("Failed to create");
         }
         assert_eq!(manager.current_sequence(), 5);
     }
@@ -359,9 +347,7 @@ fn test_load_specific_checkpoint() {
         let mut data = CheckpointData::new(i, i * 10, 100);
         data.metadata
             .insert("marker".to_string(), format!("checkpoint_{}", i));
-        manager
-            .create_checkpoint(&data)
-            .expect("Failed to create");
+        manager.create_checkpoint(&data).expect("Failed to create");
     }
 
     // Load specific checkpoint (sequence 2)
@@ -371,7 +357,10 @@ fn test_load_specific_checkpoint() {
         .expect("Not found");
     assert_eq!(header.sequence, 2);
     assert_eq!(data.epoch, 10);
-    assert_eq!(data.metadata.get("marker"), Some(&"checkpoint_1".to_string()));
+    assert_eq!(
+        data.metadata.get("marker"),
+        Some(&"checkpoint_1".to_string())
+    );
 }
 
 // ============================================================================
@@ -384,13 +373,14 @@ fn test_checkpoint_metadata_persistence() {
     let mut manager = CheckpointManager::new(dir.path(), 5).expect("Failed to create manager");
 
     let mut data = CheckpointData::new(1, 5, 100);
-    data.metadata.insert("version".to_string(), "1.0.0".to_string());
-    data.metadata.insert("node_id".to_string(), "validator_1".to_string());
-    data.metadata.insert("custom_key".to_string(), "custom_value".to_string());
+    data.metadata
+        .insert("version".to_string(), "1.0.0".to_string());
+    data.metadata
+        .insert("node_id".to_string(), "validator_1".to_string());
+    data.metadata
+        .insert("custom_key".to_string(), "custom_value".to_string());
 
-    manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    manager.create_checkpoint(&data).expect("Failed to create");
 
     let (_, loaded) = manager
         .load_latest()
@@ -399,8 +389,14 @@ fn test_checkpoint_metadata_persistence() {
 
     assert_eq!(loaded.metadata.len(), 3);
     assert_eq!(loaded.metadata.get("version"), Some(&"1.0.0".to_string()));
-    assert_eq!(loaded.metadata.get("node_id"), Some(&"validator_1".to_string()));
-    assert_eq!(loaded.metadata.get("custom_key"), Some(&"custom_value".to_string()));
+    assert_eq!(
+        loaded.metadata.get("node_id"),
+        Some(&"validator_1".to_string())
+    );
+    assert_eq!(
+        loaded.metadata.get("custom_key"),
+        Some(&"custom_value".to_string())
+    );
 }
 
 // ============================================================================
@@ -414,7 +410,7 @@ fn test_completed_evaluations_persistence() {
 
     let challenge_id = ChallengeId::new();
     let mut data = CheckpointData::new(1, 5, 100);
-    
+
     for i in 0..5 {
         data.completed_evaluations.push(CompletedEvaluationState {
             submission_id: format!("completed_{}", i),
@@ -425,9 +421,7 @@ fn test_completed_evaluations_persistence() {
         });
     }
 
-    manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    manager.create_checkpoint(&data).expect("Failed to create");
 
     let (_, loaded) = manager
         .load_latest()
@@ -435,7 +429,7 @@ fn test_completed_evaluations_persistence() {
         .expect("No checkpoint");
 
     assert_eq!(loaded.completed_evaluations.len(), 5);
-    
+
     // Verify score ordering is preserved
     for (i, eval) in loaded.completed_evaluations.iter().enumerate() {
         let expected_score = 0.80 + (i as f64 * 0.04);
@@ -456,9 +450,7 @@ fn test_checkpoint_with_empty_state() {
     // Empty checkpoint data
     let data = CheckpointData::new(0, 0, 100);
 
-    manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    manager.create_checkpoint(&data).expect("Failed to create");
 
     let (_, loaded) = manager
         .load_latest()
@@ -492,9 +484,7 @@ fn test_restoration_validates_epoch() {
         created_at: chrono::Utc::now().timestamp_millis(),
         finalizing: false,
     });
-    manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    manager.create_checkpoint(&data).expect("Failed to create");
 
     // With validation enabled, this should fail
     let options = RestorationOptions::new()
@@ -521,9 +511,7 @@ fn test_restoration_validates_submission_id() {
         created_at: chrono::Utc::now().timestamp_millis(),
         finalizing: false,
     });
-    manager
-        .create_checkpoint(&data)
-        .expect("Failed to create");
+    manager.create_checkpoint(&data).expect("Failed to create");
 
     // With validation enabled, this should fail
     let options = RestorationOptions::new()