From 741ee988004f0f7062965e4d31f9553be0692f86 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 22 Jul 2025 16:03:28 -0700
Subject: [PATCH 01/27] add nvbench kernel launch

---
 .gitignore                                    |   3 +
 .gitmodules                                   |   3 +
 examples/benchmarks/nvbench_kernel_launch.py  |  25 +++
 .../benchmarks/micro_benchmarks/__init__.py   |   2 +
 .../micro_benchmarks/nvbench/CMakeLists.txt   |  15 ++
 .../micro_benchmarks/nvbench/launch_bench.cu  |  11 ++
 .../micro_benchmarks/nvbench_kernel_launch.py | 173 ++++++++++++++++++
 .../test_nvbench_kernel_launch.py             | 105 +++++++++++
 tests/data/nvbench_kernel_launch.log          |  34 ++++
 third_party/Makefile                          |  13 ++
 10 files changed, 384 insertions(+)
 create mode 100644 examples/benchmarks/nvbench_kernel_launch.py
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
 create mode 100644 tests/data/nvbench_kernel_launch.log

diff --git a/.gitignore b/.gitignore
index 5888455a8..97b51dcd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -151,6 +151,9 @@ cython_debug/
 *.userosscache
 *.sln.docstates
 
+# Build temporary files
+compile_commands.json
+
 # Build results
 [Dd]ebug/
 [Dd]ebugPublic/
diff --git a/.gitmodules b/.gitmodules
index 9be41b59b..f82383ae9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -33,3 +33,6 @@
 [submodule "third_party/nvbandwidth"]
 	path = third_party/nvbandwidth
 	url = https://github.com/NVIDIA/nvbandwidth.git
+[submodule "third_party/nvbench"]
+	path = third_party/nvbench
+	url = https://github.com/NVIDIA/nvbench.git
diff --git a/examples/benchmarks/nvbench_kernel_launch.py b/examples/benchmarks/nvbench_kernel_launch.py
new file mode 100644
index 000000000..73377084b
--- /dev/null
+++ b/examples/benchmarks/nvbench_kernel_launch.py
@@ -0,0 +1,25 @@
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-kernel-launch',
+        platform=Platform.CUDA,
+        parameters=(
+            '--timeout 30 '
+            '--min-samples 10 '
+            '--min-time 1.0 '
+            '--max-noise 0.1 '
+            '--stopping-criterion stdrel '
+            '--throttle-threshold 80 '
+            '--throttle-recovery-delay 1.0'
+        )
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 978c2d385..9ae738444 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -39,6 +39,7 @@
 from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
+from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
 
 __all__ = [
     'BlasLtBaseBenchmark',
@@ -77,4 +78,5 @@
     'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
     'NvBandwidthBenchmark',
+    'NvbenchKernelLaunch'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
new file mode 100644
index 000000000..a23d722a3
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.20)
+project(nvbench_benchmarks LANGUAGES CUDA)
+
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+  include(../cuda_common.cmake)
+
+  find_package(nvbench CONFIG REQUIRED)
+  add_executable(nvbench_kernel_launch launch_bench.cu)
+  target_compile_features(nvbench_kernel_launch PUBLIC cuda_std_17)
+  target_link_libraries(nvbench_kernel_launch
+    PRIVATE nvbench::nvbench nvbench::main
+  )
+  install(TARGETS nvbench_kernel_launch RUNTIME DESTINATION bin)
+endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu b/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
new file mode 100644
index 000000000..f3e1bc7f0
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
@@ -0,0 +1,11 @@
+#include <nvbench/nvbench.cuh>
+
+__global__ void empty_kernel() {}
+
+void launch_bench(nvbench::state &state) {
+  state.exec([](nvbench::launch &launch) {
+    empty_kernel<<<1, 1, 0, launch.get_stream()>>>();
+  });
+}
+
+NVBENCH_BENCH(launch_bench);
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
new file mode 100644
index 000000000..74cf94383
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -0,0 +1,173 @@
+import os
+import re
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+def parse_time_to_us(raw: str) -> float:
+    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms' → float µs."""
+    raw = raw.strip()
+    if raw.endswith('%'):
+        return float(raw[:-1])
+    # split “value unit” or “valueunit”
+    m = re.match(r'([\d.]+)\s*([mun]?s)?', raw)
+    if not m:
+        return float(raw)
+    val, unit = float(m.group(1)), (m.group(2) or 'us')
+    if unit == 'ns':  return val / 1e3
+    if unit == 'ms':  return val * 1e3
+    return val
+
+class NvbenchKernelLaunch(MicroBenchmarkWithInvoke):
+    """Nvbench benchmark wrapper for SuperBench."""
+    def __init__(self, name, parameters=None):
+        """Initialize the benchmark."""
+        super().__init__(name, parameters)
+        self._bin_name = "nvbench_kernel_launch"
+    
+    def add_parser_arguments(self):
+        """
+        Add NVBench CLI options (excluding Queries, Device modification, Output, Benchmark / Axis Specification):
+          - Benchmark Properties (Omit --run-once, --disable-blocking-kernel, --profile)
+          - Stopping Criteria
+        """
+        super().add_parser_arguments()
+
+        # Benchmark Properties
+        self._parser.add_argument(
+            '--devices', '--device', '-d', type=str, default=None,
+            help='Limit execution to one or more device IDs (comma-separated or "all").'
+        )
+        self._parser.add_argument(
+            '--skip-time', type=float, default=-1.0,
+            help='Skip a measurement when a warmup run executes in less than this (seconds).'
+        )
+        # With a threshold >0 and a recovery delay >0, NVBench will automatically pause 
+        # and “wait for the card to warm up” back to a stable high‐clock state, giving you 
+        # more consistent (and lower) timings that reflect the GPU’s true peak performance.
+        self._parser.add_argument(
+            '--throttle-threshold', type=float, default=75.0,
+            help="GPU throttle threshold as percent of default clock rate. Disabled when nvbench::exec_tag::sync is used."
+        )
+        self._parser.add_argument(
+            '--throttle-recovery-delay', type=float, default=0.05,
+            help='Seconds to wait after throttle before resuming. '
+            'Disabled when nvbench::exec_tag::sync is used.'
+        )
+
+        # Stopping Criteria
+        self._parser.add_argument(
+            '--timeout', type=int, default=15,
+            help='Walltime timeout in seconds for each measurement.'
+        )
+        self._parser.add_argument(
+            '--min-samples', type=int, default=10,
+            help='Minimum number of samples per measurement before checking other criteria.'
+        )
+        self._parser.add_argument(
+            '--stopping-criterion', type=str, default='stdrel',
+            choices=['stdrel', 'entropy'],
+            help='Stopping criterion to use after --min-samples is satisfied: '
+            '"stdrel" or "entropy".'
+        )
+        # stdrel-specific
+        self._parser.add_argument(
+            '--min-time', type=float, default=0.5, 
+            help='(stdrel) Minimum execution time accumulated per measurement (seconds).'
+        )
+        self._parser.add_argument(
+            '--max-noise', type=float, default=0.5,
+            help='(stdrel) Maximum relative standard deviation (%) before stopping.'
+        )
+        # entropy-specific
+        self._parser.add_argument(
+            '--max-angle', type=float, default=0.048,
+            help='(entropy) Maximum linear regression angle of cumulative entropy.'
+        )
+        self._parser.add_argument(
+            '--min-r2', type=float, default=0.36,
+            help='(entropy) Minimum coefficient of determination (R²) for linear regression of cumulative entropy.'
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+        parts = [bin_path]
+
+        # Benchmark Properties (all optional)
+        if self._args.devices:
+            parts.extend(['--devices', self._args.devices])
+        if self._args.skip_time >= 0:
+            parts.extend(['--skip-time', str(self._args.skip_time)])
+        if self._args.throttle_threshold > 0:
+            parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
+        if self._args.throttle_recovery_delay > 0:
+            parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+
+        # Stopping Criteria (all optional)
+        if self._args.timeout is not None:
+            parts.extend(['--timeout', str(self._args.timeout)])
+        if self._args.min_samples is not None:
+            parts.extend(['--min-samples', str(self._args.min_samples)])
+        if self._args.stopping_criterion:
+            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
+            if self._args.stopping_criterion == 'stdrel':
+                if self._args.min_time is not None:
+                    parts.extend(['--min-time', str(self._args.min_time)])
+                if self._args.max_noise is not None:
+                    parts.extend(['--max-noise', str(self._args.max_noise)])
+            elif self._args.stopping_criterion == 'entropy':
+                if self._args.max_angle is not None:
+                    parts.extend(['--max-angle', str(self._args.max_angle)])
+                if self._args.min_r2 is not None:
+                    parts.extend(['--min-r2', str(self._args.min_r2)])
+
+        # finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+        try:
+            gpu_section = r"### \[(\d+)\] NVIDIA"
+            row_pat = (
+                r"\| (\d+)x \| ([\d.]+ ?[mun]?s) \| ([\d.]+%) \| "
+                r"([\d.]+ ?[mun]?s) \| ([\d.]+%) \| (\d+)x \| *([\d.]+ ?[mun]?s) \|"
+            )
+            current = None
+            parsed_any = False  # Track if any valid rows are parsed
+            for line in raw_output.splitlines():
+                line = line.strip()
+                g = re.match(gpu_section, line)
+                if g:
+                    current = f"gpu_{g.group(1)}"
+                    continue
+                r = re.match(row_pat, line)
+                if r and current:
+                    self._result.add_result(f"{current}_samples", int(r.group(1)))
+                    self._result.add_result(f"{current}_cpu_time", parse_time_to_us(r.group(2)))
+                    self._result.add_result(f"{current}_cpu_noise", float(r.group(3)[:-1]))
+                    self._result.add_result(f"{current}_gpu_time", parse_time_to_us(r.group(4)))
+                    self._result.add_result(f"{current}_gpu_noise", float(r.group(5)[:-1]))
+                    self._result.add_result(f"{current}_batch_samples", int(r.group(6)))
+                    self._result.add_result(f"{current}_batch_gpu_time", parse_time_to_us(r.group(7)))
+                    parsed_any = True
+            if not parsed_any:
+                logger.error("No valid rows parsed from the raw output.")
+                raise RuntimeError("No valid rows parsed")
+        except Exception as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{e}\n{raw_output}"
+            )
+            return False
+        return True
+
+# Register the benchmark
+BenchmarkRegistry.register_benchmark("nvbench-kernel-launch", NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
new file mode 100644
index 000000000..7c131a6c8
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench kernel launch benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvbenchKernelLaunchBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NVBench Kernel Launch benchmark."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbench_kernel_launch'])
+
+    def test_nvbench_kernel_launch_preprocess(self):
+        """Test NVBench Kernel Launch benchmark preprocess."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = (
+            '--device 0 '
+            '--timeout 20 '
+            '--min-samples 300 '
+            '--stopping-criterion stdrel '
+            '--min-time 2.0 '
+            '--max-noise 0.5 '
+            '--throttle-threshold 80.0 '
+            '--throttle-recovery-delay 1.0'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--devices 0' in benchmark._commands[0])
+        assert ('--timeout 20' in benchmark._commands[0])
+        assert ('--min-samples 300' in benchmark._commands[0])
+        assert ('--stopping-criterion stdrel' in benchmark._commands[0])
+        assert ('--min-time 2.0' in benchmark._commands[0])
+        assert ('--max-noise 0.5' in benchmark._commands[0])
+        assert ('--throttle-threshold 80.0' in benchmark._commands[0])
+        assert ('--throttle-recovery-delay 1.0' in benchmark._commands[0])
+
+    @decorator.load_data('tests/data/nvbench_kernel_launch.log')
+    def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
+        """Test NVBench Kernel Launch benchmark result parsing."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate parsed results
+        assert benchmark.result['gpu_0_samples'][0] == 120000
+        assert benchmark.result['gpu_0_cpu_time'][0] == 24.222
+        assert benchmark.result['gpu_0_cpu_noise'][0] == 30.44
+        assert benchmark.result['gpu_0_gpu_time'][0] == 7.808
+        assert benchmark.result['gpu_0_gpu_noise'][0] == 14.42
+        assert benchmark.result['gpu_0_batch_samples'][0] == 300000
+        assert benchmark.result['gpu_0_batch_gpu_time'][0] == 6.024
+
+    def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
+        """Test NVBench Kernel Launch benchmark result parsing with invalid output."""
+        benchmark_name = 'nvbench-kernel-launch'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Mock raw output with invalid format
+        raw_output = "Invalid output format"
+
+        # Parse the provided raw output
+        assert not benchmark._process_raw_result(0, raw_output)
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/data/nvbench_kernel_launch.log b/tests/data/nvbench_kernel_launch.log
new file mode 100644
index 000000000..95ccbc065
--- /dev/null
+++ b/tests/data/nvbench_kernel_launch.log
@@ -0,0 +1,34 @@
+# Devices
+
+## [0] `NVIDIA GPU`
+* SM Version: 900 (PTX Version: 900)
+* Number of SMs: 100
+* SM Default Clock Rate: 800 MHz
+* Global Memory: 100000 MiB Free / 100000 MiB Total
+* Global Memory Bus Peak: 3000 GB/sec (3000-bit DDR @4000MHz)
+* Max Shared Memory: 100 KiB/SM, 20 KiB/Block
+* L2 Cache Size: 1000 KiB
+* Maximum Active Blocks: 10/SM
+* Maximum Active Threads: 500/SM, 300/Block
+* Available Registers: 500/SM, 500/Block
+* ECC Enabled: Yes
+
+# Log
+
+```
+Run:  [1/1] launch_bench [Device=0]
+Warn: Current measurement timed out (20.00s) while over noise threshold (14.42% > 0.50%)
+Warn: Current measurement timed out (20.00s) before accumulating min_time (0.94s < 2.00s)
+Pass: Cold: 0.005878ms GPU, 0.022181ms CPU, 0.94s total GPU, 20.00s total wall, 120000x 
+Pass: Batch: 0.004024ms GPU, 2.00s total GPU, 2.00s total wall, 300000x
+```
+
+# Benchmark Results
+
+## launch_bench
+
+### [0] NVIDIA GPU
+
+| Samples | CPU Time  | Noise  | GPU Time | Noise  | Samples | Batch GPU |
+|---------|-----------|--------|----------|--------|---------|-----------|
+| 120000x | 24.222 us | 30.44% | 7.808 us | 14.42% | 300000x |  6.024 us |
\ No newline at end of file
diff --git a/third_party/Makefile b/third_party/Makefile
index 667a46a47..465b5d6e4 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -25,6 +25,7 @@ cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcne
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
 cpu: common cpu_perftest
 common: fio cpu_stream
+nvbench_only: nvbench
 
 # non aarch64 specific targets
 ifneq ($(shell uname -m), aarch64)
@@ -301,3 +302,15 @@ endif
 nvbandwidth: sb_micro_path
 	cd ./nvbandwidth && cmake . && make && cd ..
 	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
+
+# Build nvbench
+nvbench: sb_micro_path
+	cd ./nvbench && mkdir -p build && cd build && \
+	cmake                                         \
+	-DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)       \
+	-DCMAKE_CUDA_ARCHITECTURES=100                \
+	-DNVBench_ENABLE_CUPTI=ON                     \
+	-DCMAKE_BUILD_TYPE=Release                    \
+	.. &&                                         \
+	make -j $(NUM_MAKE_JOBS) &&                   \
+	make install

From 0ae7864c52d914ac95c7852e0093ae48489f3e46 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 22 Jul 2025 23:44:24 +0000
Subject: [PATCH 02/27] submodule update

---
 third_party/nvbench | 1 +
 1 file changed, 1 insertion(+)
 create mode 160000 third_party/nvbench

diff --git a/third_party/nvbench b/third_party/nvbench
new file mode 160000
index 000000000..3de9dc95d
--- /dev/null
+++ b/third_party/nvbench
@@ -0,0 +1 @@
+Subproject commit 3de9dc95da3fc859900a799918a0c1d9091920ff

From 35bfb61c118dab3436577797977f806da7861382 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 30 Jul 2025 05:55:02 +0000
Subject: [PATCH 03/27] init sleep kernel

---
 examples/benchmarks/nvbench_sleep_kernel.py   |  28 ++
 .../benchmarks/micro_benchmarks/__init__.py   |   4 +-
 .../micro_benchmarks/nvbench/CMakeLists.txt   |  29 +-
 .../{launch_bench.cu => kernel_launch.cu}     |   4 +-
 .../micro_benchmarks/nvbench/sleep_kernel.cu  |  23 ++
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 262 ++++++++++++++++++
 .../test_nvbench_sleep_kernel.py              | 110 ++++++++
 tests/data/nvbench_sleep_kernel.log           |  43 +++
 third_party/Makefile                          |  25 +-
 9 files changed, 509 insertions(+), 19 deletions(-)
 create mode 100644 examples/benchmarks/nvbench_sleep_kernel.py
 rename superbench/benchmarks/micro_benchmarks/nvbench/{launch_bench.cu => kernel_launch.cu} (70%)
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
 create mode 100644 tests/data/nvbench_sleep_kernel.log

diff --git a/examples/benchmarks/nvbench_sleep_kernel.py b/examples/benchmarks/nvbench_sleep_kernel.py
new file mode 100644
index 000000000..083bd0a7c
--- /dev/null
+++ b/examples/benchmarks/nvbench_sleep_kernel.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Example of NVBench Sleep Kernel benchmark."""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+
+def main():
+    """Main method to run the nvbench sleep kernel benchmark."""
+    context = BenchmarkRegistry.create_benchmark_context(
+        'nvbench-sleep-kernel', platform=Platform.CUDA, parameters='--duration_us "[25,50,75]" --timeout 10'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
+    else:
+        logger.error('benchmark: nvbench-sleep-kernel launch failed.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 9ae738444..ac50ce252 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -40,6 +40,7 @@
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 from superbench.benchmarks.micro_benchmarks.nvbandwidth import NvBandwidthBenchmark
 from superbench.benchmarks.micro_benchmarks.nvbench_kernel_launch import NvbenchKernelLaunch
+from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel
 
 __all__ = [
     'BlasLtBaseBenchmark',
@@ -78,5 +79,6 @@
     'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
     'NvBandwidthBenchmark',
-    'NvbenchKernelLaunch'
+    'NvbenchKernelLaunch',
+    'NvbenchSleepKernel'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index a23d722a3..448e83800 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -2,14 +2,29 @@ cmake_minimum_required(VERSION 3.20)
 project(nvbench_benchmarks LANGUAGES CUDA)
 
 find_package(CUDAToolkit QUIET)
-if(CUDAToolkit_FOUND)
+if (CUDAToolkit_FOUND)
   include(../cuda_common.cmake)
-
   find_package(nvbench CONFIG REQUIRED)
-  add_executable(nvbench_kernel_launch launch_bench.cu)
-  target_compile_features(nvbench_kernel_launch PUBLIC cuda_std_17)
-  target_link_libraries(nvbench_kernel_launch
-    PRIVATE nvbench::nvbench nvbench::main
+
+  # list all your CUDA benchmark source files here
+  set(NVBENCH_SOURCES
+    kernel_launch.cu
+    sleep_kernel.cu
+    # add more *.cu as needed
   )
-  install(TARGETS nvbench_kernel_launch RUNTIME DESTINATION bin)
+
+  foreach(src ${NVBENCH_SOURCES})
+    # strip ".cu" → NAME_WE
+    get_filename_component(basename ${src} NAME_WE)
+    set(target nvbench_${basename})
+
+    add_executable(${target} ${src})
+    target_compile_features(${target} PUBLIC cuda_std_17)
+    target_link_libraries(${target}
+      PRIVATE nvbench::nvbench nvbench::main
+    )
+    install(TARGETS ${target} RUNTIME DESTINATION bin)
+  endforeach()
+else()
+  message(STATUS "CUDA not found, skipping nvbench benchmarks.")
 endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
similarity index 70%
rename from superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
rename to superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
index f3e1bc7f0..f703c587a 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/launch_bench.cu
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
@@ -2,10 +2,10 @@
 
 __global__ void empty_kernel() {}
 
-void launch_bench(nvbench::state &state) {
+void kernel_launch(nvbench::state &state) {
   state.exec([](nvbench::launch &launch) {
     empty_kernel<<<1, 1, 0, launch.get_stream()>>>();
   });
 }
 
-NVBENCH_BENCH(launch_bench);
\ No newline at end of file
+NVBENCH_BENCH(kernel_launch);
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
new file mode 100644
index 000000000..d486d8ee7
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
@@ -0,0 +1,23 @@
+#include <nvbench/nvbench.cuh>
+#include <cuda/std/chrono>
+#include <cuda_runtime.h>
+
+__global__ void sleep_kernel(nvbench::int64_t microseconds) {
+  const auto start = cuda::std::chrono::high_resolution_clock::now();
+  const auto target_duration = cuda::std::chrono::microseconds(microseconds);
+  const auto finish = start + target_duration;
+
+  while (cuda::std::chrono::high_resolution_clock::now() < finish) {
+    // busy wait
+  }
+}
+
+void sleep_benchmark(nvbench::state &state) {
+  const auto duration_us = state.get_int64("Duration (us)");
+  state.exec([&duration_us](nvbench::launch &launch) {
+    sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us);
+  });
+}
+NVBENCH_BENCH(sleep_benchmark)
+    .add_int64_axis("Duration (us)", nvbench::range(0, 100, 5))
+    .set_timeout(1); // Limit to one second per measurement.
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
new file mode 100644
index 000000000..2171a8b6b
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -0,0 +1,262 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Module of the NVBench Sleep Kernel benchmark."""
+
+import re
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmarkWithInvoke
+
+
+class NvbenchSleepKernel(MicroBenchmarkWithInvoke):
+    """The NVBench Sleep Kernel benchmark class."""
+
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'nvbench_sleep_kernel'
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--devices',
+            type=str,
+            default=None,
+            help='Device list to run the benchmark, e.g., "0,1,2,3" or "all".',
+        )
+        self._parser.add_argument(
+            '--duration_us',
+            type=str,
+            default='[0,25,50,75,100]',
+            help='Duration axis values in microseconds, e.g., "[0,25,50,75,100]".',
+        )
+        self._parser.add_argument(
+            '--skip_time',
+            type=float,
+            default=-1.0,
+            help='Skip time in seconds.',
+        )
+        self._parser.add_argument(
+            '--throttle_threshold',
+            type=float,
+            default=75.0,
+            help='Throttle threshold percentage.',
+        )
+        self._parser.add_argument(
+            '--throttle_recovery_delay',
+            type=float,
+            default=0.05,
+            help='Throttle recovery delay in seconds.',
+        )
+        self._parser.add_argument(
+            '--run_once',
+            action='store_true',
+            help='Run once flag.',
+        )
+        self._parser.add_argument(
+            '--disable_blocking_kernel',
+            action='store_true',
+            help='Disable blocking kernel flag.',
+        )
+        self._parser.add_argument(
+            '--profile',
+            action='store_true',
+            help='Enable profiling flag.',
+        )
+        self._parser.add_argument(
+            '--timeout',
+            type=int,
+            default=15,
+            help='Timeout in seconds.',
+        )
+        self._parser.add_argument(
+            '--min_samples',
+            type=int,
+            default=None,
+            help='Minimum number of samples.',
+        )
+        self._parser.add_argument(
+            '--stopping_criterion',
+            type=str,
+            default='stdrel',
+            choices=['stdrel', 'entropy'],
+            help='Stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--min_time',
+            type=float,
+            default=None,
+            help='Minimum time for stdrel stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--max_noise',
+            type=float,
+            default=None,
+            help='Maximum noise for stdrel stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--max_angle',
+            type=float,
+            default=None,
+            help='Maximum angle for entropy stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--min_r2',
+            type=float,
+            default=None,
+            help='Minimum R-squared for entropy stopping criterion.',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        command = str(self._args.bin_dir / self._bin_name)
+        parts = [command]
+
+        # Basic configuration
+        if self._args.devices is not None:
+            if self._args.devices == 'all':
+                parts.extend(['--devices', 'all'])
+            else:
+                parts.extend(['--devices', self._args.devices])
+
+        # Duration axis
+        parts.extend(['--axis', f'"Duration (us)={self._args.duration_us}"'])
+
+        # Performance configuration
+        if self._args.skip_time >= 0:
+            parts.extend(['--skip-time', str(self._args.skip_time)])
+        parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
+        parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+        if self._args.run_once:
+            parts.append('--run-once')
+        if self._args.disable_blocking_kernel:
+            parts.append('--disable-blocking-kernel')
+        if self._args.profile:
+            parts.append('--profile')
+
+        # Stopping criteria
+        if self._args.timeout is not None:
+            parts.extend(['--timeout', str(self._args.timeout)])
+        if self._args.min_samples is not None:
+            parts.extend(['--min-samples', str(self._args.min_samples)])
+        if self._args.stopping_criterion:
+            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
+            if self._args.stopping_criterion == 'stdrel':
+                if self._args.min_time is not None:
+                    parts.extend(['--min-time', str(self._args.min_time)])
+                if self._args.max_noise is not None:
+                    parts.extend(['--max-noise', str(self._args.max_noise)])
+            elif self._args.stopping_criterion == 'entropy':
+                if self._args.max_angle is not None:
+                    parts.extend(['--max-angle', str(self._args.max_angle)])
+                if self._args.min_r2 is not None:
+                    parts.extend(['--min-r2', str(self._args.min_r2)])
+
+        # finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+        self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        logger.debug(f"Processing raw result for command index {cmd_idx}.")
+        logger.debug(f"Raw output:\n{raw_output}")
+
+        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+        try:
+            gpu_section = r"### \[(\d+)\] NVIDIA"
+            row_pat = (
+                r"\|\s*(\d+)\s*\|\s*(\d+)x\s*\|\s*([\d.]+ ?[mun]?s)\s*\|\s*([\d.]+%)\s*\|\s*"
+                r"([\d.]+ ?[mun]?s)\s*\|\s*([\d.]+%)\s*\|\s*(\d+)x\s*\|\s*([\d.]+ ?[mun]?s)\s*\|"
+            )
+            current = None
+            parsed_any = False
+            for line in raw_output.splitlines():
+                line = line.strip()
+                logger.debug(f"Processing line: {line}")
+                g = re.match(gpu_section, line)
+                if g:
+                    current = f"gpu_{g.group(1)}"
+                    logger.debug(f"Found GPU section: {current}")
+                    continue
+                r = re.match(row_pat, line)
+                if r and current:
+                    logger.debug(f"Matched row: {r.groups()}")
+                    duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_samples', int(samples))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_samples', int(batch_samples))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_gpu', self._parse_time_value(batch_gpu))
+                    parsed_any = True
+            if not parsed_any:
+                raise RuntimeError("No valid rows parsed")
+        except Exception as e:
+            logger.error(f"Error processing raw result: {e}")
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            return False
+        return True
+
+    def _parse_time_value(self, time_str):
+        """Parse time string to microseconds.
+
+        Args:
+            time_str (str): Time string like "25.321 us", "1.234 ms", etc.
+
+        Returns:
+            float: Time value in microseconds.
+        """
+        time_str = time_str.strip()
+        if time_str.endswith('us'):
+            return float(time_str[:-2].strip())
+        elif time_str.endswith('ms'):
+            return float(time_str[:-2].strip()) * 1000
+        elif time_str.endswith('ns'):
+            return float(time_str[:-2].strip()) / 1000
+        elif time_str.endswith('s'):
+            return float(time_str[:-1].strip()) * 1000000
+        else:
+            # Assume microseconds if no unit
+            return float(time_str)
+
+    def _parse_percentage(self, percent_str):
+        """Parse percentage string to float.
+
+        Args:
+            percent_str (str): Percentage string like "0.93%".
+
+        Returns:
+            float: Percentage value as float.
+        """
+        return float(percent_str[:-1].strip())
+
+
+BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
new file mode 100644
index 000000000..2784a007c
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -0,0 +1,110 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for nvbench sleep kernel benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+
+
+class TestNvbenchSleepKernelBenchmark(BenchmarkTestCase, unittest.TestCase):
+    """Test class for NVBench Sleep Kernel benchmark."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/nvbench_sleep_kernel'])
+
+    def test_nvbench_sleep_kernel_preprocess(self):
+        """Test NVBench Sleep Kernel benchmark preprocess."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test preprocess with default parameters
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Test preprocess with specified parameters
+        parameters = (
+            '--devices 0 '
+            '--duration_us "[10,25,50,75]" '
+            '--timeout 20 '
+            '--min_samples 300 '
+            '--stopping_criterion stdrel '
+            '--min_time 2.0 '
+            '--max_noise 0.5 '
+            '--throttle_threshold 80.0 '
+            '--throttle_recovery_delay 1.0'
+        )
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        assert benchmark._preprocess()
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('--devices 0' in benchmark._commands[0])
+        assert ('--axis "Duration (us)=[10,25,50,75]"' in benchmark._commands[0])
+        assert ('--timeout 20' in benchmark._commands[0])
+        assert ('--min-samples 300' in benchmark._commands[0])
+        assert ('--stopping-criterion stdrel' in benchmark._commands[0])
+        assert ('--min-time 2.0' in benchmark._commands[0])
+        assert ('--max-noise 0.5' in benchmark._commands[0])
+        assert ('--throttle-threshold 80.0' in benchmark._commands[0])
+        assert ('--throttle-recovery-delay 1.0' in benchmark._commands[0])
+
+    @decorator.load_data('tests/data/nvbench_sleep_kernel.log')
+    def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
+        """Test NVBench Sleep Kernel benchmark result parsing."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Parse the provided raw output
+        assert benchmark._process_raw_result(0, results)
+        assert benchmark.return_code == ReturnCode.SUCCESS
+
+        # Validate parsed results
+        assert benchmark.result['gpu_0_duration_us_25_samples'][0] == 10175
+        assert benchmark.result['gpu_0_duration_us_25_cpu_time'][0] == 42.123
+        assert benchmark.result['gpu_0_duration_us_25_cpu_noise'][0] == 69.78
+        assert benchmark.result['gpu_0_duration_us_25_gpu_time'][0] == 25.321
+        assert benchmark.result['gpu_0_duration_us_25_gpu_noise'][0] == 0.93
+        assert benchmark.result['gpu_0_duration_us_25_batch_samples'][0] == 17448
+        assert benchmark.result['gpu_0_duration_us_25_batch_gpu'][0] == 23.456
+
+        assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
+        assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
+
+    def test_nvbench_sleep_kernel_process_raw_result_invalid_output(self):
+        """Test NVBench Sleep Kernel benchmark result parsing with invalid output."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Preprocess and validate command
+        assert benchmark._preprocess()
+
+        # Mock raw output with invalid format
+        raw_output = "Invalid output format"
+
+        # Parse the provided raw output
+        assert not benchmark._process_raw_result(0, raw_output)
+        assert benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/data/nvbench_sleep_kernel.log b/tests/data/nvbench_sleep_kernel.log
new file mode 100644
index 000000000..b2c4037da
--- /dev/null
+++ b/tests/data/nvbench_sleep_kernel.log
@@ -0,0 +1,43 @@
+# Devices
+
+## [0] `NVIDIA GPU`
+* SM Version: 900 (PTX Version: 900)
+* Number of SMs: 100
+* SM Default Clock Rate: 800 MHz
+* Global Memory: 100000 MiB Free / 100000 MiB Total
+* Global Memory Bus Peak: 3000 GB/sec (3000-bit DDR @4000MHz)
+* Max Shared Memory: 100 KiB/SM, 20 KiB/Block
+* L2 Cache Size: 1000 KiB
+* Maximum Active Blocks: 10/SM
+* Maximum Active Threads: 500/SM, 300/Block
+* Available Registers: 500/SM, 500/Block
+* ECC Enabled: Yes
+
+# Log
+
+```
+Run:  [1/3] sleep_benchmark [Device=0 Duration (us)=25]
+Warn: Current measurement timed out (1.00s) while over noise threshold (0.93% > 0.50%)
+Warn: Current measurement timed out (1.00s) before accumulating min_time (0.31s < 0.50s)
+Pass: Cold: 0.030374ms GPU, 0.047379ms CPU, 0.31s total GPU, 1.00s total wall, 10175x 
+Pass: Batch: 0.028658ms GPU, 0.50s total GPU, 0.50s total wall, 17448x
+Run:  [2/3] sleep_benchmark [Device=0 Duration (us)=50]
+Warn: Current measurement timed out (1.00s) before accumulating min_time (0.45s < 0.50s)
+Pass: Cold: 0.055036ms GPU, 0.072054ms CPU, 0.45s total GPU, 1.00s total wall, 8187x 
+Pass: Batch: 0.053246ms GPU, 0.50s total GPU, 0.50s total wall, 9403x
+Run:  [3/3] sleep_benchmark [Device=0 Duration (us)=75]
+Pass: Cold: 0.079643ms GPU, 0.096788ms CPU, 0.50s total GPU, 0.92s total wall, 6279x 
+Pass: Batch: 0.077862ms GPU, 0.51s total GPU, 0.51s total wall, 6547x
+```
+
+# Benchmark Results
+
+## sleep_benchmark
+
+### [0] NVIDIA GPU
+
+| Duration (us) | Samples | CPU Time  | Noise  | GPU Time  | Noise | Samples | Batch GPU |
+|---------------|---------|-----------|--------|-----------|-------|---------|-----------|
+|            25 |  10175x | 42.123 us | 69.78% | 25.321 us | 0.93% |  17448x | 23.456 us |
+|            50 |   8187x | 68.456 us |  2.34% | 50.654 us | 0.45% |   9403x | 49.321 us |
+|            75 |   6279x | 90.789 us |  1.85% | 75.987 us | 0.33% |   6547x | 77.862 us |
\ No newline at end of file
diff --git a/third_party/Makefile b/third_party/Makefile
index 465b5d6e4..5f16bbce2 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -305,12 +305,19 @@ nvbandwidth: sb_micro_path
 
 # Build nvbench
 nvbench: sb_micro_path
-	cd ./nvbench && mkdir -p build && cd build && \
-	cmake                                         \
-	-DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)       \
-	-DCMAKE_CUDA_ARCHITECTURES=100                \
-	-DNVBench_ENABLE_CUPTI=ON                     \
-	-DCMAKE_BUILD_TYPE=Release                    \
-	.. &&                                         \
-	make -j $(NUM_MAKE_JOBS) &&                   \
-	make install
+ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+    $(eval ARCHS := "90;100")
+else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
+    $(eval ARCHS := "70;75;80;86;89;90")
+else
+    $(eval ARCHS := "70;75;80;86")
+endif
+    cd ./nvbench && mkdir -p build && cd build && \
+    cmake                                         \
+    -DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)       \
+    -DCMAKE_CUDA_ARCHITECTURES=$(ARCHS)           \
+    -DNVBench_ENABLE_CUPTI=ON                     \
+    -DCMAKE_BUILD_TYPE=Release                    \
+    .. &&                                         \
+    make -j $(NUM_MAKE_JOBS) &&                   \
+    make install

From bd87f50d96b9accd6ed1b5005d9f3ab8a6771d6e Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 8 Oct 2025 02:49:48 +0000
Subject: [PATCH 04/27] test sleep kernel

---
 .../micro_benchmarks/nvbench_sleep_kernel.py  |  3 +-
 third_party/Makefile                          | 42 +++++++++----------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 2171a8b6b..a1a1d55ad 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -4,6 +4,7 @@
 """Module of the NVBench Sleep Kernel benchmark."""
 
 import re
+import os
 
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
@@ -126,7 +127,7 @@ def _preprocess(self):
         if not super()._preprocess():
             return False
 
-        command = str(self._args.bin_dir / self._bin_name)
+        command = os.path.join(self._args.bin_dir, self._bin_name)
         parts = [command]
 
         # Basic configuration
diff --git a/third_party/Makefile b/third_party/Makefile
index c73ff7f73..df077eb45 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -192,7 +192,7 @@ endif
 cpu_hpl: sb_micro_path
 ifneq (,$(wildcard hpl-tests/Makefile))
 	cd ./hpl-tests && \
-    wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
+	wget https://netlib.org/benchmark/hpl/hpl-2.3.tar.gz && \
 	tar xzf hpl-2.3.tar.gz && \
 	cp Make.Linux_zen3 hpl-2.3 && \
 	cp Make.Linux_zen4 hpl-2.3 && \
@@ -208,7 +208,7 @@ endif
 cpu_stream: sb_micro_path
 ifneq (,$(wildcard stream-tests/Makefile))
 	cd ./stream-tests && \
-    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
+	wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
 	make all
 	cp -v ./stream-tests/stream* $(SB_MICRO_PATH)/bin/
 endif
@@ -243,10 +243,10 @@ rocm_megatron_lm:
 	fi
 	cp Megatron/rocm/Megatron-LM/examples/deepseek_v2/pretrain_deepseek.py Megatron/rocm/Megatron-LM/
 	git clone https://github.com/caaatch22/grouped_gemm.git &&\
-    	cd grouped_gemm &&\
-    	git checkout 8a9b438 &&\
-    	git submodule update --init --recursive &&\
-    	pip install .
+		cd grouped_gemm &&\
+		git checkout 8a9b438 &&\
+		git submodule update --init --recursive &&\
+		pip install .
 
 # Instal apex of ROCm due to dependency of Megatron
 apex_rocm:
@@ -264,7 +264,7 @@ apex_rocm:
 	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 2)" -eq 1 ] && [ "$$(expr $(TORCH_MINOR_VERSION) == 0)" -eq 1 ]; then \
 		git checkout release/1.0.0 ; \
 	elif [ "$$(expr $(TORCH_MAJOR_VERSION) == 1)" -eq 1 ]; then \
-	    git checkout release/1.0.0 ; \
+		git checkout release/1.0.0 ; \
 	fi
 	pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex
 
@@ -276,11 +276,11 @@ ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
 	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
 	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
 else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
-    # Get commit 87048bd from msscl to support updated nccl and sm_100
+	# Get commit 87048bd from msscl to support updated nccl and sm_100
 	$(eval ARCHS := 75 80 86 89 90 100)
 	if [ -d msccl ]; then rm -rf msccl; fi; \
 	git clone --single-branch --branch main https://github.com/Azure/msccl.git \
-    && git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
+	&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
 	$(eval ARCHS := 70 75 80 86 89 90)
 else
@@ -316,18 +316,18 @@ nvbandwidth: sb_micro_path
 # Build nvbench
 nvbench: sb_micro_path
 ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
-    $(eval ARCHS := "90;100")
+	$(eval ARCHS := "90;100")
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
-    $(eval ARCHS := "70;75;80;86;89;90")
+	$(eval ARCHS := "70;75;80;86;89;90")
 else
-    $(eval ARCHS := "70;75;80;86")
+	$(eval ARCHS := "70;75;80;86")
 endif
-    cd ./nvbench && mkdir -p build && cd build && \
-    cmake                                         \
-    -DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)       \
-    -DCMAKE_CUDA_ARCHITECTURES=$(ARCHS)           \
-    -DNVBench_ENABLE_CUPTI=ON                     \
-    -DCMAKE_BUILD_TYPE=Release                    \
-    .. &&                                         \
-    make -j $(NUM_MAKE_JOBS) &&                   \
-    make install
+	cd ./nvbench && mkdir -p build && cd build && \
+	cmake										 \
+	-DCMAKE_INSTALL_PREFIX=$(SB_MICRO_PATH)	   \
+	-DCMAKE_CUDA_ARCHITECTURES=$(ARCHS)		   \
+	-DNVBench_ENABLE_CUPTI=ON					 \
+	-DCMAKE_BUILD_TYPE=Release					\
+	.. &&										 \
+	make -j $(NUM_MAKE_JOBS) &&				   \
+	make install

From a663db6edecf4340a5ba5d8fcbbff750e81dffe9 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 8 Oct 2025 02:52:39 +0000
Subject: [PATCH 05/27] add sm 103

---
 third_party/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/Makefile b/third_party/Makefile
index df077eb45..508faa88d 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -315,7 +315,9 @@ nvbandwidth: sb_micro_path
 
 # Build nvbench
 nvbench: sb_micro_path
-ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
+ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
+	$(eval ARCHS := "100;103")
+else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
 	$(eval ARCHS := "90;100")
 else ifeq ($(shell echo $(CUDA_VER)">=11.8" | bc -l), 1)
 	$(eval ARCHS := "70;75;80;86;89;90")

From 32fe19702041794c675e2374f04752143edcc234 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Wed, 8 Oct 2025 23:54:24 +0000
Subject: [PATCH 06/27] add arg parsing logic

---
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 32 +++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index a1a1d55ad..2f4fa4222 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -39,7 +39,8 @@ def add_parser_arguments(self):
             '--duration_us',
             type=str,
             default='[0,25,50,75,100]',
-            help='Duration axis values in microseconds, e.g., "[0,25,50,75,100]".',
+            help='Duration axis values in microseconds. Supports multiple formats: '
+                 '"50" (single value), "[25,50,75]" (list), "[0:10]" (range), "[0:50:10]" (range with step).',
         )
         self._parser.add_argument(
             '--skip_time',
@@ -118,6 +119,30 @@ def add_parser_arguments(self):
             help='Minimum R-squared for entropy stopping criterion.',
         )
 
+    def _parse_duration_format(self, duration_str):
+        """Parse duration parameter to proper axis format.
+        
+        Args:
+            duration_str (str): Duration specification as string
+            
+        Returns:
+            str: Properly formatted duration string for --axis parameter
+        """
+        if not isinstance(duration_str, str):
+            return str(duration_str)
+            
+        # String format - could be various formats
+        duration_str = duration_str.strip()
+        
+        # Remove outer quotes if present
+        if duration_str.startswith('"') and duration_str.endswith('"'):
+            duration_str = duration_str[1:-1]
+        elif duration_str.startswith("'") and duration_str.endswith("'"):
+            duration_str = duration_str[1:-1]
+        
+        # Return as-is - should already be in correct format
+        return duration_str
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -137,8 +162,9 @@ def _preprocess(self):
             else:
                 parts.extend(['--devices', self._args.devices])
 
-        # Duration axis
-        parts.extend(['--axis', f'"Duration (us)={self._args.duration_us}"'])
+        # Duration axis - parse the format properly
+        duration_formatted = self._parse_duration_format(self._args.duration_us)
+        parts.extend(['--axis', f'"Duration (us)={duration_formatted}"'])
 
         # Performance configuration
         if self._args.skip_time >= 0:

From 3eb55253ee2f4194242ab419ddb1336f216635b0 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 00:02:01 +0000
Subject: [PATCH 07/27] add arg parsing tests

---
 .../test_nvbench_sleep_kernel.py              | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 2784a007c..83f4702d5 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -87,6 +87,78 @@ def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
         assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
         assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
 
+    def test_nvbench_sleep_kernel_parse_duration_formats(self):
+        """Test NVBench Sleep Kernel duration format parsing."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Test single value formats
+        assert benchmark._parse_duration_format("50") == "50"
+        assert benchmark._parse_duration_format("100") == "100"
+        
+        # Test quoted single values
+        assert benchmark._parse_duration_format('"50"') == "50"
+        assert benchmark._parse_duration_format("'100'") == "100"
+        
+        # Test list formats
+        assert benchmark._parse_duration_format("[25,50,75]") == "[25,50,75]"
+        assert benchmark._parse_duration_format("[10,20,30,40]") == "[10,20,30,40]"
+        
+        # Test quoted list formats
+        assert benchmark._parse_duration_format('"[25,50,75]"') == "[25,50,75]"
+        assert benchmark._parse_duration_format("'[10,20,30]'") == "[10,20,30]"
+        
+        # Test range formats
+        assert benchmark._parse_duration_format("[25:75]") == "[25:75]"
+        assert benchmark._parse_duration_format("[0:100]") == "[0:100]"
+        
+        # Test range with step formats
+        assert benchmark._parse_duration_format("[0:50:10]") == "[0:50:10]"
+        assert benchmark._parse_duration_format("[10:100:20]") == "[10:100:20]"
+        
+        # Test quoted range formats
+        assert benchmark._parse_duration_format('"[25:75]"') == "[25:75]"
+        assert benchmark._parse_duration_format("'[0:50:10]'") == "[0:50:10]"
+        
+        # Test with whitespace
+        assert benchmark._parse_duration_format(" 50 ") == "50"
+        assert benchmark._parse_duration_format(" [25,50,75] ") == "[25,50,75]"
+        assert benchmark._parse_duration_format(" [25:75] ") == "[25:75]"
+
+    def test_nvbench_sleep_kernel_preprocess_duration_formats(self):
+        """Test NVBench Sleep Kernel preprocess with different duration formats."""
+        benchmark_name = 'nvbench-sleep-kernel'
+        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+
+        # Test single value
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "50"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=50"' in benchmark._commands[0]
+        
+        # Test list format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25,50,75]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[25,50,75]"' in benchmark._commands[0]
+        
+        # Test range format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25:75]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[25:75]"' in benchmark._commands[0]
+        
+        # Test range with step format
+        benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[0:50:10]"')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[0:50:10]"' in benchmark._commands[0]
+        
+        # Test default format
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert benchmark._preprocess()
+        assert '--axis "Duration (us)=[0,25,50,75,100]"' in benchmark._commands[0]
+
     def test_nvbench_sleep_kernel_process_raw_result_invalid_output(self):
         """Test NVBench Sleep Kernel benchmark result parsing with invalid output."""
         benchmark_name = 'nvbench-sleep-kernel'

From 4785fe6b6c6c91bb38fef0973c8e0ea3544a5527 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 17:49:29 +0000
Subject: [PATCH 08/27] refactor

---
 .../micro_benchmarks/nvbench_base.py          | 241 ++++++++++++++++++
 .../micro_benchmarks/nvbench_kernel_launch.py | 176 +++----------
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 179 ++-----------
 .../test_nvbench_kernel_launch.py             |   8 +-
 .../test_nvbench_sleep_kernel.py              |  14 +-
 5 files changed, 306 insertions(+), 312 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/nvbench_base.py

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
new file mode 100644
index 000000000..3dba511b4
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -0,0 +1,241 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Base class for NVBench benchmarks."""
+
+import os
+import re
+from superbench.common.utils import logger
+from superbench.benchmarks import ReturnCode
+from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmarkWithInvoke
+
+
+def parse_time_to_us(raw: str) -> float:
+    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms' → float µs."""
+    raw = raw.strip()
+    if raw.endswith('%'):
+        return float(raw[:-1])
+    # split "value unit" or "valueunit"
+    m = re.match(r'([\d.]+)\s*([mun]?s)?', raw)
+    if not m:
+        return float(raw)
+    val, unit = float(m.group(1)), (m.group(2) or 'us')
+    if unit == 'ns':
+        return val / 1e3
+    if unit == 'ms':
+        return val * 1e3
+    return val
+
+
+class NvbenchBase(MicroBenchmarkWithInvoke):
+    """Base class for NVBench benchmarks with common functionality."""
+
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        # Subclasses should set this
+        self._bin_name = None
+
+    def add_parser_arguments(self):
+        """Add common NVBench arguments."""
+        super().add_parser_arguments()
+
+        # Device configuration
+        self._parser.add_argument(
+            '--devices',
+            type=str,
+            default=None,
+            help='Device list to run the benchmark, e.g., "0,1,2,3" or "all".',
+        )
+
+        # Benchmark Properties
+        self._parser.add_argument(
+            '--skip-time',
+            type=float,
+            default=-1.0,
+            help='Skip time in seconds.',
+        )
+        self._parser.add_argument(
+            '--throttle-threshold',
+            type=float,
+            default=75.0,
+            help='Throttle threshold percentage.',
+        )
+        self._parser.add_argument(
+            '--throttle-recovery-delay',
+            type=float,
+            default=0.05,
+            help='Throttle recovery delay in seconds.',
+        )
+        self._parser.add_argument(
+            '--run-once',
+            action='store_true',
+            help='Run once flag.',
+        )
+        self._parser.add_argument(
+            '--disable-blocking-kernel',
+            action='store_true',
+            help='Disable blocking kernel flag.',
+        )
+        self._parser.add_argument(
+            '--profile',
+            action='store_true',
+            help='Enable profiling flag.',
+        )
+
+        # Stopping Criteria
+        self._parser.add_argument(
+            '--timeout',
+            type=int,
+            default=15,
+            help='Timeout in seconds.',
+        )
+        self._parser.add_argument(
+            '--min-samples',
+            type=int,
+            default=10,
+            help='Minimum number of samples.',
+        )
+        self._parser.add_argument(
+            '--stopping-criterion',
+            type=str,
+            default='stdrel',
+            choices=['stdrel', 'entropy'],
+            help='Stopping criterion.',
+        )
+        # stdrel-specific
+        self._parser.add_argument(
+            '--min-time',
+            type=float,
+            default=0.5,
+            help='Minimum time for stdrel stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--max-noise',
+            type=float,
+            default=0.5,
+            help='Maximum noise for stdrel stopping criterion.',
+        )
+        # entropy-specific
+        self._parser.add_argument(
+            '--max-angle',
+            type=float,
+            default=0.048,
+            help='Maximum angle for entropy stopping criterion.',
+        )
+        self._parser.add_argument(
+            '--min-r2',
+            type=float,
+            default=0.36,
+            help='Minimum R-squared for entropy stopping criterion.',
+        )
+
+    def _build_base_command(self):
+        """Build the base nvbench command with common arguments.
+        
+        Returns:
+            list: Command parts that can be extended by subclasses.
+        """
+        if not self._bin_name:
+            raise ValueError("Subclass must set _bin_name")
+            
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        parts = [command]
+
+        # Device configuration
+        if self._args.devices is not None:
+            if self._args.devices == 'all':
+                parts.extend(['--devices', 'all'])
+            else:
+                parts.extend(['--devices', self._args.devices])
+
+        # Benchmark Properties
+        if hasattr(self._args, 'skip_time') and self._args.skip_time >= 0:
+            parts.extend(['--skip-time', str(self._args.skip_time)])
+        if hasattr(self._args, 'throttle_threshold') and self._args.throttle_threshold > 0:
+            parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
+        if hasattr(self._args, 'throttle_recovery_delay') and self._args.throttle_recovery_delay > 0:
+            parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+        if hasattr(self._args, 'run_once') and self._args.run_once:
+            parts.append('--run-once')
+        if hasattr(self._args, 'disable_blocking_kernel') and self._args.disable_blocking_kernel:
+            parts.append('--disable-blocking-kernel')
+        if hasattr(self._args, 'profile') and self._args.profile:
+            parts.append('--profile')
+
+        # Stopping criteria
+        if hasattr(self._args, 'timeout') and self._args.timeout is not None:
+            parts.extend(['--timeout', str(self._args.timeout)])
+        if hasattr(self._args, 'min_samples') and self._args.min_samples is not None:
+            parts.extend(['--min-samples', str(self._args.min_samples)])
+        if hasattr(self._args, 'stopping_criterion') and self._args.stopping_criterion:
+            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
+            if self._args.stopping_criterion == 'stdrel':
+                if hasattr(self._args, 'min_time') and self._args.min_time is not None:
+                    parts.extend(['--min-time', str(self._args.min_time)])
+                if hasattr(self._args, 'max_noise') and self._args.max_noise is not None:
+                    parts.extend(['--max-noise', str(self._args.max_noise)])
+            elif self._args.stopping_criterion == 'entropy':
+                if hasattr(self._args, 'max_angle') and self._args.max_angle is not None:
+                    parts.extend(['--max-angle', str(self._args.max_angle)])
+                if hasattr(self._args, 'min_r2') and self._args.min_r2 is not None:
+                    parts.extend(['--min-r2', str(self._args.min_r2)])
+
+        return parts
+
+    def _preprocess(self):
+        """Default preprocess implementation. Can be overridden by subclasses.
+        
+        Returns:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # Build base command - subclasses can override this method to add specific arguments
+        parts = self._build_base_command()
+        
+        # Finalize command
+        self._commands = [' '.join(parts)]
+        return True
+
+    def _parse_time_value(self, time_str):
+        """Parse time string to microseconds.
+        
+        Args:
+            time_str (str): Time string like "123.45 us", "678.9 ns", etc.
+            
+        Returns:
+            float: Time in microseconds.
+        """
+        return parse_time_to_us(time_str)
+
+    def _parse_percentage(self, percent_str):
+        """Parse percentage string to float.
+        
+        Args:
+            percent_str (str): Percentage string like "12.34%"
+            
+        Returns:
+            float: Percentage value as float.
+        """
+        if isinstance(percent_str, str) and percent_str.endswith('%'):
+            return float(percent_str[:-1])
+        return float(percent_str)
+
+    def _handle_parsing_error(self, error_msg, raw_output):
+        """Handle parsing errors consistently.
+        
+        Args:
+            error_msg (str): Error message to log.
+            raw_output (str): Raw output that failed to parse.
+        """
+        self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+        logger.error(
+            f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}"
+        )
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 74cf94383..3ff8fdd07 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -1,139 +1,39 @@
-import os
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Module of the NVBench Kernel Launch benchmark."""
+
 import re
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
-from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase
 
-def parse_time_to_us(raw: str) -> float:
-    """Helper: parse '123.45 us', '678.9 ns', '0.12 ms' → float µs."""
-    raw = raw.strip()
-    if raw.endswith('%'):
-        return float(raw[:-1])
-    # split “value unit” or “valueunit”
-    m = re.match(r'([\d.]+)\s*([mun]?s)?', raw)
-    if not m:
-        return float(raw)
-    val, unit = float(m.group(1)), (m.group(2) or 'us')
-    if unit == 'ns':  return val / 1e3
-    if unit == 'ms':  return val * 1e3
-    return val
 
-class NvbenchKernelLaunch(MicroBenchmarkWithInvoke):
-    """Nvbench benchmark wrapper for SuperBench."""
-    def __init__(self, name, parameters=None):
-        """Initialize the benchmark."""
-        super().__init__(name, parameters)
-        self._bin_name = "nvbench_kernel_launch"
-    
-    def add_parser_arguments(self):
-        """
-        Add NVBench CLI options (excluding Queries, Device modification, Output, Benchmark / Axis Specification):
-          - Benchmark Properties (Omit --run-once, --disable-blocking-kernel, --profile)
-          - Stopping Criteria
-        """
-        super().add_parser_arguments()
+class NvbenchKernelLaunch(NvbenchBase):
+    """The NVBench Kernel Launch benchmark class."""
 
-        # Benchmark Properties
-        self._parser.add_argument(
-            '--devices', '--device', '-d', type=str, default=None,
-            help='Limit execution to one or more device IDs (comma-separated or "all").'
-        )
-        self._parser.add_argument(
-            '--skip-time', type=float, default=-1.0,
-            help='Skip a measurement when a warmup run executes in less than this (seconds).'
-        )
-        # With a threshold >0 and a recovery delay >0, NVBench will automatically pause 
-        # and “wait for the card to warm up” back to a stable high‐clock state, giving you 
-        # more consistent (and lower) timings that reflect the GPU’s true peak performance.
-        self._parser.add_argument(
-            '--throttle-threshold', type=float, default=75.0,
-            help="GPU throttle threshold as percent of default clock rate. Disabled when nvbench::exec_tag::sync is used."
-        )
-        self._parser.add_argument(
-            '--throttle-recovery-delay', type=float, default=0.05,
-            help='Seconds to wait after throttle before resuming. '
-            'Disabled when nvbench::exec_tag::sync is used.'
-        )
+    def __init__(self, name, parameters=''):
+        """Constructor.
 
-        # Stopping Criteria
-        self._parser.add_argument(
-            '--timeout', type=int, default=15,
-            help='Walltime timeout in seconds for each measurement.'
-        )
-        self._parser.add_argument(
-            '--min-samples', type=int, default=10,
-            help='Minimum number of samples per measurement before checking other criteria.'
-        )
-        self._parser.add_argument(
-            '--stopping-criterion', type=str, default='stdrel',
-            choices=['stdrel', 'entropy'],
-            help='Stopping criterion to use after --min-samples is satisfied: '
-            '"stdrel" or "entropy".'
-        )
-        # stdrel-specific
-        self._parser.add_argument(
-            '--min-time', type=float, default=0.5, 
-            help='(stdrel) Minimum execution time accumulated per measurement (seconds).'
-        )
-        self._parser.add_argument(
-            '--max-noise', type=float, default=0.5,
-            help='(stdrel) Maximum relative standard deviation (%) before stopping.'
-        )
-        # entropy-specific
-        self._parser.add_argument(
-            '--max-angle', type=float, default=0.048,
-            help='(entropy) Maximum linear regression angle of cumulative entropy.'
-        )
-        self._parser.add_argument(
-            '--min-r2', type=float, default=0.36,
-            help='(entropy) Minimum coefficient of determination (R²) for linear regression of cumulative entropy.'
-        )
-
-    def _preprocess(self):
-        """Preprocess/preparation operations before the benchmarking.
-        Return:
-            True if _preprocess() succeed.
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
         """
-        if not super()._preprocess():
-            return False
-
-        bin_path = os.path.join(self._args.bin_dir, self._bin_name)
-        parts = [bin_path]
+        super().__init__(name, parameters)
+        self._bin_name = 'nvbench_kernel_launch'
 
-        # Benchmark Properties (all optional)
-        if self._args.devices:
-            parts.extend(['--devices', self._args.devices])
-        if self._args.skip_time >= 0:
-            parts.extend(['--skip-time', str(self._args.skip_time)])
-        if self._args.throttle_threshold > 0:
-            parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
-        if self._args.throttle_recovery_delay > 0:
-            parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
 
-        # Stopping Criteria (all optional)
-        if self._args.timeout is not None:
-            parts.extend(['--timeout', str(self._args.timeout)])
-        if self._args.min_samples is not None:
-            parts.extend(['--min-samples', str(self._args.min_samples)])
-        if self._args.stopping_criterion:
-            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
-            if self._args.stopping_criterion == 'stdrel':
-                if self._args.min_time is not None:
-                    parts.extend(['--min-time', str(self._args.min_time)])
-                if self._args.max_noise is not None:
-                    parts.extend(['--max-noise', str(self._args.max_noise)])
-            elif self._args.stopping_criterion == 'entropy':
-                if self._args.max_angle is not None:
-                    parts.extend(['--max-angle', str(self._args.max_angle)])
-                if self._args.min_r2 is not None:
-                    parts.extend(['--min-r2', str(self._args.min_r2)])
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
 
-        # finalize command
-        self._commands = [' '.join(parts)]
-        return True
-
-    def _process_raw_result(self, cmd_idx, raw_output):
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
+        
         try:
             gpu_section = r"### \[(\d+)\] NVIDIA"
             row_pat = (
@@ -142,32 +42,34 @@ def _process_raw_result(self, cmd_idx, raw_output):
             )
             current = None
             parsed_any = False  # Track if any valid rows are parsed
+            
             for line in raw_output.splitlines():
                 line = line.strip()
                 g = re.match(gpu_section, line)
                 if g:
                     current = f"gpu_{g.group(1)}"
                     continue
+                    
                 r = re.match(row_pat, line)
                 if r and current:
-                    self._result.add_result(f"{current}_samples", int(r.group(1)))
-                    self._result.add_result(f"{current}_cpu_time", parse_time_to_us(r.group(2)))
-                    self._result.add_result(f"{current}_cpu_noise", float(r.group(3)[:-1]))
-                    self._result.add_result(f"{current}_gpu_time", parse_time_to_us(r.group(4)))
-                    self._result.add_result(f"{current}_gpu_noise", float(r.group(5)[:-1]))
-                    self._result.add_result(f"{current}_batch_samples", int(r.group(6)))
-                    self._result.add_result(f"{current}_batch_gpu_time", parse_time_to_us(r.group(7)))
+                    # self._result.add_result(f"{current}_samples", int(r.group(1)))
+                    self._result.add_result(f"{current}_cpu_time", self._parse_time_value(r.group(2)))
+                    # self._result.add_result(f"{current}_cpu_noise", float(r.group(3)[:-1]))
+                    self._result.add_result(f"{current}_gpu_time", self._parse_time_value(r.group(4)))
+                    # self._result.add_result(f"{current}_gpu_noise", float(r.group(5)[:-1]))
+                    # self._result.add_result(f"{current}_batch_samples", int(r.group(6)))
+                    self._result.add_result(f"{current}_batch_gpu_time", self._parse_time_value(r.group(7)))
                     parsed_any = True
+                    
             if not parsed_any:
                 logger.error("No valid rows parsed from the raw output.")
                 raise RuntimeError("No valid rows parsed")
+                
         except Exception as e:
-            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
-            logger.error(
-                f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{e}\n{raw_output}"
-            )
+            self._handle_parsing_error(str(e), raw_output)
             return False
+            
         return True
 
-# Register the benchmark
-BenchmarkRegistry.register_benchmark("nvbench-kernel-launch", NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
+
+BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 2f4fa4222..bc154d8b1 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -8,10 +8,10 @@
 
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
-from superbench.benchmarks.micro_benchmarks.micro_base import MicroBenchmarkWithInvoke
+from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase
 
 
-class NvbenchSleepKernel(MicroBenchmarkWithInvoke):
+class NvbenchSleepKernel(NvbenchBase):
     """The NVBench Sleep Kernel benchmark class."""
 
     def __init__(self, name, parameters=''):
@@ -26,97 +26,16 @@ def __init__(self, name, parameters=''):
         self._bin_name = 'nvbench_sleep_kernel'
 
     def add_parser_arguments(self):
-        """Add the specified arguments."""
+        """Add sleep-kernel specific arguments."""
         super().add_parser_arguments()
 
-        self._parser.add_argument(
-            '--devices',
-            type=str,
-            default=None,
-            help='Device list to run the benchmark, e.g., "0,1,2,3" or "all".',
-        )
+        # Sleep-kernel specific argument
         self._parser.add_argument(
             '--duration_us',
             type=str,
             default='[0,25,50,75,100]',
             help='Duration axis values in microseconds. Supports multiple formats: '
-                 '"50" (single value), "[25,50,75]" (list), "[0:10]" (range), "[0:50:10]" (range with step).',
-        )
-        self._parser.add_argument(
-            '--skip_time',
-            type=float,
-            default=-1.0,
-            help='Skip time in seconds.',
-        )
-        self._parser.add_argument(
-            '--throttle_threshold',
-            type=float,
-            default=75.0,
-            help='Throttle threshold percentage.',
-        )
-        self._parser.add_argument(
-            '--throttle_recovery_delay',
-            type=float,
-            default=0.05,
-            help='Throttle recovery delay in seconds.',
-        )
-        self._parser.add_argument(
-            '--run_once',
-            action='store_true',
-            help='Run once flag.',
-        )
-        self._parser.add_argument(
-            '--disable_blocking_kernel',
-            action='store_true',
-            help='Disable blocking kernel flag.',
-        )
-        self._parser.add_argument(
-            '--profile',
-            action='store_true',
-            help='Enable profiling flag.',
-        )
-        self._parser.add_argument(
-            '--timeout',
-            type=int,
-            default=15,
-            help='Timeout in seconds.',
-        )
-        self._parser.add_argument(
-            '--min_samples',
-            type=int,
-            default=None,
-            help='Minimum number of samples.',
-        )
-        self._parser.add_argument(
-            '--stopping_criterion',
-            type=str,
-            default='stdrel',
-            choices=['stdrel', 'entropy'],
-            help='Stopping criterion.',
-        )
-        self._parser.add_argument(
-            '--min_time',
-            type=float,
-            default=None,
-            help='Minimum time for stdrel stopping criterion.',
-        )
-        self._parser.add_argument(
-            '--max_noise',
-            type=float,
-            default=None,
-            help='Maximum noise for stdrel stopping criterion.',
-        )
-        self._parser.add_argument(
-            '--max_angle',
-            type=float,
-            default=None,
-            help='Maximum angle for entropy stopping criterion.',
-        )
-        self._parser.add_argument(
-            '--min_r2',
-            type=float,
-            default=None,
-            help='Minimum R-squared for entropy stopping criterion.',
+                 '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
         )
 
     def _parse_duration_format(self, duration_str):
@@ -152,51 +71,14 @@ def _preprocess(self):
         if not super()._preprocess():
             return False
 
-        command = os.path.join(self._args.bin_dir, self._bin_name)
-        parts = [command]
-
-        # Basic configuration
-        if self._args.devices is not None:
-            if self._args.devices == 'all':
-                parts.extend(['--devices', 'all'])
-            else:
-                parts.extend(['--devices', self._args.devices])
-
-        # Duration axis - parse the format properly
+        # Build base command with common nvbench arguments
+        parts = self._build_base_command()
+        
+        # Add sleep-kernel specific arguments
         duration_formatted = self._parse_duration_format(self._args.duration_us)
         parts.extend(['--axis', f'"Duration (us)={duration_formatted}"'])
 
-        # Performance configuration
-        if self._args.skip_time >= 0:
-            parts.extend(['--skip-time', str(self._args.skip_time)])
-        parts.extend(['--throttle-threshold', str(self._args.throttle_threshold)])
-        parts.extend(['--throttle-recovery-delay', str(self._args.throttle_recovery_delay)])
-        if self._args.run_once:
-            parts.append('--run-once')
-        if self._args.disable_blocking_kernel:
-            parts.append('--disable-blocking-kernel')
-        if self._args.profile:
-            parts.append('--profile')
-
-        # Stopping criteria
-        if self._args.timeout is not None:
-            parts.extend(['--timeout', str(self._args.timeout)])
-        if self._args.min_samples is not None:
-            parts.extend(['--min-samples', str(self._args.min_samples)])
-        if self._args.stopping_criterion:
-            parts.extend(['--stopping-criterion', self._args.stopping_criterion])
-            if self._args.stopping_criterion == 'stdrel':
-                if self._args.min_time is not None:
-                    parts.extend(['--min-time', str(self._args.min_time)])
-                if self._args.max_noise is not None:
-                    parts.extend(['--max-noise', str(self._args.max_noise)])
-            elif self._args.stopping_criterion == 'entropy':
-                if self._args.max_angle is not None:
-                    parts.extend(['--max-angle', str(self._args.max_angle)])
-                if self._args.min_r2 is not None:
-                    parts.extend(['--min-r2', str(self._args.min_r2)])
-
-        # finalize command
+        # Finalize command
         self._commands = [' '.join(parts)]
         return True
 
@@ -236,13 +118,13 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 if r and current:
                     logger.debug(f"Matched row: {r.groups()}")
                     duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_samples', int(samples))
+                    # self._result.add_result(f'{current}_duration_us_{duration_us}_samples', int(samples))
                     self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
+                    # self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_samples', int(batch_samples))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_gpu', self._parse_time_value(batch_gpu))
+                    # self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
+                    # self._result.add_result(f'{current}_duration_us_{duration_us}_batch_samples', int(batch_samples))
+                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
                     parsed_any = True
             if not parsed_any:
                 raise RuntimeError("No valid rows parsed")
@@ -252,38 +134,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
             return False
         return True
 
-    def _parse_time_value(self, time_str):
-        """Parse time string to microseconds.
-
-        Args:
-            time_str (str): Time string like "25.321 us", "1.234 ms", etc.
-
-        Returns:
-            float: Time value in microseconds.
-        """
-        time_str = time_str.strip()
-        if time_str.endswith('us'):
-            return float(time_str[:-2].strip())
-        elif time_str.endswith('ms'):
-            return float(time_str[:-2].strip()) * 1000
-        elif time_str.endswith('ns'):
-            return float(time_str[:-2].strip()) / 1000
-        elif time_str.endswith('s'):
-            return float(time_str[:-1].strip()) * 1000000
-        else:
-            # Assume microseconds if no unit
-            return float(time_str)
 
-    def _parse_percentage(self, percent_str):
-        """Parse percentage string to float.
-
-        Args:
-            percent_str (str): Percentage string like "0.93%".
-
-        Returns:
-            float: Percentage value as float.
-        """
-        return float(percent_str[:-1].strip())
 
 
 BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 7c131a6c8..63f2738d0 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -74,12 +74,12 @@ def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        assert benchmark.result['gpu_0_samples'][0] == 120000
+        # assert benchmark.result['gpu_0_samples'][0] == 120000
         assert benchmark.result['gpu_0_cpu_time'][0] == 24.222
-        assert benchmark.result['gpu_0_cpu_noise'][0] == 30.44
+        # assert benchmark.result['gpu_0_cpu_noise'][0] == 30.44
         assert benchmark.result['gpu_0_gpu_time'][0] == 7.808
-        assert benchmark.result['gpu_0_gpu_noise'][0] == 14.42
-        assert benchmark.result['gpu_0_batch_samples'][0] == 300000
+        # assert benchmark.result['gpu_0_gpu_noise'][0] == 14.42
+        # assert benchmark.result['gpu_0_batch_samples'][0] == 300000
         assert benchmark.result['gpu_0_batch_gpu_time'][0] == 6.024
 
     def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 83f4702d5..4eeb366b3 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -76,16 +76,16 @@ def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        assert benchmark.result['gpu_0_duration_us_25_samples'][0] == 10175
+        # assert benchmark.result['gpu_0_duration_us_25_samples'][0] == 10175
         assert benchmark.result['gpu_0_duration_us_25_cpu_time'][0] == 42.123
-        assert benchmark.result['gpu_0_duration_us_25_cpu_noise'][0] == 69.78
+        # assert benchmark.result['gpu_0_duration_us_25_cpu_noise'][0] == 69.78
         assert benchmark.result['gpu_0_duration_us_25_gpu_time'][0] == 25.321
-        assert benchmark.result['gpu_0_duration_us_25_gpu_noise'][0] == 0.93
-        assert benchmark.result['gpu_0_duration_us_25_batch_samples'][0] == 17448
-        assert benchmark.result['gpu_0_duration_us_25_batch_gpu'][0] == 23.456
+        # assert benchmark.result['gpu_0_duration_us_25_gpu_noise'][0] == 0.93
+        # assert benchmark.result['gpu_0_duration_us_25_batch_samples'][0] == 17448
+        assert benchmark.result['gpu_0_duration_us_25_batch_gpu_time'][0] == 23.456
 
-        assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
-        assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
+        # assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
+        # assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
 
     def test_nvbench_sleep_kernel_parse_duration_formats(self):
         """Test NVBench Sleep Kernel duration format parsing."""

From 1fb7c05c6744d428c998e095deaccc03907f5903 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 21:25:21 +0000
Subject: [PATCH 09/27] refine logic - remove gpu_id

---
 .../micro_benchmarks/nvbench_base.py          |  5 +-
 .../micro_benchmarks/nvbench_kernel_launch.py | 15 ++--
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 46 +++---------
 .../test_nvbench_kernel_launch.py             | 16 ++--
 .../test_nvbench_sleep_kernel.py              | 73 ++++---------------
 5 files changed, 43 insertions(+), 112 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 3dba511b4..916408304 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -147,8 +147,9 @@ def _build_base_command(self):
         command = os.path.join(self._args.bin_dir, self._bin_name)
         parts = [command]
 
-        # Device configuration
-        if self._args.devices is not None:
+        # Device configuration - in distributed mode, let SuperBench handle device assignment
+        # Only add --devices if explicitly specified
+        if hasattr(self._args, 'devices') and self._args.devices is not None:
             if self._args.devices == 'all':
                 parts.extend(['--devices', 'all'])
             else:
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 3ff8fdd07..1544586fa 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -52,13 +52,13 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     
                 r = re.match(row_pat, line)
                 if r and current:
-                    # self._result.add_result(f"{current}_samples", int(r.group(1)))
-                    self._result.add_result(f"{current}_cpu_time", self._parse_time_value(r.group(2)))
-                    # self._result.add_result(f"{current}_cpu_noise", float(r.group(3)[:-1]))
-                    self._result.add_result(f"{current}_gpu_time", self._parse_time_value(r.group(4)))
-                    # self._result.add_result(f"{current}_gpu_noise", float(r.group(5)[:-1]))
-                    # self._result.add_result(f"{current}_batch_samples", int(r.group(6)))
-                    self._result.add_result(f"{current}_batch_gpu_time", self._parse_time_value(r.group(7)))
+                    # self._result.add_result("samples", int(r.group(1)))
+                    self._result.add_result("cpu_time", self._parse_time_value(r.group(2)))
+                    # self._result.add_result("cpu_noise", self._parse_percentage(r.group(3)))
+                    self._result.add_result("gpu_time", self._parse_time_value(r.group(4)))
+                    # self._result.add_result("gpu_noise", self._parse_percentage(r.group(5)))
+                    # self._result.add_result("batch_samples", int(r.group(6)))
+                    self._result.add_result("batch_gpu_time", self._parse_time_value(r.group(7)))
                     parsed_any = True
                     
             if not parsed_any:
@@ -71,5 +71,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
             
         return True
 
-
 BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index bc154d8b1..75b54ab66 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -38,30 +38,6 @@ def add_parser_arguments(self):
                  '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
         )
 
-    def _parse_duration_format(self, duration_str):
-        """Parse duration parameter to proper axis format.
-        
-        Args:
-            duration_str (str): Duration specification as string
-            
-        Returns:
-            str: Properly formatted duration string for --axis parameter
-        """
-        if not isinstance(duration_str, str):
-            return str(duration_str)
-            
-        # String format - could be various formats
-        duration_str = duration_str.strip()
-        
-        # Remove outer quotes if present
-        if duration_str.startswith('"') and duration_str.endswith('"'):
-            duration_str = duration_str[1:-1]
-        elif duration_str.startswith("'") and duration_str.endswith("'"):
-            duration_str = duration_str[1:-1]
-        
-        # Return as-is - should already be in correct format
-        return duration_str
-
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -75,8 +51,7 @@ def _preprocess(self):
         parts = self._build_base_command()
         
         # Add sleep-kernel specific arguments
-        duration_formatted = self._parse_duration_format(self._args.duration_us)
-        parts.extend(['--axis', f'"Duration (us)={duration_formatted}"'])
+        parts.extend(['--axis', f'"Duration (us)={self._args.duration_us.strip()}"'])
 
         # Finalize command
         self._commands = [' '.join(parts)]
@@ -118,13 +93,13 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 if r and current:
                     logger.debug(f"Matched row: {r.groups()}")
                     duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
-                    # self._result.add_result(f'{current}_duration_us_{duration_us}_samples', int(samples))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
-                    # self._result.add_result(f'{current}_duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
-                    # self._result.add_result(f'{current}_duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'{current}_duration_us_{duration_us}_batch_samples', int(batch_samples))
-                    self._result.add_result(f'{current}_duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
+                    # self._result.add_result(f'duration_us_{duration_us}_samples', int(samples))
+                    self._result.add_result(f'duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
+                    # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
+                    self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
+                    # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
+                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples))
+                    self._result.add_result(f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
                     parsed_any = True
             if not parsed_any:
                 raise RuntimeError("No valid rows parsed")
@@ -134,7 +109,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
             return False
         return True
 
-
-
-
-BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
+BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
\ No newline at end of file
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 63f2738d0..2923c7d3b 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -33,7 +33,7 @@ def test_nvbench_kernel_launch_preprocess(self):
 
         # Test preprocess with specified parameters
         parameters = (
-            '--device 0 '
+            '--devices 0 '
             '--timeout 20 '
             '--min-samples 300 '
             '--stopping-criterion stdrel '
@@ -74,13 +74,13 @@ def test_nvbench_kernel_launch_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        # assert benchmark.result['gpu_0_samples'][0] == 120000
-        assert benchmark.result['gpu_0_cpu_time'][0] == 24.222
-        # assert benchmark.result['gpu_0_cpu_noise'][0] == 30.44
-        assert benchmark.result['gpu_0_gpu_time'][0] == 7.808
-        # assert benchmark.result['gpu_0_gpu_noise'][0] == 14.42
-        # assert benchmark.result['gpu_0_batch_samples'][0] == 300000
-        assert benchmark.result['gpu_0_batch_gpu_time'][0] == 6.024
+        # assert benchmark.result['samples'][0] == 120000
+        assert benchmark.result['cpu_time'][0] == 24.222
+        # assert benchmark.result['cpu_noise'][0] == 30.44
+        assert benchmark.result['gpu_time'][0] == 7.808
+        # assert benchmark.result['gpu_noise'][0] == 14.42
+        # assert benchmark.result['batch_samples'][0] == 300000
+        assert benchmark.result['batch_gpu_time'][0] == 6.024
 
     def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
         """Test NVBench Kernel Launch benchmark result parsing with invalid output."""
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 4eeb366b3..6d020814a 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -36,12 +36,12 @@ def test_nvbench_sleep_kernel_preprocess(self):
             '--devices 0 '
             '--duration_us "[10,25,50,75]" '
             '--timeout 20 '
-            '--min_samples 300 '
-            '--stopping_criterion stdrel '
-            '--min_time 2.0 '
-            '--max_noise 0.5 '
-            '--throttle_threshold 80.0 '
-            '--throttle_recovery_delay 1.0'
+            '--min-samples 300 '
+            '--stopping-criterion stdrel '
+            '--min-time 2.0 '
+            '--max-noise 0.5 '
+            '--throttle-threshold 80.0 '
+            '--throttle-recovery-delay 1.0'
         )
         benchmark = benchmark_class(benchmark_name, parameters=parameters)
         assert benchmark._preprocess()
@@ -76,57 +76,16 @@ def test_nvbench_sleep_kernel_result_parsing_real_output(self, results):
         assert benchmark.return_code == ReturnCode.SUCCESS
 
         # Validate parsed results
-        # assert benchmark.result['gpu_0_duration_us_25_samples'][0] == 10175
-        assert benchmark.result['gpu_0_duration_us_25_cpu_time'][0] == 42.123
-        # assert benchmark.result['gpu_0_duration_us_25_cpu_noise'][0] == 69.78
-        assert benchmark.result['gpu_0_duration_us_25_gpu_time'][0] == 25.321
-        # assert benchmark.result['gpu_0_duration_us_25_gpu_noise'][0] == 0.93
-        # assert benchmark.result['gpu_0_duration_us_25_batch_samples'][0] == 17448
-        assert benchmark.result['gpu_0_duration_us_25_batch_gpu_time'][0] == 23.456
-
-        # assert benchmark.result['gpu_0_duration_us_50_samples'][0] == 8187
-        # assert benchmark.result['gpu_0_duration_us_75_samples'][0] == 6279
-
-    def test_nvbench_sleep_kernel_parse_duration_formats(self):
-        """Test NVBench Sleep Kernel duration format parsing."""
-        benchmark_name = 'nvbench-sleep-kernel'
-        (benchmark_class, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
-        assert (benchmark_class)
-
-        benchmark = benchmark_class(benchmark_name, parameters='')
-
-        # Test single value formats
-        assert benchmark._parse_duration_format("50") == "50"
-        assert benchmark._parse_duration_format("100") == "100"
-        
-        # Test quoted single values
-        assert benchmark._parse_duration_format('"50"') == "50"
-        assert benchmark._parse_duration_format("'100'") == "100"
-        
-        # Test list formats
-        assert benchmark._parse_duration_format("[25,50,75]") == "[25,50,75]"
-        assert benchmark._parse_duration_format("[10,20,30,40]") == "[10,20,30,40]"
-        
-        # Test quoted list formats
-        assert benchmark._parse_duration_format('"[25,50,75]"') == "[25,50,75]"
-        assert benchmark._parse_duration_format("'[10,20,30]'") == "[10,20,30]"
-        
-        # Test range formats
-        assert benchmark._parse_duration_format("[25:75]") == "[25:75]"
-        assert benchmark._parse_duration_format("[0:100]") == "[0:100]"
-        
-        # Test range with step formats
-        assert benchmark._parse_duration_format("[0:50:10]") == "[0:50:10]"
-        assert benchmark._parse_duration_format("[10:100:20]") == "[10:100:20]"
-        
-        # Test quoted range formats
-        assert benchmark._parse_duration_format('"[25:75]"') == "[25:75]"
-        assert benchmark._parse_duration_format("'[0:50:10]'") == "[0:50:10]"
-        
-        # Test with whitespace
-        assert benchmark._parse_duration_format(" 50 ") == "50"
-        assert benchmark._parse_duration_format(" [25,50,75] ") == "[25,50,75]"
-        assert benchmark._parse_duration_format(" [25:75] ") == "[25:75]"
+        # assert benchmark.result['duration_us_25_samples'][0] == 10175
+        assert benchmark.result['duration_us_25_cpu_time'][0] == 42.123
+        # assert benchmark.result['duration_us_25_cpu_noise'][0] == 69.78
+        assert benchmark.result['duration_us_25_gpu_time'][0] == 25.321
+        # assert benchmark.result['duration_us_25_gpu_noise'][0] == 0.93
+        # assert benchmark.result['duration_us_25_batch_samples'][0] == 17448
+        assert benchmark.result['duration_us_25_batch_gpu_time'][0] == 23.456
+
+        # assert benchmark.result['duration_us_50_samples'][0] == 8187
+        # assert benchmark.result['duration_us_75_samples'][0] == 6279
 
     def test_nvbench_sleep_kernel_preprocess_duration_formats(self):
         """Test NVBench Sleep Kernel preprocess with different duration formats."""

From 83c442c7f77151558cf54198a93ee40369cee283 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 21:32:57 +0000
Subject: [PATCH 10/27] add doc

---
 .../benchmarks/micro-benchmarks.md            | 45 +++++++++++++++++++
 third_party/Makefile                          |  3 +-
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index aa3aa965b..a5bc2fa5c 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -172,6 +172,51 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
 
+### `nvbench-sleep-kernel`
+
+#### Introduction
+
+Measure GPU kernel execution time using NVBench's sleep kernel benchmark. This benchmark creates CUDA kernels that sleep for specified durations (in microseconds) and measures the actual execution time, providing insights into GPU scheduling overhead and timing accuracy.
+
+The benchmark supports multiple duration specification formats:
+- Single value: `"50"` - Test single duration of 50μs
+- List format: `"[25,50,75]"` - Test multiple specific durations
+- Range format: `"[25:75]"` - Test all values from 25μs to 75μs  
+- Range with step: `"[0:50:10]"` - Test from 0μs to 50μs in steps of 10μs
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) sleep kernel benchmark.
+
+#### Metrics
+
+| Name                                    | Unit      | Description                                           |
+|-----------------------------------------|-----------|-------------------------------------------------------|
+| nvbench-sleep-kernel/duration_us_{X}_cpu_time     | time (μs) | CPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_gpu_time     | time (μs) | GPU-measured time for duration X microseconds.       |
+| nvbench-sleep-kernel/duration_us_{X}_batch_gpu_time | time (μs) | GPU batch execution time for duration X microseconds. |
+
+Where `{X}` is the sleep duration in microseconds (e.g., 25, 50, 75).
+
+### `nvbench-kernel-launch`
+
+#### Introduction
+
+Measure GPU kernel launch overhead and execution time using NVBench's kernel launch benchmark. This benchmark evaluates the time required to launch kernels on the GPU and measures both CPU-side and GPU-side timing for kernel execution.
+
+The benchmark provides insights into:
+- Kernel launch latency
+- CPU/GPU synchronization overhead  
+- Batch execution performance
+
+Performed by [NVBench](https://github.com/NVIDIA/nvbench) kernel launch benchmark.
+
+#### Metrics
+
+| Name                                | Unit      | Description                                    |
+|-------------------------------------|-----------|------------------------------------------------|
+| nvbench-kernel-launch/cpu_time      | time (μs) | CPU-measured kernel execution time.            |
+| nvbench-kernel-launch/gpu_time      | time (μs) | GPU-measured kernel execution time.            |
+| nvbench-kernel-launch/batch_gpu_time | time (μs) | GPU batch execution time.                     |
+
 ### `cpu-hpl`
 
 #### Introduction
diff --git a/third_party/Makefile b/third_party/Makefile
index 508faa88d..2300cd2ac 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -22,11 +22,10 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 all: cuda rocm
 # msccl: api change in cudaStreamUpdateCaptureDependencies
 cuda_with_msccl: cuda cuda_msccl
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth nvbench
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
 cpu: common cpu_perftest
 common: fio cpu_stream
-nvbench_only: nvbench
 
 # non aarch64 specific targets
 ifneq ($(shell uname -m), aarch64)

From 4b274c417c5b00c7c384c1ca9e8d659c94e0e233 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Thu, 9 Oct 2025 23:02:02 +0000
Subject: [PATCH 11/27] refine regex & update nvbench submodule

---
 .../micro_benchmarks/nvbench_kernel_launch.py | 25 ++++++++++++-------
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 13 +++++++---
 third_party/nvbench                           |  2 +-
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 1544586fa..1a424589f 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -36,9 +36,15 @@ def _process_raw_result(self, cmd_idx, raw_output):
         
         try:
             gpu_section = r"### \[(\d+)\] NVIDIA"
+            # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\| (\d+)x \| ([\d.]+ ?[mun]?s) \| ([\d.]+%) \| "
-                r"([\d.]+ ?[mun]?s) \| ([\d.]+%) \| (\d+)x \| *([\d.]+ ?[mun]?s) \|"
+                r"\|\s*([0-9]+)x\s*\|\s*"                            # Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # CPU Time (μs, ns, ms, us, s)
+                r"([\d.]+%)\s*\|\s*"                                 # CPU Noise percentage
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # GPU Time
+                r"([\d.]+%)\s*\|\s*"                                 # GPU Noise percentage
+                r"([0-9]+)x\s*\|\s*"                                 # Batch Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|"                          # Batch GPU Time
             )
             current = None
             parsed_any = False  # Track if any valid rows are parsed
@@ -52,13 +58,14 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     
                 r = re.match(row_pat, line)
                 if r and current:
-                    # self._result.add_result("samples", int(r.group(1)))
-                    self._result.add_result("cpu_time", self._parse_time_value(r.group(2)))
-                    # self._result.add_result("cpu_noise", self._parse_percentage(r.group(3)))
-                    self._result.add_result("gpu_time", self._parse_time_value(r.group(4)))
-                    # self._result.add_result("gpu_noise", self._parse_percentage(r.group(5)))
-                    # self._result.add_result("batch_samples", int(r.group(6)))
-                    self._result.add_result("batch_gpu_time", self._parse_time_value(r.group(7)))
+                    samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
+                    # self._result.add_result("samples", int(samples.replace('x', '')))
+                    self._result.add_result("cpu_time", self._parse_time_value(cpu_time))
+                    # self._result.add_result("cpu_noise", self._parse_percentage(cpu_noise))
+                    self._result.add_result("gpu_time", self._parse_time_value(gpu_time))
+                    # self._result.add_result("gpu_noise", self._parse_percentage(gpu_noise))
+                    # self._result.add_result("batch_samples", int(batch_samples.replace('x', '')))
+                    self._result.add_result("batch_gpu_time", self._parse_time_value(batch_gpu))
                     parsed_any = True
                     
             if not parsed_any:
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 75b54ab66..03801a666 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -75,9 +75,16 @@ def _process_raw_result(self, cmd_idx, raw_output):
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
         try:
             gpu_section = r"### \[(\d+)\] NVIDIA"
+            # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*(\d+)\s*\|\s*(\d+)x\s*\|\s*([\d.]+ ?[mun]?s)\s*\|\s*([\d.]+%)\s*\|\s*"
-                r"([\d.]+ ?[mun]?s)\s*\|\s*([\d.]+%)\s*\|\s*(\d+)x\s*\|\s*([\d.]+ ?[mun]?s)\s*\|"
+                r"\|\s*([0-9]+)\s*\|\s*"                             # Duration (us)
+                r"([0-9]+)x\s*\|\s*"                                 # Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # CPU Time (μs, ns, ms, us, s)
+                r"([\d.]+%)\s*\|\s*"                                 # CPU Noise percentage
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # GPU Time
+                r"([\d.]+%)\s*\|\s*"                                 # GPU Noise percentage
+                r"([0-9]+)x\s*\|\s*"                                 # Batch Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|"                          # Batch GPU Time
             )
             current = None
             parsed_any = False
@@ -98,7 +105,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples))
+                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples.replace('x', '')))
                     self._result.add_result(f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
                     parsed_any = True
             if not parsed_any:
diff --git a/third_party/nvbench b/third_party/nvbench
index 3de9dc95d..7feda2cf3 160000
--- a/third_party/nvbench
+++ b/third_party/nvbench
@@ -1 +1 @@
-Subproject commit 3de9dc95da3fc859900a799918a0c1d9091920ff
+Subproject commit 7feda2cf3ade88b3e73a0e0414ba543a4fbfbc43

From 0cf48bb57df6beff272f8e14fe40243770136de9 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 16:48:58 +0000
Subject: [PATCH 12/27] update cmake

---
 dockerfile/cuda12.8.dockerfile | 9 ++++++++-
 dockerfile/cuda12.9.dockerfile | 9 ++++++++-
 dockerfile/cuda13.0.dockerfile | 9 ++++++++-
 third_party/Makefile           | 4 ++--
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index 47a35feba..b3bea186f 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -61,6 +61,13 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN cd /tmp && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
+    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    rm -rf cmake-3.30.4-linux-x86_64*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -161,7 +168,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==70.3.0 && \
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index d823b1746..34d7a9900 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -62,6 +62,13 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN cd /tmp && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
+    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    rm -rf cmake-3.30.4-linux-x86_64*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -162,7 +169,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 5ce97c1c9..d9b54d081 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -62,6 +62,13 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
+# Install CMake 3.30.4 for nvbench compatibility
+RUN cd /tmp && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
+    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    rm -rf cmake-3.30.4-linux-x86_64*
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
@@ -151,7 +158,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda
+RUN make -C third_party cuda_with_msccl cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
diff --git a/third_party/Makefile b/third_party/Makefile
index 2300cd2ac..b25fca042 100755
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -22,7 +22,7 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
 all: cuda rocm
 # msccl: api change in cudaStreamUpdateCaptureDependencies
 cuda_with_msccl: cuda cuda_msccl
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth nvbench
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
 cpu: common cpu_perftest
 common: fio cpu_stream
@@ -313,7 +313,7 @@ nvbandwidth: sb_micro_path
 	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
 
 # Build nvbench
-nvbench: sb_micro_path
+cuda_nvbench: sb_micro_path
 ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
 	$(eval ARCHS := "100;103")
 else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)

From 5905647c49d1482da3e3a62c474ab5a7398e07e5 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 16:58:01 +0000
Subject: [PATCH 13/27] fix lint

---
 .../benchmarks/micro_benchmarks/__init__.py   | 46 ++++---------------
 .../micro_benchmarks/nvbench/CMakeLists.txt   | 40 ++++++----------
 .../micro_benchmarks/nvbench/kernel_launch.cu |  4 +-
 .../micro_benchmarks/nvbench/sleep_kernel.cu  | 21 ++++-----
 .../micro_benchmarks/nvbench_base.py          |  7 ++-
 .../micro_benchmarks/nvbench_kernel_launch.py | 32 ++++++-------
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 31 +++++++------
 .../test_nvbench_kernel_launch.py             |  3 +-
 .../test_nvbench_sleep_kernel.py              |  9 ++--
 9 files changed, 74 insertions(+), 119 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index ac50ce252..47a786f6d 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -43,42 +43,12 @@
 from superbench.benchmarks.micro_benchmarks.nvbench_sleep_kernel import NvbenchSleepKernel
 
 __all__ = [
-    'BlasLtBaseBenchmark',
-    'ComputationCommunicationOverlap',
-    'CpuMemBwLatencyBenchmark',
-    'CpuHplBenchmark',
-    'CpuStreamBenchmark',
-    'CublasBenchmark',
-    'CublasLtBenchmark',
-    'CudaGemmFlopsBenchmark',
-    'CudaMemBwBenchmark',
-    'CudaNcclBwBenchmark',
-    'CudnnBenchmark',
-    'DiskBenchmark',
-    'DistInference',
-    'HipBlasLtBenchmark',
-    'GPCNetBenchmark',
-    'GemmFlopsBenchmark',
-    'GpuBurnBenchmark',
-    'GpuCopyBwBenchmark',
-    'GpuStreamBenchmark',
-    'IBBenchmark',
-    'IBLoopbackBenchmark',
-    'KernelLaunch',
-    'MemBwBenchmark',
-    'MicroBenchmark',
-    'MicroBenchmarkWithInvoke',
-    'ORTInferenceBenchmark',
-    'RocmGemmFlopsBenchmark',
-    'RocmMemBwBenchmark',
-    'ShardingMatmul',
-    'TCPConnectivityBenchmark',
-    'TensorRTInferenceBenchmark',
-    'DirectXGPUEncodingLatency',
-    'DirectXGPUCopyBw',
-    'DirectXGPUMemBw',
-    'DirectXGPUCoreFlops',
-    'NvBandwidthBenchmark',
-    'NvbenchKernelLaunch',
-    'NvbenchSleepKernel'
+    'BlasLtBaseBenchmark', 'ComputationCommunicationOverlap', 'CpuMemBwLatencyBenchmark', 'CpuHplBenchmark',
+    'CpuStreamBenchmark', 'CublasBenchmark', 'CublasLtBenchmark', 'CudaGemmFlopsBenchmark', 'CudaMemBwBenchmark',
+    'CudaNcclBwBenchmark', 'CudnnBenchmark', 'DiskBenchmark', 'DistInference', 'HipBlasLtBenchmark', 'GPCNetBenchmark',
+    'GemmFlopsBenchmark', 'GpuBurnBenchmark', 'GpuCopyBwBenchmark', 'GpuStreamBenchmark', 'IBBenchmark',
+    'IBLoopbackBenchmark', 'KernelLaunch', 'MemBwBenchmark', 'MicroBenchmark', 'MicroBenchmarkWithInvoke',
+    'ORTInferenceBenchmark', 'RocmGemmFlopsBenchmark', 'RocmMemBwBenchmark', 'ShardingMatmul',
+    'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', 'DirectXGPUEncodingLatency', 'DirectXGPUCopyBw',
+    'DirectXGPUMemBw', 'DirectXGPUCoreFlops', 'NvBandwidthBenchmark', 'NvbenchKernelLaunch', 'NvbenchSleepKernel'
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 448e83800..f6c8508d4 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -1,30 +1,18 @@
-cmake_minimum_required(VERSION 3.20)
-project(nvbench_benchmarks LANGUAGES CUDA)
+cmake_minimum_required(VERSION 3.20) project(nvbench_benchmarks LANGUAGES CUDA)
 
-find_package(CUDAToolkit QUIET)
-if (CUDAToolkit_FOUND)
-  include(../cuda_common.cmake)
-  find_package(nvbench CONFIG REQUIRED)
+    find_package(CUDAToolkit QUIET) if (CUDAToolkit_FOUND) include(../ cuda_common.cmake)
+        find_package(nvbench CONFIG REQUIRED)
 
-  # list all your CUDA benchmark source files here
-  set(NVBENCH_SOURCES
-    kernel_launch.cu
-    sleep_kernel.cu
-    # add more *.cu as needed
-  )
+#list all your CUDA benchmark source files here
+            set(NVBENCH_SOURCES kernel_launch.cu sleep_kernel.cu
+#add more *.cu as needed
+                )
 
-  foreach(src ${NVBENCH_SOURCES})
-    # strip ".cu" → NAME_WE
-    get_filename_component(basename ${src} NAME_WE)
-    set(target nvbench_${basename})
+                foreach (src ${NVBENCH_SOURCES})
+#strip ".cu" → NAME_WE
+                    get_filename_component(basename ${src} NAME_WE) set(target nvbench_${basename})
 
-    add_executable(${target} ${src})
-    target_compile_features(${target} PUBLIC cuda_std_17)
-    target_link_libraries(${target}
-      PRIVATE nvbench::nvbench nvbench::main
-    )
-    install(TARGETS ${target} RUNTIME DESTINATION bin)
-  endforeach()
-else()
-  message(STATUS "CUDA not found, skipping nvbench benchmarks.")
-endif()
\ No newline at end of file
+                        add_executable(${target} ${src}) target_compile_features(${target} PUBLIC cuda_std_17)
+                            target_link_libraries(${target} PRIVATE nvbench::nvbench nvbench::main)
+                                install(TARGETS ${target} RUNTIME DESTINATION bin) endforeach() else()
+                                    message(STATUS "CUDA not found, skipping nvbench benchmarks.") endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
index f703c587a..08dc40294 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/kernel_launch.cu
@@ -3,9 +3,7 @@
 __global__ void empty_kernel() {}
 
 void kernel_launch(nvbench::state &state) {
-  state.exec([](nvbench::launch &launch) {
-    empty_kernel<<<1, 1, 0, launch.get_stream()>>>();
-  });
+    state.exec([](nvbench::launch &launch) { empty_kernel<<<1, 1, 0, launch.get_stream()>>>(); });
 }
 
 NVBENCH_BENCH(kernel_launch);
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
index d486d8ee7..b4789377e 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/sleep_kernel.cu
@@ -1,22 +1,21 @@
-#include <nvbench/nvbench.cuh>
 #include <cuda/std/chrono>
 #include <cuda_runtime.h>
+#include <nvbench/nvbench.cuh>
 
 __global__ void sleep_kernel(nvbench::int64_t microseconds) {
-  const auto start = cuda::std::chrono::high_resolution_clock::now();
-  const auto target_duration = cuda::std::chrono::microseconds(microseconds);
-  const auto finish = start + target_duration;
+    const auto start = cuda::std::chrono::high_resolution_clock::now();
+    const auto target_duration = cuda::std::chrono::microseconds(microseconds);
+    const auto finish = start + target_duration;
 
-  while (cuda::std::chrono::high_resolution_clock::now() < finish) {
-    // busy wait
-  }
+    while (cuda::std::chrono::high_resolution_clock::now() < finish) {
+        // busy wait
+    }
 }
 
 void sleep_benchmark(nvbench::state &state) {
-  const auto duration_us = state.get_int64("Duration (us)");
-  state.exec([&duration_us](nvbench::launch &launch) {
-    sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us);
-  });
+    const auto duration_us = state.get_int64("Duration (us)");
+    state.exec(
+        [&duration_us](nvbench::launch &launch) { sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_us); });
 }
 NVBENCH_BENCH(sleep_benchmark)
     .add_int64_axis("Duration (us)", nvbench::range(0, 100, 5))
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 916408304..8f472444f 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -29,7 +29,6 @@ def parse_time_to_us(raw: str) -> float:
 
 class NvbenchBase(MicroBenchmarkWithInvoke):
     """Base class for NVBench benchmarks with common functionality."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -143,7 +142,7 @@ def _build_base_command(self):
         """
         if not self._bin_name:
             raise ValueError("Subclass must set _bin_name")
-            
+
         command = os.path.join(self._args.bin_dir, self._bin_name)
         parts = [command]
 
@@ -200,7 +199,7 @@ def _preprocess(self):
 
         # Build base command - subclasses can override this method to add specific arguments
         parts = self._build_base_command()
-        
+
         # Finalize command
         self._commands = [' '.join(parts)]
         return True
@@ -239,4 +238,4 @@ def _handle_parsing_error(self, error_msg, raw_output):
         self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
         logger.error(
             f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}"
-        )
\ No newline at end of file
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 1a424589f..80cfd9378 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -11,7 +11,6 @@
 
 class NvbenchKernelLaunch(NvbenchBase):
     """The NVBench Kernel Launch benchmark class."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -33,29 +32,29 @@ def _process_raw_result(self, cmd_idx, raw_output):
             True if the raw output string is valid and result can be extracted.
         """
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
-        
+
         try:
             gpu_section = r"### \[(\d+)\] NVIDIA"
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*([0-9]+)x\s*\|\s*"                            # Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # CPU Time (μs, ns, ms, us, s)
-                r"([\d.]+%)\s*\|\s*"                                 # CPU Noise percentage
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # GPU Time
-                r"([\d.]+%)\s*\|\s*"                                 # GPU Noise percentage
-                r"([0-9]+)x\s*\|\s*"                                 # Batch Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|"                          # Batch GPU Time
+                r"\|\s*([0-9]+)x\s*\|\s*"    # Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # CPU Time (μs, ns, ms, us, s)
+                r"([\d.]+%)\s*\|\s*"    # CPU Noise percentage
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # GPU Time
+                r"([\d.]+%)\s*\|\s*"    # GPU Noise percentage
+                r"([0-9]+)x\s*\|\s*"    # Batch Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|"    # Batch GPU Time
             )
             current = None
-            parsed_any = False  # Track if any valid rows are parsed
-            
+            parsed_any = False    # Track if any valid rows are parsed
+
             for line in raw_output.splitlines():
                 line = line.strip()
                 g = re.match(gpu_section, line)
                 if g:
                     current = f"gpu_{g.group(1)}"
                     continue
-                    
+
                 r = re.match(row_pat, line)
                 if r and current:
                     samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
@@ -67,15 +66,16 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     # self._result.add_result("batch_samples", int(batch_samples.replace('x', '')))
                     self._result.add_result("batch_gpu_time", self._parse_time_value(batch_gpu))
                     parsed_any = True
-                    
+
             if not parsed_any:
                 logger.error("No valid rows parsed from the raw output.")
                 raise RuntimeError("No valid rows parsed")
-                
+
         except Exception as e:
             self._handle_parsing_error(str(e), raw_output)
             return False
-            
+
         return True
 
-BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
\ No newline at end of file
+
+BenchmarkRegistry.register_benchmark('nvbench-kernel-launch', NvbenchKernelLaunch, platform=Platform.CUDA)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 03801a666..f505280a2 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -13,7 +13,6 @@
 
 class NvbenchSleepKernel(NvbenchBase):
     """The NVBench Sleep Kernel benchmark class."""
-
     def __init__(self, name, parameters=''):
         """Constructor.
 
@@ -35,7 +34,7 @@ def add_parser_arguments(self):
             type=str,
             default='[0,25,50,75,100]',
             help='Duration axis values in microseconds. Supports multiple formats: '
-                 '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
+            '"50" (single value), "[25,50,75]" (list), "[25:75]" (range), "[0:50:10]" (range with step).',
         )
 
     def _preprocess(self):
@@ -49,7 +48,7 @@ def _preprocess(self):
 
         # Build base command with common nvbench arguments
         parts = self._build_base_command()
-        
+
         # Add sleep-kernel specific arguments
         parts.extend(['--axis', f'"Duration (us)={self._args.duration_us.strip()}"'])
 
@@ -77,14 +76,14 @@ def _process_raw_result(self, cmd_idx, raw_output):
             gpu_section = r"### \[(\d+)\] NVIDIA"
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*([0-9]+)\s*\|\s*"                             # Duration (us)
-                r"([0-9]+)x\s*\|\s*"                                 # Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # CPU Time (μs, ns, ms, us, s)
-                r"([\d.]+%)\s*\|\s*"                                 # CPU Noise percentage
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"                       # GPU Time
-                r"([\d.]+%)\s*\|\s*"                                 # GPU Noise percentage
-                r"([0-9]+)x\s*\|\s*"                                 # Batch Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|"                          # Batch GPU Time
+                r"\|\s*([0-9]+)\s*\|\s*"    # Duration (us)
+                r"([0-9]+)x\s*\|\s*"    # Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # CPU Time (μs, ns, ms, us, s)
+                r"([\d.]+%)\s*\|\s*"    # CPU Noise percentage
+                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # GPU Time
+                r"([\d.]+%)\s*\|\s*"    # GPU Noise percentage
+                r"([0-9]+)x\s*\|\s*"    # Batch Samples
+                r"([\d.]+\s*[μmun]?s)\s*\|"    # Batch GPU Time
             )
             current = None
             parsed_any = False
@@ -99,14 +98,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 r = re.match(row_pat, line)
                 if r and current:
                     logger.debug(f"Matched row: {r.groups()}")
-                    duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
+                    duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups(
+                    )
                     # self._result.add_result(f'duration_us_{duration_us}_samples', int(samples))
                     self._result.add_result(f'duration_us_{duration_us}_cpu_time', self._parse_time_value(cpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
                     # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples.replace('x', '')))
-                    self._result.add_result(f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu))
+                    self._result.add_result(
+                        f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu)
+                    )
                     parsed_any = True
             if not parsed_any:
                 raise RuntimeError("No valid rows parsed")
@@ -116,4 +118,5 @@ def _process_raw_result(self, cmd_idx, raw_output):
             return False
         return True
 
-BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
\ No newline at end of file
+
+BenchmarkRegistry.register_benchmark('nvbench-sleep-kernel', NvbenchSleepKernel, platform=Platform.CUDA)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 2923c7d3b..80072231c 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -12,7 +12,6 @@
 
 class TestNvbenchKernelLaunchBenchmark(BenchmarkTestCase, unittest.TestCase):
     """Test class for NVBench Kernel Launch benchmark."""
-
     @classmethod
     def setUpClass(cls):
         """Hook method for setting up class fixture before running tests in the class."""
@@ -102,4 +101,4 @@ def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 6d020814a..434c842e7 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -12,7 +12,6 @@
 
 class TestNvbenchSleepKernelBenchmark(BenchmarkTestCase, unittest.TestCase):
     """Test class for NVBench Sleep Kernel benchmark."""
-
     @classmethod
     def setUpClass(cls):
         """Hook method for setting up class fixture before running tests in the class."""
@@ -97,22 +96,22 @@ def test_nvbench_sleep_kernel_preprocess_duration_formats(self):
         benchmark = benchmark_class(benchmark_name, parameters='--duration_us "50"')
         assert benchmark._preprocess()
         assert '--axis "Duration (us)=50"' in benchmark._commands[0]
-        
+
         # Test list format
         benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25,50,75]"')
         assert benchmark._preprocess()
         assert '--axis "Duration (us)=[25,50,75]"' in benchmark._commands[0]
-        
+
         # Test range format
         benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[25:75]"')
         assert benchmark._preprocess()
         assert '--axis "Duration (us)=[25:75]"' in benchmark._commands[0]
-        
+
         # Test range with step format
         benchmark = benchmark_class(benchmark_name, parameters='--duration_us "[0:50:10]"')
         assert benchmark._preprocess()
         assert '--axis "Duration (us)=[0:50:10]"' in benchmark._commands[0]
-        
+
         # Test default format
         benchmark = benchmark_class(benchmark_name, parameters='')
         assert benchmark._preprocess()

From baa57c9ef72678513bf140ce9219b39b7c61fe1e Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 17:35:57 +0000
Subject: [PATCH 14/27] fix lint

---
 examples/benchmarks/nvbench_kernel_launch.py  |  9 ++++-
 .../micro_benchmarks/nvbench/CMakeLists.txt   | 40 ++++++++++++-------
 .../micro_benchmarks/nvbench_base.py          | 22 +++++-----
 .../micro_benchmarks/nvbench_kernel_launch.py | 22 +++++-----
 .../micro_benchmarks/nvbench_sleep_kernel.py  |  2 +-
 .../test_nvbench_kernel_launch.py             |  2 +-
 .../test_nvbench_sleep_kernel.py              |  2 +-
 7 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/examples/benchmarks/nvbench_kernel_launch.py b/examples/benchmarks/nvbench_kernel_launch.py
index 73377084b..eac2db602 100644
--- a/examples/benchmarks/nvbench_kernel_launch.py
+++ b/examples/benchmarks/nvbench_kernel_launch.py
@@ -1,5 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Example of NVBench Kernel Launch benchmark."""
+
 from superbench.benchmarks import BenchmarkRegistry, Platform
-from superbench.common.utils import logger
+from superbench.common.utils importSlogger
 
 if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(
@@ -22,4 +27,4 @@
             'benchmark: {}, return code: {}, result: {}'.format(
                 benchmark.name, benchmark.return_code, benchmark.result
             )
-        )
\ No newline at end of file
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index f6c8508d4..448e83800 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -1,18 +1,30 @@
-cmake_minimum_required(VERSION 3.20) project(nvbench_benchmarks LANGUAGES CUDA)
+cmake_minimum_required(VERSION 3.20)
+project(nvbench_benchmarks LANGUAGES CUDA)
 
-    find_package(CUDAToolkit QUIET) if (CUDAToolkit_FOUND) include(../ cuda_common.cmake)
-        find_package(nvbench CONFIG REQUIRED)
+find_package(CUDAToolkit QUIET)
+if (CUDAToolkit_FOUND)
+  include(../cuda_common.cmake)
+  find_package(nvbench CONFIG REQUIRED)
 
-#list all your CUDA benchmark source files here
-            set(NVBENCH_SOURCES kernel_launch.cu sleep_kernel.cu
-#add more *.cu as needed
-                )
+  # list all your CUDA benchmark source files here
+  set(NVBENCH_SOURCES
+    kernel_launch.cu
+    sleep_kernel.cu
+    # add more *.cu as needed
+  )
 
-                foreach (src ${NVBENCH_SOURCES})
-#strip ".cu" → NAME_WE
-                    get_filename_component(basename ${src} NAME_WE) set(target nvbench_${basename})
+  foreach(src ${NVBENCH_SOURCES})
+    # strip ".cu" → NAME_WE
+    get_filename_component(basename ${src} NAME_WE)
+    set(target nvbench_${basename})
 
-                        add_executable(${target} ${src}) target_compile_features(${target} PUBLIC cuda_std_17)
-                            target_link_libraries(${target} PRIVATE nvbench::nvbench nvbench::main)
-                                install(TARGETS ${target} RUNTIME DESTINATION bin) endforeach() else()
-                                    message(STATUS "CUDA not found, skipping nvbench benchmarks.") endif()
\ No newline at end of file
+    add_executable(${target} ${src})
+    target_compile_features(${target} PUBLIC cuda_std_17)
+    target_link_libraries(${target}
+      PRIVATE nvbench::nvbench nvbench::main
+    )
+    install(TARGETS ${target} RUNTIME DESTINATION bin)
+  endforeach()
+else()
+  message(STATUS "CUDA not found, skipping nvbench benchmarks.")
+endif()
\ No newline at end of file
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 8f472444f..0be282cb4 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -136,12 +136,12 @@ def add_parser_arguments(self):
 
     def _build_base_command(self):
         """Build the base nvbench command with common arguments.
-        
+
         Returns:
             list: Command parts that can be extended by subclasses.
         """
         if not self._bin_name:
-            raise ValueError("Subclass must set _bin_name")
+            raise ValueError('Subclass must set _bin_name')
 
         command = os.path.join(self._args.bin_dir, self._bin_name)
         parts = [command]
@@ -190,7 +190,7 @@ def _build_base_command(self):
 
     def _preprocess(self):
         """Default preprocess implementation. Can be overridden by subclasses.
-        
+
         Returns:
             True if _preprocess() succeed.
         """
@@ -206,10 +206,10 @@ def _preprocess(self):
 
     def _parse_time_value(self, time_str):
         """Parse time string to microseconds.
-        
+
         Args:
-            time_str (str): Time string like "123.45 us", "678.9 ns", etc.
-            
+            time_str (str): Time string like '123.45 us', '678.9 ns', etc.
+
         Returns:
             float: Time in microseconds.
         """
@@ -217,10 +217,10 @@ def _parse_time_value(self, time_str):
 
     def _parse_percentage(self, percent_str):
         """Parse percentage string to float.
-        
+
         Args:
-            percent_str (str): Percentage string like "12.34%"
-            
+            percent_str (str): Percentage string like '12.34%'
+
         Returns:
             float: Percentage value as float.
         """
@@ -230,12 +230,12 @@ def _parse_percentage(self, percent_str):
 
     def _handle_parsing_error(self, error_msg, raw_output):
         """Handle parsing errors consistently.
-        
+
         Args:
             error_msg (str): Error message to log.
             raw_output (str): Raw output that failed to parse.
         """
         self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
         logger.error(
-            f"Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}"
+            f'Invalid result format - round:{self._curr_run_index}, bench:{self._name}, msg:{error_msg}\n{raw_output}'
         )
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 80cfd9378..83233644f 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -5,7 +5,7 @@
 
 import re
 from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform
+from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase
 
 
@@ -34,16 +34,16 @@ def _process_raw_result(self, cmd_idx, raw_output):
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
 
         try:
-            gpu_section = r"### \[(\d+)\] NVIDIA"
+            gpu_section = r'### \[(\d+)\] NVIDIA'
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*([0-9]+)x\s*\|\s*"    # Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # CPU Time (μs, ns, ms, us, s)
-                r"([\d.]+%)\s*\|\s*"    # CPU Noise percentage
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # GPU Time
-                r"([\d.]+%)\s*\|\s*"    # GPU Noise percentage
-                r"([0-9]+)x\s*\|\s*"    # Batch Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|"    # Batch GPU Time
+                r'\|\s*([0-9]+)x\s*\|\s*'    # Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # CPU Time (μs, ns, ms, us, s)
+                r'([\d.]+%)\s*\|\s*'    # CPU Noise percentage
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # GPU Time
+                r'([\d.]+%)\s*\|\s*'    # GPU Noise percentage
+                r'([0-9]+)x\s*\|\s*'    # Batch Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
             )
             current = None
             parsed_any = False    # Track if any valid rows are parsed
@@ -68,8 +68,8 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     parsed_any = True
 
             if not parsed_any:
-                logger.error("No valid rows parsed from the raw output.")
-                raise RuntimeError("No valid rows parsed")
+                logger.error('No valid rows parsed from the raw output.')
+                raise RuntimeError('No valid rows parsed')
 
         except Exception as e:
             self._handle_parsing_error(str(e), raw_output)
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index f505280a2..64612bb45 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -111,7 +111,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     )
                     parsed_any = True
             if not parsed_any:
-                raise RuntimeError("No valid rows parsed")
+                raise RuntimeError('No valid rows parsed')
         except Exception as e:
             logger.error(f"Error processing raw result: {e}")
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
index 80072231c..02908e5eb 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_kernel_launch.py
@@ -93,7 +93,7 @@ def test_nvbench_kernel_launch_process_raw_result_invalid_output(self):
         assert benchmark._preprocess()
 
         # Mock raw output with invalid format
-        raw_output = "Invalid output format"
+        raw_output = 'Invalid output format'
 
         # Parse the provided raw output
         assert not benchmark._process_raw_result(0, raw_output)
diff --git a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
index 434c842e7..4606768ff 100644
--- a/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
+++ b/tests/benchmarks/micro_benchmarks/test_nvbench_sleep_kernel.py
@@ -129,7 +129,7 @@ def test_nvbench_sleep_kernel_process_raw_result_invalid_output(self):
         assert benchmark._preprocess()
 
         # Mock raw output with invalid format
-        raw_output = "Invalid output format"
+        raw_output = 'Invalid output format'
 
         # Parse the provided raw output
         assert not benchmark._process_raw_result(0, raw_output)

From ecce2d9fed5ef2452fa14518077a8b4c0a4bd6fb Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 17:38:06 +0000
Subject: [PATCH 15/27] fix import

---
 superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 64612bb45..b6163f86d 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -4,8 +4,6 @@
 """Module of the NVBench Sleep Kernel benchmark."""
 
 import re
-import os
-
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
 from superbench.benchmarks.micro_benchmarks.nvbench_base import NvbenchBase

From 3a58ead25ba7b91c65dd4f5e5f59fbd52753769f Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 19:54:38 +0000
Subject: [PATCH 16/27] fix

---
 examples/benchmarks/nvbench_kernel_launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/benchmarks/nvbench_kernel_launch.py b/examples/benchmarks/nvbench_kernel_launch.py
index eac2db602..c0f74f55a 100644
--- a/examples/benchmarks/nvbench_kernel_launch.py
+++ b/examples/benchmarks/nvbench_kernel_launch.py
@@ -4,7 +4,7 @@
 """Example of NVBench Kernel Launch benchmark."""
 
 from superbench.benchmarks import BenchmarkRegistry, Platform
-from superbench.common.utils importSlogger
+from superbench.common.utils import logger
 
 if __name__ == '__main__':
     context = BenchmarkRegistry.create_benchmark_context(

From d0d8773e5148c203e8a74eb75d21c8778df01046 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 20:14:12 +0000
Subject: [PATCH 17/27] fix

---
 .../micro_benchmarks/nvbench_base.py          | 62 ++++++++++++-------
 .../micro_benchmarks/nvbench_kernel_launch.py | 16 ++---
 .../micro_benchmarks/nvbench_sleep_kernel.py  | 22 +++----
 3 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_base.py b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
index 0be282cb4..98e705b46 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_base.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_base.py
@@ -134,27 +134,16 @@ def add_parser_arguments(self):
             help='Minimum R-squared for entropy stopping criterion.',
         )
 
-    def _build_base_command(self):
-        """Build the base nvbench command with common arguments.
-
-        Returns:
-            list: Command parts that can be extended by subclasses.
-        """
-        if not self._bin_name:
-            raise ValueError('Subclass must set _bin_name')
-
-        command = os.path.join(self._args.bin_dir, self._bin_name)
-        parts = [command]
-
-        # Device configuration - in distributed mode, let SuperBench handle device assignment
-        # Only add --devices if explicitly specified
+    def _add_device_args(self, parts):
+        """Add device configuration arguments to command parts."""
         if hasattr(self._args, 'devices') and self._args.devices is not None:
             if self._args.devices == 'all':
                 parts.extend(['--devices', 'all'])
             else:
                 parts.extend(['--devices', self._args.devices])
 
-        # Benchmark Properties
+    def _add_benchmark_property_args(self, parts):
+        """Add benchmark property arguments to command parts."""
         if hasattr(self._args, 'skip_time') and self._args.skip_time >= 0:
             parts.extend(['--skip-time', str(self._args.skip_time)])
         if hasattr(self._args, 'throttle_threshold') and self._args.throttle_threshold > 0:
@@ -168,7 +157,8 @@ def _build_base_command(self):
         if hasattr(self._args, 'profile') and self._args.profile:
             parts.append('--profile')
 
-        # Stopping criteria
+    def _add_stopping_criteria_args(self, parts):
+        """Add stopping criteria arguments to command parts."""
         if hasattr(self._args, 'timeout') and self._args.timeout is not None:
             parts.extend(['--timeout', str(self._args.timeout)])
         if hasattr(self._args, 'min_samples') and self._args.min_samples is not None:
@@ -176,15 +166,39 @@ def _build_base_command(self):
         if hasattr(self._args, 'stopping_criterion') and self._args.stopping_criterion:
             parts.extend(['--stopping-criterion', self._args.stopping_criterion])
             if self._args.stopping_criterion == 'stdrel':
-                if hasattr(self._args, 'min_time') and self._args.min_time is not None:
-                    parts.extend(['--min-time', str(self._args.min_time)])
-                if hasattr(self._args, 'max_noise') and self._args.max_noise is not None:
-                    parts.extend(['--max-noise', str(self._args.max_noise)])
+                self._add_stdrel_args(parts)
             elif self._args.stopping_criterion == 'entropy':
-                if hasattr(self._args, 'max_angle') and self._args.max_angle is not None:
-                    parts.extend(['--max-angle', str(self._args.max_angle)])
-                if hasattr(self._args, 'min_r2') and self._args.min_r2 is not None:
-                    parts.extend(['--min-r2', str(self._args.min_r2)])
+                self._add_entropy_args(parts)
+
+    def _add_stdrel_args(self, parts):
+        """Add stdrel-specific stopping criterion arguments."""
+        if hasattr(self._args, 'min_time') and self._args.min_time is not None:
+            parts.extend(['--min-time', str(self._args.min_time)])
+        if hasattr(self._args, 'max_noise') and self._args.max_noise is not None:
+            parts.extend(['--max-noise', str(self._args.max_noise)])
+
+    def _add_entropy_args(self, parts):
+        """Add entropy-specific stopping criterion arguments."""
+        if hasattr(self._args, 'max_angle') and self._args.max_angle is not None:
+            parts.extend(['--max-angle', str(self._args.max_angle)])
+        if hasattr(self._args, 'min_r2') and self._args.min_r2 is not None:
+            parts.extend(['--min-r2', str(self._args.min_r2)])
+
+    def _build_base_command(self):
+        """Build the base nvbench command with common arguments.
+
+        Returns:
+            list: Command parts that can be extended by subclasses.
+        """
+        if not self._bin_name:
+            raise ValueError('Subclass must set _bin_name')
+
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        parts = [command]
+
+        self._add_device_args(parts)
+        self._add_benchmark_property_args(parts)
+        self._add_stopping_criteria_args(parts)
 
         return parts
 
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
index 83233644f..5120f1b51 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_kernel_launch.py
@@ -52,19 +52,19 @@ def _process_raw_result(self, cmd_idx, raw_output):
                 line = line.strip()
                 g = re.match(gpu_section, line)
                 if g:
-                    current = f"gpu_{g.group(1)}"
+                    current = f'gpu_{g.group(1)}'
                     continue
 
                 r = re.match(row_pat, line)
                 if r and current:
                     samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups()
-                    # self._result.add_result("samples", int(samples.replace('x', '')))
-                    self._result.add_result("cpu_time", self._parse_time_value(cpu_time))
-                    # self._result.add_result("cpu_noise", self._parse_percentage(cpu_noise))
-                    self._result.add_result("gpu_time", self._parse_time_value(gpu_time))
-                    # self._result.add_result("gpu_noise", self._parse_percentage(gpu_noise))
-                    # self._result.add_result("batch_samples", int(batch_samples.replace('x', '')))
-                    self._result.add_result("batch_gpu_time", self._parse_time_value(batch_gpu))
+                    # self._result.add_result('samples', int(samples.replace('x', '')))
+                    self._result.add_result('cpu_time', self._parse_time_value(cpu_time))
+                    # self._result.add_result('cpu_noise', self._parse_percentage(cpu_noise))
+                    self._result.add_result('gpu_time', self._parse_time_value(gpu_time))
+                    # self._result.add_result('gpu_noise', self._parse_percentage(gpu_noise))
+                    # self._result.add_result('batch_samples', int(batch_samples.replace('x', '')))
+                    self._result.add_result('batch_gpu_time', self._parse_time_value(batch_gpu))
                     parsed_any = True
 
             if not parsed_any:
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index b6163f86d..93465a0e8 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -66,22 +66,22 @@ def _process_raw_result(self, cmd_idx, raw_output):
         Return:
             True if the raw output string is valid and result can be extracted.
         """
-        logger.debug(f"Processing raw result for command index {cmd_idx}.")
-        logger.debug(f"Raw output:\n{raw_output}")
+        logger.debug(f'Processing raw result for command index {cmd_idx}.')
+        logger.debug(f'Raw output:\n{raw_output}')
 
         self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)
         try:
-            gpu_section = r"### \[(\d+)\] NVIDIA"
+            gpu_section = r'### \[(\d+)\] NVIDIA'
             # Regex pattern to handle different time units and flexible spacing
             row_pat = (
-                r"\|\s*([0-9]+)\s*\|\s*"    # Duration (us)
-                r"([0-9]+)x\s*\|\s*"    # Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # CPU Time (μs, ns, ms, us, s)
-                r"([\d.]+%)\s*\|\s*"    # CPU Noise percentage
-                r"([\d.]+\s*[μmun]?s)\s*\|\s*"    # GPU Time
-                r"([\d.]+%)\s*\|\s*"    # GPU Noise percentage
-                r"([0-9]+)x\s*\|\s*"    # Batch Samples
-                r"([\d.]+\s*[μmun]?s)\s*\|"    # Batch GPU Time
+                r'\|\s*([0-9]+)\s*\|\s*'    # Duration (us)
+                r'([0-9]+)x\s*\|\s*'    # Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # CPU Time (μs, ns, ms, us, s)
+                r'([\d.]+%)\s*\|\s*'    # CPU Noise percentage
+                r'([\d.]+\s*[μmun]?s)\s*\|\s*'    # GPU Time
+                r'([\d.]+%)\s*\|\s*'    # GPU Noise percentage
+                r'([0-9]+)x\s*\|\s*'    # Batch Samples
+                r'([\d.]+\s*[μmun]?s)\s*\|'    # Batch GPU Time
             )
             current = None
             parsed_any = False

From fbb5969388397ab5cd673534b2c4562ab013ad84 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 20:24:30 +0000
Subject: [PATCH 18/27] fix

---
 .../micro_benchmarks/nvbench_sleep_kernel.py        | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 93465a0e8..3a2ccd9a7 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -87,15 +87,15 @@ def _process_raw_result(self, cmd_idx, raw_output):
             parsed_any = False
             for line in raw_output.splitlines():
                 line = line.strip()
-                logger.debug(f"Processing line: {line}")
+                logger.debug(f'Processing line: {line}')
                 g = re.match(gpu_section, line)
                 if g:
-                    current = f"gpu_{g.group(1)}"
-                    logger.debug(f"Found GPU section: {current}")
+                    current = f'gpu_{g.group(1)}'
+                    logger.debug(f'Found GPU section: {current}')
                     continue
                 r = re.match(row_pat, line)
                 if r and current:
-                    logger.debug(f"Matched row: {r.groups()}")
+                    logger.debug(f'Matched row: {r.groups()}')
                     duration_us, samples, cpu_time, cpu_noise, gpu_time, gpu_noise, batch_samples, batch_gpu = r.groups(
                     )
                     # self._result.add_result(f'duration_us_{duration_us}_samples', int(samples))
@@ -103,7 +103,8 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', int(batch_samples.replace('x', '')))
+                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', 
+                    # int(batch_samples.replace('x', '')))
                     self._result.add_result(
                         f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu)
                     )
@@ -111,7 +112,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
             if not parsed_any:
                 raise RuntimeError('No valid rows parsed')
         except Exception as e:
-            logger.error(f"Error processing raw result: {e}")
+            logger.error(f'Error processing raw result: {e}')
             self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
             return False
         return True

From f007745ec4d9e63094042360d44d7f8c2e8c7b02 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 20:34:54 +0000
Subject: [PATCH 19/27] fix

---
 superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
index 3a2ccd9a7..e7bcb4322 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
+++ b/superbench/benchmarks/micro_benchmarks/nvbench_sleep_kernel.py
@@ -103,7 +103,7 @@ def _process_raw_result(self, cmd_idx, raw_output):
                     # self._result.add_result(f'duration_us_{duration_us}_cpu_noise', self._parse_percentage(cpu_noise))
                     self._result.add_result(f'duration_us_{duration_us}_gpu_time', self._parse_time_value(gpu_time))
                     # self._result.add_result(f'duration_us_{duration_us}_gpu_noise', self._parse_percentage(gpu_noise))
-                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples', 
+                    # self._result.add_result(f'duration_us_{duration_us}_batch_samples',
                     # int(batch_samples.replace('x', '')))
                     self._result.add_result(
                         f'duration_us_{duration_us}_batch_gpu_time', self._parse_time_value(batch_gpu)

From b6b6082f0016d497d77d98794c6faaf2a7bb1375 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 21:23:25 +0000
Subject: [PATCH 20/27] fix

---
 .github/workflows/codeql-analysis.yml         | 18 +++++--
 .../micro_benchmarks/nvbench/CMakeLists.txt   | 47 +++++++++++--------
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index e53acebf6..4c9d43e56 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -29,14 +29,22 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+      - name: Install CMake >= 3.20 and pthreads
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential
+          # Install newer CMake
+          sudo apt-get remove --purge --auto-remove cmake
+          wget https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.sh
+          sudo sh cmake-3.21.0-linux-x86_64.sh --skip-license --prefix=/usr/local
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@v2
+        uses: github/codeql-action/init@v3
         with:
           languages: ${{ matrix.language }}
       - name: Autobuild
-        uses: github/codeql-action/autobuild@v2
+        uses: github/codeql-action/autobuild@v3
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v2
+        uses: github/codeql-action/analyze@v3
   analyze-cpp:
     name: CodeQL analyze cpp
     runs-on: ubuntu-latest
@@ -54,10 +62,10 @@ jobs:
           DEBIAN_FRONTEND=noninteractive apt-get update
           DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@v2
+        uses: github/codeql-action/init@v3
         with:
           languages: cpp
       - name: Build
         run: make cppbuild -j
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v2
+        uses: github/codeql-action/analyze@v3
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 448e83800..527176774 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -4,27 +4,36 @@ project(nvbench_benchmarks LANGUAGES CUDA)
 find_package(CUDAToolkit QUIET)
 if (CUDAToolkit_FOUND)
   include(../cuda_common.cmake)
-  find_package(nvbench CONFIG REQUIRED)
-
-  # list all your CUDA benchmark source files here
-  set(NVBENCH_SOURCES
-    kernel_launch.cu
-    sleep_kernel.cu
-    # add more *.cu as needed
-  )
+  
+  # Try to find nvbench, but don't require it
+  find_package(nvbench CONFIG QUIET)
+  
+  if (nvbench_FOUND)
+    message(STATUS "Found nvbench, building nvbench benchmarks")
+    
+    # list all your CUDA benchmark source files here
+    set(NVBENCH_SOURCES
+      kernel_launch.cu
+      sleep_kernel.cu
+      # add more *.cu as needed
+    )
 
-  foreach(src ${NVBENCH_SOURCES})
-    # strip ".cu" → NAME_WE
-    get_filename_component(basename ${src} NAME_WE)
-    set(target nvbench_${basename})
+    foreach(src ${NVBENCH_SOURCES})
+      # strip ".cu" → NAME_WE
+      get_filename_component(basename ${src} NAME_WE)
+      set(target nvbench_${basename})
 
-    add_executable(${target} ${src})
-    target_compile_features(${target} PUBLIC cuda_std_17)
-    target_link_libraries(${target}
-      PRIVATE nvbench::nvbench nvbench::main
-    )
-    install(TARGETS ${target} RUNTIME DESTINATION bin)
-  endforeach()
+      add_executable(${target} ${src})
+      target_compile_features(${target} PUBLIC cuda_std_17)
+      target_link_libraries(${target}
+        PRIVATE nvbench::nvbench nvbench::main
+      )
+      install(TARGETS ${target} RUNTIME DESTINATION bin)
+    endforeach()
+  else()
+    message(STATUS "nvbench not found, skipping nvbench benchmarks.")
+    message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench)
+  endif()
 else()
   message(STATUS "CUDA not found, skipping nvbench benchmarks.")
 endif()
\ No newline at end of file

From 0f2c838cd1174046121c0ae061ce9146ca54e717 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 21:40:50 +0000
Subject: [PATCH 21/27] fix

---
 dockerfile/cuda12.8.dockerfile                           | 9 +++++++--
 dockerfile/cuda12.9.dockerfile                           | 9 +++++++--
 dockerfile/cuda13.0.dockerfile                           | 9 +++++++--
 .../benchmarks/micro_benchmarks/nvbench/CMakeLists.txt   | 2 +-
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index b3bea186f..03534f245 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -62,11 +62,16 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
 # Install CMake 3.30.4 for nvbench compatibility
-RUN cd /tmp && \
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
     cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
-    rm -rf cmake-3.30.4-linux-x86_64*
+    rm -rf cmake-3.30.4-linux-x86_64* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index 34d7a9900..eb47d252e 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -63,11 +63,16 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
 # Install CMake 3.30.4 for nvbench compatibility
-RUN cd /tmp && \
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
     cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
-    rm -rf cmake-3.30.4-linux-x86_64*
+    rm -rf cmake-3.30.4-linux-x86_64* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index d9b54d081..a33a39181 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -63,11 +63,16 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/* /tmp/*
 
 # Install CMake 3.30.4 for nvbench compatibility
-RUN cd /tmp && \
+RUN apt-get update && \
+    apt-get remove -y cmake cmake-data && \
+    apt-get autoremove -y && \
+    cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
     cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
-    rm -rf cmake-3.30.4-linux-x86_64*
+    rm -rf cmake-3.30.4-linux-x86_64* && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 527176774..ceac53e56 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -32,7 +32,7 @@ if (CUDAToolkit_FOUND)
     endforeach()
   else()
     message(STATUS "nvbench not found, skipping nvbench benchmarks.")
-    message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench)
+    message(STATUS "To build nvbench benchmarks, first build the submodule in third_party/nvbench")
   endif()
 else()
   message(STATUS "CUDA not found, skipping nvbench benchmarks.")

From 5bd20f6e599dbfaa7b19946fad915af0e04cdb3f Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 22:02:25 +0000
Subject: [PATCH 22/27] fix

---
 .github/workflows/codeql-analysis.yml | 16 ++++++++--------
 dockerfile/cuda12.8.dockerfile        |  3 ++-
 dockerfile/cuda12.9.dockerfile        |  3 ++-
 dockerfile/cuda13.0.dockerfile        |  3 ++-
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 4c9d43e56..2960da08b 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -29,14 +29,6 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
-      - name: Install CMake >= 3.20 and pthreads
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential
-          # Install newer CMake
-          sudo apt-get remove --purge --auto-remove cmake
-          wget https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.sh
-          sudo sh cmake-3.21.0-linux-x86_64.sh --skip-license --prefix=/usr/local
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:
@@ -61,6 +53,14 @@ jobs:
         run: |
           DEBIAN_FRONTEND=noninteractive apt-get update
           DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
+      - name: Install CMake >= 3.20 and pthreads
+        run: |
+          apt-get update
+          apt-get install -y build-essential
+          apt-get remove --purge --auto-remove cmake
+          wget https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.sh
+          sh cmake-3.21.0-linux-x86_64.sh --skip-license --prefix=/usr/local
+          cmake --version
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:
diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index 03534f245..59cc15408 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -68,7 +68,8 @@ RUN apt-get update && \
     cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
     rm -rf cmake-3.30.4-linux-x86_64* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index eb47d252e..6d1fe537f 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -69,7 +69,8 @@ RUN apt-get update && \
     cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
     rm -rf cmake-3.30.4-linux-x86_64* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index a33a39181..649f236ad 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -69,7 +69,8 @@ RUN apt-get update && \
     cd /tmp && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
     tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    cp -r cmake-3.30.4-linux-x86_64/* /usr/local/ && \
+    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
     rm -rf cmake-3.30.4-linux-x86_64* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*

From ab88d254861d0f7e0214bc60695a93b807700739 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Fri, 10 Oct 2025 22:30:16 +0000
Subject: [PATCH 23/27] fix pipeline

---
 .github/workflows/codeql-analysis.yml | 14 +++++---------
 dockerfile/cuda12.8.dockerfile        | 13 +++++++++----
 dockerfile/cuda12.9.dockerfile        | 13 +++++++++----
 dockerfile/cuda13.0.dockerfile        | 13 +++++++++----
 4 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 2960da08b..e8d1ce862 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -52,15 +52,11 @@ jobs:
       - name: Install Dependency
         run: |
           DEBIAN_FRONTEND=noninteractive apt-get update
-          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo
-      - name: Install CMake >= 3.20 and pthreads
-        run: |
-          apt-get update
-          apt-get install -y build-essential
-          apt-get remove --purge --auto-remove cmake
-          wget https://github.com/Kitware/CMake/releases/download/v3.21.0/cmake-3.21.0-linux-x86_64.sh
-          sh cmake-3.21.0-linux-x86_64.sh --skip-license --prefix=/usr/local
-          cmake --version
+          DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo build-essential
+      - name: Setup CMake
+        uses: lukka/get-cmake@latest
+        with:
+          cmakeVersion: '3.20.0'
       - name: Initialize CodeQL
         uses: github/codeql-action/init@v3
         with:
diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index 59cc15408..d2eebd2b2 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -66,11 +66,16 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
-    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    case ${TARGETPLATFORM} in \
+        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
+        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
     ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
-    rm -rf cmake-3.30.4-linux-x86_64* && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index 6d1fe537f..f776f35c8 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -67,11 +67,16 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
-    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    case ${TARGETPLATFORM} in \
+        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
+        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
     ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
-    rm -rf cmake-3.30.4-linux-x86_64* && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 649f236ad..858043176 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -67,11 +67,16 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.tar.gz && \
-    tar -xzf cmake-3.30.4-linux-x86_64.tar.gz && \
-    mv cmake-3.30.4-linux-x86_64 /opt/cmake && \
+    case ${TARGETPLATFORM} in \
+        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
+        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+        *) CMAKE_ARCH="x86_64" ;; \
+    esac && \
+    wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
+    mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
     ln -sf /opt/cmake/bin/* /usr/local/bin/ && \
-    rm -rf cmake-3.30.4-linux-x86_64* && \
+    rm -rf cmake-3.30.4-linux-${CMAKE_ARCH}* && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 

From 3faaf60bae2ade90fff3d732a70270118e5dab9e Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Mon, 13 Oct 2025 23:51:45 +0000
Subject: [PATCH 24/27] fix cmake

---
 dockerfile/cuda12.8.dockerfile |  9 ++++++---
 dockerfile/cuda12.9.dockerfile | 10 +++++++---
 dockerfile/cuda13.0.dockerfile |  9 ++++++---
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index d2eebd2b2..e3fd4d797 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -66,11 +66,14 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    case ${TARGETPLATFORM} in \
-        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
-        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
         *) CMAKE_ARCH="x86_64" ;; \
     esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index f776f35c8..cac7a662d 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -67,11 +67,14 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    case ${TARGETPLATFORM} in \
-        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
-        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
         *) CMAKE_ARCH="x86_64" ;; \
     esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \
@@ -80,6 +83,7 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+
 ARG NUM_MAKE_JOBS=
 ARG TARGETPLATFORM
 ARG TARGETARCH
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 858043176..63e5eeeb8 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -67,11 +67,14 @@ RUN apt-get update && \
     apt-get remove -y cmake cmake-data && \
     apt-get autoremove -y && \
     cd /tmp && \
-    case ${TARGETPLATFORM} in \
-        "linux/arm64") CMAKE_ARCH="aarch64" ;; \
-        "linux/amd64") CMAKE_ARCH="x86_64" ;; \
+    ARCH=$(uname -m) && \
+    case ${ARCH} in \
+        "aarch64") CMAKE_ARCH="aarch64" ;; \
+        "x86_64") CMAKE_ARCH="x86_64" ;; \
+        "arm64") CMAKE_ARCH="aarch64" ;; \
         *) CMAKE_ARCH="x86_64" ;; \
     esac && \
+    echo "Detected architecture: ${ARCH}, using CMAKE_ARCH: ${CMAKE_ARCH}" && \
     wget -q https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     tar -xzf cmake-3.30.4-linux-${CMAKE_ARCH}.tar.gz && \
     mv cmake-3.30.4-linux-${CMAKE_ARCH} /opt/cmake && \

From 896a46a61d259fc5c500de024f77a9b110625667 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 14 Oct 2025 17:02:55 +0000
Subject: [PATCH 25/27] fix pipeline

---
 .../benchmarks/micro_benchmarks/nvbench/CMakeLists.txt      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index ceac53e56..0aeb37730 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -1,3 +1,9 @@
+# skip the build for < cuda12.8
+if(CMAKE_VERSION VERSION_LESS "3.20")
+  message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.20, skipping nvbench benchmarks")
+  return()
+endif()
+
 cmake_minimum_required(VERSION 3.20)
 project(nvbench_benchmarks LANGUAGES CUDA)
 

From 5d4986b67b62cf569f6bb3dc023ca427208e9646 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 14 Oct 2025 19:46:45 +0000
Subject: [PATCH 26/27] fix pipeline

---
 dockerfile/cuda13.0.dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 63e5eeeb8..55ed300df 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -172,7 +172,7 @@ ADD dockerfile/etc /opt/microsoft/
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party cuda_with_msccl cuda_nvbench
+RUN make -C third_party cuda cuda_nvbench
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==78.1.0 && \

From b246522a10992a35266feaac6d860d78564ea7c8 Mon Sep 17 00:00:00 2001
From: Wenqing Lan <wenqinglan@microsoft.com>
Date: Tue, 14 Oct 2025 23:42:18 +0000
Subject: [PATCH 27/27] fix pipeline & mlc version

---
 dockerfile/cuda11.1.1.dockerfile                     |  2 +-
 dockerfile/cuda12.2.dockerfile                       |  2 +-
 dockerfile/cuda12.4.dockerfile                       |  2 +-
 dockerfile/cuda12.8.dockerfile                       |  2 +-
 dockerfile/cuda12.9.dockerfile                       |  2 +-
 dockerfile/cuda13.0.dockerfile                       |  2 +-
 dockerfile/rocm5.0.x.dockerfile                      |  2 +-
 dockerfile/rocm5.1.x.dockerfile                      |  2 +-
 dockerfile/rocm5.7.x.dockerfile                      |  2 +-
 dockerfile/rocm6.0.x.dockerfile                      |  2 +-
 .../micro_benchmarks/nvbench/CMakeLists.txt          | 12 ++++++------
 11 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile
index ece2f3a6c..16fd447a4 100644
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -114,7 +114,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/cuda12.2.dockerfile b/dockerfile/cuda12.2.dockerfile
index b53fe1c7c..d5f8a9873 100644
--- a/dockerfile/cuda12.2.dockerfile
+++ b/dockerfile/cuda12.2.dockerfile
@@ -95,7 +95,7 @@ RUN cd /opt && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/cuda12.4.dockerfile b/dockerfile/cuda12.4.dockerfile
index 560f0908a..db9841b45 100644
--- a/dockerfile/cuda12.4.dockerfile
+++ b/dockerfile/cuda12.4.dockerfile
@@ -104,7 +104,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
     # Install Intel MLC
     cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz && \
diff --git a/dockerfile/cuda12.8.dockerfile b/dockerfile/cuda12.8.dockerfile
index e3fd4d797..f2bbdf183 100644
--- a/dockerfile/cuda12.8.dockerfile
+++ b/dockerfile/cuda12.8.dockerfile
@@ -126,7 +126,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
     # Install Intel MLC
     cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz && \
diff --git a/dockerfile/cuda12.9.dockerfile b/dockerfile/cuda12.9.dockerfile
index cac7a662d..c66e27f9d 100644
--- a/dockerfile/cuda12.9.dockerfile
+++ b/dockerfile/cuda12.9.dockerfile
@@ -128,7 +128,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
     # Install Intel MLC
     cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz && \
diff --git a/dockerfile/cuda13.0.dockerfile b/dockerfile/cuda13.0.dockerfile
index 55ed300df..0baa8cc69 100644
--- a/dockerfile/cuda13.0.dockerfile
+++ b/dockerfile/cuda13.0.dockerfile
@@ -127,7 +127,7 @@ RUN TARGETARCH_HW=$(uname -m) && \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
     # Install Intel MLC
     cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz && \
diff --git a/dockerfile/rocm5.0.x.dockerfile b/dockerfile/rocm5.0.x.dockerfile
index 8bb718794..321590e8c 100644
--- a/dockerfile/rocm5.0.x.dockerfile
+++ b/dockerfile/rocm5.0.x.dockerfile
@@ -98,7 +98,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/rocm5.1.x.dockerfile b/dockerfile/rocm5.1.x.dockerfile
index e56c35c27..8cfaa4f52 100644
--- a/dockerfile/rocm5.1.x.dockerfile
+++ b/dockerfile/rocm5.1.x.dockerfile
@@ -110,7 +110,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/rocm5.7.x.dockerfile b/dockerfile/rocm5.7.x.dockerfile
index 85ba1919e..acff9f9e3 100644
--- a/dockerfile/rocm5.7.x.dockerfile
+++ b/dockerfile/rocm5.7.x.dockerfile
@@ -126,7 +126,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/dockerfile/rocm6.0.x.dockerfile b/dockerfile/rocm6.0.x.dockerfile
index ce5736e29..436864c19 100644
--- a/dockerfile/rocm6.0.x.dockerfile
+++ b/dockerfile/rocm6.0.x.dockerfile
@@ -132,7 +132,7 @@ RUN cd /tmp && \
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
diff --git a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
index 0aeb37730..8415b10b4 100644
--- a/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/nvbench/CMakeLists.txt
@@ -1,12 +1,12 @@
-# skip the build for < cuda12.8
-if(CMAKE_VERSION VERSION_LESS "3.20")
-  message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.20, skipping nvbench benchmarks")
+cmake_minimum_required(VERSION 3.18)
+project(nvbench_benchmarks LANGUAGES CUDA)
+
+# Check if we have a recent enough CMake for nvbench (which requires 3.30.4)
+if(CMAKE_VERSION VERSION_LESS "3.30.4")
+  message(STATUS "CMake version ${CMAKE_VERSION} is less than 3.30.4 (required by nvbench), skipping nvbench benchmarks")
   return()
 endif()
 
-cmake_minimum_required(VERSION 3.20)
-project(nvbench_benchmarks LANGUAGES CUDA)
-
 find_package(CUDAToolkit QUIET)
 if (CUDAToolkit_FOUND)
   include(../cuda_common.cmake)