From 05a89a6a53a68bcfcc7c910e7a39830708393747 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Tue, 6 Jan 2026 22:48:01 +0000 Subject: [PATCH 01/22] Anchor wandb version when python is 3.8. --- third_party/Megatron/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index a41efd476..18a8df47d 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -9,7 +9,8 @@ black==25.1.0; python_version >= '3.12' isort>=5.5.4 tqdm sentencepiece -wandb +wandb==0.22.3; python_version == '3.8' +wandb; python_version > '3.8' einops typing_extensions==4.9.0; python_version < '3.12' typing_extensions==4.12.2; python_version >= '3.12' From af676fc1429b9580e9496e83c6271e693d785d73 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 7 Jan 2026 01:57:39 +0000 Subject: [PATCH 02/22] Adopt new api. --- superbench/benchmarks/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 8e6e58bfe..ef3f354fe 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -293,9 +293,13 @@ def _process_percentile_result(self, metric, result, reduce_type=None): if len(result) > 0: percentile_list = ['50', '90', '95', '99', '99.9'] for percentile in percentile_list: + try: + val = np.percentile(result, float(percentile), method='nearest') + except TypeError: + val = np.percentile(result, float(percentile), interpolation='nearest') self._result.add_result( '{}_{}'.format(metric, percentile), - np.percentile(result, float(percentile), interpolation='nearest'), reduce_type + val, reduce_type ) def print_env_info(self): From 5a4d01e88230d267ff24a572c627ae71f125bd6f Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 7 Jan 2026 05:19:26 +0000 Subject: [PATCH 03/22] Fix lint issues. --- superbench/benchmarks/base.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index ef3f354fe..ac190acee 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -21,6 +21,7 @@ class SortedMetavarTypeHelpFormatter(argparse.MetavarTypeHelpFormatter): """Custom HelpFormatter class for argparse which sorts option strings.""" + def add_arguments(self, actions): """Sort option strings before original add_arguments. @@ -32,6 +33,7 @@ def add_arguments(self, actions): class Benchmark(ABC): """The base class of all benchmarks.""" + def __init__(self, name, parameters=''): """Constructor. @@ -297,10 +299,7 @@ def _process_percentile_result(self, metric, result, reduce_type=None): val = np.percentile(result, float(percentile), method='nearest') except TypeError: val = np.percentile(result, float(percentile), interpolation='nearest') - self._result.add_result( - '{}_{}'.format(metric, percentile), - val, reduce_type - ) + self._result.add_result('{}_{}'.format(metric, percentile), val, reduce_type) def print_env_info(self): """Print environments or dependencies information.""" From 22fd5f518e7bfa19bb14eda120cac3bedc2a6e85 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 7 Jan 2026 06:18:15 +0000 Subject: [PATCH 04/22] Fix lint issues. --- superbench/benchmarks/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index ac190acee..46886b4e7 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -21,7 +21,6 @@ class SortedMetavarTypeHelpFormatter(argparse.MetavarTypeHelpFormatter): """Custom HelpFormatter class for argparse which sorts option strings.""" - def add_arguments(self, actions): """Sort option strings before original add_arguments. @@ -33,7 +32,6 @@ def add_arguments(self, actions): class Benchmark(ABC): """The base class of all benchmarks.""" - def __init__(self, name, parameters=''): """Constructor. From 0080d92b1f2fdd3b76302ff8ac5a0fbce5b409f2 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 12 Jan 2026 21:25:08 +0000 Subject: [PATCH 05/22] Revert wandb changes. --- third_party/Megatron/requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index 18a8df47d..a41efd476 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -9,8 +9,7 @@ black==25.1.0; python_version >= '3.12' isort>=5.5.4 tqdm sentencepiece -wandb==0.22.3; python_version == '3.8' -wandb; python_version > '3.8' +wandb einops typing_extensions==4.9.0; python_version < '3.12' typing_extensions==4.12.2; python_version >= '3.12' From 059daa9d62c92d3e9ade095fade780e0100fefb7 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Mon, 12 Jan 2026 21:39:26 +0000 Subject: [PATCH 06/22] Remove cuda11.1.1. --- .github/workflows/build-image.yml | 6 -- dockerfile/cuda11.1.1.dockerfile | 159 ------------------------------ 2 files changed, 165 deletions(-) delete mode 100644 dockerfile/cuda11.1.1.dockerfile diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index bf809cd43..dc6b63c21 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -74,12 +74,6 @@ jobs: platforms: linux/amd64 runner: [self-hosted, linux/amd64] build_args: "NUM_MAKE_JOBS=16" - - name: cuda11.1.1 - dockerfile: cuda11.1.1 - tags: superbench/main:cuda11.1.1,superbench/superbench:latest - platforms: linux/amd64 - runner: ubuntu-latest - build_args: "NUM_MAKE_JOBS=8" # - name: rocm6.2 # dockerfile: rocm6.2.x # tags: superbench/main:rocm6.2 diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile deleted file mode 100644 index 7ee352543..000000000 --- a/dockerfile/cuda11.1.1.dockerfile +++ /dev/null @@ -1,159 +0,0 @@ -FROM nvcr.io/nvidia/pytorch:20.12-py3 - -# OS: -# - Ubuntu: 20.04 -# - OpenMPI: 4.0.5 -# - Docker Client: 20.10.8 -# NVIDIA: -# - CUDA: 11.1.1 -# - cuDNN: 8.0.5 -# - NCCL: v2.10.3-1 -# Mellanox: -# - OFED: 5.2-2.2.3.0 -# - HPC-X: v2.8.3 -# - NCCL RDMA SHARP plugins: 7cccbc1 -# Intel: -# - mlc: v3.12 - -LABEL maintainer="SuperBench" - -ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - autoconf \ - automake \ - bc \ - build-essential \ - curl \ - dmidecode \ - ffmpeg \ - git \ - iproute2 \ - jq \ - libaio-dev \ - libavcodec-dev \ - libavformat-dev \ - libavutil-dev \ - libcap2 \ - libnuma-dev \ - libpci-dev \ - libswresample-dev \ - libtinfo5 \ - libtool \ - lshw \ - python3-mpi4py \ - net-tools \ - openssh-client \ - openssh-server \ - pciutils \ - sudo \ - util-linux \ - vim \ - wget \ - && \ - apt-get autoremove && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64 - -ARG NUM_MAKE_JOBS= - -# Install Docker -ENV DOCKER_VERSION=20.10.8 -RUN cd /tmp && \ - wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ - tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ - rm docker.tgz - -# Update system config -RUN mkdir -p /root/.ssh && \ - touch /root/.ssh/authorized_keys && \ - mkdir -p /var/run/sshd && \ - sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ - sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ - sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ - echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ - echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf - -# Install OFED -ENV OFED_VERSION=5.2-2.2.3.0 -RUN cd /tmp && \ - wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ - tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ - MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ - rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* - -# Install HPC-X -ENV HPCX_VERSION=v2.9.0 -RUN cd /opt && \ - rm -rf hpcx && \ - wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-inbox-ubuntu20.04-x86_64.tbz -O hpcx.tbz && \ - tar xf hpcx.tbz && \ - mv hpcx-${HPCX_VERSION}-gcc-inbox-ubuntu20.04-x86_64 hpcx && \ - rm hpcx.tbz - -# Install NCCL RDMA SHARP plugins -RUN cd /tmp && \ - git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins.git && \ - cd nccl-rdma-sharp-plugins && \ - git reset --hard 7cccbc1 && \ - ./autogen.sh && \ - ./configure --prefix=/usr/local --with-cuda=/usr/local/cuda && \ - make -j ${NUM_MAKE_JOBS} && \ - make install && \ - cd /tmp && \ - rm -rf nccl-rdma-sharp-plugins - -# Install NCCL patch -RUN cd /tmp && \ - git clone -b v2.10.3-1 https://github.com/NVIDIA/nccl.git && \ - cd nccl && \ - make -j ${NUM_MAKE_JOBS} src.build && \ - make install && \ - cd /tmp && \ - rm -rf nccl - -# Install Intel MLC -RUN cd /tmp && \ - wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \ - tar xzf mlc.tgz Linux/mlc && \ - cp ./Linux/mlc /usr/local/bin/ && \ - rm -rf ./Linux mlc.tgz - -ENV PATH="${PATH}" \ - LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ - SB_HOME=/opt/superbench \ - SB_MICRO_PATH=/opt/superbench \ - ANSIBLE_DEPRECATION_WARNINGS=FALSE \ - ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections - -RUN echo PATH="$PATH" > /etc/environment && \ - echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ - echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment - -# Install AOCC compiler -RUN cd /tmp && \ - wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ - apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \ - rm -rf aocc-compiler-4.0.0_1_amd64.deb - -# Install AMD BLIS -RUN cd /tmp && \ - wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ - tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ - mv amd-blis /opt/AMD && \ - rm -rf aocl-blis-linux-aocc-4.0.tar.gz - -# Add config files -ADD dockerfile/etc /opt/microsoft/ - -WORKDIR ${SB_HOME} - -ADD third_party third_party -RUN make -C third_party cuda -o nvbandwidth - -ADD . . -RUN python3 -m pip install --upgrade setuptools==65.7 importlib_metadata==6.8.0 && \ - python3 -m pip install --no-cache-dir .[nvworker] && \ - make cppbuild && \ - make postinstall && \ - rm -rf .git From 7f6de0ce4c47fc8c61bed9ae7e946edbbeaddf1b Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Tue, 13 Jan 2026 18:08:55 +0000 Subject: [PATCH 07/22] Revert "Remove cuda11.1.1." This reverts commit 059daa9d62c92d3e9ade095fade780e0100fefb7. modified: .github/workflows/build-image.yml new file: dockerfile/cuda11.1.1.dockerfile --- .github/workflows/build-image.yml | 6 ++ dockerfile/cuda11.1.1.dockerfile | 159 ++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 dockerfile/cuda11.1.1.dockerfile diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index dc6b63c21..bf809cd43 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -74,6 +74,12 @@ jobs: platforms: linux/amd64 runner: [self-hosted, linux/amd64] build_args: "NUM_MAKE_JOBS=16" + - name: cuda11.1.1 + dockerfile: cuda11.1.1 + tags: superbench/main:cuda11.1.1,superbench/superbench:latest + platforms: linux/amd64 + runner: ubuntu-latest + build_args: "NUM_MAKE_JOBS=8" # - name: rocm6.2 # dockerfile: rocm6.2.x # tags: superbench/main:rocm6.2 diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile new file mode 100644 index 000000000..7ee352543 --- /dev/null +++ b/dockerfile/cuda11.1.1.dockerfile @@ -0,0 +1,159 @@ +FROM nvcr.io/nvidia/pytorch:20.12-py3 + +# OS: +# - Ubuntu: 20.04 +# - OpenMPI: 4.0.5 +# - Docker Client: 20.10.8 +# NVIDIA: +# - CUDA: 11.1.1 +# - cuDNN: 8.0.5 +# - NCCL: v2.10.3-1 +# Mellanox: +# - OFED: 5.2-2.2.3.0 +# - HPC-X: v2.8.3 +# - NCCL RDMA SHARP plugins: 7cccbc1 +# Intel: +# - mlc: v3.12 + +LABEL maintainer="SuperBench" + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + autoconf \ + automake \ + bc \ + build-essential \ + curl \ + dmidecode \ + ffmpeg \ + git \ + iproute2 \ + jq \ + libaio-dev \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ + libcap2 \ + libnuma-dev \ + libpci-dev \ + libswresample-dev \ + libtinfo5 \ + libtool \ + lshw \ + python3-mpi4py \ + net-tools \ + openssh-client \ + openssh-server \ + pciutils \ + sudo \ + util-linux \ + vim \ + wget \ + && \ + apt-get autoremove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64 + +ARG NUM_MAKE_JOBS= + +# Install Docker +ENV DOCKER_VERSION=20.10.8 +RUN cd /tmp && \ + wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ + tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ + rm docker.tgz + +# Update system config +RUN mkdir -p /root/.ssh && \ + touch /root/.ssh/authorized_keys && \ + mkdir -p /var/run/sshd && \ + sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \ + sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \ + echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \ + echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf + +# Install OFED +ENV OFED_VERSION=5.2-2.2.3.0 +RUN cd /tmp && \ + wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ + tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ + MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ + rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* + +# Install HPC-X +ENV HPCX_VERSION=v2.9.0 +RUN cd /opt && \ + rm -rf hpcx && \ + wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-inbox-ubuntu20.04-x86_64.tbz -O hpcx.tbz && \ + tar xf hpcx.tbz && \ + mv hpcx-${HPCX_VERSION}-gcc-inbox-ubuntu20.04-x86_64 hpcx && \ + rm hpcx.tbz + +# Install NCCL RDMA SHARP plugins +RUN cd /tmp && \ + git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins.git && \ + cd nccl-rdma-sharp-plugins && \ + git reset --hard 7cccbc1 && \ + ./autogen.sh && \ + ./configure --prefix=/usr/local --with-cuda=/usr/local/cuda && \ + make -j ${NUM_MAKE_JOBS} && \ + make install && \ + cd /tmp && \ + rm -rf nccl-rdma-sharp-plugins + +# Install NCCL patch +RUN cd /tmp && \ + git clone -b v2.10.3-1 https://github.com/NVIDIA/nccl.git && \ + cd nccl && \ + make -j ${NUM_MAKE_JOBS} src.build && \ + make install && \ + cd /tmp && \ + rm -rf nccl + +# Install Intel MLC +RUN cd /tmp && \ + wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \ + tar xzf mlc.tgz Linux/mlc && \ + cp ./Linux/mlc /usr/local/bin/ && \ + rm -rf ./Linux mlc.tgz + +ENV PATH="${PATH}" \ + LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment + +# Install AOCC compiler +RUN cd /tmp && \ + wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ + apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \ + rm -rf aocc-compiler-4.0.0_1_amd64.deb + +# Install AMD BLIS +RUN cd /tmp && \ + wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ + tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ + mv amd-blis /opt/AMD && \ + rm -rf aocl-blis-linux-aocc-4.0.tar.gz + +# Add config files +ADD dockerfile/etc /opt/microsoft/ + +WORKDIR ${SB_HOME} + +ADD third_party third_party +RUN make -C third_party cuda -o nvbandwidth + +ADD . . +RUN python3 -m pip install --upgrade setuptools==65.7 importlib_metadata==6.8.0 && \ + python3 -m pip install --no-cache-dir .[nvworker] && \ + make cppbuild && \ + make postinstall && \ + rm -rf .git From 4f2317388f35094dda53bb871617de8f1564eb78 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Tue, 13 Jan 2026 18:19:56 +0000 Subject: [PATCH 08/22] Install go in cuda11.1.1 dockerfile. --- dockerfile/cuda11.1.1.dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 7ee352543..b4c2ba1a6 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -28,6 +28,7 @@ RUN apt-get update && \ dmidecode \ ffmpeg \ git \ + golang \ iproute2 \ jq \ libaio-dev \ From 5bf14d4e5c5ba2727b2af0df72646a7d62df66d2 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Tue, 13 Jan 2026 19:52:34 +0000 Subject: [PATCH 09/22] Revert "Install go in cuda11.1.1 dockerfile." This reverts commit 4f2317388f35094dda53bb871617de8f1564eb78. modified: dockerfile/cuda11.1.1.dockerfile --- dockerfile/cuda11.1.1.dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index b4c2ba1a6..7ee352543 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -28,7 +28,6 @@ RUN apt-get update && \ dmidecode \ ffmpeg \ git \ - golang \ iproute2 \ jq \ libaio-dev \ From 0687eec0b4bf72e79a49678783ec1a27766b937b Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Tue, 13 Jan 2026 19:59:19 +0000 Subject: [PATCH 10/22] Anchor wandb to 0.22.3 when python < 3.12. --- third_party/Megatron/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index a41efd476..e0713d276 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -10,6 +10,7 @@ isort>=5.5.4 tqdm sentencepiece wandb +wandb==0.22.3; python_version < '3.12' einops typing_extensions==4.9.0; python_version < '3.12' typing_extensions==4.12.2; python_version >= '3.12' From ae05143b4cc1b8595c876ae9575613d79dc525ad Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Tue, 13 Jan 2026 21:54:32 +0000 Subject: [PATCH 11/22] Add wandb version control for python >= 3.12. --- third_party/Megatron/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index e0713d276..aedc582da 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -9,8 +9,8 @@ black==25.1.0; python_version >= '3.12' isort>=5.5.4 tqdm sentencepiece -wandb wandb==0.22.3; python_version < '3.12' +wandb; python_version >= '3.12' einops typing_extensions==4.9.0; python_version < '3.12' typing_extensions==4.12.2; python_version >= '3.12' From 1ecbcdac24249b0f27db33cbf936a5d850e01fef Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Tue, 13 Jan 2026 23:01:01 +0000 Subject: [PATCH 12/22] Add build dependencies of wandb in 11.1.1 to build wandb, instead of using version control. --- dockerfile/cuda11.1.1.dockerfile | 20 ++++++++++++++++++++ third_party/Megatron/requirements.txt | 3 +-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 7ee352543..63e92f7b3 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -24,6 +24,7 @@ RUN apt-get update && \ automake \ bc \ build-essential \ + ca-certificates \ curl \ dmidecode \ ffmpeg \ @@ -46,6 +47,7 @@ RUN apt-get update && \ openssh-client \ openssh-server \ pciutils \ + software-properties-common \ sudo \ util-linux \ vim \ @@ -55,6 +57,24 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64 +# Install Go (used for wandb build) +RUN add-apt-repository -y ppa:longsleep/golang-backports && \ + apt-get update && \ + apt-get install -y golang-1.24-go + +# Install Rust (used for wandb build) +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + . /root/.cargo/env && \ + rustup update stable && \ + rustup default stable + +ENV PATH="/usr/lib/go-1.24/bin:/root/.cargo/bin:${PATH}" + +# sanity checks +RUN go version && \ + cargo --version && \ + which cargo + ARG NUM_MAKE_JOBS= # Install Docker diff --git a/third_party/Megatron/requirements.txt b/third_party/Megatron/requirements.txt index aedc582da..a41efd476 100644 --- a/third_party/Megatron/requirements.txt +++ b/third_party/Megatron/requirements.txt @@ -9,8 +9,7 @@ black==25.1.0; python_version >= '3.12' isort>=5.5.4 tqdm sentencepiece -wandb==0.22.3; python_version < '3.12' -wandb; python_version >= '3.12' +wandb einops typing_extensions==4.9.0; python_version < '3.12' typing_extensions==4.12.2; python_version >= '3.12' From 8e8d28656e96423dbdeab6ff5c4d0f837537a77a Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 15 Jan 2026 00:10:09 +0000 Subject: [PATCH 13/22] Fix comments. --- dockerfile/cuda11.1.1.dockerfile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 63e92f7b3..662c36156 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -70,11 +70,6 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ ENV PATH="/usr/lib/go-1.24/bin:/root/.cargo/bin:${PATH}" -# sanity checks -RUN go version && \ - cargo --version && \ - which cargo - ARG NUM_MAKE_JOBS= # Install Docker From 4d7d6c40da84bba07f508b84cf39fdcf882b1b5e Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Wed, 21 Jan 2026 23:08:49 +0000 Subject: [PATCH 14/22] Fix comments. --- dockerfile/cuda11.1.1.dockerfile | 12 ++++-------- superbench/benchmarks/base.py | 9 ++++----- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 662c36156..120fafcc6 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -53,23 +53,19 @@ RUN apt-get update && \ vim \ wget \ && \ + add-apt-repository -y ppa:longsleep/golang-backports && \ + apt-get update && \ + apt-get install -y golang-1.24-go && \ apt-get autoremove && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64 -# Install Go (used for wandb build) -RUN add-apt-repository -y ppa:longsleep/golang-backports && \ - apt-get update && \ - apt-get install -y golang-1.24-go - # Install Rust (used for wandb build) RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ . /root/.cargo/env && \ rustup update stable && \ rustup default stable -ENV PATH="/usr/lib/go-1.24/bin:/root/.cargo/bin:${PATH}" - ARG NUM_MAKE_JOBS= # Install Docker @@ -134,7 +130,7 @@ RUN cd /tmp && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz -ENV PATH="${PATH}" \ +ENV PATH="/usr/lib/go-1.24/bin:/root/.cargo/bin:${PATH}" \ LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ SB_HOME=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \ diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 46886b4e7..8e6e58bfe 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -293,11 +293,10 @@ def _process_percentile_result(self, metric, result, reduce_type=None): if len(result) > 0: percentile_list = ['50', '90', '95', '99', '99.9'] for percentile in percentile_list: - try: - val = np.percentile(result, float(percentile), method='nearest') - except TypeError: - val = np.percentile(result, float(percentile), interpolation='nearest') - self._result.add_result('{}_{}'.format(metric, percentile), val, reduce_type) + self._result.add_result( + '{}_{}'.format(metric, percentile), + np.percentile(result, float(percentile), interpolation='nearest'), reduce_type + ) def print_env_info(self): """Print environments or dependencies information.""" From 82734d4459e50cb425d6bb3913dd66c3604c52a7 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 22 Jan 2026 00:29:20 +0000 Subject: [PATCH 15/22] Fix comments. --- dockerfile/cuda11.1.1.dockerfile | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 120fafcc6..600c0155b 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -47,25 +47,21 @@ RUN apt-get update && \ openssh-client \ openssh-server \ pciutils \ - software-properties-common \ sudo \ util-linux \ vim \ wget \ + software-properties-common \ && \ add-apt-repository -y ppa:longsleep/golang-backports && \ apt-get update && \ - apt-get install -y golang-1.24-go && \ + apt-get install -y golang-1.24-go=1.24* && \ + update-alternatives --install /usr/bin/go go /usr/lib/go-1.24/bin/go 100 && \ + update-alternatives --install /usr/bin/gofmt gofmt /usr/lib/go-1.24/bin/gofmt 100 && \ apt-get autoremove && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /opt/cmake-3.14.6-Linux-x86_64 -# Install Rust (used for wandb build) -RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ - . /root/.cargo/env && \ - rustup update stable && \ - rustup default stable - ARG NUM_MAKE_JOBS= # Install Docker @@ -130,7 +126,7 @@ RUN cd /tmp && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz -ENV PATH="/usr/lib/go-1.24/bin:/root/.cargo/bin:${PATH}" \ +ENV PATH="${PATH}" \ LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ SB_HOME=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \ @@ -163,8 +159,13 @@ ADD third_party third_party RUN make -C third_party cuda -o nvbandwidth ADD . . -RUN python3 -m pip install --upgrade setuptools==65.7 importlib_metadata==6.8.0 && \ +# Install Rust temporarily for wandb build, then remove to reduce image size +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + . /root/.cargo/env && \ + python3 -m pip install --upgrade setuptools==65.7 importlib_metadata==6.8.0 && \ python3 -m pip install --no-cache-dir .[nvworker] && \ make cppbuild && \ make postinstall && \ - rm -rf .git + rm -rf .git && \ + rustup self uninstall -y && \ + rm -rf /root/.cargo /root/.rustup From db9e12bb3ea47a482a4d72e2f2f162a8cdd0f622 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 22 Jan 2026 01:57:49 +0000 Subject: [PATCH 16/22] Fix go install location. --- dockerfile/cuda11.1.1.dockerfile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 600c0155b..28cce3a5f 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -126,8 +126,7 @@ RUN cd /tmp && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz -ENV PATH="${PATH}" \ - LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ SB_HOME=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \ ANSIBLE_DEPRECATION_WARNINGS=FALSE \ @@ -155,14 +154,15 @@ ADD dockerfile/etc /opt/microsoft/ WORKDIR ${SB_HOME} +# Install Rust for wandb build (required by megatron_lm target) +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + ADD third_party third_party RUN make -C third_party cuda -o nvbandwidth ADD . . -# Install Rust temporarily for wandb build, then remove to reduce image size -RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ - . /root/.cargo/env && \ - python3 -m pip install --upgrade setuptools==65.7 importlib_metadata==6.8.0 && \ +RUN python3 -m pip install --upgrade setuptools==65.7 importlib_metadata==6.8.0 && \ python3 -m pip install --no-cache-dir .[nvworker] && \ make cppbuild && \ make postinstall && \ From 709680bcae0ba93969b0556bc5dd0b75370ec1a4 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 22 Jan 2026 05:21:06 +0000 Subject: [PATCH 17/22] Reorder installation steps for proper dependency handling. --- dockerfile/cuda11.1.1.dockerfile | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 28cce3a5f..203cb78c1 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -154,18 +154,17 @@ ADD dockerfile/etc /opt/microsoft/ WORKDIR ${SB_HOME} -# Install Rust for wandb build (required by megatron_lm target) -RUN curl https://sh.rustup.rs -sSf | sh -s -- -y -ENV PATH="/root/.cargo/bin:${PATH}" - ADD third_party third_party -RUN make -C third_party cuda -o nvbandwidth +# Install Rust temporarily for wandb build (required by megatron_lm target), then remove +RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ + . /root/.cargo/env && \ + make -C third_party cuda -o nvbandwidth && \ + rustup self uninstall -y && \ + rm -rf /root/.cargo /root/.rustup ADD . . RUN python3 -m pip install --upgrade setuptools==65.7 importlib_metadata==6.8.0 && \ python3 -m pip install --no-cache-dir .[nvworker] && \ make cppbuild && \ make postinstall && \ - rm -rf .git && \ - rustup self uninstall -y && \ - rm -rf /root/.cargo /root/.rustup + rm -rf .git From c4213bf02f463c1a67b27b1a08a4f903f7acdf0f Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 22 Jan 2026 05:23:25 +0000 Subject: [PATCH 18/22] Revert PATH change. --- dockerfile/cuda11.1.1.dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dockerfile/cuda11.1.1.dockerfile b/dockerfile/cuda11.1.1.dockerfile index 203cb78c1..1e7a6de41 100644 --- a/dockerfile/cuda11.1.1.dockerfile +++ b/dockerfile/cuda11.1.1.dockerfile @@ -126,7 +126,8 @@ RUN cd /tmp && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz -ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ +ENV PATH="${PATH}" \ + LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ SB_HOME=/opt/superbench \ SB_MICRO_PATH=/opt/superbench \ ANSIBLE_DEPRECATION_WARNINGS=FALSE \ From 6c91e2f526b4482793b69878359ad67eca16bc9a Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Thu, 22 Jan 2026 10:31:01 -0800 Subject: [PATCH 19/22] CI/CD - Update the NumPy/Pandas API usage (#774) **Description** - Deprecated API arguments, 'interpolation', when numpy >v1.22.0. And it was removed since v2.4.0. - Pandas to_exel() failed in unit test. Solution - Update the NumPy API usage to support NumPy 1.22.0+ while maintaining backward compatibility with earlier versions. - Fixed the pandas to_excel() API compatibility issue. In newer versions of pandas (2.0+), the 'sheet_name' parameter must be passed as a keyword argument rather than a positional argument --------- Co-authored-by: Hongtao Zhang Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- superbench/analyzer/file_handler.py | 4 ++-- superbench/analyzer/result_summary.py | 4 ++-- superbench/benchmarks/base.py | 12 ++++++++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/superbench/analyzer/file_handler.py b/superbench/analyzer/file_handler.py index f9f4065f9..e65ccfd01 100644 --- a/superbench/analyzer/file_handler.py +++ b/superbench/analyzer/file_handler.py @@ -94,7 +94,7 @@ def output_excel_raw_data(writer, raw_data_df, sheet_name): """ # Output the raw data if isinstance(raw_data_df, pd.DataFrame) and not raw_data_df.empty: - raw_data_df.to_excel(writer, sheet_name, index=True) + raw_data_df.to_excel(writer, sheet_name=sheet_name, index=True) else: logger.warning('FileHandler: excel_data_output - {} data_df is empty.'.format(sheet_name)) @@ -114,7 +114,7 @@ def output_excel_data_not_accept(writer, data_not_accept_df, rules): # Output the not accept if isinstance(data_not_accept_df, pd.DataFrame): - data_not_accept_df.to_excel(writer, 'Not Accept', index=True) + data_not_accept_df.to_excel(writer, sheet_name='Not Accept', index=True) if not data_not_accept_df.empty: row_start = 1 row_end = max(row_start, len(data_not_accept_df)) diff --git a/superbench/analyzer/result_summary.py b/superbench/analyzer/result_summary.py index 09954a8dc..016d51798 100644 --- a/superbench/analyzer/result_summary.py +++ b/superbench/analyzer/result_summary.py @@ -185,7 +185,7 @@ def generate_md_lines(self, summary): for category in summary: lines.append('## {}\n'.format(category)) summary_df = pd.DataFrame(summary[category]) - summary_df = summary_df.drop(columns=0, axis=1) + summary_df = summary_df.drop(columns=[0]) header = ['metric', 'statistics', 'values'] table_lines = file_handler.generate_md_table(summary_df, header) lines.extend(table_lines) @@ -210,7 +210,7 @@ def output_summary_in_excel(self, raw_data_df, summary, output_path): file_handler.output_excel_raw_data(writer, raw_data_df, 'Raw Data') # output the result summary in 'Summary' sheet if isinstance(summary, pd.DataFrame) and not summary.empty: - summary.to_excel(writer, 'Summary', index=False, header=False) + summary.to_excel(writer, sheet_name='Summary', index=False, header=False) worksheet = writer.sheets['Summary'] row = worksheet.max_row # merge cells in 'category' column with the same category diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 8e6e58bfe..ddfa5ce67 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -293,10 +293,14 @@ def _process_percentile_result(self, metric, result, reduce_type=None): if len(result) > 0: percentile_list = ['50', '90', '95', '99', '99.9'] for percentile in percentile_list: - self._result.add_result( - '{}_{}'.format(metric, percentile), - np.percentile(result, float(percentile), interpolation='nearest'), reduce_type - ) + try: + # Prefer the newer NumPy 'method' argument; fall back to 'interpolation' + # for older NumPy versions that don't support 'method'. + val = np.percentile(result, float(percentile), method='nearest') + except TypeError: + # If the 'method' argument is not supported (older NumPy), retry with 'interpolation'. + val = np.percentile(result, float(percentile), interpolation='nearest') + self._result.add_result('{}_{}'.format(metric, percentile), val, reduce_type) def print_env_info(self): """Print environments or dependencies information.""" From abaf4108a18780d8e4fb279892bb6947038c5833 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Fri, 23 Jan 2026 05:23:48 +0000 Subject: [PATCH 20/22] Rerun CodeQL with baseline From fde24d4ec27d6d243f86f04ced5526b03bcf0ef6 Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Fri, 23 Jan 2026 15:24:02 -0800 Subject: [PATCH 21/22] CI/CD - Bump codeql action version to v3 (#777) **Description** image **Solution** Bump CodeQL action version to V3 Co-authored-by: Hongtao Zhang --- .github/workflows/codeql-analysis.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index e53acebf6..ef903240c 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -30,13 +30,13 @@ jobs: - name: Checkout uses: actions/checkout@v3 - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 analyze-cpp: name: CodeQL analyze cpp runs-on: ubuntu-latest @@ -54,10 +54,10 @@ jobs: DEBIAN_FRONTEND=noninteractive apt-get update DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev sudo - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: cpp - name: Build run: make cppbuild -j - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 From 6bd68957dcc8c6fb217d88c2d41bf237d98cfbbc Mon Sep 17 00:00:00 2001 From: Hongtao Zhang Date: Sat, 24 Jan 2026 05:50:21 +0000 Subject: [PATCH 22/22] Rerun all pipeline.