From 5b8711eb0eab10d4c319c5205334d12b1b704fb3 Mon Sep 17 00:00:00 2001
From: Robert Clark <roclark@nvidia.com>
Date: Tue, 30 Mar 2021 16:24:20 -0500
Subject: [PATCH 1/2] Add SLURM support for multi-node tests

To make it easier to run on large clusters, Bobber should be able to run
on SLURM clusters with Pyxis and Enroot installed. This would replace the
need for mpirun and SSH keys/daemons inside the containers, making it
easier to run tests without copying images between nodes or synchronizing
SSH keys.

Signed-Off-By: Robert Clark <roclark@nvidia.com>
---
 .gitignore                             |   1 +
 bobber/bobber.py                       |  23 ++-
 bobber/lib/analysis/dali.py            |  40 +++++
 bobber/lib/exit_codes.py               |   2 +
 bobber/lib/system/__init__.py          |   4 +
 bobber/lib/system/slurm.py             | 228 +++++++++++++++++++++++++
 bobber/slurm_scripts/dali.sub          |  26 +++
 bobber/slurm_scripts/mdtest.sub        |  16 ++
 bobber/slurm_scripts/nccl.sub          |  19 +++
 bobber/test_scripts/call_dali_slurm.sh |  19 +++
 bobber/test_scripts/dali_cleanup.sh    |   3 +
 bobber/test_scripts/dali_setup.sh      |  23 +++
 bobber/test_scripts/dali_slurm.sh      |  16 ++
 bobber/test_scripts/mdtest_slurm.sh    |  12 ++
 bobber/test_scripts/nccl_slurm.sh      |  24 +++
 setup.py                               |   9 +
 16 files changed, 463 insertions(+), 2 deletions(-)
 create mode 100644 bobber/lib/system/slurm.py
 create mode 100644 bobber/slurm_scripts/dali.sub
 create mode 100644 bobber/slurm_scripts/mdtest.sub
 create mode 100644 bobber/slurm_scripts/nccl.sub
 create mode 100755 bobber/test_scripts/call_dali_slurm.sh
 create mode 100755 bobber/test_scripts/dali_cleanup.sh
 create mode 100755 bobber/test_scripts/dali_setup.sh
 create mode 100755 bobber/test_scripts/dali_slurm.sh
 create mode 100755 bobber/test_scripts/mdtest_slurm.sh
 create mode 100755 bobber/test_scripts/nccl_slurm.sh

diff --git a/.gitignore b/.gitignore
index 5eeb4c7..caf152a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ build/*
 dist/*
 env/*
 nvidia_bobber.egg-info/
+*.out
diff --git a/bobber/bobber.py b/bobber/bobber.py
index 38ef00a..9c365d3 100644
--- a/bobber/bobber.py
+++ b/bobber/bobber.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: MIT
 import bobber.lib.docker
 import json
+import sys
 from argparse import ArgumentParser, ArgumentTypeError, Namespace
 from copy import copy
 from bobber import __version__
@@ -88,11 +89,20 @@ def parse_args(version: str) -> Namespace:
 
     # More general options which apply to a majority of the running commands
     # Note that all arguments prepended with '--' are optional
+    commands_parent.add_argument('--slurm', help='Run a test on an existing '
+                                 'SLURM cluster with Pyxis/Enroot installed',
+                                 action='store_true')
+    commands_parent.add_argument('--storage-path', help='Path at which the '
+                                 'filesystem under test is mounted',
+                                 required='--slurm' in sys.argv)
     commands_parent.add_argument('log_path', metavar='log-path', help='Path '
                                  'used to store log files on the head node')
-    commands_parent.add_argument('hosts', help='Comma-separated list of '
+    commands_parent.add_argument('hosts', help='Number of hosts to queue a '
+                                 'job for in a SLURM cluster.' if '--slurm'
+                                 in sys.argv else 'Comma-separated list of '
                                  'hostnames or IP addresses',
-                                 type=unique_hosts)
+                                 type=int if '--slurm' in sys.argv
+                                 else unique_hosts)
     commands_parent.add_argument('--config-path', help='Read a JSON config '
                                  'file with expected parameters and use those '
                                  'values for testing. Ignores all other '
@@ -384,6 +394,15 @@ def execute_command(args: Namespace, version: str) -> NoReturn:
         bobber.lib.docker.cast(args.storage_path, args.ignore_gpu, version)
     elif args.command == LOAD:
         bobber.lib.docker.load(args.filename)
+    elif args.slurm and args.command == RUN_NCCL:
+        args = load_settings(args)
+        bobber.lib.system.slurm.run_nccl(args, version)
+    elif args.slurm and args.command == RUN_DALI:
+        args = load_settings(args)
+        bobber.lib.system.slurm.run_dali(args, version)
+    elif args.slurm and args.command == RUN_STG_META:
+        args = load_settings(args)
+        bobber.lib.system.slurm.run_meta(args, version)
     else:
         # Update the version to be used in filenames
         version_underscore = version.replace('.', '_')
diff --git a/bobber/lib/analysis/dali.py b/bobber/lib/analysis/dali.py
index c5cd736..5ac2ed2 100644
--- a/bobber/lib/analysis/dali.py
+++ b/bobber/lib/analysis/dali.py
@@ -139,6 +139,42 @@ def _update_results(image_type_match: dict, results: list) -> dict:
     return image_type_match
 
 
+def _slurm_test_sections(log_contents: str) -> list:
+    """
+    Parse the SLURM log test sections.
+
+    The SLURM log files for DALI tests have a different structure to the output
+    which needs to be special-handled. These sections are parsed by reading
+    from the beginning of one sub-section (ie. small JPGs) until the first time
+    the next sub-section is encountered (ie. large JPGs).
+
+    Parameters
+    ----------
+    log_contents : str
+        A ``string`` of the complete contents from the log file.
+
+    Returns
+    -------
+    list
+        Returns a ``list`` of strings where each element is the complete output
+        from a test subsection.
+    """
+    small_jpg = re.findall('800x600/file_read_pipeline.*'
+                           '?3840x2160/file_read_pipeline',
+                           log_contents, re.DOTALL)
+    large_jpg = re.findall('3840x2160/file_read_pipeline.*'
+                           '?800x600/tfrecord_pipeline',
+                           log_contents, re.DOTALL)
+    small_tf = re.findall('800x600/tfrecord_pipeline.*'
+                          '?3840x2160/tfrecord_pipeline',
+                          log_contents, re.DOTALL)
+    large_tf = re.findall('3840x2160/tfrecord_pipeline.*'
+                          'OK', log_contents, re.DOTALL)
+    sections = [small_jpg, large_jpg, small_tf, large_tf]
+    sections = ['\n'.join(section) for section in sections]
+    return sections
+
+
 def _result_parsing(log_contents: str, systems: int, image_results: dict,
                     log_file: str) -> dict:
     """
@@ -188,6 +224,10 @@ def _result_parsing(log_contents: str, systems: int, image_results: dict,
     ]
 
     test_sections = re.findall(r'RUN 1/1.*?OK', log_contents, re.DOTALL)
+    # The SLURM tests have a different layout and need to be grabbed
+    # appropriately
+    if '+ srun --nodes=' in log_contents:
+        test_sections = _slurm_test_sections(log_contents)
     if len(test_sections) != 4:
         print(f'Warning: Invalid number of results found in {log_file} log '
               'file. Skipping...')
diff --git a/bobber/lib/exit_codes.py b/bobber/lib/exit_codes.py
index dc65414..8a2ce49 100644
--- a/bobber/lib/exit_codes.py
+++ b/bobber/lib/exit_codes.py
@@ -8,3 +8,5 @@
 CONTAINER_NOT_RUNNING = 32  # Bobber container not running
 NVIDIA_RUNTIME_ERROR = 33  # NVIDIA container runtime not found
 CONTAINER_VERSION_MISMATCH = 34  # Container different from application
+SLURM_QUEUE_ERROR = 40  # Error queueing a SLURM job
+SBATCH_CALL_ERROR = 41  # Error running sbatch
diff --git a/bobber/lib/system/__init__.py b/bobber/lib/system/__init__.py
index 548d2d4..827c2a2 100644
--- a/bobber/lib/system/__init__.py
+++ b/bobber/lib/system/__init__.py
@@ -1 +1,5 @@
 # SPDX-License-Identifier: MIT
+from bobber.lib.system import slurm
+
+run_dali = slurm.run_dali
+run_nccl = slurm.run_nccl
diff --git a/bobber/lib/system/slurm.py b/bobber/lib/system/slurm.py
new file mode 100644
index 0000000..791c28f
--- /dev/null
+++ b/bobber/lib/system/slurm.py
@@ -0,0 +1,228 @@
+# SPDX-License-Identifier: MIT
+import os
+import subprocess
+import sys
+from argparse import Namespace
+from bobber.lib.exit_codes import SBATCH_CALL_ERROR, SLURM_QUEUE_ERROR
+from typing import NoReturn
+
+
+def _slurm_scripts_path() -> str:
+    """
+    Find the absolute path to the slurm_scripts directory.
+
+    The slurm_scripts directory contains several *.sub files which are required
+    to launch test commands via SLURM. Depending on how and where Bobber is
+    installed on a system, the absolute path to this directory may change, but
+    the relative path is easy to find compared to this module. By allowing
+    Python to determine the absolute path to this module, the absolute path to
+    slurm_scripts can be found by combining the absolute path of this module
+    and the relative path to the slurm_scripts directory.
+
+    Returns
+    -------
+    str
+        Returns a ``string`` of the absolute path to the slurm_scripts
+        directory.
+    """
+    directory = os.path.dirname(os.path.realpath(__file__))
+    directory = os.path.join(directory, '../../slurm_scripts')
+    return directory
+
+
+def _sbatch_path() -> str:
+    """
+    Find the full path to the sbatch script.
+
+    While launching a Python process without "shell=True" as is done for the
+    test commands below, the "sbatch" command is not available as Python
+    launches a new process without a proper PATH variable. Running "which
+    sbatch" with a shell instance provides the full path to sbatch which can
+    later be used directly to invoke the script directly instead of using the
+    alias. If sbatch is not installed on the system, the application will exit.
+
+    Returns
+    -------
+    str
+        Returns a ``string`` of the full local path to the sbatch script.
+    """
+    result = subprocess.run('which sbatch', capture_output=True, shell=True)
+    if not result.stderr and result.stdout:
+        return str(result.stdout.strip().decode('ascii'))
+    else:
+        print('sbatch command not found. Please ensure SLURM is installed and '
+              'functional.')
+        sys.exit(SBATCH_CALL_ERROR)
+
+
+def run_nccl(args: Namespace, version: str) -> NoReturn:
+    """
+    Launch a multi-node NCCL test via SLURM.
+
+    Launch a NCCL test for N-nodes managed by a SLURM cluster. Multiple tests
+    are queued-up as sbatch commands which will only launch once the previous
+    test has completed.
+
+    Parameters
+    ----------
+    args : Namespace
+        A ``Namespace`` of all settings specified by the user for the test.
+    version : str
+        A ``string`` of the Bobber version.
+    """
+    # Update the version to be used in filenames
+    version_underscore = version.replace('.', '_')
+    # If not sweeping, set the range of nodes from N-hosts to N-hosts for a
+    # single iteration of tests.
+    lower_bound = args.hosts
+    if args.sweep:
+        lower_bound = 1
+    for hosts in range(lower_bound, args.hosts + 1):
+        for iteration in range(1, args.iterations + 1):
+            nccl_log = os.path.join(args.log_path,
+                                    f'nccl_iteration_{iteration}_'
+                                    f'gpus_{args.gpus}_'
+                                    f'nccl_max_{args.nccl_max}_'
+                                    f'gid_{args.compute_gid}_'
+                                    f'nccl_tc_{args.nccl_tc}_'
+                                    f'systems_{hosts}_'
+                                    f'version_{version_underscore}.log')
+            nccl_path = os.path.join(_slurm_scripts_path(), 'nccl.sub')
+            sbatch = _sbatch_path()
+            env = {
+                'HOSTS': str(hosts),
+                'FS_PATH': args.storage_path,
+                'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}',
+                'NCCL_MAX': str(args.nccl_max),
+                'LOGDIR': args.log_path,
+                'LOGPATH': nccl_log,
+                'NCCL_IB_HCAS': args.nccl_ib_hcas,
+                'COMPUTE_GID': str(args.compute_gid),
+                'NCCL_TC': args.nccl_tc or ''
+            }
+            cmd = [f'{sbatch}',
+                   '-N',
+                   f'{hosts}',
+                   f'--gpus-per-node={args.gpus}',
+                   '--wait',
+                   '--dependency=singleton',
+                   f'{nccl_path}']
+            try:
+                print('Running:', cmd)
+                subprocess.Popen(cmd, env=env)
+            except subprocess.CalledProcessError:
+                print('Error queueing SLURM job for NCCL tests. '
+                      'See output for errors.')
+                sys.exit(SLURM_QUEUE_ERROR)
+
+
+def run_dali(args: Namespace, version: str) -> NoReturn:
+    """
+    Launch a multi-node DALI test via SLURM.
+
+    Launch a DALI test for N-nodes managed by a SLURM cluster. Multiple tests
+    are queued-up as sbatch commands which will only launch once the previous
+    test has completed.
+
+    Parameters
+    ----------
+    args : Namespace
+        A ``Namespace`` of all settings specified by the user for the test.
+    version : str
+        A ``string`` of the Bobber version.
+    """
+    # Update the version to be used in filenames
+    version_underscore = version.replace('.', '_')
+    # If not sweeping, set the range of nodes from N-hosts to N-hosts for a
+    # single iteration of tests.
+    lower_bound = args.hosts
+    if args.sweep:
+        lower_bound = 1
+    for hosts in range(lower_bound, args.hosts + 1):
+        for iteration in range(1, args.iterations + 1):
+            dali_log = os.path.join(args.log_path,
+                                    f'dali_iteration_{iteration}_'
+                                    f'gpus_{args.gpus}_'
+                                    f'batch_size_lg_{args.batch_size_lg}_'
+                                    f'batch_size_sm_{args.batch_size_sm}_'
+                                    f'systems_{hosts}_'
+                                    f'version_{version_underscore}.log')
+            dali_path = os.path.join(_slurm_scripts_path(), 'dali.sub')
+            sbatch = _sbatch_path()
+            env = {
+                'HOSTS': str(hosts),
+                'FS_PATH': args.storage_path,
+                'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}',
+                'GPUS': str(args.gpus),
+                'LOGDIR': args.log_path,
+                'LOGPATH': dali_log,
+                'BATCH_SIZE_SM': str(args.batch_size_sm),
+                'BATCH_SIZE_LG': str(args.batch_size_lg)
+            }
+            cmd = [f'{sbatch}',
+                   '-N',
+                   f'{hosts}',
+                   f'--gpus-per-node={args.gpus}',
+                   '--wait',
+                   '--dependency=singleton',
+                   f'{dali_path}']
+            try:
+                print('Running:', cmd)
+                subprocess.Popen(cmd, env=env)
+            except subprocess.CalledProcessError:
+                print('Error queueing SLURM job for DALI tests. '
+                      'See output for errors.')
+                sys.exit(SLURM_QUEUE_ERROR)
+
+
+def run_meta(args: Namespace, version: str) -> NoReturn:
+    """
+    Launch a multi-node metadata test via SLURM.
+
+    Launch a metadata test for N-nodes managed by a SLURM cluster. Multiple
+    tests are queued-up as sbatch commands which will only launch once the
+    previous test has completed.
+
+    Parameters
+    ----------
+    args : Namespace
+        A ``Namespace`` of all settings specified by the user for the test.
+    version : str
+        A ``string`` of the Bobber version.
+    """
+    # Update the version to be used in filenames
+    version_underscore = version.replace('.', '_')
+    # If not sweeping, set the range of nodes from N-hosts to N-hosts for a
+    # single iteration of tests.
+    lower_bound = args.hosts
+    if args.sweep:
+        lower_bound = 1
+    for hosts in range(lower_bound, args.hosts + 1):
+        for iteration in range(1, args.iterations + 1):
+            meta_log = os.path.join(args.log_path,
+                                    f'stg_meta_iteration_{iteration}_'
+                                    f'systems_{hosts}_'
+                                    f'version_{version_underscore}.log')
+            meta_path = os.path.join(_slurm_scripts_path(), 'mdtest.sub')
+            sbatch = _sbatch_path()
+            env = {
+                'HOSTS': str(hosts),
+                'FS_PATH': args.storage_path,
+                'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}',
+                'GPUS': str(args.gpus),
+                'LOGDIR': args.log_path,
+                'LOGPATH': meta_log
+            }
+            cmd = [f'{sbatch}',
+                   '-N',
+                   f'{hosts}',
+                   '--wait',
+                   '--dependency=singleton',
+                   f'{meta_path}']
+            try:
+                print('Running:', cmd)
+                subprocess.Popen(cmd, env=env)
+            except subprocess.CalledProcessError:
+                print('Error queueing SLURM job for metadata tests. '
+                      'See output for errors.')
+                sys.exit(SLURM_QUEUE_ERROR)
diff --git a/bobber/slurm_scripts/dali.sub b/bobber/slurm_scripts/dali.sub
new file mode 100644
index 0000000..397e730
--- /dev/null
+++ b/bobber/slurm_scripts/dali.sub
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --job-name bobber_dali
+# SPDX-License-Identifier: MIT
+set -euxo pipefail
+
+# Required vars
+: "${HOSTS:=4}"
+: "${FS_PATH:=/mnt/fs}"
+: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}"
+: "${LOGDIR:=test_logs/}"
+: "${LOGPATH:=test_logs/dali.log}"
+: "${BATCH_SIZE_LG:=150}"
+: "${BATCH_SIZE_SM:=150}"
+
+mkdir -p ${LOGDIR}
+
+srun --nodes=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_setup.sh |& tee ${LOGPATH}
+BATCH_SIZE=${BATCH_SIZE_SM} DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
+srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
+BATCH_SIZE=${BATCH_SIZE_LG} DATASET_PATH="/mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
+srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
+BATCH_SIZE=${BATCH_SIZE_SM} DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-*" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
+srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
+BATCH_SIZE=${BATCH_SIZE_LG} DATASET_PATH="/mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-*" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
+srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
+srun --nodes=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_cleanup.sh |& tee ${LOGPATH}
diff --git a/bobber/slurm_scripts/mdtest.sub b/bobber/slurm_scripts/mdtest.sub
new file mode 100644
index 0000000..8985cbc
--- /dev/null
+++ b/bobber/slurm_scripts/mdtest.sub
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH --job-name bobber_nccl
+# SPDX-License-Identifier: MIT
+set -euxo pipefail
+
+# Required vars
+: "${HOSTS:=4}"
+: "${FS_PATH:=/mnt/fs}"
+: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}"
+: "${LOGDIR:=test_logs/}"
+: "${LOGPATH:=test_logs/mdtest.log}"
+
+mkdir -p ${LOGDIR}
+
+# Default to 44 threads per node for known working config
+srun --nodes=${HOSTS} --ntasks-per-node=44 --mpi=pmix --exclusive --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/mdtest_slurm.sh |& tee ${LOGPATH}
diff --git a/bobber/slurm_scripts/nccl.sub b/bobber/slurm_scripts/nccl.sub
new file mode 100644
index 0000000..80a68dd
--- /dev/null
+++ b/bobber/slurm_scripts/nccl.sub
@@ -0,0 +1,19 @@
+#!/bin/bash
+#SBATCH --job-name bobber_nccl
+# SPDX-License-Identifier: MIT
+set -euxo pipefail
+
+# Required vars
+: "${HOSTS:=4}"
+: "${FS_PATH:=/mnt/fs}"
+: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}"
+: "${NCCL_MAX:=1}"
+: "${LOGDIR:=test_logs/}"
+: "${LOGPATH:=test_logs/nccl.log}"
+: "${NCCL_IB_HCAS:=}"
+: "${COMPUTE_GID:=0}"
+: "${NCCL_TC:=}"
+
+mkdir -p ${LOGDIR}
+
+NCCL_MAX=${NCCL_MAX} NCCL_IB_HCAS=${NCCL_IB_HCAS} COMPUTE_GID=${COMPUTE_GID} NCCL_TC=${NCCL_TC} srun --nodes=${HOSTS} --ntasks-per-node=8 --mpi=pmix --exclusive --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/nccl_slurm.sh |& tee ${LOGPATH}
diff --git a/bobber/test_scripts/call_dali_slurm.sh b/bobber/test_scripts/call_dali_slurm.sh
new file mode 100755
index 0000000..5355136
--- /dev/null
+++ b/bobber/test_scripts/call_dali_slurm.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+if [ "x$GPUS" = "x" ]; then
+	GPUS=8
+fi
+
+if [ "x$BATCH_SIZE_SM" = "x" ]; then
+	BATCH_SIZE_SM=150
+fi
+
+if [ "x$BATCH_SIZE_LG" = "x" ]; then
+	BATCH_SIZE_LG=150
+fi
+
+if [[ "$DATASET" == *tfrecord* ]]; then
+  python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --tfrecord_pipeline_paths "$DATASET"
+else
+  python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --file_read_pipeline_paths "$DATASET"
+fi
diff --git a/bobber/test_scripts/dali_cleanup.sh b/bobber/test_scripts/dali_cleanup.sh
new file mode 100755
index 0000000..dae1753
--- /dev/null
+++ b/bobber/test_scripts/dali_cleanup.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+rm -r /mnt/fs_under_test/imageinary_data
diff --git a/bobber/test_scripts/dali_setup.sh b/bobber/test_scripts/dali_setup.sh
new file mode 100755
index 0000000..132c661
--- /dev/null
+++ b/bobber/test_scripts/dali_setup.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+if [ "x$GPUS" = "x" ]; then
+	GPUS=8
+fi
+
+GPUS_ZERO_BASE=$(($GPUS-1))
+
+mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images
+mkdir -p /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images
+mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline
+mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline
+mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx
+mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx
+
+imagine create-images --path /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images --name 4k_image_ --width 3840 --height 2160 --count $(($GPUS*1000)) --image_format jpg --size
+imagine create-images --path /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images --name small_image_ --width 800 --height 600 --count $(($GPUS*1000)) --image_format jpg --size
+
+imagine create-tfrecords --source_path /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images --dest_path /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline --name tfrecord- --img_per_file 1000
+imagine create-tfrecords --source_path /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images --dest_path /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline --name tfrecord- --img_per_file 1000
+
+for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx/tfrecord-$i; done
+for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx/tfrecord-$i; done
diff --git a/bobber/test_scripts/dali_slurm.sh b/bobber/test_scripts/dali_slurm.sh
new file mode 100755
index 0000000..e4c90b7
--- /dev/null
+++ b/bobber/test_scripts/dali_slurm.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+
+if [ "x$GPUS" = "x" ]; then
+	GPUS=8
+fi
+
+if [ "x$BATCH_SIZE" = "x" ]; then
+	BATCH_SIZE=150
+fi
+
+if [ "x$DATASET_PATH" = "x" ]; then
+    DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images"
+fi
+
+python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --file_read_pipeline_paths /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images
diff --git a/bobber/test_scripts/mdtest_slurm.sh b/bobber/test_scripts/mdtest_slurm.sh
new file mode 100755
index 0000000..17fd66e
--- /dev/null
+++ b/bobber/test_scripts/mdtest_slurm.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+#SBATCH --job-name bobber_mdtest
+# SPDX-License-Identifier: MIT
+set -euxo pipefail
+
+FSDIR=/mnt/fs_under_test
+mkdir -p $FSDIR/mdtest
+
+# N-hosts * 44 (default thread count) processes
+/io-500-dev/bin/mdtest -i 3 -I 4 -z 3 -b 8 -u -d $FSDIR/mdtest
+
+rm -rf $FSDIR/mdtest
diff --git a/bobber/test_scripts/nccl_slurm.sh b/bobber/test_scripts/nccl_slurm.sh
new file mode 100755
index 0000000..4a29e86
--- /dev/null
+++ b/bobber/test_scripts/nccl_slurm.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+
+if [ "x$NCCL_IB_HCAS" = "x" ]; then
+	NCCL_IB_HCAS=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_6,mlx5_7,mlx5_8,mlx5_9
+fi
+
+if [ "x$NCCL_MAX" = "x" ]; then
+	NCCL_MAX=1
+fi
+
+if [ "x$COMPUTE_GID" = "x" ]; then
+	COMPUTE_GID=0
+fi
+
+if [ "x$NCCL_TC" = "x" ]; then
+	NCCL_TC=''
+fi
+
+export NCCL_IB_HCA=$NCCL_IB_HCAS && \
+  export NCCL_IB_TC=$NCCL_TC && \
+  export NCCL_IB_GID_INDEX=$COMPUTE_GID && \
+  export NCCL_IB_CUDA_SUPPORT=1 && \
+  /nccl-tests/build/all_reduce_perf -b 8 -e ${NCCL_MAX}G -f 2
diff --git a/setup.py b/setup.py
index ce61b05..d9832f4 100644
--- a/setup.py
+++ b/setup.py
@@ -18,12 +18,21 @@
               'bobber/lib/tests'],
     include_package_data=True,
     package_data={'': ['lib/docker/Dockerfile',
+                       'slurm_scripts/dali.sub',
+                       'slurm_scripts/mdtest.sub',
+                       'slurm_scripts/nccl.sub',
                        'test_scripts/call_dali_multi.sh',
+                       'test_scripts/call_dali_slurm.sh',
+                       'test_scripts/dali_cleanup.sh',
                        'test_scripts/dali_multi.sh',
+                       'test_scripts/dali_setup.sh',
+                       'test_scripts/dali_slurm.sh',
                        'test_scripts/fio_fill_single.sh',
                        'test_scripts/fio_multi.sh',
                        'test_scripts/mdtest_multi.sh',
+                       'test_scripts/mdtest_slurm.sh',
                        'test_scripts/nccl_multi.sh',
+                       'test_scripts/nccl_slurm.sh',
                        'test_scripts/setup_fio.sh']},
     license='MIT',
     python_requires='>=3.6',

From e3a0fcd635a67a55668937dc5574ed4b414c5f7b Mon Sep 17 00:00:00 2001
From: Robert Clark <roclark@nvidia.com>
Date: Tue, 9 Nov 2021 22:42:58 +0000
Subject: [PATCH 2/2] Verify if Docker is running

While using Slurm, it is entirely possible to still use Bobber but not
have Docker installed on the head node where the jobs will be launched.
In this case, Docker should be ignored unless one of the commands
directly needs the Docker runtime.

Signed-Off-By: Robert Clark <roclark@nvidia.com>
---
 bobber/lib/docker/management.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/bobber/lib/docker/management.py b/bobber/lib/docker/management.py
index b82e33a..d1ddeff 100644
--- a/bobber/lib/docker/management.py
+++ b/bobber/lib/docker/management.py
@@ -29,11 +29,23 @@ def __init__(self) -> NoReturn:
         try:
             self.client = docker.from_env()
             self.cli = docker.APIClient(timeout=600)
+            self.docker_running = True
         except docker.errors.DockerException as e:
             if 'error while fetching server api version' in str(e).lower():
-                print('Error: Could not communicate with the Docker daemon.')
-                print('Ensure Docker is running with "systemctl start docker"')
-                sys.exit(DOCKER_COMMUNICATION_ERROR)
+                self.docker_running = False
+
+    def _verify_docker_running(self, *args, **kwargs) -> None:
+        """
+        Raise a DOCKER_COMMUNICATION_ERROR when Docker isn't running.
+
+        If a command is attempted to be run that requires Docker and Docker is
+        either not installed or not running, an error needs to be raised
+        gracefully to the user.
+        """
+        if not self.docker_running:
+            print('Error: Could not communicate with the Docker daemon.')
+            print('Ensure Docker is running with "systemctl start docker"')
+            sys.exit(DOCKER_COMMUNICATION_ERROR)
 
     def _build_if_not_built(self, tag: str, bobber_version: str) -> NoReturn:
         """
@@ -102,6 +114,7 @@ def cast(self, storage_path: str, ignore_gpu: bool,
         bobber_version : string
             A ``string`` of the local version of Bobber, such as '5.0.0'.
         """
+        self._verify_docker_running()
         tag = self.get_tag(bobber_version)
         self._build_if_not_built(tag, bobber_version)
         runtime = None
@@ -155,6 +168,7 @@ def export(self, bobber_version: str) -> NoReturn:
         bobber_version : string
             A ``string`` of the local version of Bobber, such as '5.0.0'.
         """
+        self._verify_docker_running()
         tag = self.get_tag(bobber_version)
         self._build_if_not_built(tag, bobber_version)
         filename = tag.replace('/', '_').replace(':', '_')
@@ -177,6 +191,7 @@ def build(self, bobber_version: str) -> NoReturn:
         bobber_version : string
             A ``string`` of the local version of Bobber, such as '5.0.0'.
         """
+        self._verify_docker_running()
         tag = self.get_tag(bobber_version)
         print('Building a new image. This may take a while...')
         # Set the path to the repository's parent directory.
@@ -208,6 +223,7 @@ def load(self, filename: str) -> NoReturn:
             A ``string`` of the filename for the local tarball to load, such as
             './nvidia_bobber_5.0.0.tar'.
         """
+        self._verify_docker_running()
         print(f'Importing {filename}. This may take a while...')
         with open(filename, 'rb') as image_file:
             self.client.images.load(image_file)
@@ -233,6 +249,7 @@ def execute(self, command: str, environment: Optional[dict] = None,
         log_file : string (Optional)
             A ``string`` of the path and filename to optionally save output to.
         """
+        self._verify_docker_running()
         if not self.running:
             print('Bobber container not running. Launch a container with '
                   '"bobber cast" prior to running any tests.')
@@ -281,6 +298,7 @@ def version_match(self, container: Container) -> bool:
         bool
             Returns `True` when the versions match and `False` when not.
         """
+        self._verify_docker_running()
         if f'nvidia/bobber:{version}' not in container.image.tags:
             return False
         return True