diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/CMakeLists.txt new file mode 100755 index 0000000000..cdba19e90b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/CMakeLists.txt @@ -0,0 +1,20 @@ +if(UNIX) + # Direct CMake to use dpcpp rather than the default C++ compiler/linker + set(CMAKE_CXX_COMPILER dpcpp) +else() # Windows + # Force CMake to use dpcpp rather than the default C++ compiler/linker + # (needed on Windows only) + include (CMakeForceCompiler) + CMAKE_FORCE_CXX_COMPILER (dpcpp IntelDPCPP) + include (Platform/Windows-Clang) +endif() + +cmake_minimum_required (VERSION 3.4) + +project(ACInt CXX) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/License.txt new file mode 100755 index 0000000000..7c8b8a36c6 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/License.txt @@ -0,0 +1,23 @@ +Copyright Intel Corporation + +SPDX-License-Identifier: MIT +https://opensource.org/licenses/MIT + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/README.md new file mode 100755 index 0000000000..b653ec815d --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/README.md @@ -0,0 +1,246 @@ +# Using the Algorithmic C Integer Data Type `ac_int` + +This FPGA tutorial demonstrates how to use the Algorithmic C (AC) data type `ac_int` and some best practices. + +***Documentation***: The [DPC++ FPGA Code Samples Guide](https://software.intel.com/content/www/us/en/develop/articles/explore-dpcpp-through-intel-fpga-code-samples.html) helps you to navigate the samples and build your knowledge of DPC++ for FPGA.
+The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) is the reference manual for targeting FPGAs through DPC++.
+The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04/20.04, RHEL*/CentOS* 8, SUSE* 15; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA
Intel® FPGA Programmable Acceleration Card (PAC) D5005 (with Intel Stratix® 10 SX)
Intel® FPGA 3rd party / custom platforms with oneAPI support
*__Note__: Intel® FPGA PAC hardware is only compatible with Ubuntu 18.04* +| Software | Intel® oneAPI DPC++ Compiler
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Using the `ac_int` data type for basic operations
Efficiently using the left shift operation
Setting and reading certain bits of an `ac_int` number +| Time to complete | 20 minutes + + + +## Purpose + +This FPGA tutorial shows how to use the `ac_int` data type with some simple examples. + +This data type can be used in place of native integer types to generate area efficient and optimized designs for the FPGA. When you have a computation that does not require the full dynamic range of a 32-bit integer, you should replace your `int` variables with `ac_int` variables of the correct, reduced width. For example, if you know that a loop will iterate from 0 to 12, only 4 bits are required. + +Please refer to the [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide/top/optimize-your-design/resource-use/data-types-and-operations/var-prec-fp-sup/adv-disadv-ac-dt.html) to see advantages and limitations of `ac_int` data types. + +### Simple Code Example + +An `ac_int` number can be defined as follows: +```cpp +ac_int a; +``` +Here `W` is the width in bits and `S` is a bool indicating if the number is signed. Signed numbers use the most significant bit (MSB) to store the sign bit. + +To use the `ac_int` type in your code, you must include the following header: + +```cpp +#include +``` +Additionally, you must pass the flag `-qactypes` on Linux or `/Qactypes` on Windows to the `dpcpp` command when compiling your SYCL program in order to ensure that the headers are correctly included. In this tutorial, this is done in `src/CMakeLists.txt`. + +### Basic Operations and Promotion Rules + +When using `ac_int`, the results of addition, subtraction, multiplication, and division operations are automatically promoted to the number of bits needed to represent all possible results without overflowing. However, the data type you use to store the result may result in truncation. + +For example, the addition of two 8-bit integers results in a 9-bit result to support overflow. Internally, the result will be 9-bit. However, if the user attempts to store the result in an 8-bit container, `ac_int` will let the user do this, which leads to the most significant bit being discarded. The responsibility lies on the user to use the correct data type. + +These promotion rules are consistent across all architectures, so the behavior will be equivalent on x86 or on FPGA. + +### Shift Operations + +The behavior of shift operations of `ac_int` data types is slightly different from shift operations of native integer types. Some key points to remember are as follows: + - If the data type of the shift amount is not explicitly `unsigned` (either using `ac_int` or using the `unsigned` keyword), then the compiler will generate a more complex shifter that allows negative shifts and positive shifts. A shift by a negative amount is equivalent to a positive shift in the opposite direction. Normally, you will not want to use negative shifting, so you should use an `unsigned` data type for the shift value to obtain a more resource efficient shifter. + - Shift values greater than the width of the data types are treated as a shift equal to the width of the data type. + - The shift operation can be done more efficiently by specifying the amount to shift with the smallest possible `ac_int`. + +### Bit Select Operator + +The bit select operator `[]` allows reading and modifying an individual bit in an `ac_int`. + +*Note:* You must initialize an `ac_int` variable before accessing it using the bit select operator `[]`. Using the `[]` operator on an uninitialized `ac_int` variable is undefined behavior and can give you unexpected results. Assigning each bit explicitly using the `[]` operator does not count as initializing the `ac_int` variable. + +### Bit Slice Operations + +The slice read operation `slc` and the slice write operation `set_slc` allows reading and modifying a slice in an `ac_int`. + +Slice read is provided with the template function `slc(int lsb)`. The two arguments are defined as: +- `W` is the bit length of the slice. It must be known at compile time. +- `lsb` is the index of the LSB of the slice being read. + +Slice write is provided with the function `set_slc(int lsb, const ac_int &slc)`. The two arguments are defined as: +- `lsb` is the index of the least significant bit (LSB) of the slice being written. +- `slc` is an `ac_int` slice that is to be written into the target `ac_int` starting at bit `lsb`. The bit length of slice is inferred from the width `W` of `slc`. + +*Note:* An `ac_int` must be initialized before being accessed by bit slice operations `slc` and `set_slc`. Using the `slc` and `set_slc` functions on an uninitialized `ac_int` variable is undefined behavior and can give you unexpected results. + +### Understanding the Tutorial Design + +This tutorial consists of five kernels: + +Kernel `BasicOpsInt` contains native `int` type addition, multiplication, and division operations, while kernel `BasicOpsAcInt` contains `ac_int` type addition, multiplication, and division operations. By comparing these two kernels, you will find reduced width `ac_int` generates hardware that is more area efficient than native `int`. + +Kernel `ShiftOp` contains an `ac_int` left shifter and the data type of the shift amount is a large width signed `ac_int`. In contrast, kernel `EfficientShiftOp` also contains an `ac_int` left shifter, but the data type of the shift amount is a reduced width unsigned `ac_int`. By comparing these two kernels, you will find shift operations of `ac_int` can generate more efficient hardware if the amount to shift by is stored in a minimally sized unsigned `ac_int`. + +Kernel `BitOps` demonstrates bit operations with bit select operator `[]` and bit slice operations `slc` and `set_slc`. + +## Key Concepts +* The `ac_int` data type can be used to generate hardware for only as many bits as are needed by your application. Native integer types must generate hardware for only 8, 16, 32, or 64 bits. +* Shift operations in `ac_int` can be implemented more efficiently when the amount to shift by is stored in a minimally sized unsigned `ac_int`. +* The `ac_int` data type provides several useful operations, including reading and modifying certain bits in an `ac_int`. + +## License + +Code samples are licensed under the MIT license. See +[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details. + +## Building the `ac_int` Tutorial + +### Include Files + +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the type of compute node and whether to run in batch or interactive mode. Compiles to FPGA are only supported on fpga_compile nodes. Executing programs on FPGA hardware is only supported on fpga_runtime nodes of the appropriate type, such as fpga_runtime:arria10 or fpga_runtime:stratix10. Neither compiling nor executing programs on FPGA hardware are supported on the login nodes. For more information, see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/documentation/base-toolkit/](https://devcloud.intel.com/oneapi/documentation/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### Using Visual Studio Code* (Optional) + +You can use Visual Studio Code (VS Code) extensions to set your environment, create launch configurations, +and browse and download samples. + +The basic steps to build and run a sample using VS Code include: + - Download a sample using the extension **Code Sample Browser for Intel oneAPI Toolkits**. + - Configure the oneAPI environment with the extension **Environment Configurator for Intel oneAPI Toolkits**. + - Open a Terminal in VS Code (**Terminal>New Terminal**). + - Run the sample in the VS Code terminal using the instructions below. + +To learn more about the extensions and how to configure the oneAPI environment, see +[Using Visual Studio Code with Intel® oneAPI Toolkits](https://software.intel.com/content/www/us/en/develop/documentation/using-vs-code-with-intel-oneapi/top.html). + +After learning how to use the extensions for Intel oneAPI Toolkits, return to this readme for instructions on how to build and run a sample. + +### On a Linux* System + +1. Install the design in `build` directory from the design directory by running `cmake`: + + ```bash + mkdir build + cd build + ``` + + If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + + ```bash + cmake .. + ``` + + Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command: + + ```bash + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + You can also compile for a custom FPGA platform. Ensure that the board support package is installed on your system. Then run `cmake` using the command: + ```bash + cmake .. -DFPGA_BOARD=: + ``` + +2. Compile the design using the generated `Makefile`. The following four build targets are provided that match the recommended development flow: + + * Compile and run for emulation (fast compile time, targets emulates an FPGA device) using: + + ```bash + make fpga_emu + ``` + + * Generate HTML optimization reports using: + + ```bash + make report + ``` + + * Compile and run on FPGA hardware (longer compile time, targets an FPGA device) using: + + ```bash + make fpga + ``` + +3. (Optional) As the above hardware compile may take several hours to complete, FPGA precompiled binaries (compatible with Linux* Ubuntu* 18.04) can be downloaded here. + +### On a Windows* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake -G "NMake Makefiles" .. + ``` + Alternatively, to compile for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX), run `cmake` using the command: + + ``` + cmake -G "NMake Makefiles" .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + You can also compile for a custom FPGA platform. Ensure that the board support package is installed on your system. Then run `cmake` using the command: + ``` + cmake -G "NMake Makefiles" .. -DFPGA_BOARD=: + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + nmake fpga_emu + ``` + * Generate the optimization report: + ``` + nmake report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + nmake fpga + ``` + +*Note:* The Intel® PAC with Intel Arria® 10 GX FPGA and Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) do not yet support Windows*. Compiling to FPGA hardware on Windows* requires a third-party or custom Board Support Package (BSP) with Windows* support.
+*Note:* If you encounter any issues with long paths when compiling under Windows*, you may have to create your ‘build’ directory in a shorter path, for example c:\samples\build. You can then run cmake from that directory, and provide cmake with the full path to your sample directory. + +### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). +For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports + +Locate `report.html` in the `ac_int_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +On the main report page, scroll down to the section titled *Compile Estimated Kernel Resource Utilization Summary*. You can see the overall resource usage of kernel `BasicOpsAcInt` is less than kernel `BasicOpsInt`. Navigate to *Area Analysis of System* (*Area Analysis* > *Area Analysis of System*), you can find resource usage information of the individual addition, multiplication, and division operations, and you can verify each individual operation consumes fewer resources in kernel `BasicOpsAcInt` than in kernel `BasicOpsInt`. + +Navigate to *System Viewer* (*Views* > *System Viewer*) and find the cluster in kernel `ShiftOp` that contains the left-shifter node (`<<`). Similarly, locate the cluster that contains the left-shifter node in kernel `EfficientShiftOp`. Observe that the compiler generates an additional shifter in kernel `ShiftOp` to deal with the signedness of the shift amount `b`. You can verify that kernel `EfficientShiftOp` consumes fewer resources than kernel `ShiftOp` in *Compile Estimated Kernel Resource Utilization Summary* on the main report page and *Area Analysis of System*. + +## Running the Sample + +1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + + ```bash + ./ac_int.fpga_emu (Linux) + ac_int.fpga_emu.exe (Windows) + ``` + +2. Run the sample on the FPGA device + + ```bash + ./ac_int.fpga (Linux) + ``` + +### Example of Output + +```txt +PASSED: all kernel results are correct. +``` + +### Discussion + +`ac_int` can help minimize the generated hardware and achieve the same numerical result as native integer types. This can be very useful when the logic does not need to utilize all the bits provided by the native integer type. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/ac_int.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/ac_int.sln new file mode 100755 index 0000000000..6ad4ad3928 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/ac_int.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ac_int", "ac_int.vcxproj", "{73FCAD5C-4C93-4786-B662-A7273C515E22}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Debug|x64.ActiveCfg = Debug|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Debug|x64.Build.0 = Debug|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Release|x64.ActiveCfg = Release|x64 + {73FCAD5C-4C93-4786-B662-A7273C515E22}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {DE911CD1-4F98-4391-BD43-B02212357F5E} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/ac_int.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/ac_int.vcxproj new file mode 100755 index 0000000000..fad3f01df4 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/ac_int.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {73fcad5c-4c93-4786-b662-a7273c515e22} + Win32Proj + ac_int + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler 2022 + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler 2022 + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler 2022 + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler 2022 + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR /Qactypes %(AdditionalOptions) + $(IntDir)ac_int.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR /Qactypes %(AdditionalOptions) + $(IntDir)ac_int.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/sample.json new file mode 100755 index 0000000000..558cdd0e6d --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/sample.json @@ -0,0 +1,61 @@ +{ + "guid": "566433FF-0C04-4C99-A76E-B927F81872F8", + "name": "AC Int", + "categories": ["Toolkit/oneAPI Direct Programming/DPC++ FPGA/Tutorials/Features"], + "description": "An Intel® FPGA tutorial demonstrating how to use the Algorithmic C Integer (AC Int) ", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "dpcpp --version", + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./ac_int.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "dpcpp --version", + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "dpcpp --version", + "cd ../../..", + "mkdir build", + "cd build", + "cmake -G \"NMake Makefiles\" ../Tutorials/Features/ac_int", + "nmake fpga_emu", + "ac_int.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "dpcpp --version", + "cd ../../..", + "mkdir build", + "cd build", + "cmake -G \"NMake Makefiles\" ../Tutorials/Features/ac_int", + "nmake report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/src/CMakeLists.txt new file mode 100755 index 0000000000..a2e41f93dd --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/src/CMakeLists.txt @@ -0,0 +1,80 @@ +# To see a Makefile equivalent of this build system: +# https://github.com/oneapi-src/oneAPI-samples/blob/master/DirectProgramming/DPC++/ProjectTemplates/makefile-fpga + +set(SOURCE_FILE ac_int.cpp) +set(TARGET_NAME ac_int) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# FPGA board selection +if(NOT DEFINED FPGA_BOARD) + set(FPGA_BOARD "intel_a10gx_pac:pac_a10") + message(STATUS "FPGA_BOARD was not specified.\ + \nConfiguring the design to run on the default FPGA board ${FPGA_BOARD} (Intel(R) PAC with Intel Arria(R) 10 GX FPGA). \ + \nPlease refer to the README for information on board selection.") +else() + message(STATUS "Configuring the design to run on FPGA board ${FPGA_BOARD}") +endif() + +# These are Windows-specific flags: +# 1. /EHsc This is a Windows-specific flag that enables exception handling in host code +# 2. /Qactypes Include ac_types headers and link against ac_types emulation libraries +if(WIN32) + set(WIN_FLAG "/EHsc") + set(AC_TYPES_FLAG "/Qactypes") +else() + set(AC_TYPES_FLAG "-qactypes") +endif() + +# A DPC++ ahead-of-time (AoT) compile processes the device code in two stages. +# 1. The "compile" stage compiles the device code to an intermediate representation (SPIR-V). +# 2. The "link" stage invokes the compiler's FPGA backend before linking. +# For this reason, FPGA backend flags must be passed as link flags in CMake. +set(EMULATOR_COMPILE_FLAGS "${WIN_FLAG} -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR -Wall") +set(EMULATOR_LINK_FLAGS "-fintelfpga ${AC_TYPES_FLAG}") +set(HARDWARE_COMPILE_FLAGS "${WIN_FLAG} -fintelfpga ${AC_TYPES_FLAG} -Wall") +set(HARDWARE_LINK_FLAGS "-fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS}") +# We do not need to supply the AC_TYPES_FLAG for the 'report' target's linking stage. +set(REPORT_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${FPGA_BOARD} ${USER_HARDWARE_FLAGS}") +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation + +############################################################################### +### FPGA Emulator +############################################################################### +# To compile in a single command: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR fpga_compile.cpp -o fpga_compile.fpga_emu +# CMake executes: +# [compile] dpcpp -fintelfpga ${AC_TYPES_FLAG} -DFPGA_EMULATOR -o fpga_compile.cpp.o -c fpga_compile.cpp +# [link] dpcpp -fintelfpga ${AC_TYPES_FLAG} fpga_compile.cpp.o -o fpga_compile.fpga_emu +add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) +set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS "${EMULATOR_COMPILE_FLAGS}") +set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS "${EMULATOR_LINK_FLAGS}") +add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + +############################################################################### +### Generate Report +############################################################################### +# To compile manually: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= -fsycl-link=early ac_int.cpp -o ac_int_report.a +set(FPGA_EARLY_IMAGE ${TARGET_NAME}_report.a) +# The compile output is not an executable, but an intermediate compilation result unique to DPC++. +add_executable(${FPGA_EARLY_IMAGE} ${SOURCE_FILE}) +add_custom_target(report DEPENDS ${FPGA_EARLY_IMAGE}) +set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}") +set_target_properties(${FPGA_EARLY_IMAGE} PROPERTIES LINK_FLAGS "${REPORT_LINK_FLAGS} -fsycl-link=early") +# fsycl-link=early stops the compiler after RTL generation, before invoking Quartus® + +############################################################################### +### FPGA Hardware +############################################################################### +# To compile in a single command: +# dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= ac_int.cpp -o ac_int.fpga +# CMake executes: +# [compile] dpcpp -fintelfpga ${AC_TYPES_FLAG} -o ac_int.cpp.o -c ac_int.cpp +# [link] dpcpp -fintelfpga ${AC_TYPES_FLAG} -Xshardware -Xsboard= ac_int.cpp.o -o ac_int.fpga +add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) +add_custom_target(fpga DEPENDS ${FPGA_TARGET}) +set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}") +set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS "${HARDWARE_LINK_FLAGS} -reuse-exe=${CMAKE_BINARY_DIR}/${FPGA_TARGET}") +# The -reuse-exe flag enables rapid recompilation of host-only code changes. +# See DPC++FPGA/GettingStarted/fast_recompile for details. \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/src/ac_int.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/src/ac_int.cpp new file mode 100644 index 0000000000..78a7eb27f3 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/ac_int/src/ac_int.cpp @@ -0,0 +1,237 @@ +#include +#include +#include + +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities/include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; + +// Forward declare the kernel names in the global scope. +// This FPGA best practice reduces name mangling in the optimization reports. +class BasicOpsInt; +class BasicOpsAcInt; +class ShiftOp; +class EfficientShiftOp; +class BitAccess; + +using MyUInt2 = ac_int<2, false>; +using MyInt7 = ac_int<7, true>; +using MyInt14 = ac_int<14, true>; +using MyInt15 = ac_int<15, true>; +using MyInt28 = ac_int<28, true>; + +void TestBasicOpsInt(queue &q, const int &a, const int &b, int &c, int &d, + int &e) { + buffer a_buf(&a, 1); + buffer b_buf(&b, 1); + buffer c_buf(&c, 1); + buffer d_buf(&d, 1); + buffer e_buf(&e, 1); + + q.submit([&](handler &h) { + accessor a_acc(a_buf, h, read_only); + accessor b_acc(b_buf, h, read_only); + accessor c_acc(c_buf, h, write_only, no_init); + accessor d_acc(d_buf, h, write_only, no_init); + accessor e_acc(e_buf, h, write_only, no_init); + h.single_task([=]() [[intel::kernel_args_restrict]] { + c_acc[0] = a_acc[0] + b_acc[0]; + d_acc[0] = a_acc[0] * b_acc[0]; + e_acc[0] = a_acc[0] / b_acc[0]; + }); + }); +} + +void TestBasicOpsAcInt(queue &q, const MyInt14 &a, const MyInt14 &b, MyInt15 &c, + MyInt28 &d, MyInt15 &e) { + buffer a_buf(&a, 1); + buffer b_buf(&b, 1); + buffer c_buf(&c, 1); + buffer d_buf(&d, 1); + buffer e_buf(&e, 1); + + q.submit([&](handler &h) { + accessor a_acc(a_buf, h, read_only); + accessor b_acc(b_buf, h, read_only); + accessor c_acc(c_buf, h, write_only, no_init); + accessor d_acc(d_buf, h, write_only, no_init); + accessor e_acc(e_buf, h, write_only, no_init); + h.single_task([=]() [[intel::kernel_args_restrict]] { + c_acc[0] = a_acc[0] + b_acc[0]; + d_acc[0] = a_acc[0] * b_acc[0]; + e_acc[0] = a_acc[0] / b_acc[0]; + }); + }); +} + +void TestShiftOp(queue &q, const MyInt14 &a, const MyInt14 &b, MyInt14 &c) { + buffer a_buf(&a, 1); + buffer b_buf(&b, 1); + buffer c_buf(&c, 1); + + q.submit([&](handler &h) { + accessor a_acc(a_buf, h, read_only); + accessor b_acc(b_buf, h, read_only); + accessor c_acc(c_buf, h, write_only, no_init); + h.single_task([=]() [[intel::kernel_args_restrict]] { + MyInt14 temp = a_acc[0] << b_acc[0]; + c_acc[0] = temp >> b_acc[0]; + }); + }); +} + +void TestEfficientShiftOp(queue &q, const MyInt14 &a, const MyUInt2 &b, + MyInt14 &c) { + buffer a_buf(&a, 1); + buffer b_buf(&b, 1); + buffer c_buf(&c, 1); + + q.submit([&](handler &h) { + accessor a_acc(a_buf, h, read_only); + accessor b_acc(b_buf, h, read_only); + accessor c_acc(c_buf, h, write_only, no_init); + h.single_task([=]() [[intel::kernel_args_restrict]] { + MyInt14 temp = a_acc[0] << b_acc[0]; + c_acc[0] = temp >> b_acc[0]; + }); + }); +} + +MyInt14 TestBitAccess(queue &q, const MyInt14 &a) { + MyInt14 res; + buffer a_buf(&a, 1); + buffer res_buf(&res, 1); + + q.submit([&](handler &h) { + accessor a_acc(a_buf, h, read_only); + accessor res_acc(res_buf, h, write_only, no_init); + h.single_task([=]() [[intel::kernel_args_restrict]] { + // 0b1111101 + MyInt7 temp = a_acc[0].slc<7>(3); + + res_acc[0] = 0; // Must be initialized before being accessed by the bit + // select operator `[]`. Using the `[]` operator on an + // uninitialized `ac_int` variable is undefined behavior + // and can give you unexpected results. + + // 0 -> 0b1111101000 + res_acc[0].set_slc(3, temp); + + // 0b1111101000 -> 0b1111101111 + res_acc[0][2] = 1; + res_acc[0][1] = 1; + res_acc[0][0] = 1; + }); + }); + return res; +} + +int main() { +#if defined(FPGA_EMULATOR) + ext::intel::fpga_emulator_selector device_selector; +#else + ext::intel::fpga_selector device_selector; +#endif + + bool passed = true; + + try { + queue q(device_selector, dpc_common::exception_handler); + + constexpr int kVal1 = 1000, kVal2 = 2; + + // Kernel `BasicOpsInt` contains native `int` type addition, multiplication, + // and division operations, while kernel `BasicOpsAcInt` contains `ac_int` + // type addition, multiplication, and division operations. By comparing + // these two kernels, you will find reduced width `ac_int` generates more + // efficient hardware than native `int`. + { + MyInt14 input_a = kVal1, input_b = kVal2; + MyInt15 output_c; + MyInt28 output_d; + MyInt15 output_e; + TestBasicOpsAcInt(q, input_a, input_b, output_c, output_d, output_e); + + int golden_c, golden_d, golden_e; + TestBasicOpsInt(q, input_a, input_b, golden_c, golden_d, golden_e); + + if (output_c != golden_c || output_d != golden_d || + output_e != golden_e) { + std::cout << "Result mismatch!\n" + << "Kernel BasicOpsInt: addition = " << golden_c + << ", multiplication = " << golden_d + << ", division = " << golden_e << "\n" + << "Kernel BasicOpsAcInt: addition = " << output_c + << ", multiplication = " << output_d + << ", division = " << output_e << "\n\n"; + passed = false; + } + } + + // Kernel `ShiftOp` contains an `ac_int` left shifter and the data type of + // the shift amount is a large width signed `ac_int`. On contrast, kernel + // `EfficientShiftOp` also contains an `ac_int` left shifter but the data + // type of the shift amount is a reduced width unsigned `ac_int`. By + // comparing these two kernels, you will find shift operations of `ac_int` + // can generate more efficient hardware if the amount to shift by is stored + // in a minimally sized unsigned `ac_int`. + { + MyInt14 input_a = kVal1, input_b = kVal2; + MyUInt2 input_efficient_b = kVal2; + MyInt14 output_c, output_efficient_c; + TestShiftOp(q, input_a, input_b, output_c); + TestEfficientShiftOp(q, input_a, input_efficient_b, output_efficient_c); + + if (output_c != output_efficient_c) { + std::cout << "Result mismatch!\n" + << "Kernel ShiftOp: result = " << output_c << "\n" + << "Kernel EfficientShiftOp: result = " << output_efficient_c + << "\n\n"; + passed = false; + } + } + + // Kernel `BitAccess` demonstrates bit access with bit select operator `[]` + // and bit slice write operation `set_slc`. Note: An `ac_int` must be + // initialized before being access by bit select operator `[]` and bit slice + // operations `slc` and `set_slc`, otherwise it is undefined behavior and + // will give you unexpected results. + { + MyInt14 input = kVal1; + MyInt14 output = TestBitAccess(q, input); + + constexpr int golden = 0b001111101111; + + if (output != golden) { + std::cout << "Kernel BitAccess result mismatch!\n" + << "result = 0b" << std::bitset<14>(output) << "\n" + << "golden = 0b" << std::bitset<14>(golden) << "\n\n"; + passed = false; + } + } + } catch (exception const &e) { + // Catches exceptions in the host code. + std::cerr << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.code().value() == CL_DEVICE_NOT_FOUND) { + std::cerr << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cerr << "Run sys_check in the oneAPI root directory to verify.\n"; + std::cerr << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + if (passed) { + std::cout << "PASSED: all kernel results are correct.\n"; + } else { + std::cout << "FAILED\n"; + } + return passed ? 0 : 1; +} \ No newline at end of file