From 7f2e98a1406da6c5293f0c988df95edc246ef88d Mon Sep 17 00:00:00 2001 From: Tyler Sorensen Date: Wed, 11 Jul 2018 15:12:22 -0400 Subject: [PATCH 1/5] added inline ptx to support shuffle on Nvidia GPUs --- src/clpp11.hpp | 7 +++++ src/kernels/level3/xgemm_part1.opencl | 24 ++++++++++++--- src/kernels/level3/xgemm_part3.opencl | 42 +++++++++++++++++++++++++-- src/tuning/kernels/xgemm.cpp | 4 +-- src/tuning/kernels/xgemm.hpp | 20 ++++++++++++- src/utilities/compile.cpp | 22 ++++++++++++-- 6 files changed, 107 insertions(+), 12 deletions(-) diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 8d6a1127..690f8c49 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -44,6 +44,7 @@ #include // std::accumulate #include // std::strlen #include // fprintf, stderr +#include "assert.h" // OpenCL #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings @@ -355,6 +356,12 @@ class Device { std::string{"."} + std::to_string(GetInfo(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV)); } + // Returns if the Nvidia chip is a Volta or later archicture (sm_70 or higher) + bool IsPostNVIDIAVolta() const { + assert(HasExtension("cl_nv_device_attribute_query")); + return GetInfo(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV) >= 7; + } + // Retrieves the above extra information (if present) std::string GetExtraInfo() const { if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); } diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 99d64c91..9e483b3e 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -114,13 +114,29 @@ R"( #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance #endif -// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) -#ifndef USE_SUBGROUP_SHUFFLING - #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs +#ifndef NVIDIA_WARPS_AS_SUBGROUPS + #define NVIDIA_WARPS_AS_SUBGROUPS 0 +#endif +#ifndef NVIDIA_POST_VOLTA + #define NVIDIA_POST_VOLTA 0 #endif -#if USE_SUBGROUP_SHUFFLING == 1 +#ifndef INTEL_SUBGROUP_EXTENSION + #define INTEL_SUBGROUP_EXTENSION 0 +#endif +//#ifndef USE_SUBGROUP_SHUFFLING + #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs +//#endif + +// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) +#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION #define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs #endif + +// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html) +#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS + #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs +#endif + #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE #undef USE_SUBGROUP_SHUFFLING #define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index c3920cb5..8e20b1b8 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -17,6 +17,44 @@ R"( // ================================================================================================= +// A common interface for subgroup functions + +#if USE_SUBGROUP_SHUFFLING == 1 + +INLINE_FUNC int clblast_get_sub_group_local_id() { + + // Intel extension + #if INTEL_SUBGROUP_EXTENSION == 1 + return get_sub_group_local_id(); + + // Nvidia inline PTX + #elif NVIDIA_WARPS_AS_SUBGROUPS == 1 + int ret; + asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) ); + return ret; + #endif +} + +INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) { + + // Intel extension + #if INTEL_SUBGROUP_EXTENSION == 1 + return intel_sub_group_shuffle(reg, src); + + // Nvidia inline PTX + // Volta and later requires .sync shuffle instructions with an extra mask arg + #elif NVIDIA_WARPS_AS_SUBGROUPS == 1 + realN ret; + #if NVIDIA_POST_VOLTA == 1 + asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src)); + #else + asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src)); + #endif + return ret; + #endif +} +#endif + // Main body of the matrix-multiplication algorithm. It calls various (inlined) functions. INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const __global realM* restrict agm, const __global realN* restrict bgm, @@ -130,7 +168,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, #elif GEMMK == 1 // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups #if USE_SUBGROUP_SHUFFLING == 1 - const int _ni = get_sub_group_local_id(); + const int _ni = clblast_get_sub_group_local_id(); #pragma unroll for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki); @@ -202,7 +240,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, for (int _ki = 0; _ki < KREG/VWN; _ki += 1) { const int index = _ni * (MWI/VWM) + _mi; #if USE_SUBGROUP_SHUFFLING == 1 - const realN aval = intel_sub_group_shuffle(apm[_ki], _ni); + const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni); #else const realN aval = apm[_ni * (KREG/VWN) + _ki]; #endif diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 75e776e6..10164c41 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) { // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - StartVariation<1>(argc, argv); - StartVariation<2>(argc, argv); + //StartVariation<1>(argc, argv); + //StartVariation<2>(argc, argv); StartVariation<11>(argc, argv); StartVariation<12>(argc, argv); return 0; diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp index 9a538c1b..c1b048b7 100644 --- a/src/tuning/kernels/xgemm.hpp +++ b/src/tuning/kernels/xgemm.hpp @@ -116,7 +116,7 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments &args) { }; } else if (V == 11) { // Kernel 1: limited subset of tuning parameters - but explorable exhaustively - settings.parameters = { + /*settings.parameters = { {"GEMMK", {1}}, {"MWG", {16, 32, 64}}, {"NWG", {16, 32, 64}}, @@ -133,6 +133,24 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments &args) { {"SA", {0}}, {"SB", {0}}, {"KREG", {1, 2, 4}} + };*/ + settings.parameters = { + { "GEMMK",{ 1 } }, + { "MWG",{ 16, 32, 64 } }, + { "NWG",{ 64 } }, // This divided by NDIMC needs to be 32 + { "KWG",{ 1 } }, + { "MDIMC",{ 64 } }, // This needs to be greater than 32 + { "NDIMC",{ 2 } }, + { "MDIMA",{ 64 } }, // This needs to be equal to MDIMC + { "NDIMB",{ 2 } }, // This needs to be equal to NDIMC + { "KWI",{ 1 } }, + { "VWM",{ 1, 2, 4, 8 } }, + { "VWN",{ 1, 2, 4 } }, + { "STRM",{ 0 } }, + { "STRN",{ 0 } }, + { "SA",{ 0 } }, + { "SB",{ 0 } }, + { "KREG",{ 1, 2, 4 } } }; } else if (V == 12) { // Kernel 1: a lot more tuning parameters - has to be sampled randomly, too much to test all diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index 05c29944..cd0b3d2b 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -58,11 +58,27 @@ std::shared_ptr CompileFromSource( header_string += "#define GLOBAL_MEM_FENCE 1\n"; } - // For Intel GPUs with subgroup support, use subgroup shuffling. - if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) { + // For GPUs with subgroup support, use subgroup shuffling. + // Currently these are Intel via an extension and Nvidia using inline PTX (restricted to 32 bit) + if (device.IsGPU() && (device.HasExtension(kKhronosIntelSubgroups) || + (device.IsNVIDIA() && static_cast(precision) == 32))) { header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; - } + // Define the flavor of subgroup + if (device.IsNVIDIA()) { + header_string += "#define NVIDIA_WARPS_AS_SUBGROUPS 1\n"; + + // Nvidia additionally needs to check pre or post volta due to new + // shuffle commands + if (device.IsPostNVIDIAVolta()) { + header_string += "#define NVIDIA_POST_VOLTA 1\n"; + } + } + else if (device.HasExtension(kKhronosIntelSubgroups)) { + header_string += "#define INTEL_SUBGROUP_EXTENSION 1\n"; + } + } + // Optionally adds a translation header from OpenCL kernels to CUDA kernels #ifdef CUDA_API header_string += From 36093429fd444d0a1fc7de25dfaf7f2f775cfabc Mon Sep 17 00:00:00 2001 From: Tyler Sorensen Date: Wed, 11 Jul 2018 15:31:51 -0400 Subject: [PATCH 2/5] restored some of the changed tuning files for xgemm --- src/tuning/kernels/xgemm.cpp | 4 ++-- src/tuning/kernels/xgemm.hpp | 20 +------------------- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 10164c41..75e776e6 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) { // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - //StartVariation<1>(argc, argv); - //StartVariation<2>(argc, argv); + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); StartVariation<11>(argc, argv); StartVariation<12>(argc, argv); return 0; diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp index c1b048b7..9a538c1b 100644 --- a/src/tuning/kernels/xgemm.hpp +++ b/src/tuning/kernels/xgemm.hpp @@ -116,7 +116,7 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments &args) { }; } else if (V == 11) { // Kernel 1: limited subset of tuning parameters - but explorable exhaustively - /*settings.parameters = { + settings.parameters = { {"GEMMK", {1}}, {"MWG", {16, 32, 64}}, {"NWG", {16, 32, 64}}, @@ -133,24 +133,6 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments &args) { {"SA", {0}}, {"SB", {0}}, {"KREG", {1, 2, 4}} - };*/ - settings.parameters = { - { "GEMMK",{ 1 } }, - { "MWG",{ 16, 32, 64 } }, - { "NWG",{ 64 } }, // This divided by NDIMC needs to be 32 - { "KWG",{ 1 } }, - { "MDIMC",{ 64 } }, // This needs to be greater than 32 - { "NDIMC",{ 2 } }, - { "MDIMA",{ 64 } }, // This needs to be equal to MDIMC - { "NDIMB",{ 2 } }, // This needs to be equal to NDIMC - { "KWI",{ 1 } }, - { "VWM",{ 1, 2, 4, 8 } }, - { "VWN",{ 1, 2, 4 } }, - { "STRM",{ 0 } }, - { "STRN",{ 0 } }, - { "SA",{ 0 } }, - { "SB",{ 0 } }, - { "KREG",{ 1, 2, 4 } } }; } else if (V == 12) { // Kernel 1: a lot more tuning parameters - has to be sampled randomly, too much to test all From 7709a7308bce5492e06d8867a4dd9dff5b2ba950 Mon Sep 17 00:00:00 2001 From: Tyler Sorensen Date: Sat, 14 Jul 2018 19:50:47 -0400 Subject: [PATCH 3/5] Applied feedback from Cedric from first pull request --- src/clpp11.hpp | 2 +- src/cupp11.hpp | 5 +++++ src/kernels/level3/xgemm_part1.opencl | 21 +++++++++++---------- src/kernels/level3/xgemm_part3.opencl | 10 +++++----- src/tuning/kernels/xgemm.cpp | 4 ++-- src/utilities/compile.cpp | 26 ++++++++++++-------------- 6 files changed, 36 insertions(+), 32 deletions(-) diff --git a/src/clpp11.hpp b/src/clpp11.hpp index 690f8c49..8ac0523f 100644 --- a/src/clpp11.hpp +++ b/src/clpp11.hpp @@ -44,7 +44,7 @@ #include // std::accumulate #include // std::strlen #include // fprintf, stderr -#include "assert.h" +#include // OpenCL #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings diff --git a/src/cupp11.hpp b/src/cupp11.hpp index a1cb1614..ce765844 100644 --- a/src/cupp11.hpp +++ b/src/cupp11.hpp @@ -327,6 +327,11 @@ class Device { std::string AMDBoardName() const { return ""; } std::string NVIDIAComputeCapability() const { return Capabilities(); } + // Returns if the Nvidia chip is a Volta or later archicture (major version 7 or higher) + bool IsPostNVIDIAVolta() const { + return GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 7; + } + // Retrieves the above extra information std::string GetExtraInfo() const { return NVIDIAComputeCapability(); } diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 9e483b3e..32386312 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -114,26 +114,27 @@ R"( #define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance #endif -#ifndef NVIDIA_WARPS_AS_SUBGROUPS - #define NVIDIA_WARPS_AS_SUBGROUPS 0 +#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA + #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0 #endif -#ifndef NVIDIA_POST_VOLTA - #define NVIDIA_POST_VOLTA 0 +#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA + #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0 #endif -#ifndef INTEL_SUBGROUP_EXTENSION - #define INTEL_SUBGROUP_EXTENSION 0 +#ifndef SUBGROUP_SHUFFLING_INTEL + #define SUBGROUP_SHUFFLING_INTEL 0 #endif -//#ifndef USE_SUBGROUP_SHUFFLING +#ifndef USE_SUBGROUP_SHUFFLING #define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs -//#endif +#endif // Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) -#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION +#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL #define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs #endif // NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html) -#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS +#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || \ + SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA) #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs #endif diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index 8e20b1b8..35ec735c 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -24,11 +24,11 @@ R"( INLINE_FUNC int clblast_get_sub_group_local_id() { // Intel extension - #if INTEL_SUBGROUP_EXTENSION == 1 + #if SUBGROUP_SHUFFLING_INTEL == 1 return get_sub_group_local_id(); // Nvidia inline PTX - #elif NVIDIA_WARPS_AS_SUBGROUPS == 1 + #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 int ret; asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) ); return ret; @@ -38,14 +38,14 @@ INLINE_FUNC int clblast_get_sub_group_local_id() { INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) { // Intel extension - #if INTEL_SUBGROUP_EXTENSION == 1 + #if SUBGROUP_SHUFFLING_INTEL == 1 return intel_sub_group_shuffle(reg, src); // Nvidia inline PTX // Volta and later requires .sync shuffle instructions with an extra mask arg - #elif NVIDIA_WARPS_AS_SUBGROUPS == 1 + #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 realN ret; - #if NVIDIA_POST_VOLTA == 1 + #if SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1 asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src)); #else asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src)); diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 75e776e6..10164c41 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) { // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - StartVariation<1>(argc, argv); - StartVariation<2>(argc, argv); + //StartVariation<1>(argc, argv); + //StartVariation<2>(argc, argv); StartVariation<11>(argc, argv); StartVariation<12>(argc, argv); return 0; diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index cd0b3d2b..835f54b4 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -58,24 +58,22 @@ std::shared_ptr CompileFromSource( header_string += "#define GLOBAL_MEM_FENCE 1\n"; } - // For GPUs with subgroup support, use subgroup shuffling. - // Currently these are Intel via an extension and Nvidia using inline PTX (restricted to 32 bit) - if (device.IsGPU() && (device.HasExtension(kKhronosIntelSubgroups) || - (device.IsNVIDIA() && static_cast(precision) == 32))) { + // For Intel GPUs with subgroup support, use subgroup shuffling. + if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) { header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; + header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n"; + } - // Define the flavor of subgroup - if (device.IsNVIDIA()) { - header_string += "#define NVIDIA_WARPS_AS_SUBGROUPS 1\n"; + // For NVIDIA GPUs, inline PTX can provide subgroup support + if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) { + header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; - // Nvidia additionally needs to check pre or post volta due to new - // shuffle commands - if (device.IsPostNVIDIAVolta()) { - header_string += "#define NVIDIA_POST_VOLTA 1\n"; - } + // Nvidia needs to check pre or post volta due to new shuffle commands + if (device.IsPostNVIDIAVolta()) { + header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n"; } - else if (device.HasExtension(kKhronosIntelSubgroups)) { - header_string += "#define INTEL_SUBGROUP_EXTENSION 1\n"; + else { + header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n"; } } From f4e5b1c14ce9b0ac3b769908912fb3422a0ea8e2 Mon Sep 17 00:00:00 2001 From: Tyler Sorensen Date: Sat, 14 Jul 2018 22:47:39 -0400 Subject: [PATCH 4/5] forgot to add test cases back in, oops --- src/tuning/kernels/xgemm.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp index 10164c41..75e776e6 100644 --- a/src/tuning/kernels/xgemm.cpp +++ b/src/tuning/kernels/xgemm.cpp @@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) { // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { - //StartVariation<1>(argc, argv); - //StartVariation<2>(argc, argv); + StartVariation<1>(argc, argv); + StartVariation<2>(argc, argv); StartVariation<11>(argc, argv); StartVariation<12>(argc, argv); return 0; From 0772d63498c8eeddc380902ba6010a1a861763cc Mon Sep 17 00:00:00 2001 From: Tyler Sorensen Date: Mon, 16 Jul 2018 20:12:30 -0400 Subject: [PATCH 5/5] moved a two-line macro to a single line --- src/kernels/level3/xgemm_part1.opencl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 32386312..3cfc5dfb 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -133,8 +133,7 @@ R"( #endif // NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html) -#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || \ - SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA) +#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA) #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs #endif