Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inline PTX to support subgroup shuffle for Nvidia GPUs #297

Merged
merged 5 commits into from
Jul 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/clpp11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include <numeric> // std::accumulate
#include <cstring> // std::strlen
#include <cstdio> // fprintf, stderr
#include <assert.h>

// OpenCL
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
Expand Down Expand Up @@ -355,6 +356,12 @@ class Device {
std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
}

// Returns if the Nvidia chip is a Volta or later archicture (sm_70 or higher)
bool IsPostNVIDIAVolta() const {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CLBlast also has a CUDA back-end, which works because every function in this file is also implemented in cupp11.h, so you'll have to mimic this behaviour with the same API in the other file as well to make the CUDA version still work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I didn't notice! I can do that.

assert(HasExtension("cl_nv_device_attribute_query"));
return GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV) >= 7;
}

// Retrieves the above extra information (if present)
std::string GetExtraInfo() const {
if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
Expand Down
5 changes: 5 additions & 0 deletions src/cupp11.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,11 @@ class Device {
std::string AMDBoardName() const { return ""; }
std::string NVIDIAComputeCapability() const { return Capabilities(); }

// Returns if the Nvidia chip is a Volta or later archicture (major version 7 or higher)
bool IsPostNVIDIAVolta() const {
return GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 7;
}

// Retrieves the above extra information
std::string GetExtraInfo() const { return NVIDIAComputeCapability(); }

Expand Down
20 changes: 18 additions & 2 deletions src/kernels/level3/xgemm_part1.opencl
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,29 @@ R"(
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance
#endif

// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA
#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0
#endif
#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA
#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0
#endif
#ifndef SUBGROUP_SHUFFLING_INTEL
#define SUBGROUP_SHUFFLING_INTEL 0
#endif
#ifndef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs
#endif
#if USE_SUBGROUP_SHUFFLING == 1

// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs
#endif

// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
#define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs
#endif

#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
#undef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold
Expand Down
42 changes: 40 additions & 2 deletions src/kernels/level3/xgemm_part3.opencl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,44 @@ R"(

// =================================================================================================

// A common interface for subgroup functions

#if USE_SUBGROUP_SHUFFLING == 1

INLINE_FUNC int clblast_get_sub_group_local_id() {

// Intel extension
#if SUBGROUP_SHUFFLING_INTEL == 1
return get_sub_group_local_id();

// Nvidia inline PTX
#elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
int ret;
asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
return ret;
#endif
}

INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {

// Intel extension
#if SUBGROUP_SHUFFLING_INTEL == 1
return intel_sub_group_shuffle(reg, src);

// Nvidia inline PTX
// Volta and later requires .sync shuffle instructions with an extra mask arg
#elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
realN ret;
#if SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src));
#else
asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
#endif
return ret;
#endif
}
#endif

// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
const __global realM* restrict agm, const __global realN* restrict bgm,
Expand Down Expand Up @@ -130,7 +168,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
#elif GEMMK == 1
// Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups
#if USE_SUBGROUP_SHUFFLING == 1
const int _ni = get_sub_group_local_id();
const int _ni = clblast_get_sub_group_local_id();
#pragma unroll
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
Expand Down Expand Up @@ -202,7 +240,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
const int index = _ni * (MWI/VWM) + _mi;
#if USE_SUBGROUP_SHUFFLING == 1
const realN aval = intel_sub_group_shuffle(apm[_ki], _ni);
const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni);
#else
const realN aval = apm[_ni * (KREG/VWN) + _ki];
#endif
Expand Down
14 changes: 14 additions & 0 deletions src/utilities/compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,22 @@ std::shared_ptr<Program> CompileFromSource(
// For Intel GPUs with subgroup support, use subgroup shuffling.
if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
}

// For NVIDIA GPUs, inline PTX can provide subgroup support
if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) {
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";

// Nvidia needs to check pre or post volta due to new shuffle commands
if (device.IsPostNVIDIAVolta()) {
header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n";
}
else {
header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
}
}

// Optionally adds a translation header from OpenCL kernels to CUDA kernels
#ifdef CUDA_API
header_string +=
Expand Down