-
-
Notifications
You must be signed in to change notification settings - Fork 204
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
inline PTX to support subgroup shuffle for Nvidia GPUs #297
Changes from 2 commits
7f2e98a
3609342
7709a73
f4e5b1c
0772d63
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,7 @@ | |
#include <numeric> // std::accumulate | ||
#include <cstring> // std::strlen | ||
#include <cstdio> // fprintf, stderr | ||
#include "assert.h" | ||
|
||
// OpenCL | ||
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings | ||
|
@@ -355,6 +356,12 @@ class Device { | |
std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV)); | ||
} | ||
|
||
// Returns if the Nvidia chip is a Volta or later archicture (sm_70 or higher) | ||
bool IsPostNVIDIAVolta() const { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CLBlast also has a CUDA back-end, which works because every function in this file is also implemented in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I didn't notice! I can do that. |
||
assert(HasExtension("cl_nv_device_attribute_query")); | ||
return GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV) >= 7; | ||
} | ||
|
||
// Retrieves the above extra information (if present) | ||
std::string GetExtraInfo() const { | ||
if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); } | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -114,13 +114,29 @@ R"( | |
#define GLOBAL_MEM_FENCE 0 // Global synchronisation barrier for potential better performance | ||
#endif | ||
|
||
// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) | ||
#ifndef USE_SUBGROUP_SHUFFLING | ||
#define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs | ||
#ifndef NVIDIA_WARPS_AS_SUBGROUPS | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm afraid to get lost in the subgroup defines.... could we rename them with a common start, e.g.:
And perhaps treat the two NVIDIA ones as separate things, either one of them will be set. I think this makes the host code clearer and also the kernel code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea. I'll work on this over the weekend. Thanks! |
||
#define NVIDIA_WARPS_AS_SUBGROUPS 0 | ||
#endif | ||
#ifndef NVIDIA_POST_VOLTA | ||
#define NVIDIA_POST_VOLTA 0 | ||
#endif | ||
#if USE_SUBGROUP_SHUFFLING == 1 | ||
#ifndef INTEL_SUBGROUP_EXTENSION | ||
#define INTEL_SUBGROUP_EXTENSION 0 | ||
#endif | ||
//#ifndef USE_SUBGROUP_SHUFFLING | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this commented out? Now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oops, I was doing that to get some performance numbers. I will add it back in. Good catch |
||
#define USE_SUBGROUP_SHUFFLING 0 // Optionally enables subgroup shuffling for Intel GPUs | ||
//#endif | ||
|
||
// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt) | ||
#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION | ||
#define SUBGROUP_SIZE 8 // Assumes subgroup size is always 8 on Intel GPUs | ||
#endif | ||
|
||
// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html) | ||
#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS | ||
#define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on NVIDIA GPUs | ||
#endif | ||
|
||
#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE | ||
#undef USE_SUBGROUP_SHUFFLING | ||
#define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,11 +58,27 @@ std::shared_ptr<Program> CompileFromSource( | |
header_string += "#define GLOBAL_MEM_FENCE 1\n"; | ||
} | ||
|
||
// For Intel GPUs with subgroup support, use subgroup shuffling. | ||
if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) { | ||
// For GPUs with subgroup support, use subgroup shuffling. | ||
// Currently these are Intel via an extension and Nvidia using inline PTX (restricted to 32 bit) | ||
if (device.IsGPU() && (device.HasExtension(kKhronosIntelSubgroups) || | ||
(device.IsNVIDIA() && static_cast<int>(precision) == 32))) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to formulate |
||
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; | ||
} | ||
|
||
// Define the flavor of subgroup | ||
if (device.IsNVIDIA()) { | ||
header_string += "#define NVIDIA_WARPS_AS_SUBGROUPS 1\n"; | ||
|
||
// Nvidia additionally needs to check pre or post volta due to new | ||
// shuffle commands | ||
if (device.IsPostNVIDIAVolta()) { | ||
header_string += "#define NVIDIA_POST_VOLTA 1\n"; | ||
} | ||
} | ||
else if (device.HasExtension(kKhronosIntelSubgroups)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This logic is a bit doubled now. What if we split it, i.e. keep the original code and just add something, e.g.:
But now the |
||
header_string += "#define INTEL_SUBGROUP_EXTENSION 1\n"; | ||
} | ||
} | ||
|
||
// Optionally adds a translation header from OpenCL kernels to CUDA kernels | ||
#ifdef CUDA_API | ||
header_string += | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--->
<assert.h>