diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl index 9d46ab7e..31bd869a 100644 --- a/src/kernels/level3/xgemm_part1.opencl +++ b/src/kernels/level3/xgemm_part1.opencl @@ -138,6 +138,25 @@ R"( #endif #endif +#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_GCN == 1 + #define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on AMD Navi GPUs + #define NAVI_SHFL(s0, l) \ + { \ + __asm ( \ + "ds_bpermute_b32 %[d], %[l], %[s]\n" \ + "s_waitcnt lgkmcnt(0)\n" \ + : [d] "=&v" (s0) \ + : [l] "v" (l), \ + [s] "0" (s0)); \ + } + #define NAVI_LID() \ + if (get_work_dim() == 2) { \ + return (get_local_size(0) * get_local_id(1) + get_local_id(0)) % SUBGROUP_SIZE; \ + } else { \ + return (get_local_id(0)) % SUBGROUP_SIZE; \ + } +#endif + #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE #undef USE_SUBGROUP_SHUFFLING #define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index 77964a94..f72b0185 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -30,6 +30,8 @@ INLINE_FUNC int clblast_get_sub_group_local_id() { int ret; asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) ); return ret; + #elif SUBGROUP_SHUFFLING_GCN == 1 + NAVI_LID() #endif } @@ -49,6 +51,10 @@ INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) { asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src)); #endif return ret; + #elif SUBGROUP_SHUFFLING_GCN == 1 + realN ret = reg; + NAVI_SHFL(ret, ((src) << 2)) + return ret; #endif } #endif diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp index 59aa6107..99273b3e 100644 --- a/src/utilities/compile.cpp +++ b/src/utilities/compile.cpp @@ -78,6 +78,12 @@ std::shared_ptr CompileFromSource( } } + if (device.IsGPU() && device.IsAMD() && device.Name().find("gfx1") != std::string::npos && + precision == Precision::kSingle) { // only for Navi cards (gfx1XXX) + header_string += "#define USE_SUBGROUP_SHUFFLING 1\n"; + header_string += "#define SUBGROUP_SHUFFLING_GCN 1\n"; + } + // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance. // This option compiles without the workgroup size requirement and does not affect correctness. if (device.IsQualcomm()) {