Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Navi card subgroup shuffle support for gemm #512

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/kernels/level3/xgemm_part1.opencl
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,25 @@ R"(
#endif
#endif

#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_GCN == 1
#define SUBGROUP_SIZE 32 // Assumes subgroup size is always 32 on AMD Navi GPUs
#define NAVI_SHFL(s0, l) \
{ \
__asm ( \
"ds_bpermute_b32 %[d], %[l], %[s]\n" \
"s_waitcnt lgkmcnt(0)\n" \
: [d] "=&v" (s0) \
: [l] "v" (l), \
[s] "0" (s0)); \
}
#define NAVI_LID() \
if (get_work_dim() == 2) { \
return (get_local_size(0) * get_local_id(1) + get_local_id(0)) % SUBGROUP_SIZE; \
} else { \
return (get_local_id(0)) % SUBGROUP_SIZE; \
}
#endif

#if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
#undef USE_SUBGROUP_SHUFFLING
#define USE_SUBGROUP_SHUFFLING 0 // Disables subgroups in case the assumptions don't hold
Expand Down
6 changes: 6 additions & 0 deletions src/kernels/level3/xgemm_part3.opencl
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
int ret;
asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
return ret;
#elif SUBGROUP_SHUFFLING_GCN == 1
NAVI_LID()
#endif
}

Expand All @@ -49,6 +51,10 @@ INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
#endif
return ret;
#elif SUBGROUP_SHUFFLING_GCN == 1
realN ret = reg;
NAVI_SHFL(ret, ((src) << 2))
return ret;
#endif
}
#endif
Expand Down
6 changes: 6 additions & 0 deletions src/utilities/compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ std::shared_ptr<Program> CompileFromSource(
}
}

if (device.IsGPU() && device.IsAMD() && device.Name().find("gfx1") != std::string::npos &&
precision == Precision::kSingle) { // only for Navi cards (gfx1XXX)
header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
header_string += "#define SUBGROUP_SHUFFLING_GCN 1\n";
}

// For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
// This option compiles without the workgroup size requirement and does not affect correctness.
if (device.IsQualcomm()) {
Expand Down