diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 9d46ab7e..31bd869a 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -138,6 +138,25 @@ R"(
   #endif
 #endif
 
+#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_GCN == 1
+  #define SUBGROUP_SIZE 32    // Assumes subgroup size is always 32 on AMD Navi GPUs
+    #define NAVI_SHFL(s0, l)  \
+    { \
+		__asm ( \
+		  "ds_bpermute_b32  %[d], %[l], %[s]\n" \
+       "s_waitcnt lgkmcnt(0)\n" \
+		  : [d] "=&v" (s0) \
+		  : [l] "v" (l), \
+            [s] "0" (s0)); \
+	 }
+   #define NAVI_LID() \
+     if (get_work_dim() == 2) { \
+      return (get_local_size(0) * get_local_id(1) + get_local_id(0)) % SUBGROUP_SIZE; \
+    } else { \
+      return (get_local_id(0)) % SUBGROUP_SIZE; \
+    }
+#endif
+
 #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
   #undef USE_SUBGROUP_SHUFFLING
   #define USE_SUBGROUP_SHUFFLING 0     // Disables subgroups in case the assumptions don't hold
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 77964a94..f72b0185 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -30,6 +30,8 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
   int ret;
   asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
   return ret;
+  #elif SUBGROUP_SHUFFLING_GCN == 1
+    NAVI_LID()
   #endif 
 }
 
@@ -49,6 +51,10 @@ INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
     asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
     #endif
   return ret;
+  #elif SUBGROUP_SHUFFLING_GCN == 1
+  realN ret = reg;
+  NAVI_SHFL(ret, ((src) << 2))
+  return ret;
   #endif
 }
 #endif
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 59aa6107..99273b3e 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -78,6 +78,12 @@ std::shared_ptr<Program> CompileFromSource(
     }
   }
 
+  if (device.IsGPU() && device.IsAMD() && device.Name().find("gfx1") != std::string::npos &&
+      precision == Precision::kSingle) { // only for Navi cards (gfx1XXX)
+    header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
+    header_string += "#define SUBGROUP_SHUFFLING_GCN 1\n";
+  }
+
   // For Qualcomm devices, specifying the OpenCL kernel attribute reqd_work_group_size reduces performance.
   // This option compiles without the workgroup size requirement and does not affect correctness.
   if (device.IsQualcomm()) {