From 7f2e98a1406da6c5293f0c988df95edc246ef88d Mon Sep 17 00:00:00 2001
From: Tyler Sorensen <tylersorensen3221@hotmail.com>
Date: Wed, 11 Jul 2018 15:12:22 -0400
Subject: [PATCH 1/5] added inline ptx to support shuffle on Nvidia GPUs

---
 src/clpp11.hpp                        |  7 +++++
 src/kernels/level3/xgemm_part1.opencl | 24 ++++++++++++---
 src/kernels/level3/xgemm_part3.opencl | 42 +++++++++++++++++++++++++--
 src/tuning/kernels/xgemm.cpp          |  4 +--
 src/tuning/kernels/xgemm.hpp          | 20 ++++++++++++-
 src/utilities/compile.cpp             | 22 ++++++++++++--
 6 files changed, 107 insertions(+), 12 deletions(-)
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 8d6a1127..690f8c49 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -44,6 +44,7 @@
 #include <numeric>   // std::accumulate
 #include <cstring>   // std::strlen
 #include <cstdio>    // fprintf, stderr
+#include "assert.h"
 
 // OpenCL
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
@@ -355,6 +356,12 @@ class Device {
            std::string{"."} + std::to_string(GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV));
   }
 
+  // Returns if the Nvidia chip is a Volta or later archicture (sm_70 or higher)
+  bool IsPostNVIDIAVolta() const {
+    assert(HasExtension("cl_nv_device_attribute_query"));
+    return GetInfo<cl_uint>(CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV) >= 7;
+  }
+
   // Retrieves the above extra information (if present)
   std::string GetExtraInfo() const {
     if (HasExtension("cl_amd_device_attribute_query")) { return AMDBoardName(); }
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 99d64c91..9e483b3e 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,13 +114,29 @@ R"(
   #define GLOBAL_MEM_FENCE 0    // Global synchronisation barrier for potential better performance
 #endif
 
-// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
-#ifndef USE_SUBGROUP_SHUFFLING
-  #define USE_SUBGROUP_SHUFFLING 0     // Optionally enables subgroup shuffling for Intel GPUs
+#ifndef NVIDIA_WARPS_AS_SUBGROUPS
+  #define NVIDIA_WARPS_AS_SUBGROUPS 0
+#endif
+#ifndef NVIDIA_POST_VOLTA
+  #define NVIDIA_POST_VOLTA 0
 #endif
-#if USE_SUBGROUP_SHUFFLING == 1
+#ifndef INTEL_SUBGROUP_EXTENSION
+  #define INTEL_SUBGROUP_EXTENSION 0
+#endif
+//#ifndef USE_SUBGROUP_SHUFFLING
+  #define USE_SUBGROUP_SHUFFLING 0     // Optionally enables subgroup shuffling for Intel GPUs
+//#endif
+
+// Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
+#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION
   #define SUBGROUP_SIZE 8              // Assumes subgroup size is always 8 on Intel GPUs
 #endif
+
+// NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
+#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS
+  #define SUBGROUP_SIZE 32              // Assumes subgroup size is always 32 on NVIDIA GPUs
+#endif
+
 #if NWI != SUBGROUP_SIZE || MDIMC < SUBGROUP_SIZE
   #undef USE_SUBGROUP_SHUFFLING
   #define USE_SUBGROUP_SHUFFLING 0     // Disables subgroups in case the assumptions don't hold
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index c3920cb5..8e20b1b8 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -17,6 +17,44 @@ R"(
 
 // =================================================================================================
 
+// A common interface for subgroup functions
+
+#if USE_SUBGROUP_SHUFFLING == 1
+
+INLINE_FUNC int clblast_get_sub_group_local_id() {
+  
+  // Intel extension 
+  #if INTEL_SUBGROUP_EXTENSION == 1
+  return get_sub_group_local_id();
+  
+  // Nvidia inline PTX
+  #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+  int ret;
+  asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
+  return ret;
+  #endif 
+}
+
+INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
+  
+  // Intel extension 
+  #if INTEL_SUBGROUP_EXTENSION == 1
+  return intel_sub_group_shuffle(reg, src);
+  
+  // Nvidia inline PTX
+  // Volta and later requires .sync shuffle instructions with an extra mask arg
+  #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+  realN ret;
+    #if NVIDIA_POST_VOLTA == 1
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src));
+    #else
+    asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
+    #endif
+  return ret;
+  #endif
+}
+#endif
+
 // Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
 INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                            const __global realM* restrict agm, const __global realN* restrict bgm,
@@ -130,7 +168,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
         #elif GEMMK == 1
           // Loads data: 2D global --> 2D private (matrix A). Partly, shuffled later among subgroups
           #if USE_SUBGROUP_SHUFFLING == 1
-            const int _ni = get_sub_group_local_id();
+            const int _ni = clblast_get_sub_group_local_id();
             #pragma unroll
             for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
               apm[_ki] = GlobalToPrivateA2D(a_ptr, tid_y, _ni, kSizeK, idk, _ki);
@@ -202,7 +240,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
               for (int _ki = 0; _ki < KREG/VWN; _ki += 1) {
                 const int index =  _ni * (MWI/VWM) + _mi;
                 #if USE_SUBGROUP_SHUFFLING == 1
-                  const realN aval = intel_sub_group_shuffle(apm[_ki], _ni);
+                  const realN aval = clblast_sub_group_shuffle(apm[_ki], _ni);
                 #else
                   const realN aval = apm[_ni * (KREG/VWN) + _ki];
                 #endif
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 75e776e6..10164c41 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) {
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  StartVariation<1>(argc, argv);
-  StartVariation<2>(argc, argv);
+  //StartVariation<1>(argc, argv);
+  //StartVariation<2>(argc, argv);
   StartVariation<11>(argc, argv);
   StartVariation<12>(argc, argv);
   return 0;
diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp
index 9a538c1b..c1b048b7 100644
--- a/src/tuning/kernels/xgemm.hpp
+++ b/src/tuning/kernels/xgemm.hpp
@@ -116,7 +116,7 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
     };
   }
   else if (V == 11) { // Kernel 1: limited subset of tuning parameters - but explorable exhaustively
-    settings.parameters = {
+    /*settings.parameters = {
       {"GEMMK", {1}},
       {"MWG", {16, 32, 64}},
       {"NWG", {16, 32, 64}},
@@ -133,6 +133,24 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
       {"SA", {0}},
       {"SB", {0}},
       {"KREG", {1, 2, 4}}
+    };*/
+    settings.parameters = {
+      { "GEMMK",{ 1 } },
+      { "MWG",{ 16, 32, 64 } },
+      { "NWG",{ 64 } }, // This divided by NDIMC needs to be 32
+      { "KWG",{ 1 } },
+      { "MDIMC",{ 64 } }, // This needs to be greater than 32
+      { "NDIMC",{ 2 } },
+      { "MDIMA",{ 64 } }, // This needs to be equal to MDIMC
+      { "NDIMB",{ 2 } }, // This needs to be equal to NDIMC
+      { "KWI",{ 1 } },
+      { "VWM",{ 1, 2, 4, 8 } },
+      { "VWN",{ 1, 2, 4 } },
+      { "STRM",{ 0 } },
+      { "STRN",{ 0 } },
+      { "SA",{ 0 } },
+      { "SB",{ 0 } },
+      { "KREG",{ 1, 2, 4 } }
     };
   }
   else if (V == 12) { // Kernel 1: a lot more tuning parameters - has to be sampled randomly, too much to test all
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 05c29944..cd0b3d2b 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -58,11 +58,27 @@ std::shared_ptr<Program> CompileFromSource(
     header_string += "#define GLOBAL_MEM_FENCE 1\n";
   }
 
-  // For Intel GPUs with subgroup support, use subgroup shuffling.
-  if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
+  // For GPUs with subgroup support, use subgroup shuffling.
+  // Currently these are Intel via an extension and Nvidia using inline PTX (restricted to 32 bit)
+  if (device.IsGPU() && (device.HasExtension(kKhronosIntelSubgroups) ||
+                        (device.IsNVIDIA() && static_cast<int>(precision) == 32))) {
     header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
-  }
 
+    // Define the flavor of subgroup
+    if (device.IsNVIDIA()) {
+      header_string += "#define NVIDIA_WARPS_AS_SUBGROUPS 1\n";
+
+      // Nvidia additionally needs to check pre or post volta due to new
+      // shuffle commands
+      if (device.IsPostNVIDIAVolta()) {
+        header_string += "#define NVIDIA_POST_VOLTA 1\n";
+      }
+    }
+    else if (device.HasExtension(kKhronosIntelSubgroups)) {
+      header_string += "#define INTEL_SUBGROUP_EXTENSION 1\n";
+    }
+  }
+  
   // Optionally adds a translation header from OpenCL kernels to CUDA kernels
   #ifdef CUDA_API
     header_string +=

From 36093429fd444d0a1fc7de25dfaf7f2f775cfabc Mon Sep 17 00:00:00 2001
From: Tyler Sorensen <tylersorensen3221@hotmail.com>
Date: Wed, 11 Jul 2018 15:31:51 -0400
Subject: [PATCH 2/5] restored some of the changed tuning files for xgemm

---
 src/tuning/kernels/xgemm.cpp |  4 ++--
 src/tuning/kernels/xgemm.hpp | 20 +-------------------
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 10164c41..75e776e6 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) {
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  //StartVariation<1>(argc, argv);
-  //StartVariation<2>(argc, argv);
+  StartVariation<1>(argc, argv);
+  StartVariation<2>(argc, argv);
   StartVariation<11>(argc, argv);
   StartVariation<12>(argc, argv);
   return 0;
diff --git a/src/tuning/kernels/xgemm.hpp b/src/tuning/kernels/xgemm.hpp
index c1b048b7..9a538c1b 100644
--- a/src/tuning/kernels/xgemm.hpp
+++ b/src/tuning/kernels/xgemm.hpp
@@ -116,7 +116,7 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
     };
   }
   else if (V == 11) { // Kernel 1: limited subset of tuning parameters - but explorable exhaustively
-    /*settings.parameters = {
+    settings.parameters = {
       {"GEMMK", {1}},
       {"MWG", {16, 32, 64}},
       {"NWG", {16, 32, 64}},
@@ -133,24 +133,6 @@ TunerSettings XgemmGetTunerSettings(const int V, const Arguments<T> &args) {
       {"SA", {0}},
       {"SB", {0}},
       {"KREG", {1, 2, 4}}
-    };*/
-    settings.parameters = {
-      { "GEMMK",{ 1 } },
-      { "MWG",{ 16, 32, 64 } },
-      { "NWG",{ 64 } }, // This divided by NDIMC needs to be 32
-      { "KWG",{ 1 } },
-      { "MDIMC",{ 64 } }, // This needs to be greater than 32
-      { "NDIMC",{ 2 } },
-      { "MDIMA",{ 64 } }, // This needs to be equal to MDIMC
-      { "NDIMB",{ 2 } }, // This needs to be equal to NDIMC
-      { "KWI",{ 1 } },
-      { "VWM",{ 1, 2, 4, 8 } },
-      { "VWN",{ 1, 2, 4 } },
-      { "STRM",{ 0 } },
-      { "STRN",{ 0 } },
-      { "SA",{ 0 } },
-      { "SB",{ 0 } },
-      { "KREG",{ 1, 2, 4 } }
     };
   }
   else if (V == 12) { // Kernel 1: a lot more tuning parameters - has to be sampled randomly, too much to test all

From 7709a7308bce5492e06d8867a4dd9dff5b2ba950 Mon Sep 17 00:00:00 2001
From: Tyler Sorensen <tylersorensen3221@hotmail.com>
Date: Sat, 14 Jul 2018 19:50:47 -0400
Subject: [PATCH 3/5] Applied feedback from Cedric from first pull request

---
 src/clpp11.hpp                        |  2 +-
 src/cupp11.hpp                        |  5 +++++
 src/kernels/level3/xgemm_part1.opencl | 21 +++++++++++----------
 src/kernels/level3/xgemm_part3.opencl | 10 +++++-----
 src/tuning/kernels/xgemm.cpp          |  4 ++--
 src/utilities/compile.cpp             | 26 ++++++++++++--------------
 6 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 690f8c49..8ac0523f 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -44,7 +44,7 @@
 #include <numeric>   // std::accumulate
 #include <cstring>   // std::strlen
 #include <cstdio>    // fprintf, stderr
-#include "assert.h"
+#include <assert.h>
 
 // OpenCL
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
diff --git a/src/cupp11.hpp b/src/cupp11.hpp
index a1cb1614..ce765844 100644
--- a/src/cupp11.hpp
+++ b/src/cupp11.hpp
@@ -327,6 +327,11 @@ class Device {
   std::string AMDBoardName() const { return ""; }
   std::string NVIDIAComputeCapability() const { return Capabilities(); }
 
+  // Returns if the Nvidia chip is a Volta or later archicture (major version  7 or higher)
+  bool IsPostNVIDIAVolta() const {
+    return GetInfo(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 7;
+  }
+
   // Retrieves the above extra information
   std::string GetExtraInfo() const { return NVIDIAComputeCapability(); }
 
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 9e483b3e..32386312 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -114,26 +114,27 @@ R"(
   #define GLOBAL_MEM_FENCE 0    // Global synchronisation barrier for potential better performance
 #endif
 
-#ifndef NVIDIA_WARPS_AS_SUBGROUPS
-  #define NVIDIA_WARPS_AS_SUBGROUPS 0
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA
+  #define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 0
 #endif
-#ifndef NVIDIA_POST_VOLTA
-  #define NVIDIA_POST_VOLTA 0
+#ifndef SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA
+  #define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 0
 #endif
-#ifndef INTEL_SUBGROUP_EXTENSION
-  #define INTEL_SUBGROUP_EXTENSION 0
+#ifndef SUBGROUP_SHUFFLING_INTEL
+  #define SUBGROUP_SHUFFLING_INTEL 0
 #endif
-//#ifndef USE_SUBGROUP_SHUFFLING
+#ifndef USE_SUBGROUP_SHUFFLING
   #define USE_SUBGROUP_SHUFFLING 0     // Optionally enables subgroup shuffling for Intel GPUs
-//#endif
+#endif
 
 // Intel subgroups (https://www.khronos.org/registry/OpenCL/extensions/intel/cl_intel_subgroups.txt)
-#if USE_SUBGROUP_SHUFFLING == 1 && INTEL_SUBGROUP_EXTENSION
+#if USE_SUBGROUP_SHUFFLING == 1 && SUBGROUP_SHUFFLING_INTEL
   #define SUBGROUP_SIZE 8              // Assumes subgroup size is always 8 on Intel GPUs
 #endif
 
 // NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
-#if USE_SUBGROUP_SHUFFLING == 1 && NVIDIA_WARPS_AS_SUBGROUPS
+#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || \ 
+                                    SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
   #define SUBGROUP_SIZE 32              // Assumes subgroup size is always 32 on NVIDIA GPUs
 #endif
 
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 8e20b1b8..35ec735c 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -24,11 +24,11 @@ R"(
 INLINE_FUNC int clblast_get_sub_group_local_id() {
   
   // Intel extension 
-  #if INTEL_SUBGROUP_EXTENSION == 1
+  #if SUBGROUP_SHUFFLING_INTEL == 1
   return get_sub_group_local_id();
   
   // Nvidia inline PTX
-  #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+  #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
   int ret;
   asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret) );
   return ret;
@@ -38,14 +38,14 @@ INLINE_FUNC int clblast_get_sub_group_local_id() {
 INLINE_FUNC realN clblast_sub_group_shuffle(realN reg, int src) {
   
   // Intel extension 
-  #if INTEL_SUBGROUP_EXTENSION == 1
+  #if SUBGROUP_SHUFFLING_INTEL == 1
   return intel_sub_group_shuffle(reg, src);
   
   // Nvidia inline PTX
   // Volta and later requires .sync shuffle instructions with an extra mask arg
-  #elif NVIDIA_WARPS_AS_SUBGROUPS == 1
+  #elif SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA == 1 || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
   realN ret;
-    #if NVIDIA_POST_VOLTA == 1
+    #if SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA == 1
     asm volatile("shfl.sync.idx.b32 %0, %1, %2, 0x1f, 0xffffffff;" : "=f"(ret): "f"(reg), "r"(src));
     #else
     asm volatile("shfl.idx.b32 %0, %1, %2, 0x1f;" : "=f"(ret): "f"(reg), "r"(src));
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 75e776e6..10164c41 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) {
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  StartVariation<1>(argc, argv);
-  StartVariation<2>(argc, argv);
+  //StartVariation<1>(argc, argv);
+  //StartVariation<2>(argc, argv);
   StartVariation<11>(argc, argv);
   StartVariation<12>(argc, argv);
   return 0;
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index cd0b3d2b..835f54b4 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -58,24 +58,22 @@ std::shared_ptr<Program> CompileFromSource(
     header_string += "#define GLOBAL_MEM_FENCE 1\n";
   }
 
-  // For GPUs with subgroup support, use subgroup shuffling.
-  // Currently these are Intel via an extension and Nvidia using inline PTX (restricted to 32 bit)
-  if (device.IsGPU() && (device.HasExtension(kKhronosIntelSubgroups) ||
-                        (device.IsNVIDIA() && static_cast<int>(precision) == 32))) {
+  // For Intel GPUs with subgroup support, use subgroup shuffling.
+  if (device.IsGPU() && device.HasExtension(kKhronosIntelSubgroups)) {
     header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
+    header_string += "#define SUBGROUP_SHUFFLING_INTEL 1\n";
+  }
 
-    // Define the flavor of subgroup
-    if (device.IsNVIDIA()) {
-      header_string += "#define NVIDIA_WARPS_AS_SUBGROUPS 1\n";
+  // For NVIDIA GPUs, inline PTX can provide subgroup support
+  if (device.IsGPU() && device.IsNVIDIA() && precision == Precision::kSingle) {
+    header_string += "#define USE_SUBGROUP_SHUFFLING 1\n";
 
-      // Nvidia additionally needs to check pre or post volta due to new
-      // shuffle commands
-      if (device.IsPostNVIDIAVolta()) {
-        header_string += "#define NVIDIA_POST_VOLTA 1\n";
-      }
+    // Nvidia needs to check pre or post volta due to new shuffle commands
+    if (device.IsPostNVIDIAVolta()) {
+      header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA 1\n";
     }
-    else if (device.HasExtension(kKhronosIntelSubgroups)) {
-      header_string += "#define INTEL_SUBGROUP_EXTENSION 1\n";
+    else {
+      header_string += "#define SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA 1\n";
     }
   }
   

From f4e5b1c14ce9b0ac3b769908912fb3422a0ea8e2 Mon Sep 17 00:00:00 2001
From: Tyler Sorensen <tylersorensen3221@hotmail.com>
Date: Sat, 14 Jul 2018 22:47:39 -0400
Subject: [PATCH 4/5] forgot to add test cases back in, oops

---
 src/tuning/kernels/xgemm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 10164c41..75e776e6 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -33,8 +33,8 @@ void StartVariation(int argc, char *argv[]) {
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  //StartVariation<1>(argc, argv);
-  //StartVariation<2>(argc, argv);
+  StartVariation<1>(argc, argv);
+  StartVariation<2>(argc, argv);
   StartVariation<11>(argc, argv);
   StartVariation<12>(argc, argv);
   return 0;

From 0772d63498c8eeddc380902ba6010a1a861763cc Mon Sep 17 00:00:00 2001
From: Tyler Sorensen <tylersorensen3221@hotmail.com>
Date: Mon, 16 Jul 2018 20:12:30 -0400
Subject: [PATCH 5/5] moved a two-line macro to a single line

---
 src/kernels/level3/xgemm_part1.opencl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 32386312..3cfc5dfb 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -133,8 +133,7 @@ R"(
 #endif
 
 // NVIDIA warps as subgroups using inline PTX (https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html)
-#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || \ 
-                                    SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
+#if USE_SUBGROUP_SHUFFLING == 1 && (SUBGROUP_SHUFFLING_NVIDIA_PRE_VOLTA || SUBGROUP_SHUFFLING_NVIDIA_POST_VOLTA)
   #define SUBGROUP_SIZE 32              // Assumes subgroup size is always 32 on NVIDIA GPUs
 #endif