From 5ba1755ec703bda880e876525e3effe72fd452ab Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Mon, 13 Jan 2025 09:28:01 -0600
Subject: [PATCH] give seperate heuristics  to IGEMM

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
---
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 20 ++++++++++---------
 .../ROCDL/config_igemm_tile_and_fuse.mlir     | 12 +++++------
 2 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index fe18a4bc6fad1..4f03ba8591ca4 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -118,7 +118,7 @@ LogicalResult setDataTiledMultiMmaLoweringConfig(
 /// problem based on the available mma intrinsics.
 static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
     IREE::GPU::TargetAttr target, GPUMatmulShapeType problem,
-    bool transposedLhs, bool transposedRhs, bool mustBeAligned = true,
+    bool transposedLhs, bool transposedRhs, bool isIGEMM, bool mustBeAligned = true,
     bool doCPromotion = false) {
   const int64_t targetSubgroupSize = target.getPreferredSubgroupSize();
   SmallVector<GPUMatmulShapeType> intrinsics;
@@ -142,6 +142,8 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
   // See https://github.com/iree-org/iree/issues/16341 for details.
   int64_t mSize = ShapedType::getNumElements(problem.mSizes);
   int64_t nSize = ShapedType::getNumElements(problem.nSizes);
+  int64_t cacheLineSizeElements = kCacheLineSizeBits / inBitWidth;
+  int64_t bestKElementCountPerSubgroup = isIGEMM?  cacheLineSizeElements / 2: cacheLineSizeElements;
   if (mSize * nSize <= 512 * 512) {
     // For matmuls with small M*N size, we want to distribute M*N onto more
     // workgroups to fill the GPU. Use a smaller bestMNTileCountPerSubgroup
@@ -149,13 +151,13 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
     seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
              /*bestMNTileCountPerSubgroup=*/4,
              /*bestKTileCountPerSubgroup=*/8,
-             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits * 2 /
-                 inBitWidth};
+             bestKElementCountPerSubgroup*2};
   } else {
+    int64_t bestKElementCountPerSubgroup = isIGEMM?  cacheLineSizeElements /2 : cacheLineSizeElements;
     seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
              /*bestMNTileCountPerSubgroup=*/16,
              /*bestKTileCountPerSubgroup=*/4,
-             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
+             bestKElementCountPerSubgroup};
   }
 
   // We target slightly below the full available shared Memory to leave room for
@@ -181,7 +183,7 @@ static FailureOr<std::pair<LoweringConfigAttr, int64_t>>
 getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
                                         ArrayRef<AffineMap> maps,
                                         ArrayRef<Value> operands,
-                                        IREE::GPU::TargetAttr target) {
+                                        IREE::GPU::TargetAttr target, bool isIGEMM) {
   if (target.getWgp().getMma().empty())
     return failure();
 
@@ -249,7 +251,7 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   bool mustBeAligned = true;
   bool doCPromotion = false;
   std::optional<GPUMMASchedule> schedule = getMmaScheduleFromProblemAndTarget(
-      target, problem, transposedLhs, transposedRhs);
+      target, problem, transposedLhs, transposedRhs, isIGEMM);
 
   // TODO (nirvedhmeshram, qedawkins): The performance with this will be bad if
   // the GEMM is accumulating (i.e doesnt have a zero fill dpsInit) as that
@@ -260,7 +262,7 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
     mustBeAligned = false;
     doCPromotion = true;
     schedule = getMmaScheduleFromProblemAndTarget(target, problem,
-                                                  transposedLhs, transposedRhs,
+                                                  transposedLhs, transposedRhs, isIGEMM,
                                                   mustBeAligned, doCPromotion);
   }
 
@@ -384,7 +386,7 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
   SmallVector<int64_t> bounds = igemmLoopBounds.value();
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
       getMatmulLoweringConfigAndWorkgroupSize(
-          bounds, igemmContractionMaps.value(), igemmOperands.value(), target);
+          bounds, igemmContractionMaps.value(), igemmOperands.value(), target, /*isIGEMM=*/true);
   if (failed(configAndWgSize)) {
     return failure();
   }
@@ -434,7 +436,7 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
   LDBG("Matmul TileAndFuse Config");
 
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
-      getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target);
+      getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target, /*isIGEMM=*/false);
   if (failed(configAndWgSize)) {
     return failure();
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
index c1be57b949036..d8af22e58664c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir
@@ -24,7 +24,7 @@ func.func @nhwc_conv_mfma() {
 //       CHECK:   linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
 //  CHECK-SAME:     promote_operands = [0, 1]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 16]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
 //  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 2, 32, 64, 0]
 
@@ -53,7 +53,7 @@ func.func @nchw_conv_mfma() {
 //       CHECK:   linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
 //  CHECK-SAME:     promote_operands = [0, 1]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 16]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
 //  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 64, 2, 32, 0]
 
@@ -81,9 +81,9 @@ func.func @nhwc_conv_unaligned_mfma() {
 
 //       CHECK:   linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
-//  CHECK-SAME:     padding = [2, 1, 32, 64, 64]
+//  CHECK-SAME:     padding = [2, 1, 32, 64, 32]
 //  CHECK-SAME:     promote_operands = [0, 1, 2]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 16]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
 //  CHECK-SAME:     subgroup = [2, 1, 2, 1, 0]
 //  CHECK-SAME:     workgroup = [2, 1, 32, 64, 0]
 
@@ -111,8 +111,8 @@ func.func @nchw_conv_unaligned_mfma() {
 
 //       CHECK:   linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
-//  CHECK-SAME:     padding = [1, 64, 2, 32, 64]
+//  CHECK-SAME:     padding = [1, 64, 2, 32, 32]
 //  CHECK-SAME:     promote_operands = [0, 1, 2]
-//  CHECK-SAME:     reduction = [0, 0, 0, 0, 16]
+//  CHECK-SAME:     reduction = [0, 0, 0, 0, 8]
 //  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 64, 2, 32, 0]