iree-org · nirvedhmeshram · Feb 3, 2025 · Feb 3, 2025 · Feb 3, 2025
@@ -50,8 +50,8 @@
 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
 namespace mlir::iree_compiler {
 
-llvm::cl::opt<bool> clGPUTestTileAndFuseMatmul(
-    "iree-codegen-llvmgpu-test-tile-and-fuse-matmul",
+llvm::cl::opt<bool> clGPUEarlyTileAndFuseMatmul(
+    "iree-codegen-llvmgpu-early-tile-and-fuse-matmul",
     llvm::cl::desc("test the the tile and fuse pipeline for matmul"),
     llvm::cl::init(false));
 
@@ -2340,7 +2340,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
     LDBG("Tile and fuse data tiled multi_mma config");
     return success();
   }
-  if (clGPUTestTileAndFuseMatmul) {
+  if (clGPUEarlyTileAndFuseMatmul) {
     if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
                                                      computeOp))) {
       LDBG("Tile and fuse matmul config");
@@ -2364,6 +2364,13 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
   if (succeeded(setVectorDistributionConfig(target, entryPointFn, computeOp))) {
     return success();
   }
+  // TODO (nirvedhmeshram, qedawkins) : remove this when tile and fuse backend
+  // config becomes the default for matmul.
+  if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
+                                                   computeOp))) {
+    LDBG("Tile and fuse matmul config after no vector distribute config");
+    return success();
+  }
 
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
     if (succeeded(setContractConfig(target, entryPointFn, linalgOp))) {

@@ -1,7 +1,11 @@
 // RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
-// RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
+// RUN: --iree-codegen-llvmgpu-early-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
 // RUN: --iree-codegen-llvmgpu-use-igemm=false \
-// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
+// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=CHECK
+//
+// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
+// RUN: --iree-codegen-llvmgpu-use-igemm=false \
+// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=LATE
 
 // TODO: This test is still using the legacy LLVMGPU kernel config. This needs
 // to be migrated to the rocdl heuristics, but for now is just physically
@@ -43,6 +47,8 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
 //  CHECK-SAME:     subgroup = [1, 1, 4, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 1, 64, 64, 0]
 
+//        LATE:  LLVMGPUVectorDistribute
+
 // -----
 
 #map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)>
@@ -78,6 +84,8 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
 //  CHECK-SAME:     subgroup = [2, 2, 1, 1, 0, 0]
 //  CHECK-SAME:     workgroup = [2, 2, 32, 32, 0, 0]
 
+//        LATE:  LLVMGPUVectorDistribute
+
 // -----
 
 #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d5, d6)>
@@ -115,6 +123,8 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
 //  CHECK-SAME:     subgroup = [0, 1, 0, 1, 1, 0, 0]
 //  CHECK-SAME:     workgroup = [1, 2, 1, 16, 32, 0, 0]
 
+//        LATE:  LLVMGPUVectorDistribute
+
 // -----
 
 func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<1024x1024xf16>) -> tensor<1024x1024xf32> {
@@ -140,6 +150,8 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
 //  CHECK-SAME:     subgroup = [4, 4, 0]
 //  CHECK-SAME:     workgroup = [128, 128, 0]
 
+//        LATE:  LLVMGPUVectorDistribute
+
 // -----
 
 module {
@@ -160,6 +172,8 @@ module {
 //  CHECK-SAME:     thread = [1, 1, 1, 1, 0, 0, 0]
 //  CHECK-SAME:     workgroup = [1, 1, 1, 64, 0, 0, 0]
 
+//        LATE:  LLVMGPUVectorDistribute
+
 // -----
 
 module {
@@ -182,6 +196,8 @@ module {
 //  CHECK-SAME:     thread = [1, 4, 0]
 //  CHECK-SAME:     workgroup = [1, 256, 0]
 
+//        LATE:  LLVMGPUWarpReduction
+
 // -----
 
 module {
@@ -275,15 +291,15 @@ func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>,
 }
 }
 
-// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul
-// CHECK-SAME:    #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
-// CHECK-SAME:    {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
-//      CHECK:    linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
-//  CHECK-SAME:     padding = [1, 16, 16, 4]
-//  CHECK-SAME:     promote_operands = [0, 1, 2]
-//  CHECK-SAME:     reduction = [0, 0, 0, 1]
-//  CHECK-SAME:     subgroup = [0, 1, 1, 0]
-//  CHECK-SAME:     workgroup = [1, 16, 16, 0]
+// LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul
+// LATE-SAME:    #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
+// LATE-SAME:    {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
+//      LATE:    linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
+//  LATE-SAME:     padding = [1, 16, 16, 4]
+//  LATE-SAME:     promote_operands = [0, 1, 2]
+//  LATE-SAME:     reduction = [0, 0, 0, 1]
+//  LATE-SAME:     subgroup = [0, 1, 1, 0]
+//  LATE-SAME:     workgroup = [1, 16, 16, 0]
 
 // -----
 
@@ -302,15 +318,15 @@ func.func @unaligned_matmul_with_two_reduce_dim(%arg0: tensor<196x9x4xf32>, %arg
 }
 }
 
-// CHECK-LABEL: func.func @unaligned_matmul_with_two_reduce_dim
-// CHECK-SAME:  {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
-// CHECK:       linalg.generic
-// CHECK-SAME:  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
-// CHECK-SAME:  padding = [16, 1, 16, 4]
-// CHECK-SAME:  promote_operands = [0, 1, 2]
-// CHECK-SAME:  reduction = [0, 1, 0, 1],
-// CHECK-SAME:  subgroup = [1, 0, 1, 0],
-// CHECK-SAME:  workgroup = [16, 0, 16, 0]}
+// LATE-LABEL: func.func @unaligned_matmul_with_two_reduce_dim
+// LATE-SAME:  {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
+// LATE:       linalg.generic
+// LATE-SAME:  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
+// LATE-SAME:  padding = [16, 1, 16, 4]
+// LATE-SAME:  promote_operands = [0, 1, 2]
+// LATE-SAME:  reduction = [0, 1, 0, 1],
+// LATE-SAME:  subgroup = [1, 0, 1, 0],
+// LATE-SAME:  workgroup = [16, 0, 16, 0]}
 
 // -----
 
@@ -331,15 +347,15 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5
 // In this unit test, if C promotion is not considered, it will deduce a MMA
 // schedule with nTileSize of 16 while in reality it should be 8.
 
-// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
-// CHECK-SAME:    #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
-// CHECK-SAME:    {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
-//      CHECK:    linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
-//  CHECK-SAME:     padding = [1, 16, 512, 4]
-//  CHECK-SAME:     promote_operands = [0, 1, 2]
-//  CHECK-SAME:     reduction = [0, 0, 0, 1]
-//  CHECK-SAME:     subgroup = [0, 1, 8, 0]
-//  CHECK-SAME:     workgroup = [1, 16, 512, 0]
+// LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
+// LATE-SAME:    #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
+// LATE-SAME:    {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
+//      LATE:    linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
+//  LATE-SAME:     padding = [1, 16, 512, 4]
+//  LATE-SAME:     promote_operands = [0, 1, 2]
+//  LATE-SAME:     reduction = [0, 0, 0, 1]
+//  LATE-SAME:     subgroup = [0, 1, 8, 0]
+//  LATE-SAME:     workgroup = [1, 16, 512, 0]
 
 // -----