Skip to content

Commit

Permalink
[GPU] Use tile and fuse for matmul after vector distribute by default
Browse files Browse the repository at this point in the history
Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
  • Loading branch information
nirvedhmeshram committed Feb 3, 2025
1 parent 84a1746 commit 8c18ba7
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 32 deletions.
15 changes: 12 additions & 3 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
namespace mlir::iree_compiler {

llvm::cl::opt<bool> clGPUTestTileAndFuseMatmul(
"iree-codegen-llvmgpu-test-tile-and-fuse-matmul",
llvm::cl::opt<bool> clGPUEarlyTileAndFuseMatmul(
"iree-codegen-llvmgpu-early-tile-and-fuse-matmul",
llvm::cl::desc("test the the tile and fuse pipeline for matmul"),
llvm::cl::init(false));

Expand Down Expand Up @@ -2340,7 +2340,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
LDBG("Tile and fuse data tiled multi_mma config");
return success();
}
if (clGPUTestTileAndFuseMatmul) {
if (clGPUEarlyTileAndFuseMatmul) {
if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
computeOp))) {
LDBG("Tile and fuse matmul config");
Expand All @@ -2364,6 +2364,15 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
if (succeeded(setVectorDistributionConfig(target, entryPointFn, computeOp))) {
return success();
}
// TODO : remove this when tile and fuse backend config becomes the default
// for matmul.
if (!clGPUEarlyTileAndFuseMatmul && isROCmBackend(target)) {
if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
computeOp))) {
LDBG("Tile and fuse matmul config after no vector distribute config");
return success();
}
}

if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
if (succeeded(setContractConfig(target, entryPointFn, linalgOp))) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
// RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
// RUN: --iree-codegen-llvmgpu-early-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
// RUN: --iree-codegen-llvmgpu-use-igemm=false \
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=CHECK
//
// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
// RUN: --iree-codegen-llvmgpu-use-igemm=false \
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=LATE

// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
// to be migrated to the rocdl heuristics, but for now is just physically
Expand Down Expand Up @@ -43,6 +47,8 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
// CHECK-SAME: subgroup = [1, 1, 4, 1, 0]
// CHECK-SAME: workgroup = [1, 1, 64, 64, 0]

// LATE: LLVMGPUVectorDistribute

// -----

#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)>
Expand Down Expand Up @@ -78,6 +84,8 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
// CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0]
// CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0]

// LATE: LLVMGPUVectorDistribute

// -----

#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d5, d6)>
Expand Down Expand Up @@ -115,6 +123,8 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
// CHECK-SAME: subgroup = [0, 1, 0, 1, 1, 0, 0]
// CHECK-SAME: workgroup = [1, 2, 1, 16, 32, 0, 0]

// LATE: LLVMGPUVectorDistribute

// -----

func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<1024x1024xf16>) -> tensor<1024x1024xf32> {
Expand All @@ -140,6 +150,8 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
// CHECK-SAME: subgroup = [4, 4, 0]
// CHECK-SAME: workgroup = [128, 128, 0]

// LATE: LLVMGPUVectorDistribute

// -----

module {
Expand All @@ -160,6 +172,8 @@ module {
// CHECK-SAME: thread = [1, 1, 1, 1, 0, 0, 0]
// CHECK-SAME: workgroup = [1, 1, 1, 64, 0, 0, 0]

// LATE: LLVMGPUVectorDistribute

// -----

module {
Expand All @@ -182,6 +196,8 @@ module {
// CHECK-SAME: thread = [1, 4, 0]
// CHECK-SAME: workgroup = [1, 256, 0]

// LATE: LLVMGPUWarpReduction

// -----

module {
Expand Down Expand Up @@ -275,15 +291,15 @@ func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>,
}
}

// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: padding = [1, 16, 16, 4]
// CHECK-SAME: promote_operands = [0, 1, 2]
// CHECK-SAME: reduction = [0, 0, 0, 1]
// CHECK-SAME: subgroup = [0, 1, 1, 0]
// CHECK-SAME: workgroup = [1, 16, 16, 0]
// LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul
// LATE-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
// LATE-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
// LATE: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
// LATE-SAME: padding = [1, 16, 16, 4]
// LATE-SAME: promote_operands = [0, 1, 2]
// LATE-SAME: reduction = [0, 0, 0, 1]
// LATE-SAME: subgroup = [0, 1, 1, 0]
// LATE-SAME: workgroup = [1, 16, 16, 0]

// -----

Expand All @@ -302,15 +318,15 @@ func.func @unaligned_matmul_with_two_reduce_dim(%arg0: tensor<196x9x4xf32>, %arg
}
}

// CHECK-LABEL: func.func @unaligned_matmul_with_two_reduce_dim
// CHECK-SAME: {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
// CHECK: linalg.generic
// CHECK-SAME: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
// CHECK-SAME: padding = [16, 1, 16, 4]
// CHECK-SAME: promote_operands = [0, 1, 2]
// CHECK-SAME: reduction = [0, 1, 0, 1],
// CHECK-SAME: subgroup = [1, 0, 1, 0],
// CHECK-SAME: workgroup = [16, 0, 16, 0]}
// LATE-LABEL: func.func @unaligned_matmul_with_two_reduce_dim
// LATE-SAME: {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
// LATE: linalg.generic
// LATE-SAME: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
// LATE-SAME: padding = [16, 1, 16, 4]
// LATE-SAME: promote_operands = [0, 1, 2]
// LATE-SAME: reduction = [0, 1, 0, 1],
// LATE-SAME: subgroup = [1, 0, 1, 0],
// LATE-SAME: workgroup = [16, 0, 16, 0]}

// -----

Expand All @@ -331,15 +347,15 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5
// In this unit test, if C promotion is not considered, it will deduce a MMA
// schedule with nTileSize of 16 while in reality it should be 8.

// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: padding = [1, 16, 512, 4]
// CHECK-SAME: promote_operands = [0, 1, 2]
// CHECK-SAME: reduction = [0, 0, 0, 1]
// CHECK-SAME: subgroup = [0, 1, 8, 0]
// CHECK-SAME: workgroup = [1, 16, 512, 0]
// LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
// LATE-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
// LATE-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
// LATE: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
// LATE-SAME: padding = [1, 16, 512, 4]
// LATE-SAME: promote_operands = [0, 1, 2]
// LATE-SAME: reduction = [0, 0, 0, 1]
// LATE-SAME: subgroup = [0, 1, 8, 0]
// LATE-SAME: workgroup = [1, 16, 512, 0]

// -----

Expand Down

0 comments on commit 8c18ba7

Please sign in to comment.