diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 76865d6661d4..2ee7e8241c17 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -50,8 +50,8 @@ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") namespace mlir::iree_compiler { -llvm::cl::opt clGPUTestTileAndFuseMatmul( - "iree-codegen-llvmgpu-test-tile-and-fuse-matmul", +llvm::cl::opt clGPUEarlyTileAndFuseMatmul( + "iree-codegen-llvmgpu-early-tile-and-fuse-matmul", llvm::cl::desc("test the the tile and fuse pipeline for matmul"), llvm::cl::init(false)); @@ -2340,7 +2340,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target, LDBG("Tile and fuse data tiled multi_mma config"); return success(); } - if (clGPUTestTileAndFuseMatmul) { + if (clGPUEarlyTileAndFuseMatmul) { if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn, computeOp))) { LDBG("Tile and fuse matmul config"); @@ -2364,6 +2364,13 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target, if (succeeded(setVectorDistributionConfig(target, entryPointFn, computeOp))) { return success(); } + // TODO (nirvedhmeshram, qedawkins) : remove this when tile and fuse backend + // config becomes the default for matmul. + if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn, + computeOp))) { + LDBG("Tile and fuse matmul config after no vector distribute config"); + return success(); + } if (auto linalgOp = dyn_cast(computeOp)) { if (succeeded(setContractConfig(target, entryPointFn, linalgOp))) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index fde9e940c977..910eca3c1768 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -1,7 +1,11 @@ // RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \ -// RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \ +// RUN: --iree-codegen-llvmgpu-early-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \ // RUN: --iree-codegen-llvmgpu-use-igemm=false \ -// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s +// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=CHECK +// +// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \ +// RUN: --iree-codegen-llvmgpu-use-igemm=false \ +// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=LATE // TODO: This test is still using the legacy LLVMGPU kernel config. This needs // to be migrated to the rocdl heuristics, but for now is just physically @@ -43,6 +47,8 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor // CHECK-SAME: subgroup = [1, 1, 4, 1, 0] // CHECK-SAME: workgroup = [1, 1, 64, 64, 0] +// LATE: LLVMGPUVectorDistribute + // ----- #map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)> @@ -78,6 +84,8 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4 // CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0] // CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0] +// LATE: LLVMGPUVectorDistribute + // ----- #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d5, d6)> @@ -115,6 +123,8 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor, %rhs: t // CHECK-SAME: subgroup = [0, 1, 0, 1, 1, 0, 0] // CHECK-SAME: workgroup = [1, 2, 1, 16, 32, 0, 0] +// LATE: LLVMGPUVectorDistribute + // ----- func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<1024x1024xf16>) -> tensor<1024x1024xf32> { @@ -140,6 +150,8 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor< // CHECK-SAME: subgroup = [4, 4, 0] // CHECK-SAME: workgroup = [128, 128, 0] +// LATE: LLVMGPUVectorDistribute + // ----- module { @@ -160,6 +172,8 @@ module { // CHECK-SAME: thread = [1, 1, 1, 1, 0, 0, 0] // CHECK-SAME: workgroup = [1, 1, 1, 64, 0, 0, 0] +// LATE: LLVMGPUVectorDistribute + // ----- module { @@ -182,6 +196,8 @@ module { // CHECK-SAME: thread = [1, 4, 0] // CHECK-SAME: workgroup = [1, 256, 0] +// LATE: LLVMGPUWarpReduction + // ----- module { @@ -275,15 +291,15 @@ func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, } } -// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul -// CHECK-SAME: #iree_codegen.translation_info} -// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: padding = [1, 16, 16, 4] -// CHECK-SAME: promote_operands = [0, 1, 2] -// CHECK-SAME: reduction = [0, 0, 0, 1] -// CHECK-SAME: subgroup = [0, 1, 1, 0] -// CHECK-SAME: workgroup = [1, 16, 16, 0] +// LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul +// LATE-SAME: #iree_codegen.translation_info} +// LATE: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config +// LATE-SAME: padding = [1, 16, 16, 4] +// LATE-SAME: promote_operands = [0, 1, 2] +// LATE-SAME: reduction = [0, 0, 0, 1] +// LATE-SAME: subgroup = [0, 1, 1, 0] +// LATE-SAME: workgroup = [1, 16, 16, 0] // ----- @@ -302,15 +318,15 @@ func.func @unaligned_matmul_with_two_reduce_dim(%arg0: tensor<196x9x4xf32>, %arg } } -// CHECK-LABEL: func.func @unaligned_matmul_with_two_reduce_dim -// CHECK-SAME: {translation_info = #iree_codegen.translation_info -// CHECK-SAME: padding = [16, 1, 16, 4] -// CHECK-SAME: promote_operands = [0, 1, 2] -// CHECK-SAME: reduction = [0, 1, 0, 1], -// CHECK-SAME: subgroup = [1, 0, 1, 0], -// CHECK-SAME: workgroup = [16, 0, 16, 0]} +// LATE-LABEL: func.func @unaligned_matmul_with_two_reduce_dim +// LATE-SAME: {translation_info = #iree_codegen.translation_info +// LATE-SAME: padding = [16, 1, 16, 4] +// LATE-SAME: promote_operands = [0, 1, 2] +// LATE-SAME: reduction = [0, 1, 0, 1], +// LATE-SAME: subgroup = [1, 0, 1, 0], +// LATE-SAME: workgroup = [16, 0, 16, 0]} // ----- @@ -331,15 +347,15 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5 // In this unit test, if C promotion is not considered, it will deduce a MMA // schedule with nTileSize of 16 while in reality it should be 8. -// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check -// CHECK-SAME: #iree_codegen.translation_info} -// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config -// CHECK-SAME: padding = [1, 16, 512, 4] -// CHECK-SAME: promote_operands = [0, 1, 2] -// CHECK-SAME: reduction = [0, 0, 0, 1] -// CHECK-SAME: subgroup = [0, 1, 8, 0] -// CHECK-SAME: workgroup = [1, 16, 512, 0] +// LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check +// LATE-SAME: #iree_codegen.translation_info} +// LATE: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config +// LATE-SAME: padding = [1, 16, 512, 4] +// LATE-SAME: promote_operands = [0, 1, 2] +// LATE-SAME: reduction = [0, 0, 0, 1] +// LATE-SAME: subgroup = [0, 1, 8, 0] +// LATE-SAME: workgroup = [1, 16, 512, 0] // -----