Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Enable GEMMs to first attempt LLVMGPUTileAndFuse with intrinsic by default #19520

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compiler/src/iree/compiler/Codegen/Common/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,7 @@ def TileLargeTensorsPass :
];
let options = [
Option<"maxVectorSize", "max-vector-size", "int64_t",
/*default=*/"64",
/*default=*/"128",
"Maximum static size to tile to (i.e. all remaining ops will be smaller)">,
];
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ func.func @simple_generic(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: te

// CHECK-LABEL: func.func @simple_generic
// CHECK: scf.for %{{.*}} = %c0 to %c64 step %c1
// CHECK: scf.for %{{.*}} = %c0 to %c256 step %c64
// CHECK: linalg.generic {{.*}} outs({{.*}}: tensor<1x64xf32>)
// CHECK: scf.for %{{.*}} = %c0 to %c256 step %c128
// CHECK: linalg.generic {{.*}} outs({{.*}}: tensor<1x128xf32>)

// -----

Expand Down Expand Up @@ -79,7 +79,7 @@ func.func @multiple_use_tilable_op(%3: tensor<64x256xf32>, %4: tensor<64x256xf32

// CHECK-LABEL: func.func @multiple_use_tilable_op
// CHECK: %[[ADD_TILING:.+]] = scf.for
// CHECK: linalg.add {{.*}} -> tensor<1x64xf32>
// CHECK: linalg.add {{.*}} -> tensor<1x128xf32>
// CHECK: %[[T_TILING:.+]] = scf.for
// CHECK: %[[FUSED_ADD:.+]] = linalg.add {{.*}} -> tensor<64x1xf32>
// CHECK: linalg.transpose ins(%[[FUSED_ADD]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,8 @@ LogicalResult setDataTiledMultiMmaLoweringConfig(
/// problem based on the available mma intrinsics.
static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
IREE::GPU::TargetAttr target, GPUMatmulShapeType problem,
bool transposedLhs, bool transposedRhs, bool mustBeAligned = true,
bool doCPromotion = false) {
bool transposedLhs, bool transposedRhs, bool isIGEMM,
bool mustBeAligned = true, bool doCPromotion = false) {
const int64_t targetSubgroupSize = target.getPreferredSubgroupSize();
SmallVector<GPUMatmulShapeType> intrinsics;
for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
Expand All @@ -142,23 +142,28 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
// See https://github.com/iree-org/iree/issues/16341 for details.
int64_t mSize = ShapedType::getNumElements(problem.mSizes);
int64_t nSize = ShapedType::getNumElements(problem.nSizes);
int64_t cacheLineSizeElements = kCacheLineSizeBits / inBitWidth;
int64_t bestKElementCountPerSubgroup =
isIGEMM ? cacheLineSizeElements / 2 : cacheLineSizeElements;
if (mSize * nSize <= 512 * 512) {
// For matmuls with small M*N size, we want to distribute M*N onto more
// workgroups to fill the GPU. Use a smaller bestMNTileCountPerSubgroup
// and a larger bestKTileCountPerSubgroup.
seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
/*bestMNTileCountPerSubgroup=*/4,
/*bestKTileCountPerSubgroup=*/8,
/*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
/*bestKTileCountPerSubgroup=*/8, bestKElementCountPerSubgroup * 2};
} else {
seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
/*bestMNTileCountPerSubgroup=*/16,
/*bestKTileCountPerSubgroup=*/4,
/*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / 2 /
inBitWidth};
/*bestMNTileCountPerSubgroup=*/8,
/*bestKTileCountPerSubgroup=*/4, bestKElementCountPerSubgroup};
}

int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();
// We target slightly below the full available shared Memory to leave room for
// `GPUReduceBankConflictsPass` that will pad shared memory without keeping
// track of usage. We can drop this after solving
// https://github.com/iree-org/iree/issues/19675
int64_t maxSharedMemoryBytes =
target.getWgp().getMaxWorkgroupMemoryBytes() - 64 * inBitWidth;

// First try to find a schedule with an exactly matching intrinsic.
std::optional<GPUMMASchedule> schedule = deduceMMASchedule(
Expand All @@ -176,7 +181,8 @@ static FailureOr<std::pair<LoweringConfigAttr, int64_t>>
getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
ArrayRef<AffineMap> maps,
ArrayRef<Value> operands,
IREE::GPU::TargetAttr target) {
IREE::GPU::TargetAttr target,
bool isIGEMM) {
if (target.getWgp().getMma().empty())
return failure();

Expand Down Expand Up @@ -244,7 +250,7 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
bool mustBeAligned = true;
bool doCPromotion = false;
std::optional<GPUMMASchedule> schedule = getMmaScheduleFromProblemAndTarget(
target, problem, transposedLhs, transposedRhs);
target, problem, transposedLhs, transposedRhs, isIGEMM);

// TODO (nirvedhmeshram, qedawkins): The performance with this will be bad if
// the GEMM is accumulating (i.e doesnt have a zero fill dpsInit) as that
Expand All @@ -254,9 +260,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
LDBG("Attempting to deduce unaligned TileAndFuse MMA schedulee");
mustBeAligned = false;
doCPromotion = true;
schedule = getMmaScheduleFromProblemAndTarget(target, problem,
transposedLhs, transposedRhs,
mustBeAligned, doCPromotion);
schedule = getMmaScheduleFromProblemAndTarget(
target, problem, transposedLhs, transposedRhs, isIGEMM, mustBeAligned,
doCPromotion);
}

if (!schedule) {
Expand Down Expand Up @@ -379,7 +385,8 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target,
SmallVector<int64_t> bounds = igemmLoopBounds.value();
FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
getMatmulLoweringConfigAndWorkgroupSize(
bounds, igemmContractionMaps.value(), igemmOperands.value(), target);
bounds, igemmContractionMaps.value(), igemmOperands.value(), target,
/*isIGEMM=*/true);
if (failed(configAndWgSize)) {
return failure();
}
Expand Down Expand Up @@ -422,7 +429,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
LDBG("Matmul TileAndFuse Config");

FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target);
getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target,
/*isIGEMM=*/false);
if (failed(configAndWgSize)) {
return failure();
}
Expand Down
11 changes: 7 additions & 4 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
namespace mlir::iree_compiler {

llvm::cl::opt<bool> clGPUTestTileAndFuseMatmul(
"iree-codegen-llvmgpu-test-tile-and-fuse-matmul",
llvm::cl::opt<bool> clGPUEnableTileAndFuseMatmul(
"iree-codegen-llvmgpu-enable-tile-and-fuse-matmul",
llvm::cl::desc("test the the tile and fuse pipeline for matmul"),
llvm::cl::init(false));
llvm::cl::init(true));

llvm::cl::opt<bool> clGPUTestTileAndFuseVectorize(
"iree-codegen-llvmgpu-test-tile-and-fuse-vectorize",
Expand Down Expand Up @@ -620,6 +620,9 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
/*canUpcastAcc=*/true);
}

LDBG("transposedLhs: " << transposedLhs);
LDBG("transposedRhs: " << transposedRhs);

// Only batch_matmul is supported in the LLVMGPUPadAndVectorDistribute
// pipeline.
// TODO(hanchung): Support cases that there are fused producers.
Expand Down Expand Up @@ -2352,7 +2355,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
LDBG("Tile and fuse data tiled multi_mma config");
return success();
}
if (clGPUTestTileAndFuseMatmul) {
if (clGPUEnableTileAndFuseMatmul) {
if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
computeOp))) {
LDBG("Tile and fuse matmul config");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
// RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
// RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
// RUN: --iree-codegen-llvmgpu-use-igemm=false \
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s

Expand Down Expand Up @@ -39,7 +39,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 4]
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
// CHECK-SAME: subgroup = [1, 1, 4, 1, 0]
// CHECK-SAME: workgroup = [1, 1, 64, 64, 0]

Expand Down Expand Up @@ -74,7 +74,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 4, 1]
// CHECK-SAME: reduction = [0, 0, 0, 0, 8, 1]
// CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0]
// CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0]

Expand Down Expand Up @@ -136,9 +136,9 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
// CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
// CHECK-SAME: promote_operands = [0, 1]
// CHECK-SAME: reduction = [0, 0, 2]
// CHECK-SAME: subgroup = [4, 4, 0]
// CHECK-SAME: workgroup = [128, 128, 0]
// CHECK-SAME: reduction = [0, 0, 4]
// CHECK-SAME: subgroup = [2, 4, 0]
// CHECK-SAME: workgroup = [64, 128, 0]

// -----

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --iree-codegen-llvmgpu-use-vector-distribution \
// RUN: --iree-codegen-llvmgpu-enable-tile-and-fuse-matmul=false \
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=WMMA

// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \
// RUN: --iree-codegen-llvmgpu-use-unaligned-gemm-vector-distribution --iree-codegen-llvmgpu-use-igemm=false \
// RUN: --iree-codegen-llvmgpu-enable-tile-and-fuse-matmul=false \
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s

// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1013,10 +1013,8 @@ hal.executable public @main {
// CHECK: scf.yield %[[REDUCE]]

// CHECK: scf.for %{{.*}} = %{{.*}} to %c16 step %c1
// CHECK: scf.for
// CHECK-COUNT-4: arith.addf {{.*}} : vector<9xf32>
// CHECK: vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>

// CHECK-COUNT-4: arith.addf {{.*}} : vector<9x9xf32>
// CHECK: vector.transfer_write {{.*}} vector<9x9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
// -----

#pipeline_layout = #hal.pipeline.layout<bindings = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
return %1 : tensor<384x128xf32>
}
// CHECK: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0]]>
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64,
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64,
// CHECK: func @custom_op
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.custom_op
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: ^bb
// CHECK: linalg.matmul
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 32], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 16], subgroup = [2, 2, 0], workgroup = [64, 64, 0]}>
// CHECK: iree_linalg_ext.yield

// -----
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \
// RUN: --iree-gpu-test-target=sm_60 %s | FileCheck %s
// RUN: --iree-gpu-test-target=sm_60 --iree-codegen-llvmgpu-enable-tile-and-fuse-matmul=false %s | FileCheck %s
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \
// RUN: --iree-gpu-test-target=sm_80 %s | FileCheck %s --check-prefix=SM80
// RUN: --iree-gpu-test-target=sm_80 --iree-codegen-llvmgpu-enable-tile-and-fuse-matmul=false %s | FileCheck %s --check-prefix=SM80

// Transform dialect attributes are tested separately.

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))' --iree-gpu-test-target=sm_80 -split-input-file %s -o - | FileCheck %s
// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant( \
// RUN: builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))' \
// RUN: --iree-codegen-llvmgpu-enable-tile-and-fuse-matmul=false --iree-gpu-test-target=sm_80 -split-input-file %s -o - | FileCheck %s

// This test checks that the lowering of nvvm includes the extraction
// and optimization of address computations.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_80 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))" -iree-codegen-llvmgpu-use-mma-sync %s | FileCheck %s
// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_80 \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant( \
// RUN: builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))" \
// RUN: --iree-codegen-llvmgpu-enable-tile-and-fuse-matmul=false -iree-codegen-llvmgpu-use-mma-sync %s | FileCheck %s

// Verify that a simple element wise op gets lowered succefully all the way to
// nvvm/llvm dialect via mma.sync path.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))" -iree-codegen-llvmgpu-use-wmma %s | FileCheck %s
// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_80 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))" -iree-codegen-llvmgpu-use-wmma %s | FileCheck %s --check-prefix=SM80
// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant( \
// RUN: builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))" \
// RUN: --iree-codegen-llvmgpu-enable-tile-and-fuse-matmul=false -iree-codegen-llvmgpu-use-wmma %s | FileCheck %s
// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_80 \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant( \
// RUN: builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-nvvm-pipeline)))" \
// RUN: --iree-codegen-llvmgpu-enable-tile-and-fuse-matmul=false -iree-codegen-llvmgpu-use-wmma %s | FileCheck %s --check-prefix=SM80

// Verify that a simple element wise op gets lowered succefully all the way to
// nvvm/llvm dialect.
Expand Down
Loading