From 5ba1755ec703bda880e876525e3effe72fd452ab Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Mon, 13 Jan 2025 09:28:01 -0600 Subject: [PATCH] give seperate heuristics to IGEMM Signed-off-by: Nirvedh Meshram --- .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 20 ++++++++++--------- .../ROCDL/config_igemm_tile_and_fuse.mlir | 12 +++++------ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index fe18a4bc6fad1..4f03ba8591ca4 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -118,7 +118,7 @@ LogicalResult setDataTiledMultiMmaLoweringConfig( /// problem based on the available mma intrinsics. static std::optional getMmaScheduleFromProblemAndTarget( IREE::GPU::TargetAttr target, GPUMatmulShapeType problem, - bool transposedLhs, bool transposedRhs, bool mustBeAligned = true, + bool transposedLhs, bool transposedRhs, bool isIGEMM, bool mustBeAligned = true, bool doCPromotion = false) { const int64_t targetSubgroupSize = target.getPreferredSubgroupSize(); SmallVector intrinsics; @@ -142,6 +142,8 @@ static std::optional getMmaScheduleFromProblemAndTarget( // See https://github.com/iree-org/iree/issues/16341 for details. int64_t mSize = ShapedType::getNumElements(problem.mSizes); int64_t nSize = ShapedType::getNumElements(problem.nSizes); + int64_t cacheLineSizeElements = kCacheLineSizeBits / inBitWidth; + int64_t bestKElementCountPerSubgroup = isIGEMM? cacheLineSizeElements / 2: cacheLineSizeElements; if (mSize * nSize <= 512 * 512) { // For matmuls with small M*N size, we want to distribute M*N onto more // workgroups to fill the GPU. Use a smaller bestMNTileCountPerSubgroup @@ -149,13 +151,13 @@ static std::optional getMmaScheduleFromProblemAndTarget( seeds = {/*bestSubgroupCountPerWorkgroup=*/4, /*bestMNTileCountPerSubgroup=*/4, /*bestKTileCountPerSubgroup=*/8, - /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits * 2 / - inBitWidth}; + bestKElementCountPerSubgroup*2}; } else { + int64_t bestKElementCountPerSubgroup = isIGEMM? cacheLineSizeElements /2 : cacheLineSizeElements; seeds = {/*bestSubgroupCountPerWorkgroup=*/4, /*bestMNTileCountPerSubgroup=*/16, /*bestKTileCountPerSubgroup=*/4, - /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth}; + bestKElementCountPerSubgroup}; } // We target slightly below the full available shared Memory to leave room for @@ -181,7 +183,7 @@ static FailureOr> getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, ArrayRef maps, ArrayRef operands, - IREE::GPU::TargetAttr target) { + IREE::GPU::TargetAttr target, bool isIGEMM) { if (target.getWgp().getMma().empty()) return failure(); @@ -249,7 +251,7 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, bool mustBeAligned = true; bool doCPromotion = false; std::optional schedule = getMmaScheduleFromProblemAndTarget( - target, problem, transposedLhs, transposedRhs); + target, problem, transposedLhs, transposedRhs, isIGEMM); // TODO (nirvedhmeshram, qedawkins): The performance with this will be bad if // the GEMM is accumulating (i.e doesnt have a zero fill dpsInit) as that @@ -260,7 +262,7 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, mustBeAligned = false; doCPromotion = true; schedule = getMmaScheduleFromProblemAndTarget(target, problem, - transposedLhs, transposedRhs, + transposedLhs, transposedRhs, isIGEMM, mustBeAligned, doCPromotion); } @@ -384,7 +386,7 @@ setIGEMMConvolutionLoweringConfig(IREE::GPU::TargetAttr target, SmallVector bounds = igemmLoopBounds.value(); FailureOr> configAndWgSize = getMatmulLoweringConfigAndWorkgroupSize( - bounds, igemmContractionMaps.value(), igemmOperands.value(), target); + bounds, igemmContractionMaps.value(), igemmOperands.value(), target, /*isIGEMM=*/true); if (failed(configAndWgSize)) { return failure(); } @@ -434,7 +436,7 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, LDBG("Matmul TileAndFuse Config"); FailureOr> configAndWgSize = - getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target); + getMatmulLoweringConfigAndWorkgroupSize(bounds, maps, operands, target, /*isIGEMM=*/false); if (failed(configAndWgSize)) { return failure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir index c1be57b949036..d8af22e58664c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir @@ -24,7 +24,7 @@ func.func @nhwc_conv_mfma() { // CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config // CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: promote_operands = [0, 1] -// CHECK-SAME: reduction = [0, 0, 0, 0, 16] +// CHECK-SAME: reduction = [0, 0, 0, 0, 8] // CHECK-SAME: subgroup = [1, 2, 2, 1, 0] // CHECK-SAME: workgroup = [1, 2, 32, 64, 0] @@ -53,7 +53,7 @@ func.func @nchw_conv_mfma() { // CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config // CHECK-SAME: mma_kind = #iree_gpu.mma_layout // CHECK-SAME: promote_operands = [0, 1] -// CHECK-SAME: reduction = [0, 0, 0, 0, 16] +// CHECK-SAME: reduction = [0, 0, 0, 0, 8] // CHECK-SAME: subgroup = [1, 2, 2, 1, 0] // CHECK-SAME: workgroup = [1, 64, 2, 32, 0] @@ -81,9 +81,9 @@ func.func @nhwc_conv_unaligned_mfma() { // CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config // CHECK-SAME: mma_kind = #iree_gpu.mma_layout -// CHECK-SAME: padding = [2, 1, 32, 64, 64] +// CHECK-SAME: padding = [2, 1, 32, 64, 32] // CHECK-SAME: promote_operands = [0, 1, 2] -// CHECK-SAME: reduction = [0, 0, 0, 0, 16] +// CHECK-SAME: reduction = [0, 0, 0, 0, 8] // CHECK-SAME: subgroup = [2, 1, 2, 1, 0] // CHECK-SAME: workgroup = [2, 1, 32, 64, 0] @@ -111,8 +111,8 @@ func.func @nchw_conv_unaligned_mfma() { // CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config // CHECK-SAME: mma_kind = #iree_gpu.mma_layout -// CHECK-SAME: padding = [1, 64, 2, 32, 64] +// CHECK-SAME: padding = [1, 64, 2, 32, 32] // CHECK-SAME: promote_operands = [0, 1, 2] -// CHECK-SAME: reduction = [0, 0, 0, 0, 16] +// CHECK-SAME: reduction = [0, 0, 0, 0, 8] // CHECK-SAME: subgroup = [1, 2, 2, 1, 0] // CHECK-SAME: workgroup = [1, 64, 2, 32, 0]