From 7a35663b3cedb92e61ea4c1311167828a329c7f0 Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Thu, 30 Jan 2025 11:49:07 -0600 Subject: [PATCH 1/7] [GPU] Only dont do padding for pure matvecs Signed-off-by: Nirvedh Meshram --- .../Codegen/Common/GPU/GPUHeuristics.cpp | 52 ++++++++++++------- .../Codegen/Common/GPU/GPUHeuristics.h | 18 ++++--- .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 15 ++++-- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 12 ++--- .../test/ROCDL/config_tile_and_fuse.mlir | 10 ++-- .../compiler/Codegen/SPIRV/KernelConfig.cpp | 2 +- .../Preprocessing/Common/PadToIntrinsics.cpp | 6 +-- 7 files changed, 71 insertions(+), 44 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp index f8e30f31a961..ef1e01e1e45b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp @@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor; namespace mlir::iree_compiler { +// Threshold used to determine whether a matmul dimension is 'very skinny'. +constexpr int64_t kVerySkinnyDimThreshold = 4; + template static llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const llvm::SmallVectorImpl &vector) { @@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule, static bool isScheduleAligned(const GPUMatmulShapeType &problem, const GPUMMASchedule &schedule, bool mustBeAligned) { - SmallVector alignedMSizes(problem.mSizes); + SmallVector alignedMSizes(problem.mSizes); alignedMSizes.back() = mustBeAligned ? problem.mSizes.back() : llvm::divideCeil(problem.mSizes.back(), schedule.mSize) * schedule.mSize; - SmallVector alignedNSizes(problem.nSizes); + SmallVector alignedNSizes(problem.nSizes); alignedNSizes.back() = mustBeAligned ? problem.nSizes.back() : llvm::divideCeil(problem.nSizes.back(), schedule.nSize) * schedule.nSize; - SmallVector alignedKSizes(problem.kSizes); + SmallVector alignedKSizes(problem.kSizes); alignedKSizes.back() = mustBeAligned ? problem.kSizes.back() : llvm::divideCeil(problem.kSizes.back(), schedule.kSize) * @@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem, }; // Checks whether the elements of `a` are evenly divisible by the // corresponding elements of `b`. - auto areAligned = [](SmallVector a, SmallVector b) { + auto areAligned = [](SmallVector a, SmallVector b) { for (auto [aVal, bVal] : llvm::zip_equal(a, b)) { if (aVal % bVal != 0) { return false; @@ -223,6 +226,7 @@ static FailureOr fitScheduleInSharedMemory( static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, const GPUMatmulShapeType &intrinsic, + int64_t preferredSubgroupSize, bool canUpcastAcc, bool mustBeAligned) { assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 && intrinsic.kSizes.size() == 1 && @@ -240,12 +244,19 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, } } - if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 || - problem.nSizes.back() % intrinsic.nSizes[0] != 0 || - problem.kSizes.back() % intrinsic.kSizes[0] != 0)) { - return failure(); // Cannot use this intrinsic for misaligned cases. + if (mustBeAligned) { + if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 || + problem.nSizes.back() % intrinsic.nSizes[0] != 0 || + problem.kSizes.back() % intrinsic.kSizes[0] != 0)) { + return failure(); + } + return success(); } + // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction + // pipeline, similar to matvec. Note: Because of reassociation in the vector + // reduction pipeline, this may lead to precission loss. If this ever becomes + // an issue, we can hide this behind a flag. // TODO: Figure out what the precise cutoff is, this may be machine dependent. // In situation when alignment isn't required, we disallow intrinsics to be // picked if the tile size is too small. For example, this will force a matmul @@ -255,10 +266,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, // established after we sweep the different tile sizes for a problem config. // Once a precise threshold is established, replace 4 with the threshold and // remove this todo. - if (!mustBeAligned && - (problem.mSizes.back() < 4 || problem.nSizes.back() < 4 || - problem.kSizes.back() < 4)) { - return failure(); + if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(), + problem.kSizes.size(), size_t{1}}) && + problem.batchSizes.empty()) { + int64_t mSize = problem.mSizes.back(); + int64_t nSize = problem.nSizes.back(); + if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) || + (nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) { + return failure(); + } } return success(); } @@ -279,8 +295,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem, // 16x16x16 intrinsic, then: // - mTotalTileCounts would be 4 * (16/16) = 4 // - nTotalTileCounts would be 2 * (32/16) = 4 - SmallVector mTotalTileCounts = problem.mSizes; - SmallVector nTotalTileCounts = problem.nSizes; + SmallVector mTotalTileCounts = problem.mSizes; + SmallVector nTotalTileCounts = problem.nSizes; mTotalTileCounts.back() = llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]); nTotalTileCounts.back() = @@ -361,7 +377,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem, // For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a // 16x16x16 intrinsic, then: // - kTotalTileCounts would be 3 * (128/16) = 24 - SmallVector kTotalTileCounts = problem.kSizes; + SmallVector kTotalTileCounts = problem.kSizes; kTotalTileCounts.back() = llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]); // Compute the ideal number of intrinsics along K per subgroup based on the @@ -396,7 +412,7 @@ FailureOr deduceMMASchedule( bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) { for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc, - mustBeAligned))) { + subgroupSize, mustBeAligned))) { continue; } @@ -451,12 +467,12 @@ FailureOr deduceAttentionSchedule( "unimplemented: multi M/N/K attention schedule"); for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc, - mustBeAligned))) { + subgroupSize, mustBeAligned))) { continue; } if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc, - mustBeAligned))) { + subgroupSize, mustBeAligned))) { continue; } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h index a35e2b464632..f09edd729952 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h @@ -10,18 +10,22 @@ namespace mlir::iree_compiler { /// Struct containing information about a matmul's shape and type. struct GPUMatmulShapeType { - SmallVector mSizes; - SmallVector nSizes; - SmallVector kSizes; + SmallVector mSizes; + SmallVector nSizes; + SmallVector kSizes; + SmallVector batchSizes; Type aType; Type bType; Type cType; GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c) - : mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {} - GPUMatmulShapeType(SmallVector m, SmallVector n, - SmallVector k, Type a, Type b, Type c) - : mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {} + : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a), + bType(b), cType(c) {} + GPUMatmulShapeType(SmallVector m, SmallVector n, + SmallVector k, SmallVector batch, + Type a, Type b, Type c) + : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b), + cType(c) {} }; /// Struct containing seed tile sizes for GPU MMA heuristics deduction logic. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 7b62d6955b10..6533115e8b49 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -121,7 +121,7 @@ static std::optional getMmaScheduleFromProblemAndTarget( bool transposedLhs, bool transposedRhs, bool mustBeAligned = true, bool doCPromotion = false) { const int64_t targetSubgroupSize = target.getPreferredSubgroupSize(); - SmallVector intrinsics; + SmallVector intrinsics; for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { // Intrinsics that do not specify a scope cannot be distributed. if (failed(mma.getMmaScope())) @@ -202,7 +202,7 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, // Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when // computing an MMA schedule. - SmallVector mDims, nDims, kDims; + SmallVector mDims, nDims, kDims, batchDims; for (auto mDim : contractionDims.m) { if (!ShapedType::isDynamic(bounds[mDim])) { mDims.push_back(mDim); @@ -219,6 +219,12 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, } } + for (auto batchDim : contractionDims.batch) { + if (!ShapedType::isDynamic(bounds[batchDim])) { + batchDims.push_back(batchDim); + } + } + auto getDimBounds = [&](SmallVector dims) -> SmallVector { return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; }); }; @@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, Type initElemType = getElementTypeOrSelf(init); GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims), - getDimBounds(kDims), lhsElemType, - rhsElemType, initElemType}; + getDimBounds(kDims), getDimBounds(batchDims), + lhsElemType, rhsElemType, + initElemType}; // Infer if lhs or rhs is transposed to help generate better schedule. // TODO: Drop this. This is only a consideration for other pipelines. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 2ee7e8241c17..1a19b78677fd 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -339,7 +339,7 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target, // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, - SmallVector &intrinsics, + SmallVector &intrinsics, SmallVector &mmaKinds) { auto [mSize, nSize, kSize] = mma.getMNKShape(); auto [aType, bType, cType] = mma.getABCElementTypes(); @@ -347,7 +347,7 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target, mmaKinds.emplace_back(mma); }; - SmallVector intrinsics; + SmallVector intrinsics; intrinsics.reserve(target.getWgp().getMma().size()); SmallVector mmaKinds; MLIRContext *context = op.getContext(); @@ -546,7 +546,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, - SmallVector &intrinsics, + SmallVector &intrinsics, SmallVector &mmaKinds) { auto [mSize, nSize, kSize] = mma.getMNKShape(); auto [aType, bType, cType] = mma.getABCElementTypes(); @@ -554,7 +554,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, mmaKinds.emplace_back(mma); }; - SmallVector intrinsics; + SmallVector intrinsics; intrinsics.reserve(target.getWgp().getMma().size()); SmallVector mmaKinds; MLIRContext *context = op.getContext(); @@ -766,7 +766,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig( // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, - SmallVector &intrinsics, + SmallVector &intrinsics, SmallVector &mmaKinds) { auto [mSize, nSize, kSize] = mma.getMNKShape(); auto [aType, bType, cType] = mma.getABCElementTypes(); @@ -774,7 +774,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig( mmaKinds.emplace_back(mma); }; - SmallVector intrinsics; + SmallVector intrinsics; intrinsics.reserve(target.getWgp().getMma().size()); SmallVector mmaKinds; MLIRContext *context = op.getContext(); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index 910eca3c1768..ec6038f47dee 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -282,12 +282,12 @@ module { // ----- module { -func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> { +func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> { %c0 = arith.constant 0.0 : f32 - %empty = tensor.empty() : tensor<12x577x577xf32> - %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> - %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> - return %mm : tensor<12x577x577xf32> + %empty = tensor.empty() : tensor<12x2x577xf32> + %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32> + %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32> + return %mm : tensor<12x2x577xf32> } } diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp index bbdec5c83f6d..1ef0e2ceb311 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp @@ -891,7 +891,7 @@ setCooperativeMatrixConfig(IREE::GPU::TargetAttr target, linalg::LinalgOp op, // just the first element. GPUMatmulShapeType problem(dimM, dimN, dimK, lhsElem, rhsElem, initElem); - SmallVector intrinsics; + SmallVector intrinsics; intrinsics.reserve(target.getWgp().getMma().size()); for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { auto [mSize, nSize, kSize] = mma.getMNKShape(); diff --git a/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp b/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp index 922e50882775..d4b485ac096c 100644 --- a/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp +++ b/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp @@ -145,7 +145,7 @@ expandMapsAndIterators(SmallVector &expandedMaps, } } -static SmallVector +static SmallVector getIntrinsics(linalg::LinalgOp linalgOp, ArrayRef executableTargets) { IREE::GPU::TargetAttr target; @@ -179,7 +179,7 @@ padConvOp(RewriterBase &rewriter, linalg::LinalgOp linalgOp, return; // Early exit if cannot find intrinsics or if multiple executable targets. - SmallVector intrinsics = + SmallVector intrinsics = getIntrinsics(linalgOp, executableTargets); if (intrinsics.empty()) return; @@ -326,7 +326,7 @@ static void padContractionLikeOp( } // Early exit if cannot find intrinsics or if multiple executable targets. - SmallVector intrinsics = + SmallVector intrinsics = getIntrinsics(linalgOp, executableTargets); if (intrinsics.empty()) return; From 9911ef44569060a9b1258a54b61b2f47f893956a Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Fri, 31 Jan 2025 14:15:56 -0600 Subject: [PATCH 2/7] add batch in vector distribute Signed-off-by: Nirvedh Meshram --- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 1a19b78677fd..b5d03c20a970 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]); } + SmallVector batchDims; + for (auto batchDim : contractionDims->batch) { + if (!ShapedType::isDynamic(bounds[batchDim])) { + batchDims.push_back(batchDim); + } + } + auto getDimBounds = [&](SmallVector dims) -> SmallVector { + return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; }); + }; + // TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules // once the pipeline is able to support it. After adding multiple dimensions, // all instances of schedule->m/nSubgroupCounts[0] and // schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of // just the first element. - GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim], - lhsElemType, rhsElemType, initElemType}; + GPUMatmulShapeType problem{ + {bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims), + lhsElemType, rhsElemType, initElemType}; // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, From 84cc41df6a29bfc6b05b0ff131496e49b1bcdded Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Mon, 3 Feb 2025 14:57:52 -0600 Subject: [PATCH 3/7] address reviwer comments Signed-off-by: Nirvedh Meshram --- .../src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp | 4 +--- .../src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp index ef1e01e1e45b..ff20a3eed216 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp @@ -254,9 +254,7 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, } // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction - // pipeline, similar to matvec. Note: Because of reassociation in the vector - // reduction pipeline, this may lead to precission loss. If this ever becomes - // an issue, we can hide this behind a flag. + // pipeline, similar to matvec. // TODO: Figure out what the precise cutoff is, this may be machine dependent. // In situation when alignment isn't required, we disallow intrinsics to be // picked if the tile size is too small. For example, this will force a matmul diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h index f09edd729952..6542d11ebf18 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h @@ -21,9 +21,9 @@ struct GPUMatmulShapeType { GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c) : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a), bType(b), cType(c) {} - GPUMatmulShapeType(SmallVector m, SmallVector n, - SmallVector k, SmallVector batch, - Type a, Type b, Type c) + GPUMatmulShapeType(ArrayRef m, ArrayRef n, + ArrayRef k, ArrayRef batch, Type a, + Type b, Type c) : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b), cType(c) {} }; From 9e0462b7ed0e284e95176ff69e13d2ae727ce584 Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Mon, 3 Feb 2025 15:33:01 -0600 Subject: [PATCH 4/7] Reviwer comments Signed-off-by: Nirvedh Meshram --- .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 10 +++++----- .../iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp | 14 +++++++------- .../iree/compiler/Codegen/SPIRV/KernelConfig.cpp | 2 +- .../Preprocessing/Common/PadToIntrinsics.cpp | 6 +++--- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 6533115e8b49..48bfcc9a7c2a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -121,7 +121,7 @@ static std::optional getMmaScheduleFromProblemAndTarget( bool transposedLhs, bool transposedRhs, bool mustBeAligned = true, bool doCPromotion = false) { const int64_t targetSubgroupSize = target.getPreferredSubgroupSize(); - SmallVector intrinsics; + SmallVector intrinsics; for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { // Intrinsics that do not specify a scope cannot be distributed. if (failed(mma.getMmaScope())) @@ -203,23 +203,23 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when // computing an MMA schedule. SmallVector mDims, nDims, kDims, batchDims; - for (auto mDim : contractionDims.m) { + for (int64_t mDim : contractionDims.m) { if (!ShapedType::isDynamic(bounds[mDim])) { mDims.push_back(mDim); } } - for (auto nDim : contractionDims.n) { + for (int64_t nDim : contractionDims.n) { if (!ShapedType::isDynamic(bounds[nDim])) { nDims.push_back(nDim); } } - for (auto kDim : contractionDims.k) { + for (int64_t kDim : contractionDims.k) { if (!ShapedType::isDynamic(bounds[kDim])) { kDims.push_back(kDim); } } - for (auto batchDim : contractionDims.batch) { + for (int64_t batchDim : contractionDims.batch) { if (!ShapedType::isDynamic(bounds[batchDim])) { batchDims.push_back(batchDim); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index b5d03c20a970..8833b4203156 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -339,7 +339,7 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target, // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, - SmallVector &intrinsics, + SmallVector &intrinsics, SmallVector &mmaKinds) { auto [mSize, nSize, kSize] = mma.getMNKShape(); auto [aType, bType, cType] = mma.getABCElementTypes(); @@ -347,7 +347,7 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target, mmaKinds.emplace_back(mma); }; - SmallVector intrinsics; + SmallVector intrinsics; intrinsics.reserve(target.getWgp().getMma().size()); SmallVector mmaKinds; MLIRContext *context = op.getContext(); @@ -537,7 +537,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, } SmallVector batchDims; - for (auto batchDim : contractionDims->batch) { + for (int64_t batchDim : contractionDims->batch) { if (!ShapedType::isDynamic(bounds[batchDim])) { batchDims.push_back(batchDim); } @@ -557,7 +557,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, - SmallVector &intrinsics, + SmallVector &intrinsics, SmallVector &mmaKinds) { auto [mSize, nSize, kSize] = mma.getMNKShape(); auto [aType, bType, cType] = mma.getABCElementTypes(); @@ -565,7 +565,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, mmaKinds.emplace_back(mma); }; - SmallVector intrinsics; + SmallVector intrinsics; intrinsics.reserve(target.getWgp().getMma().size()); SmallVector mmaKinds; MLIRContext *context = op.getContext(); @@ -777,7 +777,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig( // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, - SmallVector &intrinsics, + SmallVector &intrinsics, SmallVector &mmaKinds) { auto [mSize, nSize, kSize] = mma.getMNKShape(); auto [aType, bType, cType] = mma.getABCElementTypes(); @@ -785,7 +785,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig( mmaKinds.emplace_back(mma); }; - SmallVector intrinsics; + SmallVector intrinsics; intrinsics.reserve(target.getWgp().getMma().size()); SmallVector mmaKinds; MLIRContext *context = op.getContext(); diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp index 1ef0e2ceb311..bbdec5c83f6d 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp @@ -891,7 +891,7 @@ setCooperativeMatrixConfig(IREE::GPU::TargetAttr target, linalg::LinalgOp op, // just the first element. GPUMatmulShapeType problem(dimM, dimN, dimK, lhsElem, rhsElem, initElem); - SmallVector intrinsics; + SmallVector intrinsics; intrinsics.reserve(target.getWgp().getMma().size()); for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) { auto [mSize, nSize, kSize] = mma.getMNKShape(); diff --git a/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp b/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp index d4b485ac096c..922e50882775 100644 --- a/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp +++ b/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp @@ -145,7 +145,7 @@ expandMapsAndIterators(SmallVector &expandedMaps, } } -static SmallVector +static SmallVector getIntrinsics(linalg::LinalgOp linalgOp, ArrayRef executableTargets) { IREE::GPU::TargetAttr target; @@ -179,7 +179,7 @@ padConvOp(RewriterBase &rewriter, linalg::LinalgOp linalgOp, return; // Early exit if cannot find intrinsics or if multiple executable targets. - SmallVector intrinsics = + SmallVector intrinsics = getIntrinsics(linalgOp, executableTargets); if (intrinsics.empty()) return; @@ -326,7 +326,7 @@ static void padContractionLikeOp( } // Early exit if cannot find intrinsics or if multiple executable targets. - SmallVector intrinsics = + SmallVector intrinsics = getIntrinsics(linalgOp, executableTargets); if (intrinsics.empty()) return; From 97e9cb4c1f662ff0a7472c3f4a9c606edeaf09ec Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Mon, 3 Feb 2025 16:39:26 -0600 Subject: [PATCH 5/7] bump as CI seems stuck Signed-off-by: Nirvedh Meshram From f6136345b1e88de1ef6dbcec62c6a6c42ed2b1cc Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Mon, 3 Feb 2025 17:09:54 -0600 Subject: [PATCH 6/7] fix new test --- .../src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir index baaed8c1f81e..5c4fe8ac991c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir @@ -343,11 +343,11 @@ func.func @not_vmt() { return } -// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info}> +// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info}> // CHECK: func.func @not_vmt() // CHECK-SAME: translation_info = #[[$TRANSLATION]] // CHECK: linalg.generic -// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 8], thread = [1, 128, 0], workgroup = [1, 128, 1]}> +// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, padding = [16, 256, 64], promote_operands = [0, 1, 2], reduction = [0, 0, 4], subgroup = [1, 4, 0], workgroup = [16, 256, 0]}> // ----- From e1c32173528e32e5e786b118d6ba3c12996e5214 Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram Date: Mon, 3 Feb 2025 17:47:42 -0600 Subject: [PATCH 7/7] fix bug in cantargetintrinisic ordering Signed-off-by: Nirvedh Meshram --- .../compiler/Codegen/Common/GPU/GPUHeuristics.cpp | 12 ++++++------ .../compiler/Codegen/LLVMGPU/test/config_matvec.mlir | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp index ff20a3eed216..669d1d1eb539 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp @@ -409,8 +409,8 @@ FailureOr deduceMMASchedule( int64_t subgroupSize, bool transposedLhs, bool transposedRhs, bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) { for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { - if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc, - subgroupSize, mustBeAligned))) { + if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } @@ -464,13 +464,13 @@ FailureOr deduceAttentionSchedule( qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 && "unimplemented: multi M/N/K attention schedule"); for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { - if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc, - subgroupSize, mustBeAligned))) { + if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } - if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc, - subgroupSize, mustBeAligned))) { + if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir index 5c4fe8ac991c..baaed8c1f81e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir @@ -343,11 +343,11 @@ func.func @not_vmt() { return } -// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info}> +// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info}> // CHECK: func.func @not_vmt() // CHECK-SAME: translation_info = #[[$TRANSLATION]] // CHECK: linalg.generic -// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout, padding = [16, 256, 64], promote_operands = [0, 1, 2], reduction = [0, 0, 4], subgroup = [1, 4, 0], workgroup = [16, 256, 0]}> +// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 8], thread = [1, 128, 0], workgroup = [1, 128, 1]}> // -----