Skip to content

Commit

Permalink
[GPU] Match Tile And Fuse skinny matmul bail-out to Vector Distribute (
Browse files Browse the repository at this point in the history
…#19857)

This PR matches the failure criteria to what we see in the
SetContractConfig for vector distribute for bailing out on skinny
matmuls.

The dispatch in #19855 goes to
0.068 ms vs the default path which gets 1.64 ms as this skinny matmul
with multiple dims cannot be currently supported by vector reduction and
warp reduction but tile and fuse can support it using padding.

This needs the flag
`--iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true`

Also the `GPUMatmulShapeType` was becoming too large as this PR adds
batch sizes to it and was giving the following error.
```
 error: static_assert failed due to requirement 'sizeof(mlir::iree_compiler::GPUMatmulShapeType) <= 256' "You are trying to use a default number of inlined elements for SmallVector<T> but sizeof(T) is really big! Please use an explicit number of inlined elements with SmallVector<T, N> to make sure you really want that much inline storage."
 ```
 This PR fixes this issue both by explictly mentioning vector sizes in the struct members.

---------

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
  • Loading branch information
nirvedhmeshram authored Feb 4, 2025
1 parent 1ed6350 commit d96a3f0
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 41 deletions.
56 changes: 35 additions & 21 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor;

namespace mlir::iree_compiler {

// Threshold used to determine whether a matmul dimension is 'very skinny'.
constexpr int64_t kVerySkinnyDimThreshold = 4;

template <typename T>
static llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
const llvm::SmallVectorImpl<T> &vector) {
Expand Down Expand Up @@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
static bool isScheduleAligned(const GPUMatmulShapeType &problem,
const GPUMMASchedule &schedule,
bool mustBeAligned) {
SmallVector<int64_t> alignedMSizes(problem.mSizes);
SmallVector<int64_t, 2> alignedMSizes(problem.mSizes);
alignedMSizes.back() =
mustBeAligned ? problem.mSizes.back()
: llvm::divideCeil(problem.mSizes.back(), schedule.mSize) *
schedule.mSize;
SmallVector<int64_t> alignedNSizes(problem.nSizes);
SmallVector<int64_t, 2> alignedNSizes(problem.nSizes);
alignedNSizes.back() =
mustBeAligned ? problem.nSizes.back()
: llvm::divideCeil(problem.nSizes.back(), schedule.nSize) *
schedule.nSize;
SmallVector<int64_t> alignedKSizes(problem.kSizes);
SmallVector<int64_t, 2> alignedKSizes(problem.kSizes);
alignedKSizes.back() =
mustBeAligned ? problem.kSizes.back()
: llvm::divideCeil(problem.kSizes.back(), schedule.kSize) *
Expand All @@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem,
};
// Checks whether the elements of `a` are evenly divisible by the
// corresponding elements of `b`.
auto areAligned = [](SmallVector<int64_t> a, SmallVector<int64_t> b) {
auto areAligned = [](SmallVector<int64_t, 2> a, SmallVector<int64_t, 2> b) {
for (auto [aVal, bVal] : llvm::zip_equal(a, b)) {
if (aVal % bVal != 0) {
return false;
Expand Down Expand Up @@ -223,6 +226,7 @@ static FailureOr<GPUMMASchedule> fitScheduleInSharedMemory(

static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
const GPUMatmulShapeType &intrinsic,
int64_t preferredSubgroupSize,
bool canUpcastAcc, bool mustBeAligned) {
assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 &&
intrinsic.kSizes.size() == 1 &&
Expand All @@ -240,12 +244,17 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
}
}

if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
return failure(); // Cannot use this intrinsic for misaligned cases.
if (mustBeAligned) {
if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
return failure();
}
return success();
}

// Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction
// pipeline, similar to matvec.
// TODO: Figure out what the precise cutoff is, this may be machine dependent.
// In situation when alignment isn't required, we disallow intrinsics to be
// picked if the tile size is too small. For example, this will force a matmul
Expand All @@ -255,10 +264,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
// established after we sweep the different tile sizes for a problem config.
// Once a precise threshold is established, replace 4 with the threshold and
// remove this todo.
if (!mustBeAligned &&
(problem.mSizes.back() < 4 || problem.nSizes.back() < 4 ||
problem.kSizes.back() < 4)) {
return failure();
if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(),
problem.kSizes.size(), size_t{1}}) &&
problem.batchSizes.empty()) {
int64_t mSize = problem.mSizes.back();
int64_t nSize = problem.nSizes.back();
if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) ||
(nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) {
return failure();
}
}
return success();
}
Expand All @@ -279,8 +293,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
// 16x16x16 intrinsic, then:
// - mTotalTileCounts would be 4 * (16/16) = 4
// - nTotalTileCounts would be 2 * (32/16) = 4
SmallVector<int64_t> mTotalTileCounts = problem.mSizes;
SmallVector<int64_t> nTotalTileCounts = problem.nSizes;
SmallVector<int64_t, 2> mTotalTileCounts = problem.mSizes;
SmallVector<int64_t, 2> nTotalTileCounts = problem.nSizes;
mTotalTileCounts.back() =
llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]);
nTotalTileCounts.back() =
Expand Down Expand Up @@ -361,7 +375,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
// For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a
// 16x16x16 intrinsic, then:
// - kTotalTileCounts would be 3 * (128/16) = 24
SmallVector<int64_t> kTotalTileCounts = problem.kSizes;
SmallVector<int64_t, 2> kTotalTileCounts = problem.kSizes;
kTotalTileCounts.back() =
llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]);
// Compute the ideal number of intrinsics along K per subgroup based on the
Expand Down Expand Up @@ -395,8 +409,8 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
int64_t subgroupSize, bool transposedLhs, bool transposedRhs,
bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) {
for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc,
mustBeAligned))) {
if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize,
canUpcastAcc, mustBeAligned))) {
continue;
}

Expand Down Expand Up @@ -450,13 +464,13 @@ FailureOr<GPUMMASchedule> deduceAttentionSchedule(
qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 &&
"unimplemented: multi M/N/K attention schedule");
for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc,
mustBeAligned))) {
if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize,
canUpcastAcc, mustBeAligned))) {
continue;
}

if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc,
mustBeAligned))) {
if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize,
canUpcastAcc, mustBeAligned))) {
continue;
}

Expand Down
18 changes: 11 additions & 7 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,22 @@ namespace mlir::iree_compiler {

/// Struct containing information about a matmul's shape and type.
struct GPUMatmulShapeType {
SmallVector<int64_t> mSizes;
SmallVector<int64_t> nSizes;
SmallVector<int64_t> kSizes;
SmallVector<int64_t, 2> mSizes;
SmallVector<int64_t, 2> nSizes;
SmallVector<int64_t, 2> kSizes;
SmallVector<int64_t, 2> batchSizes;
Type aType;
Type bType;
Type cType;

GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c)
: mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {}
GPUMatmulShapeType(SmallVector<int64_t> m, SmallVector<int64_t> n,
SmallVector<int64_t> k, Type a, Type b, Type c)
: mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {}
: mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a),
bType(b), cType(c) {}
GPUMatmulShapeType(ArrayRef<int64_t> m, ArrayRef<int64_t> n,
ArrayRef<int64_t> k, ArrayRef<int64_t> batch, Type a,
Type b, Type c)
: mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b),
cType(c) {}
};

/// Struct containing seed tile sizes for GPU MMA heuristics deduction logic.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,23 +202,29 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
// Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic
// dimensions will be tiled to 1 in workgroup tiling, so they are ignored when
// computing an MMA schedule.
SmallVector<int64_t> mDims, nDims, kDims;
for (auto mDim : contractionDims.m) {
SmallVector<int64_t> mDims, nDims, kDims, batchDims;
for (int64_t mDim : contractionDims.m) {
if (!ShapedType::isDynamic(bounds[mDim])) {
mDims.push_back(mDim);
}
}
for (auto nDim : contractionDims.n) {
for (int64_t nDim : contractionDims.n) {
if (!ShapedType::isDynamic(bounds[nDim])) {
nDims.push_back(nDim);
}
}
for (auto kDim : contractionDims.k) {
for (int64_t kDim : contractionDims.k) {
if (!ShapedType::isDynamic(bounds[kDim])) {
kDims.push_back(kDim);
}
}

for (int64_t batchDim : contractionDims.batch) {
if (!ShapedType::isDynamic(bounds[batchDim])) {
batchDims.push_back(batchDim);
}
}

auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
};
Expand All @@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
Type initElemType = getElementTypeOrSelf(init);

GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims),
getDimBounds(kDims), lhsElemType,
rhsElemType, initElemType};
getDimBounds(kDims), getDimBounds(batchDims),
lhsElemType, rhsElemType,
initElemType};

// Infer if lhs or rhs is transposed to help generate better schedule.
// TODO: Drop this. This is only a consideration for other pipelines.
Expand Down
15 changes: 13 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]);
}

SmallVector<int64_t> batchDims;
for (int64_t batchDim : contractionDims->batch) {
if (!ShapedType::isDynamic(bounds[batchDim])) {
batchDims.push_back(batchDim);
}
}
auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
};

// TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules
// once the pipeline is able to support it. After adding multiple dimensions,
// all instances of schedule->m/nSubgroupCounts[0] and
// schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of
// just the first element.
GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim],
lhsElemType, rhsElemType, initElemType};
GPUMatmulShapeType problem{
{bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims),
lhsElemType, rhsElemType, initElemType};

// Helper fn to store mma information.
auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,12 +282,12 @@ module {
// -----

module {
func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> {
func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> {
%c0 = arith.constant 0.0 : f32
%empty = tensor.empty() : tensor<12x577x577xf32>
%fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
%mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
return %mm : tensor<12x577x577xf32>
%empty = tensor.empty() : tensor<12x2x577xf32>
%fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
%mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
return %mm : tensor<12x2x577xf32>
}
}

Expand Down

0 comments on commit d96a3f0

Please sign in to comment.