Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Match Tile And Fuse skinny matmul bail-out to Vector Distribute #19857

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 35 additions & 21 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor;

namespace mlir::iree_compiler {

// Threshold used to determine whether a matmul dimension is 'very skinny'.
constexpr int64_t kVerySkinnyDimThreshold = 4;

template <typename T>
static llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
const llvm::SmallVectorImpl<T> &vector) {
Expand Down Expand Up @@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
static bool isScheduleAligned(const GPUMatmulShapeType &problem,
const GPUMMASchedule &schedule,
bool mustBeAligned) {
SmallVector<int64_t> alignedMSizes(problem.mSizes);
SmallVector<int64_t, 2> alignedMSizes(problem.mSizes);
alignedMSizes.back() =
mustBeAligned ? problem.mSizes.back()
: llvm::divideCeil(problem.mSizes.back(), schedule.mSize) *
schedule.mSize;
SmallVector<int64_t> alignedNSizes(problem.nSizes);
SmallVector<int64_t, 2> alignedNSizes(problem.nSizes);
alignedNSizes.back() =
mustBeAligned ? problem.nSizes.back()
: llvm::divideCeil(problem.nSizes.back(), schedule.nSize) *
schedule.nSize;
SmallVector<int64_t> alignedKSizes(problem.kSizes);
SmallVector<int64_t, 2> alignedKSizes(problem.kSizes);
alignedKSizes.back() =
mustBeAligned ? problem.kSizes.back()
: llvm::divideCeil(problem.kSizes.back(), schedule.kSize) *
Expand All @@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem,
};
// Checks whether the elements of `a` are evenly divisible by the
// corresponding elements of `b`.
auto areAligned = [](SmallVector<int64_t> a, SmallVector<int64_t> b) {
auto areAligned = [](SmallVector<int64_t, 2> a, SmallVector<int64_t, 2> b) {
for (auto [aVal, bVal] : llvm::zip_equal(a, b)) {
if (aVal % bVal != 0) {
return false;
Expand Down Expand Up @@ -223,6 +226,7 @@ static FailureOr<GPUMMASchedule> fitScheduleInSharedMemory(

static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
const GPUMatmulShapeType &intrinsic,
int64_t preferredSubgroupSize,
bool canUpcastAcc, bool mustBeAligned) {
assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 &&
intrinsic.kSizes.size() == 1 &&
Expand All @@ -240,12 +244,17 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
}
}

if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
return failure(); // Cannot use this intrinsic for misaligned cases.
if (mustBeAligned) {
if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
return failure();
}
return success();
}

// Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction
// pipeline, similar to matvec.
// TODO: Figure out what the precise cutoff is, this may be machine dependent.
// In situation when alignment isn't required, we disallow intrinsics to be
// picked if the tile size is too small. For example, this will force a matmul
Expand All @@ -255,10 +264,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
// established after we sweep the different tile sizes for a problem config.
// Once a precise threshold is established, replace 4 with the threshold and
// remove this todo.
if (!mustBeAligned &&
(problem.mSizes.back() < 4 || problem.nSizes.back() < 4 ||
problem.kSizes.back() < 4)) {
return failure();
if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(),
problem.kSizes.size(), size_t{1}}) &&
problem.batchSizes.empty()) {
int64_t mSize = problem.mSizes.back();
int64_t nSize = problem.nSizes.back();
if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) ||
(nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) {
return failure();
}
}
return success();
}
Expand All @@ -279,8 +293,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
// 16x16x16 intrinsic, then:
// - mTotalTileCounts would be 4 * (16/16) = 4
// - nTotalTileCounts would be 2 * (32/16) = 4
SmallVector<int64_t> mTotalTileCounts = problem.mSizes;
SmallVector<int64_t> nTotalTileCounts = problem.nSizes;
SmallVector<int64_t, 2> mTotalTileCounts = problem.mSizes;
SmallVector<int64_t, 2> nTotalTileCounts = problem.nSizes;
mTotalTileCounts.back() =
llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]);
nTotalTileCounts.back() =
Expand Down Expand Up @@ -361,7 +375,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
// For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a
// 16x16x16 intrinsic, then:
// - kTotalTileCounts would be 3 * (128/16) = 24
SmallVector<int64_t> kTotalTileCounts = problem.kSizes;
SmallVector<int64_t, 2> kTotalTileCounts = problem.kSizes;
kTotalTileCounts.back() =
llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]);
// Compute the ideal number of intrinsics along K per subgroup based on the
Expand Down Expand Up @@ -395,8 +409,8 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
int64_t subgroupSize, bool transposedLhs, bool transposedRhs,
bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) {
for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc,
mustBeAligned))) {
if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize,
canUpcastAcc, mustBeAligned))) {
continue;
}

Expand Down Expand Up @@ -450,13 +464,13 @@ FailureOr<GPUMMASchedule> deduceAttentionSchedule(
qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 &&
"unimplemented: multi M/N/K attention schedule");
for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc,
mustBeAligned))) {
if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize,
canUpcastAcc, mustBeAligned))) {
continue;
}

if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc,
mustBeAligned))) {
if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize,
canUpcastAcc, mustBeAligned))) {
continue;
}

Expand Down
18 changes: 11 additions & 7 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,22 @@ namespace mlir::iree_compiler {

/// Struct containing information about a matmul's shape and type.
struct GPUMatmulShapeType {
SmallVector<int64_t> mSizes;
SmallVector<int64_t> nSizes;
SmallVector<int64_t> kSizes;
SmallVector<int64_t, 2> mSizes;
SmallVector<int64_t, 2> nSizes;
SmallVector<int64_t, 2> kSizes;
SmallVector<int64_t, 2> batchSizes;
Type aType;
Type bType;
Type cType;

GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c)
: mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {}
GPUMatmulShapeType(SmallVector<int64_t> m, SmallVector<int64_t> n,
SmallVector<int64_t> k, Type a, Type b, Type c)
: mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {}
: mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a),
bType(b), cType(c) {}
GPUMatmulShapeType(ArrayRef<int64_t> m, ArrayRef<int64_t> n,
ArrayRef<int64_t> k, ArrayRef<int64_t> batch, Type a,
Type b, Type c)
: mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b),
cType(c) {}
};

/// Struct containing seed tile sizes for GPU MMA heuristics deduction logic.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,23 +202,29 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
// Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic
// dimensions will be tiled to 1 in workgroup tiling, so they are ignored when
// computing an MMA schedule.
SmallVector<int64_t> mDims, nDims, kDims;
for (auto mDim : contractionDims.m) {
SmallVector<int64_t> mDims, nDims, kDims, batchDims;
for (int64_t mDim : contractionDims.m) {
if (!ShapedType::isDynamic(bounds[mDim])) {
mDims.push_back(mDim);
}
}
for (auto nDim : contractionDims.n) {
for (int64_t nDim : contractionDims.n) {
if (!ShapedType::isDynamic(bounds[nDim])) {
nDims.push_back(nDim);
}
}
for (auto kDim : contractionDims.k) {
for (int64_t kDim : contractionDims.k) {
if (!ShapedType::isDynamic(bounds[kDim])) {
kDims.push_back(kDim);
}
}

for (int64_t batchDim : contractionDims.batch) {
if (!ShapedType::isDynamic(bounds[batchDim])) {
batchDims.push_back(batchDim);
}
}

auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
};
Expand All @@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
Type initElemType = getElementTypeOrSelf(init);

GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims),
getDimBounds(kDims), lhsElemType,
rhsElemType, initElemType};
getDimBounds(kDims), getDimBounds(batchDims),
lhsElemType, rhsElemType,
initElemType};

// Infer if lhs or rhs is transposed to help generate better schedule.
// TODO: Drop this. This is only a consideration for other pipelines.
Expand Down
15 changes: 13 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]);
}

SmallVector<int64_t> batchDims;
for (int64_t batchDim : contractionDims->batch) {
if (!ShapedType::isDynamic(bounds[batchDim])) {
batchDims.push_back(batchDim);
}
}
auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
};

// TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules
// once the pipeline is able to support it. After adding multiple dimensions,
// all instances of schedule->m/nSubgroupCounts[0] and
// schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of
// just the first element.
GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim],
lhsElemType, rhsElemType, initElemType};
GPUMatmulShapeType problem{
{bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims),
lhsElemType, rhsElemType, initElemType};

// Helper fn to store mma information.
auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,12 +282,12 @@ module {
// -----

module {
func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> {
func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> {
%c0 = arith.constant 0.0 : f32
%empty = tensor.empty() : tensor<12x577x577xf32>
%fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
%mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
return %mm : tensor<12x577x577xf32>
%empty = tensor.empty() : tensor<12x2x577xf32>
%fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
%mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
return %mm : tensor<12x2x577xf32>
}
}

Expand Down
Loading