[GPU] Match Tile And Fuse skinny matmul bail-out to Vector Distribute (…

…#19857) This PR matches the failure criteria to what we see in the SetContractConfig for vector distribute for bailing out on skinny matmuls. The dispatch in #19855 goes to 0.068 ms vs the default path which gets 1.64 ms as this skinny matmul with multiple dims cannot be currently supported by vector reduction and warp reduction but tile and fuse can support it using padding. This needs the flag `--iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true` Also the `GPUMatmulShapeType` was becoming too large as this PR adds batch sizes to it and was giving the following error. ``` error: static_assert failed due to requirement 'sizeof(mlir::iree_compiler::GPUMatmulShapeType) <= 256' "You are trying to use a default number of inlined elements for SmallVector<T> but sizeof(T) is really big! Please use an explicit number of inlined elements with SmallVector<T, N> to make sure you really want that much inline storage." ``` This PR fixes this issue both by explictly mentioning vector sizes in the struct members. --------- Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
iree-org · Feb 4, 2025 · d96a3f0 · d96a3f0
1 parent 1ed6350
commit d96a3f0
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 41 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor;
 
 namespace mlir::iree_compiler {
 
+// Threshold used to determine whether a matmul dimension is 'very skinny'.
+constexpr int64_t kVerySkinnyDimThreshold = 4;
+
 template <typename T>
 static llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                                      const llvm::SmallVectorImpl<T> &vector) {
@@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
 static bool isScheduleAligned(const GPUMatmulShapeType &problem,
                               const GPUMMASchedule &schedule,
                               bool mustBeAligned) {
-  SmallVector<int64_t> alignedMSizes(problem.mSizes);
+  SmallVector<int64_t, 2> alignedMSizes(problem.mSizes);
   alignedMSizes.back() =
       mustBeAligned ? problem.mSizes.back()
                     : llvm::divideCeil(problem.mSizes.back(), schedule.mSize) *
                           schedule.mSize;
-  SmallVector<int64_t> alignedNSizes(problem.nSizes);
+  SmallVector<int64_t, 2> alignedNSizes(problem.nSizes);
   alignedNSizes.back() =
       mustBeAligned ? problem.nSizes.back()
                     : llvm::divideCeil(problem.nSizes.back(), schedule.nSize) *
                           schedule.nSize;
-  SmallVector<int64_t> alignedKSizes(problem.kSizes);
+  SmallVector<int64_t, 2> alignedKSizes(problem.kSizes);
   alignedKSizes.back() =
       mustBeAligned ? problem.kSizes.back()
                     : llvm::divideCeil(problem.kSizes.back(), schedule.kSize) *
@@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem,
       };
   // Checks whether the elements of `a` are evenly divisible by the
   // corresponding elements of `b`.
-  auto areAligned = [](SmallVector<int64_t> a, SmallVector<int64_t> b) {
+  auto areAligned = [](SmallVector<int64_t, 2> a, SmallVector<int64_t, 2> b) {
     for (auto [aVal, bVal] : llvm::zip_equal(a, b)) {
       if (aVal % bVal != 0) {
         return false;
@@ -223,6 +226,7 @@ static FailureOr<GPUMMASchedule> fitScheduleInSharedMemory(
 
 static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
                                         const GPUMatmulShapeType &intrinsic,
+                                        int64_t preferredSubgroupSize,
                                         bool canUpcastAcc, bool mustBeAligned) {
   assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 &&
          intrinsic.kSizes.size() == 1 &&
@@ -240,12 +244,17 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
     }
   }
 
-  if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
-                        problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
-                        problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
-    return failure(); // Cannot use this intrinsic for misaligned cases.
+  if (mustBeAligned) {
+    if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
+         problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
+         problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
+      return failure();
+    }
+    return success();
   }
 
+  // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction
+  // pipeline, similar to matvec.
   // TODO: Figure out what the precise cutoff is, this may be machine dependent.
   // In situation when alignment isn't required, we disallow intrinsics to be
   // picked if the tile size is too small. For example, this will force a matmul
@@ -255,10 +264,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
   // established after we sweep the different tile sizes for a problem config.
   // Once a precise threshold is established, replace 4 with the threshold and
   // remove this todo.
-  if (!mustBeAligned &&
-      (problem.mSizes.back() < 4 || problem.nSizes.back() < 4 ||
-       problem.kSizes.back() < 4)) {
-    return failure();
+  if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(),
+                       problem.kSizes.size(), size_t{1}}) &&
+      problem.batchSizes.empty()) {
+    int64_t mSize = problem.mSizes.back();
+    int64_t nSize = problem.nSizes.back();
+    if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) ||
+        (nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) {
+      return failure();
+    }
   }
   return success();
 }
@@ -279,8 +293,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
   // 16x16x16 intrinsic, then:
   //  - mTotalTileCounts would be 4 * (16/16) = 4
   //  - nTotalTileCounts would be 2 * (32/16) = 4
-  SmallVector<int64_t> mTotalTileCounts = problem.mSizes;
-  SmallVector<int64_t> nTotalTileCounts = problem.nSizes;
+  SmallVector<int64_t, 2> mTotalTileCounts = problem.mSizes;
+  SmallVector<int64_t, 2> nTotalTileCounts = problem.nSizes;
   mTotalTileCounts.back() =
       llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]);
   nTotalTileCounts.back() =
@@ -361,7 +375,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
   // For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a
   // 16x16x16 intrinsic, then:
   //  - kTotalTileCounts would be 3 * (128/16) = 24
-  SmallVector<int64_t> kTotalTileCounts = problem.kSizes;
+  SmallVector<int64_t, 2> kTotalTileCounts = problem.kSizes;
   kTotalTileCounts.back() =
       llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]);
   // Compute the ideal number of intrinsics along K per subgroup based on the
@@ -395,8 +409,8 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
     int64_t subgroupSize, bool transposedLhs, bool transposedRhs,
     bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) {
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
-    if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
@@ -450,13 +464,13 @@ FailureOr<GPUMMASchedule> deduceAttentionSchedule(
          qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 &&
          "unimplemented: multi M/N/K attention schedule");
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
-    if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
-    if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
@@ -10,18 +10,22 @@ namespace mlir::iree_compiler {
 
 /// Struct containing information about a matmul's shape and type.
 struct GPUMatmulShapeType {
-  SmallVector<int64_t> mSizes;
-  SmallVector<int64_t> nSizes;
-  SmallVector<int64_t> kSizes;
+  SmallVector<int64_t, 2> mSizes;
+  SmallVector<int64_t, 2> nSizes;
+  SmallVector<int64_t, 2> kSizes;
+  SmallVector<int64_t, 2> batchSizes;
   Type aType;
   Type bType;
   Type cType;
 
   GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c)
-      : mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {}
-  GPUMatmulShapeType(SmallVector<int64_t> m, SmallVector<int64_t> n,
-                     SmallVector<int64_t> k, Type a, Type b, Type c)
-      : mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {}
+      : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a),
+        bType(b), cType(c) {}
+  GPUMatmulShapeType(ArrayRef<int64_t> m, ArrayRef<int64_t> n,
+                     ArrayRef<int64_t> k, ArrayRef<int64_t> batch, Type a,
+                     Type b, Type c)
+      : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b),
+        cType(c) {}
 };
 
 /// Struct containing seed tile sizes for GPU MMA heuristics deduction logic.

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -202,23 +202,29 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   // Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic
   // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when
   // computing an MMA schedule.
-  SmallVector<int64_t> mDims, nDims, kDims;
-  for (auto mDim : contractionDims.m) {
+  SmallVector<int64_t> mDims, nDims, kDims, batchDims;
+  for (int64_t mDim : contractionDims.m) {
     if (!ShapedType::isDynamic(bounds[mDim])) {
       mDims.push_back(mDim);
     }
   }
-  for (auto nDim : contractionDims.n) {
+  for (int64_t nDim : contractionDims.n) {
     if (!ShapedType::isDynamic(bounds[nDim])) {
       nDims.push_back(nDim);
     }
   }
-  for (auto kDim : contractionDims.k) {
+  for (int64_t kDim : contractionDims.k) {
     if (!ShapedType::isDynamic(bounds[kDim])) {
       kDims.push_back(kDim);
     }
   }
 
+  for (int64_t batchDim : contractionDims.batch) {
+    if (!ShapedType::isDynamic(bounds[batchDim])) {
+      batchDims.push_back(batchDim);
+    }
+  }
+
   auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
     return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
   };
@@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   Type initElemType = getElementTypeOrSelf(init);
 
   GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims),
-                             getDimBounds(kDims), lhsElemType,
-                             rhsElemType,         initElemType};
+                             getDimBounds(kDims), getDimBounds(batchDims),
+                             lhsElemType,         rhsElemType,
+                             initElemType};
 
   // Infer if lhs or rhs is transposed to help generate better schedule.
   // TODO: Drop this. This is only a consideration for other pipelines.

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
       rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]);
   }
 
+  SmallVector<int64_t> batchDims;
+  for (int64_t batchDim : contractionDims->batch) {
+    if (!ShapedType::isDynamic(bounds[batchDim])) {
+      batchDims.push_back(batchDim);
+    }
+  }
+  auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
+    return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
+  };
+
   // TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules
   // once the pipeline is able to support it. After adding multiple dimensions,
   // all instances of schedule->m/nSubgroupCounts[0] and
   // schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of
   // just the first element.
-  GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim],
-                             lhsElemType,  rhsElemType,  initElemType};
+  GPUMatmulShapeType problem{
+      {bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims),
+      lhsElemType,    rhsElemType,    initElemType};
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -282,12 +282,12 @@ module {
 // -----
 
 module {
-func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> {
+func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> {
     %c0 = arith.constant 0.0 : f32
-    %empty = tensor.empty() : tensor<12x577x577xf32>
-    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
-    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
-    return %mm :  tensor<12x577x577xf32>
+    %empty = tensor.empty() : tensor<12x2x577xf32>
+    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
+    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
+    return %mm :  tensor<12x2x577xf32>
 }
 }