From 7a35663b3cedb92e61ea4c1311167828a329c7f0 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Thu, 30 Jan 2025 11:49:07 -0600
Subject: [PATCH 1/7] [GPU] Only dont do padding for pure matvecs

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
---
 .../Codegen/Common/GPU/GPUHeuristics.cpp      | 52 ++++++++++++-------
 .../Codegen/Common/GPU/GPUHeuristics.h        | 18 ++++---
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 15 ++++--
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 12 ++---
 .../test/ROCDL/config_tile_and_fuse.mlir      | 10 ++--
 .../compiler/Codegen/SPIRV/KernelConfig.cpp   |  2 +-
 .../Preprocessing/Common/PadToIntrinsics.cpp  |  6 +--
 7 files changed, 71 insertions(+), 44 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
index f8e30f31a961..ef1e01e1e45b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor;
 
 namespace mlir::iree_compiler {
 
+// Threshold used to determine whether a matmul dimension is 'very skinny'.
+constexpr int64_t kVerySkinnyDimThreshold = 4;
+
 template <typename T>
 static llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                                      const llvm::SmallVectorImpl<T> &vector) {
@@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
 static bool isScheduleAligned(const GPUMatmulShapeType &problem,
                               const GPUMMASchedule &schedule,
                               bool mustBeAligned) {
-  SmallVector<int64_t> alignedMSizes(problem.mSizes);
+  SmallVector<int64_t, 2> alignedMSizes(problem.mSizes);
   alignedMSizes.back() =
       mustBeAligned ? problem.mSizes.back()
                     : llvm::divideCeil(problem.mSizes.back(), schedule.mSize) *
                           schedule.mSize;
-  SmallVector<int64_t> alignedNSizes(problem.nSizes);
+  SmallVector<int64_t, 2> alignedNSizes(problem.nSizes);
   alignedNSizes.back() =
       mustBeAligned ? problem.nSizes.back()
                     : llvm::divideCeil(problem.nSizes.back(), schedule.nSize) *
                           schedule.nSize;
-  SmallVector<int64_t> alignedKSizes(problem.kSizes);
+  SmallVector<int64_t, 2> alignedKSizes(problem.kSizes);
   alignedKSizes.back() =
       mustBeAligned ? problem.kSizes.back()
                     : llvm::divideCeil(problem.kSizes.back(), schedule.kSize) *
@@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem,
       };
   // Checks whether the elements of `a` are evenly divisible by the
   // corresponding elements of `b`.
-  auto areAligned = [](SmallVector<int64_t> a, SmallVector<int64_t> b) {
+  auto areAligned = [](SmallVector<int64_t, 2> a, SmallVector<int64_t, 2> b) {
     for (auto [aVal, bVal] : llvm::zip_equal(a, b)) {
       if (aVal % bVal != 0) {
         return false;
@@ -223,6 +226,7 @@ static FailureOr<GPUMMASchedule> fitScheduleInSharedMemory(
 
 static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
                                         const GPUMatmulShapeType &intrinsic,
+                                        int64_t preferredSubgroupSize,
                                         bool canUpcastAcc, bool mustBeAligned) {
   assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 &&
          intrinsic.kSizes.size() == 1 &&
@@ -240,12 +244,19 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
     }
   }
 
-  if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
-                        problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
-                        problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
-    return failure(); // Cannot use this intrinsic for misaligned cases.
+  if (mustBeAligned) {
+    if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
+         problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
+         problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
+      return failure();
+    }
+    return success();
   }
 
+  // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction
+  // pipeline, similar to matvec. Note: Because of reassociation in the vector
+  // reduction pipeline, this may lead to precission loss. If this ever becomes
+  // an issue, we can hide this behind a flag.
   // TODO: Figure out what the precise cutoff is, this may be machine dependent.
   // In situation when alignment isn't required, we disallow intrinsics to be
   // picked if the tile size is too small. For example, this will force a matmul
@@ -255,10 +266,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
   // established after we sweep the different tile sizes for a problem config.
   // Once a precise threshold is established, replace 4 with the threshold and
   // remove this todo.
-  if (!mustBeAligned &&
-      (problem.mSizes.back() < 4 || problem.nSizes.back() < 4 ||
-       problem.kSizes.back() < 4)) {
-    return failure();
+  if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(),
+                       problem.kSizes.size(), size_t{1}}) &&
+      problem.batchSizes.empty()) {
+    int64_t mSize = problem.mSizes.back();
+    int64_t nSize = problem.nSizes.back();
+    if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) ||
+        (nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) {
+      return failure();
+    }
   }
   return success();
 }
@@ -279,8 +295,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
   // 16x16x16 intrinsic, then:
   //  - mTotalTileCounts would be 4 * (16/16) = 4
   //  - nTotalTileCounts would be 2 * (32/16) = 4
-  SmallVector<int64_t> mTotalTileCounts = problem.mSizes;
-  SmallVector<int64_t> nTotalTileCounts = problem.nSizes;
+  SmallVector<int64_t, 2> mTotalTileCounts = problem.mSizes;
+  SmallVector<int64_t, 2> nTotalTileCounts = problem.nSizes;
   mTotalTileCounts.back() =
       llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]);
   nTotalTileCounts.back() =
@@ -361,7 +377,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
   // For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a
   // 16x16x16 intrinsic, then:
   //  - kTotalTileCounts would be 3 * (128/16) = 24
-  SmallVector<int64_t> kTotalTileCounts = problem.kSizes;
+  SmallVector<int64_t, 2> kTotalTileCounts = problem.kSizes;
   kTotalTileCounts.back() =
       llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]);
   // Compute the ideal number of intrinsics along K per subgroup based on the
@@ -396,7 +412,7 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
     bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) {
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
     if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+                                  subgroupSize, mustBeAligned))) {
       continue;
     }
 
@@ -451,12 +467,12 @@ FailureOr<GPUMMASchedule> deduceAttentionSchedule(
          "unimplemented: multi M/N/K attention schedule");
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
     if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+                                  subgroupSize, mustBeAligned))) {
       continue;
     }
 
     if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+                                  subgroupSize, mustBeAligned))) {
       continue;
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
index a35e2b464632..f09edd729952 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
@@ -10,18 +10,22 @@ namespace mlir::iree_compiler {
 
 /// Struct containing information about a matmul's shape and type.
 struct GPUMatmulShapeType {
-  SmallVector<int64_t> mSizes;
-  SmallVector<int64_t> nSizes;
-  SmallVector<int64_t> kSizes;
+  SmallVector<int64_t, 2> mSizes;
+  SmallVector<int64_t, 2> nSizes;
+  SmallVector<int64_t, 2> kSizes;
+  SmallVector<int64_t, 2> batchSizes;
   Type aType;
   Type bType;
   Type cType;
 
   GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c)
-      : mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {}
-  GPUMatmulShapeType(SmallVector<int64_t> m, SmallVector<int64_t> n,
-                     SmallVector<int64_t> k, Type a, Type b, Type c)
-      : mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {}
+      : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a),
+        bType(b), cType(c) {}
+  GPUMatmulShapeType(SmallVector<int64_t, 2> m, SmallVector<int64_t, 2> n,
+                     SmallVector<int64_t, 2> k, SmallVector<int64_t, 2> batch,
+                     Type a, Type b, Type c)
+      : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b),
+        cType(c) {}
 };
 
 /// Struct containing seed tile sizes for GPU MMA heuristics deduction logic.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 7b62d6955b10..6533115e8b49 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -121,7 +121,7 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
     bool transposedLhs, bool transposedRhs, bool mustBeAligned = true,
     bool doCPromotion = false) {
   const int64_t targetSubgroupSize = target.getPreferredSubgroupSize();
-  SmallVector<GPUMatmulShapeType> intrinsics;
+  SmallVector<GPUMatmulShapeType, 2> intrinsics;
   for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
     // Intrinsics that do not specify a scope cannot be distributed.
     if (failed(mma.getMmaScope()))
@@ -202,7 +202,7 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   // Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic
   // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when
   // computing an MMA schedule.
-  SmallVector<int64_t> mDims, nDims, kDims;
+  SmallVector<int64_t> mDims, nDims, kDims, batchDims;
   for (auto mDim : contractionDims.m) {
     if (!ShapedType::isDynamic(bounds[mDim])) {
       mDims.push_back(mDim);
@@ -219,6 +219,12 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
     }
   }
 
+  for (auto batchDim : contractionDims.batch) {
+    if (!ShapedType::isDynamic(bounds[batchDim])) {
+      batchDims.push_back(batchDim);
+    }
+  }
+
   auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
     return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
   };
@@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   Type initElemType = getElementTypeOrSelf(init);
 
   GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims),
-                             getDimBounds(kDims), lhsElemType,
-                             rhsElemType,         initElemType};
+                             getDimBounds(kDims), getDimBounds(batchDims),
+                             lhsElemType,         rhsElemType,
+                             initElemType};
 
   // Infer if lhs or rhs is transposed to help generate better schedule.
   // TODO: Drop this. This is only a consideration for other pipelines.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 2ee7e8241c17..1a19b78677fd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -339,7 +339,7 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
-                         SmallVector<GPUMatmulShapeType> &intrinsics,
+                         SmallVector<GPUMatmulShapeType, 2> &intrinsics,
                          SmallVector<IREE::GPU::MmaInterfaceAttr> &mmaKinds) {
     auto [mSize, nSize, kSize] = mma.getMNKShape();
     auto [aType, bType, cType] = mma.getABCElementTypes();
@@ -347,7 +347,7 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
     mmaKinds.emplace_back(mma);
   };
 
-  SmallVector<GPUMatmulShapeType> intrinsics;
+  SmallVector<GPUMatmulShapeType, 2> intrinsics;
   intrinsics.reserve(target.getWgp().getMma().size());
   SmallVector<IREE::GPU::MmaInterfaceAttr> mmaKinds;
   MLIRContext *context = op.getContext();
@@ -546,7 +546,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
-                         SmallVector<GPUMatmulShapeType> &intrinsics,
+                         SmallVector<GPUMatmulShapeType, 2> &intrinsics,
                          SmallVector<IREE::GPU::MmaInterfaceAttr> &mmaKinds) {
     auto [mSize, nSize, kSize] = mma.getMNKShape();
     auto [aType, bType, cType] = mma.getABCElementTypes();
@@ -554,7 +554,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
     mmaKinds.emplace_back(mma);
   };
 
-  SmallVector<GPUMatmulShapeType> intrinsics;
+  SmallVector<GPUMatmulShapeType, 2> intrinsics;
   intrinsics.reserve(target.getWgp().getMma().size());
   SmallVector<IREE::GPU::MmaInterfaceAttr> mmaKinds;
   MLIRContext *context = op.getContext();
@@ -766,7 +766,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig(
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
-                         SmallVector<GPUMatmulShapeType> &intrinsics,
+                         SmallVector<GPUMatmulShapeType, 2> &intrinsics,
                          SmallVector<IREE::GPU::MmaInterfaceAttr> &mmaKinds) {
     auto [mSize, nSize, kSize] = mma.getMNKShape();
     auto [aType, bType, cType] = mma.getABCElementTypes();
@@ -774,7 +774,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig(
     mmaKinds.emplace_back(mma);
   };
 
-  SmallVector<GPUMatmulShapeType> intrinsics;
+  SmallVector<GPUMatmulShapeType, 2> intrinsics;
   intrinsics.reserve(target.getWgp().getMma().size());
   SmallVector<IREE::GPU::MmaInterfaceAttr> mmaKinds;
   MLIRContext *context = op.getContext();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index 910eca3c1768..ec6038f47dee 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -282,12 +282,12 @@ module {
 // -----
 
 module {
-func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> {
+func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> {
     %c0 = arith.constant 0.0 : f32
-    %empty = tensor.empty() : tensor<12x577x577xf32>
-    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
-    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
-    return %mm :  tensor<12x577x577xf32>
+    %empty = tensor.empty() : tensor<12x2x577xf32>
+    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
+    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
+    return %mm :  tensor<12x2x577xf32>
 }
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index bbdec5c83f6d..1ef0e2ceb311 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -891,7 +891,7 @@ setCooperativeMatrixConfig(IREE::GPU::TargetAttr target, linalg::LinalgOp op,
   // just the first element.
   GPUMatmulShapeType problem(dimM, dimN, dimK, lhsElem, rhsElem, initElem);
 
-  SmallVector<GPUMatmulShapeType> intrinsics;
+  SmallVector<GPUMatmulShapeType, 2> intrinsics;
   intrinsics.reserve(target.getWgp().getMma().size());
   for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
     auto [mSize, nSize, kSize] = mma.getMNKShape();
diff --git a/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp b/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp
index 922e50882775..d4b485ac096c 100644
--- a/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp
+++ b/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp
@@ -145,7 +145,7 @@ expandMapsAndIterators(SmallVector<AffineMap> &expandedMaps,
   }
 }
 
-static SmallVector<GPUMatmulShapeType>
+static SmallVector<GPUMatmulShapeType, 2>
 getIntrinsics(linalg::LinalgOp linalgOp,
               ArrayRef<IREE::HAL::ExecutableTargetAttr> executableTargets) {
   IREE::GPU::TargetAttr target;
@@ -179,7 +179,7 @@ padConvOp(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
     return;
 
   // Early exit if cannot find intrinsics or if multiple executable targets.
-  SmallVector<GPUMatmulShapeType> intrinsics =
+  SmallVector<GPUMatmulShapeType, 2> intrinsics =
       getIntrinsics(linalgOp, executableTargets);
   if (intrinsics.empty())
     return;
@@ -326,7 +326,7 @@ static void padContractionLikeOp(
   }
 
   // Early exit if cannot find intrinsics or if multiple executable targets.
-  SmallVector<GPUMatmulShapeType> intrinsics =
+  SmallVector<GPUMatmulShapeType, 2> intrinsics =
       getIntrinsics(linalgOp, executableTargets);
   if (intrinsics.empty())
     return;

From 9911ef44569060a9b1258a54b61b2f47f893956a Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Fri, 31 Jan 2025 14:15:56 -0600
Subject: [PATCH 2/7] add batch in vector distribute

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
---
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp     | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 1a19b78677fd..b5d03c20a970 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
       rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]);
   }
 
+  SmallVector<int64_t> batchDims;
+  for (auto batchDim : contractionDims->batch) {
+    if (!ShapedType::isDynamic(bounds[batchDim])) {
+      batchDims.push_back(batchDim);
+    }
+  }
+  auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
+    return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
+  };
+
   // TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules
   // once the pipeline is able to support it. After adding multiple dimensions,
   // all instances of schedule->m/nSubgroupCounts[0] and
   // schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of
   // just the first element.
-  GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim],
-                             lhsElemType,  rhsElemType,  initElemType};
+  GPUMatmulShapeType problem{
+      {bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims),
+      lhsElemType,    rhsElemType,    initElemType};
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,

From 84cc41df6a29bfc6b05b0ff131496e49b1bcdded Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Mon, 3 Feb 2025 14:57:52 -0600
Subject: [PATCH 3/7] address reviwer comments

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
---
 .../src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp  | 4 +---
 .../src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h    | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
index ef1e01e1e45b..ff20a3eed216 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -254,9 +254,7 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
   }
 
   // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction
-  // pipeline, similar to matvec. Note: Because of reassociation in the vector
-  // reduction pipeline, this may lead to precission loss. If this ever becomes
-  // an issue, we can hide this behind a flag.
+  // pipeline, similar to matvec.
   // TODO: Figure out what the precise cutoff is, this may be machine dependent.
   // In situation when alignment isn't required, we disallow intrinsics to be
   // picked if the tile size is too small. For example, this will force a matmul
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
index f09edd729952..6542d11ebf18 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
@@ -21,9 +21,9 @@ struct GPUMatmulShapeType {
   GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c)
       : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a),
         bType(b), cType(c) {}
-  GPUMatmulShapeType(SmallVector<int64_t, 2> m, SmallVector<int64_t, 2> n,
-                     SmallVector<int64_t, 2> k, SmallVector<int64_t, 2> batch,
-                     Type a, Type b, Type c)
+  GPUMatmulShapeType(ArrayRef<int64_t> m, ArrayRef<int64_t> n,
+                     ArrayRef<int64_t> k, ArrayRef<int64_t> batch, Type a,
+                     Type b, Type c)
       : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b),
         cType(c) {}
 };

From 9e0462b7ed0e284e95176ff69e13d2ae727ce584 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Mon, 3 Feb 2025 15:33:01 -0600
Subject: [PATCH 4/7] Reviwer comments

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
---
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp        | 10 +++++-----
 .../iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp | 14 +++++++-------
 .../iree/compiler/Codegen/SPIRV/KernelConfig.cpp   |  2 +-
 .../Preprocessing/Common/PadToIntrinsics.cpp       |  6 +++---
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 6533115e8b49..48bfcc9a7c2a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -121,7 +121,7 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
     bool transposedLhs, bool transposedRhs, bool mustBeAligned = true,
     bool doCPromotion = false) {
   const int64_t targetSubgroupSize = target.getPreferredSubgroupSize();
-  SmallVector<GPUMatmulShapeType, 2> intrinsics;
+  SmallVector<GPUMatmulShapeType> intrinsics;
   for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
     // Intrinsics that do not specify a scope cannot be distributed.
     if (failed(mma.getMmaScope()))
@@ -203,23 +203,23 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when
   // computing an MMA schedule.
   SmallVector<int64_t> mDims, nDims, kDims, batchDims;
-  for (auto mDim : contractionDims.m) {
+  for (int64_t mDim : contractionDims.m) {
     if (!ShapedType::isDynamic(bounds[mDim])) {
       mDims.push_back(mDim);
     }
   }
-  for (auto nDim : contractionDims.n) {
+  for (int64_t nDim : contractionDims.n) {
     if (!ShapedType::isDynamic(bounds[nDim])) {
       nDims.push_back(nDim);
     }
   }
-  for (auto kDim : contractionDims.k) {
+  for (int64_t kDim : contractionDims.k) {
     if (!ShapedType::isDynamic(bounds[kDim])) {
       kDims.push_back(kDim);
     }
   }
 
-  for (auto batchDim : contractionDims.batch) {
+  for (int64_t batchDim : contractionDims.batch) {
     if (!ShapedType::isDynamic(bounds[batchDim])) {
       batchDims.push_back(batchDim);
     }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index b5d03c20a970..8833b4203156 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -339,7 +339,7 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
-                         SmallVector<GPUMatmulShapeType, 2> &intrinsics,
+                         SmallVector<GPUMatmulShapeType> &intrinsics,
                          SmallVector<IREE::GPU::MmaInterfaceAttr> &mmaKinds) {
     auto [mSize, nSize, kSize] = mma.getMNKShape();
     auto [aType, bType, cType] = mma.getABCElementTypes();
@@ -347,7 +347,7 @@ setConvolutionVectorDistributionConfig(IREE::GPU::TargetAttr target,
     mmaKinds.emplace_back(mma);
   };
 
-  SmallVector<GPUMatmulShapeType, 2> intrinsics;
+  SmallVector<GPUMatmulShapeType> intrinsics;
   intrinsics.reserve(target.getWgp().getMma().size());
   SmallVector<IREE::GPU::MmaInterfaceAttr> mmaKinds;
   MLIRContext *context = op.getContext();
@@ -537,7 +537,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
   }
 
   SmallVector<int64_t> batchDims;
-  for (auto batchDim : contractionDims->batch) {
+  for (int64_t batchDim : contractionDims->batch) {
     if (!ShapedType::isDynamic(bounds[batchDim])) {
       batchDims.push_back(batchDim);
     }
@@ -557,7 +557,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
-                         SmallVector<GPUMatmulShapeType, 2> &intrinsics,
+                         SmallVector<GPUMatmulShapeType> &intrinsics,
                          SmallVector<IREE::GPU::MmaInterfaceAttr> &mmaKinds) {
     auto [mSize, nSize, kSize] = mma.getMNKShape();
     auto [aType, bType, cType] = mma.getABCElementTypes();
@@ -565,7 +565,7 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
     mmaKinds.emplace_back(mma);
   };
 
-  SmallVector<GPUMatmulShapeType, 2> intrinsics;
+  SmallVector<GPUMatmulShapeType> intrinsics;
   intrinsics.reserve(target.getWgp().getMma().size());
   SmallVector<IREE::GPU::MmaInterfaceAttr> mmaKinds;
   MLIRContext *context = op.getContext();
@@ -777,7 +777,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig(
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
-                         SmallVector<GPUMatmulShapeType, 2> &intrinsics,
+                         SmallVector<GPUMatmulShapeType> &intrinsics,
                          SmallVector<IREE::GPU::MmaInterfaceAttr> &mmaKinds) {
     auto [mSize, nSize, kSize] = mma.getMNKShape();
     auto [aType, bType, cType] = mma.getABCElementTypes();
@@ -785,7 +785,7 @@ static LogicalResult setAttentionIntrinsicBasedVectorDistributionConfig(
     mmaKinds.emplace_back(mma);
   };
 
-  SmallVector<GPUMatmulShapeType, 2> intrinsics;
+  SmallVector<GPUMatmulShapeType> intrinsics;
   intrinsics.reserve(target.getWgp().getMma().size());
   SmallVector<IREE::GPU::MmaInterfaceAttr> mmaKinds;
   MLIRContext *context = op.getContext();
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index 1ef0e2ceb311..bbdec5c83f6d 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -891,7 +891,7 @@ setCooperativeMatrixConfig(IREE::GPU::TargetAttr target, linalg::LinalgOp op,
   // just the first element.
   GPUMatmulShapeType problem(dimM, dimN, dimK, lhsElem, rhsElem, initElem);
 
-  SmallVector<GPUMatmulShapeType, 2> intrinsics;
+  SmallVector<GPUMatmulShapeType> intrinsics;
   intrinsics.reserve(target.getWgp().getMma().size());
   for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
     auto [mSize, nSize, kSize] = mma.getMNKShape();
diff --git a/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp b/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp
index d4b485ac096c..922e50882775 100644
--- a/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp
+++ b/compiler/src/iree/compiler/Preprocessing/Common/PadToIntrinsics.cpp
@@ -145,7 +145,7 @@ expandMapsAndIterators(SmallVector<AffineMap> &expandedMaps,
   }
 }
 
-static SmallVector<GPUMatmulShapeType, 2>
+static SmallVector<GPUMatmulShapeType>
 getIntrinsics(linalg::LinalgOp linalgOp,
               ArrayRef<IREE::HAL::ExecutableTargetAttr> executableTargets) {
   IREE::GPU::TargetAttr target;
@@ -179,7 +179,7 @@ padConvOp(RewriterBase &rewriter, linalg::LinalgOp linalgOp,
     return;
 
   // Early exit if cannot find intrinsics or if multiple executable targets.
-  SmallVector<GPUMatmulShapeType, 2> intrinsics =
+  SmallVector<GPUMatmulShapeType> intrinsics =
       getIntrinsics(linalgOp, executableTargets);
   if (intrinsics.empty())
     return;
@@ -326,7 +326,7 @@ static void padContractionLikeOp(
   }
 
   // Early exit if cannot find intrinsics or if multiple executable targets.
-  SmallVector<GPUMatmulShapeType, 2> intrinsics =
+  SmallVector<GPUMatmulShapeType> intrinsics =
       getIntrinsics(linalgOp, executableTargets);
   if (intrinsics.empty())
     return;

From 97e9cb4c1f662ff0a7472c3f4a9c606edeaf09ec Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Mon, 3 Feb 2025 16:39:26 -0600
Subject: [PATCH 5/7] bump as CI seems stuck

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>

From f6136345b1e88de1ef6dbcec62c6a6c42ed2b1cc Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Mon, 3 Feb 2025 17:09:54 -0600
Subject: [PATCH 6/7] fix new test

---
 .../src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
index baaed8c1f81e..5c4fe8ac991c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
@@ -343,11 +343,11 @@ func.func @not_vmt() {
   return
 }
 
-//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
+//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>
 //       CHECK: func.func @not_vmt()
 //  CHECK-SAME:     translation_info = #[[$TRANSLATION]]
 //       CHECK:   linalg.generic
-//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 8], thread = [1, 128, 0], workgroup = [1, 128, 1]}>
+//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [16, 256, 64], promote_operands = [0, 1, 2], reduction = [0, 0, 4], subgroup = [1, 4, 0], workgroup = [16, 256, 0]}>
 
 // -----
 

From e1c32173528e32e5e786b118d6ba3c12996e5214 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Mon, 3 Feb 2025 17:47:42 -0600
Subject: [PATCH 7/7] fix bug in cantargetintrinisic ordering

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
---
 .../compiler/Codegen/Common/GPU/GPUHeuristics.cpp    | 12 ++++++------
 .../compiler/Codegen/LLVMGPU/test/config_matvec.mlir |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
index ff20a3eed216..669d1d1eb539 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -409,8 +409,8 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
     int64_t subgroupSize, bool transposedLhs, bool transposedRhs,
     bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) {
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
-    if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc,
-                                  subgroupSize, mustBeAligned))) {
+    if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
@@ -464,13 +464,13 @@ FailureOr<GPUMMASchedule> deduceAttentionSchedule(
          qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 &&
          "unimplemented: multi M/N/K attention schedule");
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
-    if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc,
-                                  subgroupSize, mustBeAligned))) {
+    if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
-    if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc,
-                                  subgroupSize, mustBeAligned))) {
+    if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
index 5c4fe8ac991c..baaed8c1f81e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
@@ -343,11 +343,11 @@ func.func @not_vmt() {
   return
 }
 
-//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>
+//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
 //       CHECK: func.func @not_vmt()
 //  CHECK-SAME:     translation_info = #[[$TRANSLATION]]
 //       CHECK:   linalg.generic
-//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, padding = [16, 256, 64], promote_operands = [0, 1, 2], reduction = [0, 0, 4], subgroup = [1, 4, 0], workgroup = [16, 256, 0]}>
+//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 8], thread = [1, 128, 0], workgroup = [1, 128, 1]}>
 
 // -----