From 3a870c7dc45bd1d6e57fcf7ace14fb49bdef274f Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Mon, 29 Apr 2024 23:04:48 +0000
Subject: [PATCH 1/4] Do not unroll a lot

---
 .../Codegen/LLVMCPU/KernelDispatch.cpp        | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 107540262cf4..6b4de3cf168f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -289,6 +289,8 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn,
   return getVectorSize(entryPointFn, byteWidth);
 }
 
+static bool isSupportedTransposeOp(linalg::GenericOp genericOp);
+
 /// Returns minimum tiling sizes for each dimension. One dimension is possible
 /// to access at different element types. It determines the tiling sizes by
 /// looking into all the operands.
@@ -319,10 +321,17 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn,
         llvm::cast<ShapedType>(inputOutputOpOperands[index].get().getType());
     int64_t tileSize = getVectorSize(entryPointFn, operandType);
 
+    LLVM_DEBUG(KD_DBGS() << "fastestVaryingDim: " << fastestVaryingDim
+                         << ", tileSize: " << tileSize
+                         << ", operandType: " << operandType << "\n");
     minTileSizes[fastestVaryingDim] =
         std::max<int64_t>(minTileSizes[fastestVaryingDim], tileSize);
   }
 
+  for (auto [i, val] : llvm::enumerate(minTileSizes)) {
+    LLVM_DEBUG(KD_DBGS() << "minTileSizes #" << i << ": " << val << "\n");
+  }
+
   // Limit unroll factor. For now, we assume the rightmost non-one tiled
   // dimension is for vectorization and any other non-one dimension is for
   // unrolling.
@@ -333,13 +342,24 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn,
         break;
       }
     }
+    bool seen = false;
     for (int unrollDim = vecDim - 1; unrollDim >= 0; --unrollDim) {
+      if (minTileSizes[unrollDim] <= 1) {
+        continue;
+      }
+      int64_t factor = seen ? 1LL : maxUnrollFactor;
+      seen = true;
+      LLVM_DEBUG(KD_DBGS() << "Adjusted min tile sizes: "
+                           << minTileSizes[unrollDim]
+                           << " with factor=" << factor << "\n");
       minTileSizes[unrollDim] =
-          std::min<int64_t>(minTileSizes[unrollDim], maxUnrollFactor);
+          std::min<int64_t>(minTileSizes[unrollDim], factor);
     }
   };
 
-  if (linalgOpInfo.isTranspose()) {
+  auto genericOp = dyn_cast<linalg::GenericOp>(op.getOperation());
+  if (linalgOpInfo.isTranspose() && genericOp &&
+      isSupportedTransposeOp(genericOp)) {
     // Limit unrolling on transpose operations.
     // TODO(dcaballe): Consider input and output transposes.
     limitUnrollFactor(targetMLTransInfo.defaultMaxTransposeUnrollFactor);

From 4ee8a8673508b5397e75810ed402645f231f7066 Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Tue, 30 Apr 2024 22:38:24 +0000
Subject: [PATCH 2/4] [CPU] Limit unrolling factors for generic ops.

---
 .../Codegen/LLVMCPU/KernelDispatch.cpp        | 10 +--
 .../test/select_x86_64_lowering_strategy.mlir | 82 ++++++++-----------
 2 files changed, 38 insertions(+), 54 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 6b4de3cf168f..8e5cc773c354 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -321,20 +321,14 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn,
         llvm::cast<ShapedType>(inputOutputOpOperands[index].get().getType());
     int64_t tileSize = getVectorSize(entryPointFn, operandType);
 
-    LLVM_DEBUG(KD_DBGS() << "fastestVaryingDim: " << fastestVaryingDim
-                         << ", tileSize: " << tileSize
-                         << ", operandType: " << operandType << "\n");
     minTileSizes[fastestVaryingDim] =
         std::max<int64_t>(minTileSizes[fastestVaryingDim], tileSize);
   }
 
-  for (auto [i, val] : llvm::enumerate(minTileSizes)) {
-    LLVM_DEBUG(KD_DBGS() << "minTileSizes #" << i << ": " << val << "\n");
-  }
-
   // Limit unroll factor. For now, we assume the rightmost non-one tiled
   // dimension is for vectorization and any other non-one dimension is for
-  // unrolling.
+  // unrolling. The util limits the second rightmost non-one tiled dimension
+  // to be not larger than `maxUnrollFactor` and others tiled dimension to 1.
   auto limitUnrollFactor = [&](int64_t maxUnrollFactor) {
     int vecDim;
     for (vecDim = minTileSizes.size() - 1; vecDim >= 0; --vecDim) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index 01fa1a880850..e92a0de0aa06 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1398,52 +1398,6 @@ module {
 
 // -----
 
-#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu", cpu_features = "+avx512f"}>
-#map = affine_map<(d0, d1, d2) -> (d1, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-#map3 = affine_map<(d0, d1, d2) -> (d0)>
-module {
-  func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
-    %cst = arith.constant 0.000000e+00 : f32
-    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
-    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
-    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
-    %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
-    %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
-    %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
-    %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
-    %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
-    %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
-    %9 = tensor.empty() : tensor<4096xf32>
-    %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32>
-    %11 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %5, %6, %7 : tensor<86x128xf32>, tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096xf32>) {
-    ^bb0(%in: f32, %in_0: i4, %in_1: f32, %in_2: f32, %out: f32):
-      %12 = arith.extui %in_0 : i4 to i32
-      %13 = arith.uitofp %12 : i32 to f32
-      %14 = arith.subf %13, %in_2 : f32
-      %15 = arith.mulf %14, %in_1 : f32
-      %16 = arith.mulf %in, %15 : f32
-      %17 = arith.addf %16, %out : f32
-      linalg.yield %17 : f32
-    } -> tensor<4096xf32>
-    flow.dispatch.tensor.store %11, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
-    return
-  }
-}
-
-//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32], [4], [0], [0]]>
-//   CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 0, 0], [4, 0, 0], [0, 4, 16], [0, 0, 0]]>
-//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
-//       CHECK: func.func @i4_dequant_matvec_f32()
-//  CHECK-SAME:     translation_info = #[[TRANSLATION]]
-//       CHECK: linalg.fill
-//  CHECK-SAME:     lowering_config = #[[CONFIG]]
-//       CHECK: linalg.generic {{.*}} iterator_types = ["parallel", "reduction", "reduction"]
-//  CHECK-SAME:     lowering_config = #[[CONFIG1]]
-
-// -----
-
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>
 module {
   func.func @batch_mmt4d() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
@@ -1641,3 +1595,39 @@ module {
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //     CHECK:   iree_linalg_ext.attention
 // CHECK-SAME:    {lowering_config = #[[CONFIG]]}
+
+// -----
+
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
+      cpu = "generic", cpu_features = "",
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+      native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
+module {
+  func.func @elementwise_output_transposed() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+    %c0 = arith.constant 0 : index
+    %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<i64>>
+    %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<768xi64>>
+    %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32xi64>>
+    %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x32x768xf32>>
+    %4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<i64>> -> tensor<i64>
+    %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<768xi64>> -> tensor<768xi64>
+    %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xi64>> -> tensor<32xi64>
+    %7 = tensor.empty() : tensor<32x32x768xf32>
+    %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d1)>, affine_map<(d0, d1, d2) -> (d1, d2, d0)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %5, %6 : tensor<i64>, tensor<768xi64>, tensor<32xi64>) outs(%7 : tensor<32x32x768xf32>) {
+    ^bb0(%in: i64, %in_0: i64, %in_1: i64, %out: f32):
+      %9 = arith.addi %in, %in_0 : i64
+      %10 = arith.addi %9, %in_1 : i64
+      %11 = arith.uitofp %10 : i64 to f32
+      linalg.yield %11 : f32
+    } -> tensor<32x32x768xf32>
+    flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [32, 32, 768], strides = [1, 1, 1] : tensor<32x32x768xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x32x768xf32>>
+    return
+  }
+}
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 32, 32], [8, 8, 1], [0, 0, 0], [0, 0, 0]]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//      CHECK: func.func @elementwise_output_transposed()
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//     CHECK:    linalg.generic
+// CHECK-SAME:     {lowering_config = #[[CONFIG]]}
+

From f231dd81a24db1d1d36d6ae6eb22b773cefe1b3d Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Tue, 30 Apr 2024 22:48:52 +0000
Subject: [PATCH 3/4] Bubble up the isSupportedTranspose method impl, so others
 can reuse it directly

---
 .../Codegen/LLVMCPU/KernelDispatch.cpp        | 56 +++++++++----------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 8e5cc773c354..468e0f73f02f 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -289,7 +289,33 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn,
   return getVectorSize(entryPointFn, byteWidth);
 }
 
-static bool isSupportedTransposeOp(linalg::GenericOp genericOp);
+/// Returns true if the operation is a GenericOp implementing a supported
+/// transposition.
+static bool isSupportedTransposeOp(linalg::GenericOp genericOp) {
+  // Check that the op has at least 2 dimensions.
+  if (genericOp.getNumLoops() < 2) {
+    return false;
+  }
+
+  // Check that the op has only one input and one output.
+  // TODO(diegocaballero): Generalize to multiple inputs.
+  if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) {
+    return false;
+  }
+
+  // Check that all the iterators are parallel.
+  if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) {
+    return false;
+  }
+
+  // Check that the two indexing maps are a permutation of each other.
+  auto indexingMaps = genericOp.getIndexingMapsArray();
+  return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() &&
+         ((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() &&
+           indexingMaps[1].isPermutation()) ||
+          (!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() &&
+           indexingMaps[1].isIdentity()));
+}
 
 /// Returns minimum tiling sizes for each dimension. One dimension is possible
 /// to access at different element types. It determines the tiling sizes by
@@ -1743,34 +1769,6 @@ static void setVectorTileSizes(linalg::LinalgOp op,
   }
 }
 
-/// Returns true if the operation is a GenericOp implementing a supported
-/// transposition.
-static bool isSupportedTransposeOp(linalg::GenericOp genericOp) {
-  // Check that the op has at least 2 dimensions.
-  if (genericOp.getNumLoops() < 2) {
-    return false;
-  }
-
-  // Check that the op has only one input and one output.
-  // TODO(diegocaballero): Generalize to multiple inputs.
-  if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) {
-    return false;
-  }
-
-  // Check that all the iterators are parallel.
-  if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) {
-    return false;
-  }
-
-  // Check that the two indexing maps are a permutation of each other.
-  auto indexingMaps = genericOp.getIndexingMapsArray();
-  return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() &&
-         ((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() &&
-           indexingMaps[1].isPermutation()) ||
-          (!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() &&
-           indexingMaps[1].isIdentity()));
-}
-
 /// Sets the default lowering configuration for a generic op to use
 /// CPUDoubleTilingExpert pipeline.
 static LogicalResult

From f178fc1fa6961290da384e2556dcee752b888f90 Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Wed, 1 May 2024 22:38:27 +0000
Subject: [PATCH 4/4] update isSupportedTranspose naming and comments

---
 .../iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp  | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 468e0f73f02f..747f918585d0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -290,8 +290,10 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn,
 }
 
 /// Returns true if the operation is a GenericOp implementing a supported
-/// transposition.
-static bool isSupportedTransposeOp(linalg::GenericOp genericOp) {
+/// transposition:
+///   1. The op has a single input and a single output.
+///   2. One of the indexing_map is identity and the other is a permutation.
+static bool x86TransposeLoweringPrecondition(linalg::GenericOp genericOp) {
   // Check that the op has at least 2 dimensions.
   if (genericOp.getNumLoops() < 2) {
     return false;
@@ -379,7 +381,7 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn,
 
   auto genericOp = dyn_cast<linalg::GenericOp>(op.getOperation());
   if (linalgOpInfo.isTranspose() && genericOp &&
-      isSupportedTransposeOp(genericOp)) {
+      x86TransposeLoweringPrecondition(genericOp)) {
     // Limit unrolling on transpose operations.
     // TODO(dcaballe): Consider input and output transposes.
     limitUnrollFactor(targetMLTransInfo.defaultMaxTransposeUnrollFactor);
@@ -1877,7 +1879,8 @@ setTransposeLikeOpRootConfig(mlir::FunctionOpInterface entryPointFn,
   LLVM_DEBUG(KD_DBGS() << "Setting transpose-like op root configuration\n");
 
   auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
-  if (!hasAVX2Feature(targetAttr) || !isSupportedTransposeOp(genericOp)) {
+  if (!hasAVX2Feature(targetAttr) ||
+      !x86TransposeLoweringPrecondition(genericOp)) {
     return failure();
   }