From 158c6369c7f7d47f7410c70e9093285da51cc48a Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com>
Date: Mon, 13 Jan 2025 17:27:10 -0600
Subject: [PATCH] Revert "Increase default threshold of TileLargeTensor pass
 (#19671)" (#19693)

This reverts commit 3978ce6ffc652e3afd2ce479f3adb4edc7e6d680.

It may be causing regression in MI250 SDXL not observed on pre-submit
---
 .../iree/compiler/Codegen/Common/Passes.td    |  2 +-
 .../Common/test/tile_large_tensors.mlir       | 30 +++++++++----------
 .../test/ROCDL/pipeline_tile_and_fuse.mlir    |  5 ++--
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
index 245b07f6deaa..7188de257ca8 100644
--- a/compiler/src/iree/compiler/Codegen/Common/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -654,7 +654,7 @@ def TileLargeTensorsPass :
   ];
   let options = [
     Option<"maxVectorSize", "max-vector-size", "int64_t",
-           /*default=*/"256",
+           /*default=*/"64",
            "Maximum static size to tile to (i.e. all remaining ops will be smaller)">,
   ];
 }
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir
index 3bb51a2d6d0c..66c73da981c0 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir
@@ -3,22 +3,22 @@
 // RUN:   FileCheck %s
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
-func.func @simple_generic(%3: tensor<64x512xf32>, %4: tensor<64x512xf32>, %5: tensor<64x512xf32>) -> tensor<64x512xf32> {
+func.func @simple_generic(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: tensor<64x256xf32>) -> tensor<64x256xf32> {
   %6 = linalg.generic {
     indexing_maps = [#map, #map, #map],
     iterator_types = ["parallel", "parallel"]
-    } ins(%3, %4 : tensor<64x512xf32>, tensor<64x512xf32>) outs(%5 : tensor<64x512xf32>) {
+    } ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>) outs(%5 : tensor<64x256xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %7 = arith.addf %in, %in_0 : f32
     linalg.yield %7 : f32
-  } -> tensor<64x512xf32>
-  return %6 : tensor<64x512xf32>
+  } -> tensor<64x256xf32>
+  return %6 : tensor<64x256xf32>
 }
 
 // CHECK-LABEL: func.func @simple_generic
 //       CHECK:   scf.for %{{.*}} = %c0 to %c64 step %c1
-//       CHECK:     scf.for %{{.*}} = %c0 to %c512 step %c256
-//       CHECK:       linalg.generic {{.*}} outs({{.*}}: tensor<1x256xf32>)
+//       CHECK:     scf.for %{{.*}} = %c0 to %c256 step %c64
+//       CHECK:       linalg.generic {{.*}} outs({{.*}}: tensor<1x64xf32>)
 
 // -----
 
@@ -65,21 +65,21 @@ func.func @in_nested_region(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: te
 
 // -----
 
-func.func @multiple_use_tilable_op(%3: tensor<64x512xf32>, %4: tensor<64x512xf32>) -> (tensor<64x512xf32>, tensor<512x64xf32>) {
-  %add_empty = tensor.empty() : tensor<64x512xf32>
+func.func @multiple_use_tilable_op(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>) -> (tensor<64x256xf32>, tensor<256x64xf32>) {
+  %add_empty = tensor.empty() : tensor<64x256xf32>
   %6 = linalg.add
-    ins(%3, %4 : tensor<64x512xf32>, tensor<64x512xf32>)
-    outs(%add_empty : tensor<64x512xf32>) -> tensor<64x512xf32>
-  %transpose_empty = tensor.empty() : tensor<512x64xf32>
+    ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>)
+    outs(%add_empty : tensor<64x256xf32>) -> tensor<64x256xf32>
+  %transpose_empty = tensor.empty() : tensor<256x64xf32>
   %7 = linalg.transpose
-    ins(%6 : tensor<64x512xf32>)
-    outs(%transpose_empty : tensor<512x64xf32>) permutation = [1, 0]
-  return %6, %7 : tensor<64x512xf32>, tensor<512x64xf32>
+    ins(%6 : tensor<64x256xf32>)
+    outs(%transpose_empty : tensor<256x64xf32>) permutation = [1, 0]
+  return %6, %7 : tensor<64x256xf32>, tensor<256x64xf32>
 }
 
 // CHECK-LABEL: func.func @multiple_use_tilable_op
 //       CHECK:   %[[ADD_TILING:.+]] = scf.for
-//       CHECK:     linalg.add {{.*}} -> tensor<1x256xf32>
+//       CHECK:     linalg.add {{.*}} -> tensor<1x64xf32>
 //       CHECK:   %[[T_TILING:.+]] = scf.for
 //       CHECK:     %[[FUSED_ADD:.+]] = linalg.add {{.*}} -> tensor<64x1xf32>
 //       CHECK:     linalg.transpose ins(%[[FUSED_ADD]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 6c9e4d5f752d..1a521e61ebd9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1013,8 +1013,9 @@ hal.executable public @main {
 //       CHECK:         scf.yield %[[REDUCE]]
 
 //       CHECK:   scf.for %{{.*}} = %{{.*}} to %c16 step %c1
-// CHECK-COUNT-4:     arith.addf {{.*}} : vector<9x9xf32>
-//       CHECK:       vector.transfer_write {{.*}} vector<9x9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
+//       CHECK:     scf.for
+// CHECK-COUNT-4:     arith.addf {{.*}} : vector<9xf32>
+//       CHECK:       vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
 
 // -----