From 158c6369c7f7d47f7410c70e9093285da51cc48a Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:27:10 -0600 Subject: [PATCH] Revert "Increase default threshold of TileLargeTensor pass (#19671)" (#19693) This reverts commit 3978ce6ffc652e3afd2ce479f3adb4edc7e6d680. It may be causing regression in MI250 SDXL not observed on pre-submit --- .../iree/compiler/Codegen/Common/Passes.td | 2 +- .../Common/test/tile_large_tensors.mlir | 30 +++++++++---------- .../test/ROCDL/pipeline_tile_and_fuse.mlir | 5 ++-- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td index 245b07f6deaa..7188de257ca8 100644 --- a/compiler/src/iree/compiler/Codegen/Common/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td @@ -654,7 +654,7 @@ def TileLargeTensorsPass : ]; let options = [ Option<"maxVectorSize", "max-vector-size", "int64_t", - /*default=*/"256", + /*default=*/"64", "Maximum static size to tile to (i.e. all remaining ops will be smaller)">, ]; } diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir index 3bb51a2d6d0c..66c73da981c0 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir @@ -3,22 +3,22 @@ // RUN: FileCheck %s #map = affine_map<(d0, d1) -> (d0, d1)> -func.func @simple_generic(%3: tensor<64x512xf32>, %4: tensor<64x512xf32>, %5: tensor<64x512xf32>) -> tensor<64x512xf32> { +func.func @simple_generic(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: tensor<64x256xf32>) -> tensor<64x256xf32> { %6 = linalg.generic { indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"] - } ins(%3, %4 : tensor<64x512xf32>, tensor<64x512xf32>) outs(%5 : tensor<64x512xf32>) { + } ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>) outs(%5 : tensor<64x256xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %7 = arith.addf %in, %in_0 : f32 linalg.yield %7 : f32 - } -> tensor<64x512xf32> - return %6 : tensor<64x512xf32> + } -> tensor<64x256xf32> + return %6 : tensor<64x256xf32> } // CHECK-LABEL: func.func @simple_generic // CHECK: scf.for %{{.*}} = %c0 to %c64 step %c1 -// CHECK: scf.for %{{.*}} = %c0 to %c512 step %c256 -// CHECK: linalg.generic {{.*}} outs({{.*}}: tensor<1x256xf32>) +// CHECK: scf.for %{{.*}} = %c0 to %c256 step %c64 +// CHECK: linalg.generic {{.*}} outs({{.*}}: tensor<1x64xf32>) // ----- @@ -65,21 +65,21 @@ func.func @in_nested_region(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: te // ----- -func.func @multiple_use_tilable_op(%3: tensor<64x512xf32>, %4: tensor<64x512xf32>) -> (tensor<64x512xf32>, tensor<512x64xf32>) { - %add_empty = tensor.empty() : tensor<64x512xf32> +func.func @multiple_use_tilable_op(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>) -> (tensor<64x256xf32>, tensor<256x64xf32>) { + %add_empty = tensor.empty() : tensor<64x256xf32> %6 = linalg.add - ins(%3, %4 : tensor<64x512xf32>, tensor<64x512xf32>) - outs(%add_empty : tensor<64x512xf32>) -> tensor<64x512xf32> - %transpose_empty = tensor.empty() : tensor<512x64xf32> + ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>) + outs(%add_empty : tensor<64x256xf32>) -> tensor<64x256xf32> + %transpose_empty = tensor.empty() : tensor<256x64xf32> %7 = linalg.transpose - ins(%6 : tensor<64x512xf32>) - outs(%transpose_empty : tensor<512x64xf32>) permutation = [1, 0] - return %6, %7 : tensor<64x512xf32>, tensor<512x64xf32> + ins(%6 : tensor<64x256xf32>) + outs(%transpose_empty : tensor<256x64xf32>) permutation = [1, 0] + return %6, %7 : tensor<64x256xf32>, tensor<256x64xf32> } // CHECK-LABEL: func.func @multiple_use_tilable_op // CHECK: %[[ADD_TILING:.+]] = scf.for -// CHECK: linalg.add {{.*}} -> tensor<1x256xf32> +// CHECK: linalg.add {{.*}} -> tensor<1x64xf32> // CHECK: %[[T_TILING:.+]] = scf.for // CHECK: %[[FUSED_ADD:.+]] = linalg.add {{.*}} -> tensor<64x1xf32> // CHECK: linalg.transpose ins(%[[FUSED_ADD]] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index 6c9e4d5f752d..1a521e61ebd9 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -1013,8 +1013,9 @@ hal.executable public @main { // CHECK: scf.yield %[[REDUCE]] // CHECK: scf.for %{{.*}} = %{{.*}} to %c16 step %c1 -// CHECK-COUNT-4: arith.addf {{.*}} : vector<9x9xf32> -// CHECK: vector.transfer_write {{.*}} vector<9x9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type> +// CHECK: scf.for +// CHECK-COUNT-4: arith.addf {{.*}} : vector<9xf32> +// CHECK: vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type> // -----