diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp index 3753700c6111..3d5ed20d145a 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp @@ -405,11 +405,11 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager, // SplitReductionPass takes care of banked-tiling. funcPassManager.addPass( createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions)); - funcPassManager.addPass(createLLVMCPUTilePass(i)); + funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperands(i)); continue; } - funcPassManager.addPass(createLLVMCPUTilePass(i)); + funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperands(i)); } } diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir index e927b2fe8799..6936adef7e01 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/pipeline_tests.mlir @@ -395,3 +395,29 @@ func.func @dequant_matmul() attributes {hal.executable.target = #executable_targ // CHECK: scf.for // CHECK: arith.uitofp // CHECK: vector.fma + +// ----- + +#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+fma,+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}> +func.func @fuse_inputs_reduction() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(, #hal.pipeline.binding], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0], sizes = [64, 1, 1, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<64x1x1x16x16xf32> + %3 = tensor.empty() : tensor<64x16x16xf32> + %4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<64x16x16xf32>) -> tensor<64x16x16xf32> + %unpack = tensor.unpack %2 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %3 : tensor<64x1x1x16x16xf32> -> tensor<64x16x16xf32> + %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%unpack : tensor<64x16x16xf32>) outs(%4 : tensor<64x16x16xf32>) { + ^bb0(%in: f32, %out: f32): + %6 = arith.addf %out, %in : f32 + linalg.yield %6 : f32 + } -> tensor<64x16x16xf32> + flow.dispatch.tensor.store %5, %1, offsets = [0, 0, 0], sizes = [64, 16, 16], strides = [1, 1, 1] : tensor<64x16x16xf32> -> !flow.dispatch.tensor> + return +} +// CHECK-LABEL: func.func @fuse_inputs_reduction +// CHECK: scf.for +// CHECK: vector.load +// CHECK-NOT: scf.for +// CHECK: arith.addf