Skip to content

Commit

Permalink
[LLVMCPU] Update the tile&fuse MultiTilingExpert pipeline (#19352)
Browse files Browse the repository at this point in the history
The tilePass is updated with tileRootAndFuseInputOperands pass.
  • Loading branch information
pashu123 authored Dec 4, 2024
1 parent 29229df commit 939984c
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 2 deletions.
4 changes: 2 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,11 +405,11 @@ void addMultiTilingExpertPassPipeline(OpPassManager &funcPassManager,
// SplitReductionPass takes care of banked-tiling.
funcPassManager.addPass(
createLLVMCPUSplitReductionPass(clEnableReassociateFpReductions));
funcPassManager.addPass(createLLVMCPUTilePass(i));
funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperands(i));
continue;
}

funcPassManager.addPass(createLLVMCPUTilePass(i));
funcPassManager.addPass(createLLVMCPUTileRootAndFuseInputOperands(i));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,29 @@ func.func @dequant_matmul() attributes {hal.executable.target = #executable_targ
// CHECK: scf.for
// CHECK: arith.uitofp
// CHECK: vector.fma

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "+fma,+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 64 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "all"}>
func.func @fuse_inputs_reduction() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<64x1x1x16x16xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<64x16x16xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0], sizes = [64, 1, 1, 16, 16], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<64x1x1x16x16xf32>> -> tensor<64x1x1x16x16xf32>
%3 = tensor.empty() : tensor<64x16x16xf32>
%4 = linalg.fill ins(%cst : f32) outs(%3 : tensor<64x16x16xf32>) -> tensor<64x16x16xf32>
%unpack = tensor.unpack %2 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %3 : tensor<64x1x1x16x16xf32> -> tensor<64x16x16xf32>
%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%unpack : tensor<64x16x16xf32>) outs(%4 : tensor<64x16x16xf32>) {
^bb0(%in: f32, %out: f32):
%6 = arith.addf %out, %in : f32
linalg.yield %6 : f32
} -> tensor<64x16x16xf32>
flow.dispatch.tensor.store %5, %1, offsets = [0, 0, 0], sizes = [64, 16, 16], strides = [1, 1, 1] : tensor<64x16x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<64x16x16xf32>>
return
}
// CHECK-LABEL: func.func @fuse_inputs_reduction
// CHECK: scf.for
// CHECK: vector.load
// CHECK-NOT: scf.for
// CHECK: arith.addf

0 comments on commit 939984c

Please sign in to comment.