Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Codegen][CPU] Eliminate all-true vector masks after vectorization #18190

Merged
merged 3 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,14 @@ class GenericVectorizationPass final
void runOnOperation() override;
};

/// Converts from iree_compiler::VscaleRange to vector::VscaleRange.
static std::optional<vector::VscaleRange>
toVectorVscaleRange(std::optional<iree_compiler::VscaleRange> vscaleRange) {
if (!vscaleRange.has_value())
return std::nullopt;
return vector::VscaleRange{vscaleRange->min, vscaleRange->max};
}
Comment on lines +328 to +334
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was confused that why can't we just use vector::VscaleRagnge, then I found that it's added to upstream recently (llvm/llvm-project@9b06e25). Can you help remove the IREE version, and switch to use upstream version? It can be done in a follow-up.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll do this in a follow up PR 👍


void GenericVectorizationPass::runOnOperation() {
MLIRContext *context = &getContext();
auto funcOp = getOperation();
Expand Down Expand Up @@ -377,6 +385,17 @@ void GenericVectorizationPass::runOnOperation() {
vectorizeGatherAccesses);
};

{
// Eliminate (all-true) vector masks as early as possible (to avoid missing
// optimizations/folds). This is particularly beneficial for scalable
// vectors that use dynamic tensor shapes.
auto targetAttr =
iree_compiler::IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
auto vscaleRange = iree_compiler::getDefaultVscaleRange(targetAttr);
vector::eliminateVectorMasks(rewriter, funcOp,
toVectorVscaleRange(vscaleRange));
}

{
// Canonicalize mask related ops before we lower them.
RewritePatternSet maskCanonPatterns(funcOp.getContext());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -445,3 +445,61 @@ func.func @dynamic_fill_with_scalable_tiling_infer_remainder_vector_size(%arg0:
// CHECK-MASK: scf.for
// CHECK-MASK: scf.for
// CHECK-MASK: vector.transfer_write %[[CST]], {{.*}} {in_bounds = [true, true, true, true]} : vector<1x1x4x[4]xf32>, tensor<1x1x4x?xf32>

// -----

#aarch64_sve = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", target_triple = "aarch64-none-elf"}>
#config = #iree_codegen.lowering_config<tile_sizes = [[0, 0, 0, 0], [1, 4, [4], 0], [0, 0, 0, 3], [0, 0, 0, 0]]>
#map = affine_map<()[s0] -> (-(96 mod s0) + 96)>
#map1 = affine_map<(d0) -> (d0 * 2)>

func.func @depthwise_conv_fold_away_masking(%arg0: tensor<1x68x120x96xf32>, %arg1: tensor<1x137x241x96xf32>, %arg2: tensor<3x3x96xf32>) -> tensor<1x68x120x96xf32>
attributes {hal.executable.target = #aarch64_sve}
{
%c3 = arith.constant 3 : index
%c120 = arith.constant 120 : index
%c68 = arith.constant 68 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%vscale = vector.vscale
%c4_vscale = arith.muli %vscale, %c4 : index
%0 = scf.for %arg3 = %c0 to %c68 step %c1 iter_args(%arg4 = %arg0) -> (tensor<1x68x120x96xf32>) {
%1 = scf.for %arg5 = %c0 to %c120 step %c4 iter_args(%arg6 = %arg4) -> (tensor<1x68x120x96xf32>) {
%2 = affine.apply #map()[%c4_vscale]
%3 = scf.for %arg7 = %c0 to %2 step %c4_vscale iter_args(%arg8 = %arg6) -> (tensor<1x68x120x96xf32>) {
%4 = affine.apply #map1(%arg3)
%5 = affine.apply #map1(%arg5)
%extracted_slice = tensor.extract_slice %arg1[0, %4, %5, %arg7] [1, 3, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x137x241x96xf32> to tensor<1x3x9x?xf32>
%extracted_slice_0 = tensor.extract_slice %arg2[0, 0, %arg7] [3, 3, %c4_vscale] [1, 1, 1] : tensor<3x3x96xf32> to tensor<3x3x?xf32>
%extracted_slice_1 = tensor.extract_slice %arg8[0, %arg3, %arg5, %arg7] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x68x120x96xf32> to tensor<1x1x4x?xf32>
%6 = linalg.fill ins(%cst : f32) outs(%extracted_slice_1 : tensor<1x1x4x?xf32>) -> tensor<1x1x4x?xf32>
%7 = scf.for %arg9 = %c0 to %c3 step %c1 iter_args(%arg10 = %6) -> (tensor<1x1x4x?xf32>) {
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg9, 0, 0] [1, 1, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x3x9x?xf32> to tensor<1x1x9x?xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg9, 0, 0] [1, 3, %c4_vscale] [1, 1, 1] : tensor<3x3x?xf32> to tensor<1x3x?xf32>
%extracted_slice_4 = tensor.extract_slice %arg10[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> to tensor<1x1x4x?xf32>
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[0, 0, 0, 0] [1, 1, 9, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x9x?xf32> to tensor<1x9x?xf32>
%extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, 0, 0] [1, 3, %c4_vscale] [1, 1, 1] : tensor<1x3x?xf32> to tensor<3x?xf32>
%extracted_slice_7 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> to tensor<1x4x?xf32>
%8 = linalg.depthwise_conv_1d_nwc_wc {dilations = dense<1> : vector<1xi64>, lowering_config = #config, strides = dense<2> : vector<1xi64>} ins(%extracted_slice_5, %extracted_slice_6 : tensor<1x9x?xf32>, tensor<3x?xf32>) outs(%extracted_slice_7 : tensor<1x4x?xf32>) -> tensor<1x4x?xf32>
%inserted_slice_8 = tensor.insert_slice %8 into %extracted_slice_4[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x4x?xf32> into tensor<1x1x4x?xf32>
%inserted_slice_9 = tensor.insert_slice %inserted_slice_8 into %arg10[0, 0, 0, 0] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x1x4x?xf32>
scf.yield %inserted_slice_9 : tensor<1x1x4x?xf32>
}
%inserted_slice = tensor.insert_slice %7 into %arg8[0, %arg3, %arg5, %arg7] [1, 1, 4, %c4_vscale] [1, 1, 1, 1] : tensor<1x1x4x?xf32> into tensor<1x68x120x96xf32>
scf.yield %inserted_slice : tensor<1x68x120x96xf32>
}
scf.yield %3 : tensor<1x68x120x96xf32>
}
scf.yield %1 : tensor<1x68x120x96xf32>
}
return %0 : tensor<1x68x120x96xf32>
}

/// This checks that the masks (introduced by the vectorizer) are eliminated by
/// the end of the iree-codegen-generic-vectorization pass.

// CHECK-MASK-LABEL: func.func @depthwise_conv_fold_away_masking
// CHECK-MASK-NOT: vector.create_mask
// CHECK-MASK-NOT: vector.constant_mask
Loading