From 3a870c7dc45bd1d6e57fcf7ace14fb49bdef274f Mon Sep 17 00:00:00 2001 From: hanhanW Date: Mon, 29 Apr 2024 23:04:48 +0000 Subject: [PATCH 1/4] Do not unroll a lot --- .../Codegen/LLVMCPU/KernelDispatch.cpp | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 107540262cf4..6b4de3cf168f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -289,6 +289,8 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn, return getVectorSize(entryPointFn, byteWidth); } +static bool isSupportedTransposeOp(linalg::GenericOp genericOp); + /// Returns minimum tiling sizes for each dimension. One dimension is possible /// to access at different element types. It determines the tiling sizes by /// looking into all the operands. @@ -319,10 +321,17 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn, llvm::cast(inputOutputOpOperands[index].get().getType()); int64_t tileSize = getVectorSize(entryPointFn, operandType); + LLVM_DEBUG(KD_DBGS() << "fastestVaryingDim: " << fastestVaryingDim + << ", tileSize: " << tileSize + << ", operandType: " << operandType << "\n"); minTileSizes[fastestVaryingDim] = std::max(minTileSizes[fastestVaryingDim], tileSize); } + for (auto [i, val] : llvm::enumerate(minTileSizes)) { + LLVM_DEBUG(KD_DBGS() << "minTileSizes #" << i << ": " << val << "\n"); + } + // Limit unroll factor. For now, we assume the rightmost non-one tiled // dimension is for vectorization and any other non-one dimension is for // unrolling. @@ -333,13 +342,24 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn, break; } } + bool seen = false; for (int unrollDim = vecDim - 1; unrollDim >= 0; --unrollDim) { + if (minTileSizes[unrollDim] <= 1) { + continue; + } + int64_t factor = seen ? 1LL : maxUnrollFactor; + seen = true; + LLVM_DEBUG(KD_DBGS() << "Adjusted min tile sizes: " + << minTileSizes[unrollDim] + << " with factor=" << factor << "\n"); minTileSizes[unrollDim] = - std::min(minTileSizes[unrollDim], maxUnrollFactor); + std::min(minTileSizes[unrollDim], factor); } }; - if (linalgOpInfo.isTranspose()) { + auto genericOp = dyn_cast(op.getOperation()); + if (linalgOpInfo.isTranspose() && genericOp && + isSupportedTransposeOp(genericOp)) { // Limit unrolling on transpose operations. // TODO(dcaballe): Consider input and output transposes. limitUnrollFactor(targetMLTransInfo.defaultMaxTransposeUnrollFactor); From 4ee8a8673508b5397e75810ed402645f231f7066 Mon Sep 17 00:00:00 2001 From: hanhanW Date: Tue, 30 Apr 2024 22:38:24 +0000 Subject: [PATCH 2/4] [CPU] Limit unrolling factors for generic ops. --- .../Codegen/LLVMCPU/KernelDispatch.cpp | 10 +-- .../test/select_x86_64_lowering_strategy.mlir | 82 ++++++++----------- 2 files changed, 38 insertions(+), 54 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 6b4de3cf168f..8e5cc773c354 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -321,20 +321,14 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn, llvm::cast(inputOutputOpOperands[index].get().getType()); int64_t tileSize = getVectorSize(entryPointFn, operandType); - LLVM_DEBUG(KD_DBGS() << "fastestVaryingDim: " << fastestVaryingDim - << ", tileSize: " << tileSize - << ", operandType: " << operandType << "\n"); minTileSizes[fastestVaryingDim] = std::max(minTileSizes[fastestVaryingDim], tileSize); } - for (auto [i, val] : llvm::enumerate(minTileSizes)) { - LLVM_DEBUG(KD_DBGS() << "minTileSizes #" << i << ": " << val << "\n"); - } - // Limit unroll factor. For now, we assume the rightmost non-one tiled // dimension is for vectorization and any other non-one dimension is for - // unrolling. + // unrolling. The util limits the second rightmost non-one tiled dimension + // to be not larger than `maxUnrollFactor` and others tiled dimension to 1. auto limitUnrollFactor = [&](int64_t maxUnrollFactor) { int vecDim; for (vecDim = minTileSizes.size() - 1; vecDim >= 0; --vecDim) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir index 01fa1a880850..e92a0de0aa06 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir @@ -1398,52 +1398,6 @@ module { // ----- -#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu", cpu_features = "+avx512f"}> -#map = affine_map<(d0, d1, d2) -> (d1, d2)> -#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -#map3 = affine_map<(d0, d1, d2) -> (d0)> -module { - func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x128xf32> - %9 = tensor.empty() : tensor<4096xf32> - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32> - %11 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %5, %6, %7 : tensor<86x128xf32>, tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096xf32>) { - ^bb0(%in: f32, %in_0: i4, %in_1: f32, %in_2: f32, %out: f32): - %12 = arith.extui %in_0 : i4 to i32 - %13 = arith.uitofp %12 : i32 to f32 - %14 = arith.subf %13, %in_2 : f32 - %15 = arith.mulf %14, %in_1 : f32 - %16 = arith.mulf %in, %15 : f32 - %17 = arith.addf %16, %out : f32 - linalg.yield %17 : f32 - } -> tensor<4096xf32> - flow.dispatch.tensor.store %11, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor> - return - } -} - -// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config -// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config -// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info -// CHECK: func.func @i4_dequant_matvec_f32() -// CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK: linalg.fill -// CHECK-SAME: lowering_config = #[[CONFIG]] -// CHECK: linalg.generic {{.*}} iterator_types = ["parallel", "reduction", "reduction"] -// CHECK-SAME: lowering_config = #[[CONFIG1]] - -// ----- - #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}> module { func.func @batch_mmt4d() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { @@ -1641,3 +1595,39 @@ module { // CHECK-SAME: translation_info = #[[TRANSLATION]] // CHECK: iree_linalg_ext.attention // CHECK-SAME: {lowering_config = #[[CONFIG]]} + +// ----- + +#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { + cpu = "generic", cpu_features = "", + data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", + native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> +module { + func.func @elementwise_output_transposed() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor> -> tensor<768xi64> + %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xi64> + %7 = tensor.empty() : tensor<32x32x768xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d1)>, affine_map<(d0, d1, d2) -> (d1, d2, d0)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %5, %6 : tensor, tensor<768xi64>, tensor<32xi64>) outs(%7 : tensor<32x32x768xf32>) { + ^bb0(%in: i64, %in_0: i64, %in_1: i64, %out: f32): + %9 = arith.addi %in, %in_0 : i64 + %10 = arith.addi %9, %in_1 : i64 + %11 = arith.uitofp %10 : i64 to f32 + linalg.yield %11 : f32 + } -> tensor<32x32x768xf32> + flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [32, 32, 768], strides = [1, 1, 1] : tensor<32x32x768xf32> -> !flow.dispatch.tensor> + return + } +} +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info +// CHECK: func.func @elementwise_output_transposed() +// CHECK-SAME: translation_info = #[[TRANSLATION]] +// CHECK: linalg.generic +// CHECK-SAME: {lowering_config = #[[CONFIG]]} + From f231dd81a24db1d1d36d6ae6eb22b773cefe1b3d Mon Sep 17 00:00:00 2001 From: hanhanW Date: Tue, 30 Apr 2024 22:48:52 +0000 Subject: [PATCH 3/4] Bubble up the isSupportedTranspose method impl, so others can reuse it directly --- .../Codegen/LLVMCPU/KernelDispatch.cpp | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 8e5cc773c354..468e0f73f02f 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -289,7 +289,33 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn, return getVectorSize(entryPointFn, byteWidth); } -static bool isSupportedTransposeOp(linalg::GenericOp genericOp); +/// Returns true if the operation is a GenericOp implementing a supported +/// transposition. +static bool isSupportedTransposeOp(linalg::GenericOp genericOp) { + // Check that the op has at least 2 dimensions. + if (genericOp.getNumLoops() < 2) { + return false; + } + + // Check that the op has only one input and one output. + // TODO(diegocaballero): Generalize to multiple inputs. + if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) { + return false; + } + + // Check that all the iterators are parallel. + if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) { + return false; + } + + // Check that the two indexing maps are a permutation of each other. + auto indexingMaps = genericOp.getIndexingMapsArray(); + return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() && + ((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() && + indexingMaps[1].isPermutation()) || + (!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() && + indexingMaps[1].isIdentity())); +} /// Returns minimum tiling sizes for each dimension. One dimension is possible /// to access at different element types. It determines the tiling sizes by @@ -1743,34 +1769,6 @@ static void setVectorTileSizes(linalg::LinalgOp op, } } -/// Returns true if the operation is a GenericOp implementing a supported -/// transposition. -static bool isSupportedTransposeOp(linalg::GenericOp genericOp) { - // Check that the op has at least 2 dimensions. - if (genericOp.getNumLoops() < 2) { - return false; - } - - // Check that the op has only one input and one output. - // TODO(diegocaballero): Generalize to multiple inputs. - if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) { - return false; - } - - // Check that all the iterators are parallel. - if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) { - return false; - } - - // Check that the two indexing maps are a permutation of each other. - auto indexingMaps = genericOp.getIndexingMapsArray(); - return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() && - ((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() && - indexingMaps[1].isPermutation()) || - (!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() && - indexingMaps[1].isIdentity())); -} - /// Sets the default lowering configuration for a generic op to use /// CPUDoubleTilingExpert pipeline. static LogicalResult From f178fc1fa6961290da384e2556dcee752b888f90 Mon Sep 17 00:00:00 2001 From: hanhanW Date: Wed, 1 May 2024 22:38:27 +0000 Subject: [PATCH 4/4] update isSupportedTranspose naming and comments --- .../iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 468e0f73f02f..747f918585d0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -290,8 +290,10 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn, } /// Returns true if the operation is a GenericOp implementing a supported -/// transposition. -static bool isSupportedTransposeOp(linalg::GenericOp genericOp) { +/// transposition: +/// 1. The op has a single input and a single output. +/// 2. One of the indexing_map is identity and the other is a permutation. +static bool x86TransposeLoweringPrecondition(linalg::GenericOp genericOp) { // Check that the op has at least 2 dimensions. if (genericOp.getNumLoops() < 2) { return false; @@ -379,7 +381,7 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn, auto genericOp = dyn_cast(op.getOperation()); if (linalgOpInfo.isTranspose() && genericOp && - isSupportedTransposeOp(genericOp)) { + x86TransposeLoweringPrecondition(genericOp)) { // Limit unrolling on transpose operations. // TODO(dcaballe): Consider input and output transposes. limitUnrollFactor(targetMLTransInfo.defaultMaxTransposeUnrollFactor); @@ -1877,7 +1879,8 @@ setTransposeLikeOpRootConfig(mlir::FunctionOpInterface entryPointFn, LLVM_DEBUG(KD_DBGS() << "Setting transpose-like op root configuration\n"); auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn); - if (!hasAVX2Feature(targetAttr) || !isSupportedTransposeOp(genericOp)) { + if (!hasAVX2Feature(targetAttr) || + !x86TransposeLoweringPrecondition(genericOp)) { return failure(); }