diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 107540262cf4..747f918585d0 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -289,6 +289,36 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn, return getVectorSize(entryPointFn, byteWidth); } +/// Returns true if the operation is a GenericOp implementing a supported +/// transposition: +/// 1. The op has a single input and a single output. +/// 2. One of the indexing_map is identity and the other is a permutation. +static bool x86TransposeLoweringPrecondition(linalg::GenericOp genericOp) { + // Check that the op has at least 2 dimensions. + if (genericOp.getNumLoops() < 2) { + return false; + } + + // Check that the op has only one input and one output. + // TODO(diegocaballero): Generalize to multiple inputs. + if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) { + return false; + } + + // Check that all the iterators are parallel. + if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) { + return false; + } + + // Check that the two indexing maps are a permutation of each other. + auto indexingMaps = genericOp.getIndexingMapsArray(); + return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() && + ((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() && + indexingMaps[1].isPermutation()) || + (!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() && + indexingMaps[1].isIdentity())); +} + /// Returns minimum tiling sizes for each dimension. One dimension is possible /// to access at different element types. It determines the tiling sizes by /// looking into all the operands. @@ -325,7 +355,8 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn, // Limit unroll factor. For now, we assume the rightmost non-one tiled // dimension is for vectorization and any other non-one dimension is for - // unrolling. + // unrolling. The util limits the second rightmost non-one tiled dimension + // to be not larger than `maxUnrollFactor` and others tiled dimension to 1. auto limitUnrollFactor = [&](int64_t maxUnrollFactor) { int vecDim; for (vecDim = minTileSizes.size() - 1; vecDim >= 0; --vecDim) { @@ -333,13 +364,24 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn, break; } } + bool seen = false; for (int unrollDim = vecDim - 1; unrollDim >= 0; --unrollDim) { + if (minTileSizes[unrollDim] <= 1) { + continue; + } + int64_t factor = seen ? 1LL : maxUnrollFactor; + seen = true; + LLVM_DEBUG(KD_DBGS() << "Adjusted min tile sizes: " + << minTileSizes[unrollDim] + << " with factor=" << factor << "\n"); minTileSizes[unrollDim] = - std::min(minTileSizes[unrollDim], maxUnrollFactor); + std::min(minTileSizes[unrollDim], factor); } }; - if (linalgOpInfo.isTranspose()) { + auto genericOp = dyn_cast(op.getOperation()); + if (linalgOpInfo.isTranspose() && genericOp && + x86TransposeLoweringPrecondition(genericOp)) { // Limit unrolling on transpose operations. // TODO(dcaballe): Consider input and output transposes. limitUnrollFactor(targetMLTransInfo.defaultMaxTransposeUnrollFactor); @@ -1729,34 +1771,6 @@ static void setVectorTileSizes(linalg::LinalgOp op, } } -/// Returns true if the operation is a GenericOp implementing a supported -/// transposition. -static bool isSupportedTransposeOp(linalg::GenericOp genericOp) { - // Check that the op has at least 2 dimensions. - if (genericOp.getNumLoops() < 2) { - return false; - } - - // Check that the op has only one input and one output. - // TODO(diegocaballero): Generalize to multiple inputs. - if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) { - return false; - } - - // Check that all the iterators are parallel. - if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) { - return false; - } - - // Check that the two indexing maps are a permutation of each other. - auto indexingMaps = genericOp.getIndexingMapsArray(); - return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() && - ((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() && - indexingMaps[1].isPermutation()) || - (!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() && - indexingMaps[1].isIdentity())); -} - /// Sets the default lowering configuration for a generic op to use /// CPUDoubleTilingExpert pipeline. static LogicalResult @@ -1865,7 +1879,8 @@ setTransposeLikeOpRootConfig(mlir::FunctionOpInterface entryPointFn, LLVM_DEBUG(KD_DBGS() << "Setting transpose-like op root configuration\n"); auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn); - if (!hasAVX2Feature(targetAttr) || !isSupportedTransposeOp(genericOp)) { + if (!hasAVX2Feature(targetAttr) || + !x86TransposeLoweringPrecondition(genericOp)) { return failure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir index 01fa1a880850..e92a0de0aa06 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir @@ -1398,52 +1398,6 @@ module { // ----- -#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu", cpu_features = "+avx512f"}> -#map = affine_map<(d0, d1, d2) -> (d1, d2)> -#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -#map3 = affine_map<(d0, d1, d2) -> (d0)> -module { - func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor> - %5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<4096x86x128xi4> - %6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<4096x86xf32> - %8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<86x128xf32> - %9 = tensor.empty() : tensor<4096xf32> - %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32> - %11 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %5, %6, %7 : tensor<86x128xf32>, tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096xf32>) { - ^bb0(%in: f32, %in_0: i4, %in_1: f32, %in_2: f32, %out: f32): - %12 = arith.extui %in_0 : i4 to i32 - %13 = arith.uitofp %12 : i32 to f32 - %14 = arith.subf %13, %in_2 : f32 - %15 = arith.mulf %14, %in_1 : f32 - %16 = arith.mulf %in, %15 : f32 - %17 = arith.addf %16, %out : f32 - linalg.yield %17 : f32 - } -> tensor<4096xf32> - flow.dispatch.tensor.store %11, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor> - return - } -} - -// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config -// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config -// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info -// CHECK: func.func @i4_dequant_matvec_f32() -// CHECK-SAME: translation_info = #[[TRANSLATION]] -// CHECK: linalg.fill -// CHECK-SAME: lowering_config = #[[CONFIG]] -// CHECK: linalg.generic {{.*}} iterator_types = ["parallel", "reduction", "reduction"] -// CHECK-SAME: lowering_config = #[[CONFIG1]] - -// ----- - #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}> module { func.func @batch_mmt4d() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { @@ -1641,3 +1595,39 @@ module { // CHECK-SAME: translation_info = #[[TRANSLATION]] // CHECK: iree_linalg_ext.attention // CHECK-SAME: {lowering_config = #[[CONFIG]]} + +// ----- + +#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", { + cpu = "generic", cpu_features = "", + data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", + native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}> +module { + func.func @elementwise_output_transposed() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor> -> tensor + %5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor> -> tensor<768xi64> + %6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor> -> tensor<32xi64> + %7 = tensor.empty() : tensor<32x32x768xf32> + %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d1)>, affine_map<(d0, d1, d2) -> (d1, d2, d0)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %5, %6 : tensor, tensor<768xi64>, tensor<32xi64>) outs(%7 : tensor<32x32x768xf32>) { + ^bb0(%in: i64, %in_0: i64, %in_1: i64, %out: f32): + %9 = arith.addi %in, %in_0 : i64 + %10 = arith.addi %9, %in_1 : i64 + %11 = arith.uitofp %10 : i64 to f32 + linalg.yield %11 : f32 + } -> tensor<32x32x768xf32> + flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [32, 32, 768], strides = [1, 1, 1] : tensor<32x32x768xf32> -> !flow.dispatch.tensor> + return + } +} +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info +// CHECK: func.func @elementwise_output_transposed() +// CHECK-SAME: translation_info = #[[TRANSLATION]] +// CHECK: linalg.generic +// CHECK-SAME: {lowering_config = #[[CONFIG]]} +