Skip to content

Commit

Permalink
[CPU] Limit unrolling factors for generic ops. (iree-org#17227)
Browse files Browse the repository at this point in the history
The revision also deprecates an outdated lit test that is impacted by
it. It adds the other lit test simplified from the
iree-org#16993

Fixes iree-org#16993

Signed-off-by: Lubo Litchev <lubol@google.com>
  • Loading branch information
hanhanW authored and LLITCHEV committed Jul 30, 2024
1 parent 73f62a9 commit 4098fca
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 78 deletions.
79 changes: 47 additions & 32 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,36 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn,
return getVectorSize(entryPointFn, byteWidth);
}

/// Returns true if the operation is a GenericOp implementing a supported
/// transposition:
/// 1. The op has a single input and a single output.
/// 2. One of the indexing_map is identity and the other is a permutation.
static bool x86TransposeLoweringPrecondition(linalg::GenericOp genericOp) {
// Check that the op has at least 2 dimensions.
if (genericOp.getNumLoops() < 2) {
return false;
}

// Check that the op has only one input and one output.
// TODO(diegocaballero): Generalize to multiple inputs.
if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) {
return false;
}

// Check that all the iterators are parallel.
if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) {
return false;
}

// Check that the two indexing maps are a permutation of each other.
auto indexingMaps = genericOp.getIndexingMapsArray();
return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() &&
((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() &&
indexingMaps[1].isPermutation()) ||
(!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() &&
indexingMaps[1].isIdentity()));
}

/// Returns minimum tiling sizes for each dimension. One dimension is possible
/// to access at different element types. It determines the tiling sizes by
/// looking into all the operands.
Expand Down Expand Up @@ -330,21 +360,33 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn,

// Limit unroll factor. For now, we assume the rightmost non-one tiled
// dimension is for vectorization and any other non-one dimension is for
// unrolling.
// unrolling. The util limits the second rightmost non-one tiled dimension
// to be not larger than `maxUnrollFactor` and others tiled dimension to 1.
auto limitUnrollFactor = [&](int64_t maxUnrollFactor) {
int vecDim;
for (vecDim = minTileSizes.size() - 1; vecDim >= 0; --vecDim) {
if (minTileSizes[vecDim] > 1) {
break;
}
}
bool seen = false;
for (int unrollDim = vecDim - 1; unrollDim >= 0; --unrollDim) {
if (minTileSizes[unrollDim] <= 1) {
continue;
}
int64_t factor = seen ? 1LL : maxUnrollFactor;
seen = true;
LLVM_DEBUG(KD_DBGS() << "Adjusted min tile sizes: "
<< minTileSizes[unrollDim]
<< " with factor=" << factor << "\n");
minTileSizes[unrollDim] =
std::min<int64_t>(minTileSizes[unrollDim], maxUnrollFactor);
std::min<int64_t>(minTileSizes[unrollDim], factor);
}
};

if (linalgOpInfo.isTranspose()) {
auto genericOp = dyn_cast<linalg::GenericOp>(op.getOperation());
if (linalgOpInfo.isTranspose() && genericOp &&
x86TransposeLoweringPrecondition(genericOp)) {
// Limit unrolling on transpose operations.
// TODO(dcaballe): Consider input and output transposes.
limitUnrollFactor(targetMLTransInfo.defaultMaxTransposeUnrollFactor);
Expand Down Expand Up @@ -1741,34 +1783,6 @@ static void setVectorTileSizes(linalg::LinalgOp op,
}
}

/// Returns true if the operation is a GenericOp implementing a supported
/// transposition.
static bool isSupportedTransposeOp(linalg::GenericOp genericOp) {
// Check that the op has at least 2 dimensions.
if (genericOp.getNumLoops() < 2) {
return false;
}

// Check that the op has only one input and one output.
// TODO(diegocaballero): Generalize to multiple inputs.
if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) {
return false;
}

// Check that all the iterators are parallel.
if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) {
return false;
}

// Check that the two indexing maps are a permutation of each other.
auto indexingMaps = genericOp.getIndexingMapsArray();
return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() &&
((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() &&
indexingMaps[1].isPermutation()) ||
(!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() &&
indexingMaps[1].isIdentity()));
}

/// Sets the default lowering configuration for a generic op to use
/// CPUDoubleTilingExpert pipeline.
static LogicalResult
Expand Down Expand Up @@ -1877,7 +1891,8 @@ setTransposeLikeOpRootConfig(mlir::FunctionOpInterface entryPointFn,
LLVM_DEBUG(KD_DBGS() << "Setting transpose-like op root configuration\n");

auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
if (!hasAVX2Feature(targetAttr) || !isSupportedTransposeOp(genericOp)) {
if (!hasAVX2Feature(targetAttr) ||
!x86TransposeLoweringPrecondition(genericOp)) {
return failure();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1422,52 +1422,6 @@ module {

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu", cpu_features = "+avx512f"}>
#map = affine_map<(d0, d1, d2) -> (d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map3 = affine_map<(d0, d1, d2) -> (d0)>
module {
func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
%4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
%8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
%9 = tensor.empty() : tensor<4096xf32>
%10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32>
%11 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %5, %6, %7 : tensor<86x128xf32>, tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096xf32>) {
^bb0(%in: f32, %in_0: i4, %in_1: f32, %in_2: f32, %out: f32):
%12 = arith.extui %in_0 : i4 to i32
%13 = arith.uitofp %12 : i32 to f32
%14 = arith.subf %13, %in_2 : f32
%15 = arith.mulf %14, %in_1 : f32
%16 = arith.mulf %in, %15 : f32
%17 = arith.addf %16, %out : f32
linalg.yield %17 : f32
} -> tensor<4096xf32>
flow.dispatch.tensor.store %11, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
return
}
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32], [4], [0], [0]]>
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 0, 0], [4, 0, 0], [0, 4, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: func.func @i4_dequant_matvec_f32()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.generic {{.*}} iterator_types = ["parallel", "reduction", "reduction"]
// CHECK-SAME: lowering_config = #[[CONFIG1]]

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>
module {
func.func @batch_mmt4d() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
Expand Down Expand Up @@ -1665,3 +1619,39 @@ module {
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.attention
// CHECK-SAME: {lowering_config = #[[CONFIG]]}

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
cpu = "generic", cpu_features = "",
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
module {
func.func @elementwise_output_transposed() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<i64>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<768xi64>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32xi64>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x32x768xf32>>
%4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<i64>> -> tensor<i64>
%5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<768xi64>> -> tensor<768xi64>
%6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xi64>> -> tensor<32xi64>
%7 = tensor.empty() : tensor<32x32x768xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d1)>, affine_map<(d0, d1, d2) -> (d1, d2, d0)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %5, %6 : tensor<i64>, tensor<768xi64>, tensor<32xi64>) outs(%7 : tensor<32x32x768xf32>) {
^bb0(%in: i64, %in_0: i64, %in_1: i64, %out: f32):
%9 = arith.addi %in, %in_0 : i64
%10 = arith.addi %9, %in_1 : i64
%11 = arith.uitofp %10 : i64 to f32
linalg.yield %11 : f32
} -> tensor<32x32x768xf32>
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [32, 32, 768], strides = [1, 1, 1] : tensor<32x32x768xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x32x768xf32>>
return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 32, 32], [8, 8, 1], [0, 0, 0], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: func.func @elementwise_output_transposed()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: {lowering_config = #[[CONFIG]]}

0 comments on commit 4098fca

Please sign in to comment.