Skip to content

Commit

Permalink
Merge f178fc1 into b8ef25c
Browse files Browse the repository at this point in the history
  • Loading branch information
hanhanW authored May 1, 2024
2 parents b8ef25c + f178fc1 commit 7787871
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 78 deletions.
79 changes: 47 additions & 32 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,36 @@ static int64_t getVectorSize(mlir::FunctionOpInterface entryPointFn,
return getVectorSize(entryPointFn, byteWidth);
}

/// Returns true if the operation is a GenericOp implementing a supported
/// transposition:
/// 1. The op has a single input and a single output.
/// 2. One of the indexing_map is identity and the other is a permutation.
static bool x86TransposeLoweringPrecondition(linalg::GenericOp genericOp) {
// Check that the op has at least 2 dimensions.
if (genericOp.getNumLoops() < 2) {
return false;
}

// Check that the op has only one input and one output.
// TODO(diegocaballero): Generalize to multiple inputs.
if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) {
return false;
}

// Check that all the iterators are parallel.
if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) {
return false;
}

// Check that the two indexing maps are a permutation of each other.
auto indexingMaps = genericOp.getIndexingMapsArray();
return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() &&
((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() &&
indexingMaps[1].isPermutation()) ||
(!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() &&
indexingMaps[1].isIdentity()));
}

/// Returns minimum tiling sizes for each dimension. One dimension is possible
/// to access at different element types. It determines the tiling sizes by
/// looking into all the operands.
Expand Down Expand Up @@ -325,21 +355,33 @@ getMinTilingSizesForEachDim(mlir::FunctionOpInterface entryPointFn,

// Limit unroll factor. For now, we assume the rightmost non-one tiled
// dimension is for vectorization and any other non-one dimension is for
// unrolling.
// unrolling. The util limits the second rightmost non-one tiled dimension
// to be not larger than `maxUnrollFactor` and others tiled dimension to 1.
auto limitUnrollFactor = [&](int64_t maxUnrollFactor) {
int vecDim;
for (vecDim = minTileSizes.size() - 1; vecDim >= 0; --vecDim) {
if (minTileSizes[vecDim] > 1) {
break;
}
}
bool seen = false;
for (int unrollDim = vecDim - 1; unrollDim >= 0; --unrollDim) {
if (minTileSizes[unrollDim] <= 1) {
continue;
}
int64_t factor = seen ? 1LL : maxUnrollFactor;
seen = true;
LLVM_DEBUG(KD_DBGS() << "Adjusted min tile sizes: "
<< minTileSizes[unrollDim]
<< " with factor=" << factor << "\n");
minTileSizes[unrollDim] =
std::min<int64_t>(minTileSizes[unrollDim], maxUnrollFactor);
std::min<int64_t>(minTileSizes[unrollDim], factor);
}
};

if (linalgOpInfo.isTranspose()) {
auto genericOp = dyn_cast<linalg::GenericOp>(op.getOperation());
if (linalgOpInfo.isTranspose() && genericOp &&
x86TransposeLoweringPrecondition(genericOp)) {
// Limit unrolling on transpose operations.
// TODO(dcaballe): Consider input and output transposes.
limitUnrollFactor(targetMLTransInfo.defaultMaxTransposeUnrollFactor);
Expand Down Expand Up @@ -1736,34 +1778,6 @@ static void setVectorTileSizes(linalg::LinalgOp op,
}
}

/// Returns true if the operation is a GenericOp implementing a supported
/// transposition.
static bool isSupportedTransposeOp(linalg::GenericOp genericOp) {
// Check that the op has at least 2 dimensions.
if (genericOp.getNumLoops() < 2) {
return false;
}

// Check that the op has only one input and one output.
// TODO(diegocaballero): Generalize to multiple inputs.
if ((genericOp.getNumDpsInputs() != 1) || (genericOp.getNumDpsInits() != 1)) {
return false;
}

// Check that all the iterators are parallel.
if (genericOp.getNumParallelLoops() != genericOp.getNumLoops()) {
return false;
}

// Check that the two indexing maps are a permutation of each other.
auto indexingMaps = genericOp.getIndexingMapsArray();
return !indexingMaps[0].isEmpty() && !indexingMaps[1].isEmpty() &&
((indexingMaps[0].isIdentity() && !indexingMaps[1].isIdentity() &&
indexingMaps[1].isPermutation()) ||
(!indexingMaps[0].isIdentity() && indexingMaps[0].isPermutation() &&
indexingMaps[1].isIdentity()));
}

/// Sets the default lowering configuration for a generic op to use
/// CPUDoubleTilingExpert pipeline.
static LogicalResult
Expand Down Expand Up @@ -1872,7 +1886,8 @@ setTransposeLikeOpRootConfig(mlir::FunctionOpInterface entryPointFn,
LLVM_DEBUG(KD_DBGS() << "Setting transpose-like op root configuration\n");

auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
if (!hasAVX2Feature(targetAttr) || !isSupportedTransposeOp(genericOp)) {
if (!hasAVX2Feature(targetAttr) ||
!x86TransposeLoweringPrecondition(genericOp)) {
return failure();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1422,52 +1422,6 @@ module {

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu", cpu_features = "+avx512f"}>
#map = affine_map<(d0, d1, d2) -> (d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#map3 = affine_map<(d0, d1, d2) -> (d0)>
module {
func.func @i4_dequant_matvec_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
%4 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
%5 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
%6 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
%7 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
%8 = flow.dispatch.tensor.load %3, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
%9 = tensor.empty() : tensor<4096xf32>
%10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<4096xf32>) -> tensor<4096xf32>
%11 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map2, #map3], iterator_types = ["parallel", "reduction", "reduction"]} ins(%8, %5, %6, %7 : tensor<86x128xf32>, tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%10 : tensor<4096xf32>) {
^bb0(%in: f32, %in_0: i4, %in_1: f32, %in_2: f32, %out: f32):
%12 = arith.extui %in_0 : i4 to i32
%13 = arith.uitofp %12 : i32 to f32
%14 = arith.subf %13, %in_2 : f32
%15 = arith.mulf %14, %in_1 : f32
%16 = arith.mulf %in, %15 : f32
%17 = arith.addf %16, %out : f32
linalg.yield %17 : f32
} -> tensor<4096xf32>
flow.dispatch.tensor.store %11, %4, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
return
}
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32], [4], [0], [0]]>
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 0, 0], [4, 0, 0], [0, 4, 16], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: func.func @i4_dequant_matvec_f32()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.fill
// CHECK-SAME: lowering_config = #[[CONFIG]]
// CHECK: linalg.generic {{.*}} iterator_types = ["parallel", "reduction", "reduction"]
// CHECK-SAME: lowering_config = #[[CONFIG1]]

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "cascadelake", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 32 : index, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = true}>
module {
func.func @batch_mmt4d() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
Expand Down Expand Up @@ -1665,3 +1619,39 @@ module {
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: iree_linalg_ext.attention
// CHECK-SAME: {lowering_config = #[[CONFIG]]}

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
cpu = "generic", cpu_features = "",
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 64 : index, target_triple = "x86_64-none-elf"}>
module {
func.func @elementwise_output_transposed() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<i64>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<768xi64>>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<32xi64>>
%3 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x32x768xf32>>
%4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<i64>> -> tensor<i64>
%5 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [768], strides = [1] : !flow.dispatch.tensor<readonly:tensor<768xi64>> -> tensor<768xi64>
%6 = flow.dispatch.tensor.load %2, offsets = [0], sizes = [32], strides = [1] : !flow.dispatch.tensor<readonly:tensor<32xi64>> -> tensor<32xi64>
%7 = tensor.empty() : tensor<32x32x768xf32>
%8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0)>, affine_map<(d0, d1, d2) -> (d1)>, affine_map<(d0, d1, d2) -> (d1, d2, d0)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4, %5, %6 : tensor<i64>, tensor<768xi64>, tensor<32xi64>) outs(%7 : tensor<32x32x768xf32>) {
^bb0(%in: i64, %in_0: i64, %in_1: i64, %out: f32):
%9 = arith.addi %in, %in_0 : i64
%10 = arith.addi %9, %in_1 : i64
%11 = arith.uitofp %10 : i64 to f32
linalg.yield %11 : f32
} -> tensor<32x32x768xf32>
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [32, 32, 768], strides = [1, 1, 1] : tensor<32x32x768xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x32x768xf32>>
return
}
}
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 32, 32], [8, 8, 1], [0, 0, 0], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: func.func @elementwise_output_transposed()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.generic
// CHECK-SAME: {lowering_config = #[[CONFIG]]}

0 comments on commit 7787871

Please sign in to comment.