Skip to content

Commit

Permalink
Support i1 datatype with an experimental flag. (#18713)
Browse files Browse the repository at this point in the history
Enable packed i1 datatype storage

This commit introduces support for packed storage of the `i1` (bit)
datatype. When subbyte type packing is enabled via the
`--iree-experimental-packed-i1-storage` option, vectors of `i1` elements
will be stored in a compact packed representation.

For example, a `vector<6xi1>` will occupy a single byte of memory with
the 6 bit elements packed together and 2 padding bits. A
`vector<3x3xi1>` will take up 2 bytes, with the 9 bit elements packed
across the bytes and 7 padding bits.

Limitations:
- To ensure correct behavior, the tiling configuration aligns the
innermost dimension data loads with byte boundaries. This is
necessitated by the current lack of emulation for unaligned subbyte
vector loading/storing.
- Unaligned subbyte emulation support can be added in the future, though
it may incur some performance overhead.

This change requires corresponding updates in the frontend to utilize
the packed `i1` storage format.

Signed-off-by: Alan Li <me@alanli.org>
  • Loading branch information
lialan authored Nov 15, 2024
1 parent 8cb8743 commit c80fa3b
Show file tree
Hide file tree
Showing 10 changed files with 191 additions and 8 deletions.
20 changes: 20 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2915,6 +2915,26 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
}
}

// Make sure the innermost tile size times element size is multiple
// of byte bits. This is required for now because we do not fully
// support sub-byte vector stores. Once vector stores are supported
// then this can be eliminated. Note that emulating sub-byte sized vector
// loads and stores will have a performance impact.
auto resultTypes = rootOperation->getResultTypes();
if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
auto elementTypeSize =
cast<ShapedType>(rootOperation->getResultTypes().front())
.getElementType()
.getIntOrFloatBitWidth();
// for now just enable for i1
if (elementTypeSize == 1) {
auto innermostTileSize = commonVecTileSizes.back();
commonVecTileSizes.back() =
llvm::alignTo(innermostTileSize * elementTypeSize, 8) /
elementTypeSize;
}
}

// Set the lowering configs with new tile sizes.
for (auto op : computeOps) {
int numLoops = cast<TilingInterface>(op).getLoopIteratorTypes().size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1958,3 +1958,28 @@ func.func @test_tiling_cpu_default(%arg0: tensor<256x256xi8>, %arg1: tensor<256x
// CHECK: func @test_tiling_cpu_default(
// CHECK-SAME: translation_info = #[[TRANSLATION_INFO]]
// CHECK: linalg.quantized_matmul {lowering_config = #[[CONFIG0]]}

// -----

#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
func.func @i1_type() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<8xi1>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<8xi1>>
%3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<8xi1>
%4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [8], strides = [1] : !flow.dispatch.tensor<readonly:tensor<8xi1>> -> tensor<8xi1>
%5 = tensor.empty() : tensor<8xi1>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%3, %4 : tensor<8xi1>, tensor<8xi1>) outs(%5 : tensor<8xi1>) {
^bb0(%in: i1, %in_0: i1, %out: i1):
%7 = arith.xori %in, %in_0 : i1
linalg.yield %7 : i1
} -> tensor<8xi1>
flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [8], strides = [1] : tensor<8xi1> -> !flow.dispatch.tensor<writeonly:tensor<8xi1>>
return
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[8], [8], [0], [0]]>
// CHECK: func @i1_type()
// CHECK: linalg.generic {
// CHECK-SAME: {lowering_config = #[[CONFIG]]}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ iree_lit_test_suite(
"encode_device_tensors_packing.mlir",
"encode_host_tensors.mlir",
"encode_host_tensors_packing.mlir",
"encode_host_tensors_packing_i1.mlir",
"fold_globals.mlir",
"fold_uniform_operands.mlir",
"fuse_dispatch_bindings.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ iree_lit_test_suite(
"encode_device_tensors_packing.mlir"
"encode_host_tensors.mlir"
"encode_host_tensors_packing.mlir"
"encode_host_tensors_packing_i1.mlir"
"fold_globals.mlir"
"fold_uniform_operands.mlir"
"fuse_dispatch_bindings.mlir"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ util.func public @denseTensorSizeOfDynamic(%arg0: index) -> index {
// CHECK-DAG: %[[C5:.+]] = arith.constant 5 : index
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: %[[MUL:.+]] = arith.muli %arg0, %[[C5]] : index
// CHECK: %[[DIV:.+]] = arith.divui %[[MUL]], %[[C2]] : index
// CHECK: %[[DIV:.+]] = arith.ceildivui %[[MUL]], %[[C2]] : index
%0 = stream.tensor.sizeof tensor<?x5xi4>{%arg0} : index
// CHECK: util.return %[[DIV]]
util.return %0 : index
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --iree-experimental-packed-i1-storage %s | FileCheck %s

func.func @unaligned_i1_size() -> index {
%0 = stream.tensor.sizeof tensor<12xi1> : index
return %0 : index
}
// CHECK: func @unaligned_i1_size() -> index {
// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
// CHECK: return %[[C2]] : index

// -----

func.func @aligned_i1_size() -> index {
%0 = stream.tensor.sizeof tensor<24xi1> : index
return %0 : index
}

// CHECK: func @aligned_i1_size() -> index {
// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
// CHECK: return %[[C3]] : index
23 changes: 16 additions & 7 deletions compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,17 @@

namespace mlir::iree_compiler {

llvm::cl::opt<bool> clEnableI1Support(
"iree-experimental-packed-i1-storage",
llvm::cl::desc(
"Experimental feature: enable i1 data type support in codegen"),
llvm::cl::init(false));

bool needToPackSubByteElementBitWidth(unsigned bitWidth) {
// Enable i1 support if requested.
if (clEnableI1Support && bitWidth == 1) {
return true;
}
// Require the original bit width to be some power of two for now to avoid
// trickiness and weirdness of packing and cross-byte access.
// Also disallow boolean values for now--they may require separate interface
Expand Down Expand Up @@ -114,15 +124,14 @@ Value calculateStorageElementCountInBytes(Location loc,
if (needToPackSubByteElementBitWidth(elementBits)) {
assert(8 % elementBits == 0);
unsigned byteElements = 8 / elementBits;
// Perform some basic sanity check to make sure the total count is byte
// aligned for fully static shapes.
if (paddedDynamicDims.empty() && (staticCount * elementBits) % 8 != 0) {
return nullptr;
}
auto divisor = builder.create<arith::ConstantIndexOp>(loc, byteElements);
// TODO(antiagainst): We may want to emit runtime check to make sure this is
// divisible.
value = builder.createOrFold<arith::DivUIOp>(loc, value, divisor);
auto divisor = builder.create<arith::ConstantIndexOp>(loc, byteElements);
if (!clEnableI1Support && paddedDynamicDims.empty() &&
(staticCount * elementBits) % 8 != 0) {
return nullptr;
}
value = builder.createOrFold<arith::CeilDivUIOp>(loc, value, divisor);
}

return value;
Expand Down
50 changes: 50 additions & 0 deletions tests/e2e/subbyte_types/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2024 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# Tests of end-to-end IREE support for individual ops in the TOSA dialect.
# Each test file should have a name matching the corresponding TOSA op and test only the
# functionality of that op (though may make use of other ops where necessary). Tests should be
# written using the IREE Check framework.
# See https://iree.dev/developers/general/testing-guide/#iree-core-end-to-end-e2e-tests.

load("//build_tools/bazel:enforce_glob.bzl", "enforce_glob")
load("//build_tools/bazel:iree_check_test.bzl", "iree_check_single_backend_test_suite")

package(
features = ["layering_check"],
licenses = ["notice"], # Apache 2.0
)

LLVM_SRCS = enforce_glob(
# keep sorted
[
"subbyte_types.mlir",
],
include = ["*.mlir"],
exclude = [],
)

iree_check_single_backend_test_suite(
name = "check_llvm-cpu_subbyte_emulation",
srcs = LLVM_SRCS,
compiler_flags = [
"--iree-llvmcpu-target-cpu=generic",
"--iree-experimental-packed-i1-storage",
],
driver = "local-task",
tags = [
# subbyte support for wasm is not on priorities.
"nowasm",
],
target_backend = "llvm-cpu",
)

test_suite(
name = "check",
tests = [
":check_llvm-cpu_subbyte_emulation",
],
)
29 changes: 29 additions & 0 deletions tests/e2e/subbyte_types/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
################################################################################
# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from #
# tests/e2e/subbyte_types/BUILD.bazel #
# #
# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary #
# CMake-only content. #
# #
# To disable autogeneration for this file entirely, delete this header. #
################################################################################

iree_add_all_subdirs()

iree_check_single_backend_test_suite(
NAME
check_llvm-cpu_subbyte_emulation
SRCS
"subbyte_types.mlir"
TARGET_BACKEND
"llvm-cpu"
DRIVER
"local-task"
COMPILER_FLAGS
"--iree-llvmcpu-target-cpu=generic"
"--iree-experimental-packed-i1-storage"
LABELS
"nowasm"
)

### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
28 changes: 28 additions & 0 deletions tests/e2e/subbyte_types/subbyte_types.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
func.func @i1_type() {
%c0 = arith.constant 0 : index
%c255 = arith.constant 255 : i8
%input1 = util.unfoldable_constant dense<[85]> : tensor<1xi8> // b01010101
%input2 = util.unfoldable_constant dense<[170]> : tensor<1xi8> // b10101010
%lhs = flow.tensor.bitcast %input1 : tensor<1xi8> -> tensor<8xi1>
%rhs = flow.tensor.bitcast %input2 : tensor<1xi8> -> tensor<8xi1>
%empty = tensor.empty() : tensor<8xi1>
%res = linalg.generic
{indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
ins(%lhs, %rhs : tensor<8xi1>, tensor<8xi1>) outs(%empty: tensor<8xi1>) {
^bb0(%inlhs: i1, %inrhs: i1, %out: i1):
%inres = arith.xori %inlhs, %inrhs: i1
linalg.yield %inres : i1
} -> tensor<8xi1>
%tensor_res = flow.tensor.bitcast %res : tensor<8xi1> -> tensor<1xi8>
check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
return
}

func.func @i1_type_slice() {
%input = util.unfoldable_constant dense<[0, 255, 0]> : tensor<3xi8>
%flat_input_all = flow.tensor.bitcast %input : tensor<3xi8> -> tensor<24xi1>
%slice = tensor.extract_slice %flat_input_all[8][8][1] : tensor<24xi1> to tensor<8xi1>
%tensor_res = flow.tensor.bitcast %slice : tensor<8xi1> -> tensor<1xi8>
check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
return
}

0 comments on commit c80fa3b

Please sign in to comment.