diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp index e5d1b93cbf82..119b3291817b 100644 --- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp @@ -30,10 +30,13 @@ MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter( addConversion([](FloatType floatType) { return floatType; }); addConversion([](MemRefType memrefType) { return memrefType; }); addConversion([=](RankedTensorType type) -> RankedTensorType { + MaterializeEncodingInfo encodingInfo = getEncodingInfo(type); + if (IREE::Encoding::hasPackedStorageAttr(type)) { + return type; + } // For a given tensor type with an encoding, return the materialized // type to use for it. If no encoding is set, then return the tensor type // itself. - MaterializeEncodingInfo encodingInfo = getEncodingInfo(type); if (IREE::Codegen::isIdentityLayout(encodingInfo)) { return dropEncoding(type); } @@ -92,6 +95,14 @@ MaterializeEncodingTypeConverter::getEncodingInfo(RankedTensorType type) const { } RankedTensorType dropEncoding(RankedTensorType type) { + assert(!IREE::Encoding::hasPackedStorageAttr(type) && + "not expected `packed_storage` attribute."); + return RankedTensorType::get(type.getShape(), type.getElementType()); +} + +RankedTensorType dropPackedStorageEncodingIfAny(RankedTensorType type) { + if (!IREE::Encoding::hasPackedStorageAttr(type)) + return type; return RankedTensorType::get(type.getShape(), type.getElementType()); } diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h index 08a8a5aadbe6..ff07c37ae233 100644 --- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h +++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h @@ -10,6 +10,7 @@ #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h" #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h" +#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h" #include "iree/compiler/Dialect/HAL/IR/HALTypes.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Transforms/DialectConversion.h" @@ -77,6 +78,9 @@ class OpMaterializeEncodingPattern : public OpConversionPattern { /// Returns the RankedTensorType without encodings. RankedTensorType dropEncoding(RankedTensorType type); +/// Returns the RankedTensorType without packed storage encoding (if any). +RankedTensorType dropPackedStorageEncodingIfAny(RankedTensorType type); + /// Returns the deserialized MaterializeEncodingInfo if the `layouts` field is /// present in encodings and it only has a single layout. Otherwise, returns /// std::nullopt. diff --git a/compiler/src/iree/compiler/Codegen/Common/TypePropagationPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TypePropagationPass.cpp index bd182bdd77fd..676a7c3ae005 100644 --- a/compiler/src/iree/compiler/Codegen/Common/TypePropagationPass.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/TypePropagationPass.cpp @@ -25,6 +25,7 @@ //===---------------------------------------------------------------------===// #include "iree/compiler/Codegen/Common/Passes.h" +#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h" #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h" #include "iree/compiler/Dialect/Util/IR/UtilTypes.h" @@ -65,9 +66,8 @@ static Value convertElementType(OpBuilder &b, Location loc, Type targetType, /// std::nullopt. static std::optional getLegalizedType(Type t) { if (auto shapedType = llvm::dyn_cast(t)) { - Type elementType = shapedType.getElementType(); std::optional legalizedElementType = - legalizeStorageElementType(elementType); + legalizeTensorStorageElementType(shapedType); if (!legalizedElementType) return std::nullopt; return RankedTensorType::get(shapedType.getShape(), @@ -121,7 +121,7 @@ struct ConstantOpTypeConversion constantOp, "expected attribute type to be shaped type"); } std::optional legalizedElementType = - legalizeStorageElementType(attrType.getElementType()); + legalizeTensorStorageElementType(attrType); if (!legalizedElementType) { return rewriter.notifyMatchFailure(constantOp, "cannot legalize elementType"); @@ -227,8 +227,10 @@ struct GenericOpTypePropagation signatureConverter.addInputs(index, argType); continue; } + auto inputOperandType = + llvm::cast(genericOp->getOperandTypes()[index]); std::optional legalizedArgType = - legalizeStorageElementType(argType); + legalizeTensorStorageElementType(inputOperandType); if (!legalizedArgType) { return genericOp.emitOpError("failed to get legalized type for arg ") << index; @@ -258,8 +260,8 @@ struct GenericOpTypePropagation modifyYield = true; OpOperand *yieldOperand = modifiedOp.getMatchingYieldValue(modifiedOpOperand); - std::optional legalizedType = - legalizeStorageElementType(yieldOperand->get().getType()); + std::optional legalizedType = legalizeTensorStorageElementType( + modifiedOpOperand->get().getType()); if (!legalizedType) { return genericOp.emitOpError( "failed to get legalized type for yield value"); @@ -289,7 +291,7 @@ struct LinalgFillTypePropagation ConversionPatternRewriter &rewriter) const final { Value value = adaptor.getInputs().front(); std::optional legalizedElementType = - legalizeStorageElementType(value.getType()); + legalizeTensorStorageElementType(adaptor.getOutputs()[0].getType()); if (!legalizedElementType) { return fillOp.emitOpError("failed to get legalized type for value"); } @@ -355,8 +357,8 @@ struct IREELinalgExtScatterTypePropagation // type. TypeConverter::SignatureConversion signatureConverter( modifiedOpRegion.getNumArguments()); - Type argType = modifiedOpRegion.getArguments()[0].getType(); - std::optional legalizedArgType = legalizeStorageElementType(argType); + std::optional legalizedArgType = + legalizeTensorStorageElementType(inputType); if (!legalizedArgType) { return scatterOp.emitOpError("failed to get legalized type for argument"); } @@ -418,8 +420,12 @@ struct IREELinalgExtSortTypePropagation TypeConverter::SignatureConversion signatureConverter( modifiedOpRegion.getNumArguments()); for (auto [index, arg] : llvm::enumerate(modifiedOpRegion.getArguments())) { + // Refer to input types of the original operation to determine the + // corresponding legal arg type. + auto convertType = index % 2 == 0 ? sortOp->getOperandTypes()[index / 2] + : sortOp->getResultTypes()[index / 2]; std::optional legalizedArgType = - legalizeStorageElementType(arg.getType()); + legalizeTensorStorageElementType(convertType); if (!legalizedArgType) { return sortOp.emitOpError("failed to get legalized type for argument"); } diff --git a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp index be1462cd30a4..e37589bf96ad 100644 --- a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp +++ b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp @@ -281,8 +281,12 @@ EncodingAttr getEncodingAttr(RankedTensorType type) { return dyn_cast_or_null(type.getEncoding()); } -bool hasPackedStorageAttr(RankedTensorType type) { - return dyn_cast_or_null(type.getEncoding()) != nullptr; +bool hasPackedStorageAttr(Type type) { + if (auto tensorType = dyn_cast(type)) { + return dyn_cast_or_null(tensorType.getEncoding()) != + nullptr; + } + return false; } FailureOr diff --git a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h index 6edc666110d0..96812d63afe2 100644 --- a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h +++ b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h @@ -43,7 +43,7 @@ namespace mlir::iree_compiler::IREE::Encoding { EncodingAttr getEncodingAttr(RankedTensorType type); /// Returns true if the type contains packed_storage attribute. -bool hasPackedStorageAttr(RankedTensorType type); +bool hasPackedStorageAttr(Type type); /// Returns the ContractionDimensions for the encoding user_indexing_maps. FailureOr diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel index 5c64fa1ce8a2..bb8606fec0af 100644 --- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel @@ -22,6 +22,7 @@ iree_compiler_cc_library( ], deps = [ ":Utils", + "//compiler/src/iree/compiler/Codegen/Common", "//compiler/src/iree/compiler/Dialect/HAL/Analysis", "//compiler/src/iree/compiler/Dialect/HAL/Conversion", "//compiler/src/iree/compiler/Dialect/HAL/IR", diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt index 92dc1b9dba64..de1892ce0b4b 100644 --- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt @@ -28,6 +28,7 @@ iree_cc_library( MLIRSCFDialect MLIRTransformUtils MLIRTransforms + iree::compiler::Codegen::Common iree::compiler::Dialect::HAL::Analysis iree::compiler::Dialect::HAL::Conversion iree::compiler::Dialect::HAL::IR diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp index 748483349f98..e3da3683d4c6 100644 --- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp +++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp @@ -6,6 +6,7 @@ #include "iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.h" +#include "iree/compiler/Codegen/Common/EncodingUtils.h" #include "iree/compiler/Dialect/HAL/Analysis/Captures.h" #include "iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.h" #include "iree/compiler/Dialect/HAL/IR/HALDialect.h" @@ -478,7 +479,8 @@ struct TensorExportBufferViewOpPattern } auto loc = exportOp.getLoc(); - auto tensorType = llvm::cast(adaptor.getSourceEncoding()); + auto tensorType = dropPackedStorageEncodingIfAny( + llvm::cast(adaptor.getSourceEncoding())); auto dynamicDims = adaptor.getSourceEncodingDims(); // NOTE: we should have verified supported encodings/types at entry into the diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD.bazel index 2fa22edf5eb6..ef727de312b6 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD.bazel @@ -64,7 +64,7 @@ iree_compiler_cc_library( ":StreamInterfacesGen", ":StreamOpsGen", ":StreamTypesGen", - "//compiler/src/iree/compiler/Dialect/Util/IR", + "//compiler/src/iree/compiler/Dialect/Encoding/IR", "//compiler/src/iree/compiler/Utils", "@llvm-project//llvm:Support", "@llvm-project//mlir:ArithDialect", diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/IR/CMakeLists.txt index 2f10910741ae..79e4e2c46539 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/CMakeLists.txt @@ -54,7 +54,7 @@ iree_cc_library( MLIRTensorDialect MLIRTransformUtils MLIRViewLikeInterface - iree::compiler::Dialect::Util::IR + iree::compiler::Dialect::Encoding::IR iree::compiler::Utils PUBLIC ) diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp index 13988a999b2f..46f88edb7def 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp @@ -6,6 +6,7 @@ #include "iree/compiler/Dialect/Stream/IR/StreamOps.h" +#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h" #include "iree/compiler/Dialect/Util/IR/ClosureOpUtils.h" #include "iree/compiler/Dialect/Util/IR/UtilOps.h" #include "iree/compiler/Dialect/Util/IR/UtilTypes.h" @@ -27,6 +28,10 @@ #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/RegionUtils.h" +namespace mlir::iree_compiler { +using IREE::Encoding::getEncodingAttr; +} + namespace mlir::iree_compiler::IREE::Stream { //===----------------------------------------------------------------------===// @@ -1903,7 +1908,7 @@ LogicalResult TensorCloneOp::verify() { // information. auto sourceEncoding = llvm::cast(op.getSourceEncoding()); auto resultEncoding = llvm::cast(op.getResultEncoding()); - if (sourceEncoding.getEncoding() != resultEncoding.getEncoding()) { + if (getEncodingAttr(sourceEncoding) != getEncodingAttr(resultEncoding)) { return op.emitOpError() << "clones changing tensor encoding from " << sourceEncoding.getEncoding() << " to " << resultEncoding.getEncoding() << "; not allowed"; diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ConvertToStream.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ConvertToStream.cpp index 501cbb83fbbb..4676fb4fc307 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ConvertToStream.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ConvertToStream.cpp @@ -4,6 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h" #include "iree/compiler/Dialect/Flow/IR/FlowDialect.h" #include "iree/compiler/Dialect/Flow/IR/FlowTypes.h" #include "iree/compiler/Dialect/Stream/Analysis/Affinity.h" @@ -22,6 +23,7 @@ #include "iree/compiler/Dialect/Util/IR/UtilTypes.h" #include "iree/compiler/Dialect/Util/Transforms/Passes.h" #include "iree/compiler/Dialect/Util/Transforms/Patterns.h" +#include "llvm/Support/Casting.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" @@ -247,6 +249,12 @@ struct ConvertToStreamPass final if (llvm::isa(type)) { return IREE::Stream::ChannelType::get(context); } + if (auto rankedType = llvm::dyn_cast_or_null(type)) { + if (IREE::Encoding::hasPackedStorageAttr(rankedType)) { + return RankedTensorType::get(rankedType.getShape(), + rankedType.getElementType()); + } + } return !llvm::isa(type) ? type : Type{}; }); diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp index c3753aab0dfe..ec0d24e74eed 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp @@ -58,7 +58,7 @@ static LogicalResult checkEncoding(Operation *op, RankedTensorType encodingType, // Aligns the element type of a tensor<> to a byte-aligned power of 2 bit width. static RankedTensorType alignTensorType(RankedTensorType originalType) { Type elementType = originalType.getElementType(); - Type alignedType = legalizeStorageElementType(elementType); + Type alignedType = legalizeTensorStorageElementType(originalType); if (alignedType == elementType) return originalType; return RankedTensorType::get(originalType.getShape(), alignedType, @@ -168,7 +168,9 @@ static Value canonicalizeFillPattern(Value pattern, OpBuilder &builder) { // %i8_val = (%i8_val << 2) | %i2_val // %i8_val = (%i8_val << 2) | %i2_val // %i8_val = (%i8_val << 2) | %i2_val - if (needToPackSubByteElementBitWidth(elementBitWidth)) { + bool patternIsPacked = + IREE::Encoding::hasPackedStorageAttr(pattern.getType()); + if (!patternIsPacked && needToPackSubByteElementBitWidth(elementBitWidth)) { Type i8Type = builder.getI8Type(); Value bitwidth = builder.createOrFold( loc, i8Type, builder.getIntegerAttr(i8Type, elementBitWidth)); @@ -655,7 +657,8 @@ struct EncodeHostTensorsPass static IREE::Flow::DispatchTensorType alignDispatchTensorType(IREE::Flow::DispatchTensorType originalType) { Type elementType = originalType.getBoundElementType(); - Type alignedType = legalizeStorageElementType(elementType); + Type alignedType = + legalizeTensorStorageElementType(originalType.asRankedTensorType()); if (alignedType == elementType) return originalType; return IREE::Flow::DispatchTensorType::get( diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel index 138ba0be6689..3e1dacdb1eda 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel @@ -30,7 +30,6 @@ iree_lit_test_suite( "encode_host_tensors.mlir", "encode_host_tensors_encoding.mlir", "encode_host_tensors_packing.mlir", - "encode_host_tensors_packing_i1_experimental_clopt.mlir", "fold_globals.mlir", "fold_uniform_operands.mlir", "fuse_dispatch_bindings.mlir", diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt index 4c4cb93d80ef..75aaf654b8b8 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt @@ -28,7 +28,6 @@ iree_lit_test_suite( "encode_host_tensors.mlir" "encode_host_tensors_encoding.mlir" "encode_host_tensors_packing.mlir" - "encode_host_tensors_packing_i1_experimental_clopt.mlir" "fold_globals.mlir" "fold_uniform_operands.mlir" "fuse_dispatch_bindings.mlir" diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir deleted file mode 100644 index c96e05270d12..000000000000 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir +++ /dev/null @@ -1,23 +0,0 @@ -// This is only used to test the experimental packing flag. When the default -// is changed the encode_host_tensors.mlir test should be updated and used -// instead and this file should be deleted. - -// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --iree-experimental-packed-i1-storage %s | FileCheck %s - -// CHECK-LABEL: @tensorSizeOfUnalignedPackedI1 -util.func @tensorSizeOfUnalignedPackedI1() -> index { - // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index - %0 = stream.tensor.sizeof tensor<12xi1> : index - // CHECK: return %[[C2]] : index - util.return %0 : index -} - -// ----- - -// CHECK-LABEL: @tensorSizeOfAlignedPackedI1 -util.func @tensorSizeOfAlignedPackedI1() -> index { - // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index - %0 = stream.tensor.sizeof tensor<24xi1> : index - // CHECK: util.return %[[C3]] : index - util.return %0 : index -} diff --git a/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp b/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp index d7b3258d5e45..f902874582cd 100644 --- a/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp +++ b/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp @@ -15,27 +15,9 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/IR/BuiltinTypes.h" -// TODO(lialan): remove cl options once frontend can emit packed i1 tensors. -llvm::cl::opt clEnableI1Support( - "iree-experimental-packed-i1-storage", - llvm::cl::desc( - "Experimental feature: force to use packed storage for i1 tensors." - "Turning on this option will see i1 tensors as if it has " - "#iree_encoding.packed_storage attribute." - "This is to allow an alternative way to test the packed storage " - "feature before frontend can emit packed i1 tensors." - "This option can be dropped once the frontend can emit packed i1 " - "tensors."), - llvm::cl::init(false)); - namespace mlir::iree_compiler { -static bool needToPackSubByteElementBitWidthImpl(unsigned bitWidth, - bool isPackedStorage) { - // Enable i1 support if requested. - if (isPackedStorage && bitWidth == 1) { - return true; - } +bool needToPackSubByteElementBitWidth(unsigned bitWidth) { // Require the original bit width to be some power of two for now to avoid // trickiness and weirdness of packing and cross-byte access. // Also disallow boolean values for now--they may require separate interface @@ -43,19 +25,13 @@ static bool needToPackSubByteElementBitWidthImpl(unsigned bitWidth, return bitWidth < 8 && llvm::isPowerOf2_32(bitWidth) && bitWidth != 1; } -bool needToPackSubByteElementBitWidth(unsigned bitWidth) { - return needToPackSubByteElementBitWidthImpl( - bitWidth, /*isPackedStorage=*/clEnableI1Support); -} - bool needToPackSubByteElements(RankedTensorType shapedType) { unsigned bitWidth = IREE::Util::getTypeBitWidth(shapedType.getElementType()); - // Two paths to enable packed storage for i1 tensors: the attribute or cl - // option. The cl option will be dropped once frontend supports emitting - // tensors with attributes. - bool isPackedStorage = - IREE::Encoding::hasPackedStorageAttr(shapedType) || clEnableI1Support; - return needToPackSubByteElementBitWidthImpl(bitWidth, isPackedStorage); + // i1 with packed memory layout does not need to be extended. + if (bitWidth == 1 && IREE::Encoding::hasPackedStorageAttr(shapedType)) { + return true; + } + return needToPackSubByteElementBitWidth(bitWidth); } static Type legalizeStorageElementTypeImpl(Type elementType, @@ -65,9 +41,13 @@ static Type legalizeStorageElementTypeImpl(Type elementType, if (!intType) return elementType; - // For sub-byte elements, default to pack them into bytes. unsigned bitWidth = intType.getWidth(); - if (needToPackSubByteElementBitWidthImpl(bitWidth, isPackedStorage)) + if (bitWidth == 1 && isPackedStorage) { + return elementType; + } + + // For sub-byte elements, default to pack them into bytes. + if (needToPackSubByteElementBitWidth(bitWidth)) return elementType; // Otherwise, extend them to the next power-of-two bit width. @@ -79,10 +59,10 @@ static Type legalizeStorageElementTypeImpl(Type elementType, intType.getSignedness()); } -Type legalizeStorageElementType(Type elementType) { - // Consider packed storage for i1 tensors if cl opt is set. - return legalizeStorageElementTypeImpl(elementType, - /*isPackedStorage=*/clEnableI1Support); +Type legalizeTensorStorageElementType(Type type) { + auto tensorType = llvm::cast(type); + return legalizeStorageElementTypeImpl( + tensorType.getElementType(), IREE::Encoding::hasPackedStorageAttr(type)); } Value calculateStorageElementCountInBytes(Location loc, @@ -96,15 +76,16 @@ Value calculateStorageElementCountInBytes(Location loc, loc, builder, shapedType, dynamicDims); } - bool isPackedStorage = - IREE::Encoding::hasPackedStorageAttr(shapedType) || clEnableI1Support; - Type alignedElementType = legalizeStorageElementTypeImpl( - shapedType.getElementType(), isPackedStorage); + Type alignedElementType = legalizeTensorStorageElementType(shapedType); unsigned elementBits = IREE::Util::getTypeBitWidth(alignedElementType); + bool isPackedStorage = IREE::Encoding::hasPackedStorageAttr(shapedType); + bool isI1WithPackedStorage = elementBits == 1 && isPackedStorage; + // Calculate all static dims first, if any. int64_t staticCount = 1; - if (!needToPackSubByteElementBitWidthImpl(elementBits, isPackedStorage)) { + if (!isI1WithPackedStorage && + !needToPackSubByteElementBitWidth(elementBits)) { staticCount *= IREE::Util::getRoundedElementByteWidth(alignedElementType); } @@ -119,7 +100,7 @@ Value calculateStorageElementCountInBytes(Location loc, value = builder.createOrFold(loc, value, dim); } // Sub-byte packing requires putting multiple elements in the same byte. - if (needToPackSubByteElementBitWidthImpl(elementBits, isPackedStorage)) { + if (isI1WithPackedStorage || needToPackSubByteElementBitWidth(elementBits)) { assert(8 % elementBits == 0); unsigned byteElements = 8 / elementBits; // TODO(antiagainst): We may want to emit runtime check to make sure this is @@ -139,14 +120,14 @@ Value calculateStorageElementOffsetInBytes(Location loc, RankedTensorType originalType, Value linearizedIndex, OpBuilder &builder) { - bool isPackedStorage = - IREE::Encoding::hasPackedStorageAttr(originalType) || clEnableI1Support; - Type alignedElementType = legalizeStorageElementTypeImpl( - originalType.getElementType(), isPackedStorage); + Type alignedElementType = legalizeTensorStorageElementType(originalType); unsigned elementBits = IREE::Util::getTypeBitWidth(alignedElementType); + bool isPackedStorage = IREE::Encoding::hasPackedStorageAttr(originalType); + bool isI1WithPackedStorage = elementBits == 1 && isPackedStorage; + // Sub-byte packing requires putting multiple elements in the same byte. - if (needToPackSubByteElementBitWidthImpl(elementBits, isPackedStorage)) { + if (isI1WithPackedStorage || needToPackSubByteElementBitWidth(elementBits)) { Value byteElements = builder.create(loc, 8 / elementBits); // TODO(antiagainst): We may want to emit runtime check to make sure this is diff --git a/compiler/src/iree/compiler/Utils/ElementPackingUtils.h b/compiler/src/iree/compiler/Utils/ElementPackingUtils.h index 9de6ea70c26a..38fff78df2b8 100644 --- a/compiler/src/iree/compiler/Utils/ElementPackingUtils.h +++ b/compiler/src/iree/compiler/Utils/ElementPackingUtils.h @@ -30,6 +30,8 @@ bool needToPackSubByteElements(RankedTensorType shapedType); /// cases. Type legalizeStorageElementType(Type elementType); +Type legalizeTensorStorageElementType(Type tensorType); + /// Emits IR with the given |builder| to calculate the total number of bytes /// required for the given |shapedType| in storage. Returns the value for the /// final count on success; returns nullptr on failure. Dynamic dimensions in diff --git a/tests/e2e/linalg_ext_ops/BUILD.bazel b/tests/e2e/linalg_ext_ops/BUILD.bazel index acc14d1c61f9..2c81be1a66c2 100644 --- a/tests/e2e/linalg_ext_ops/BUILD.bazel +++ b/tests/e2e/linalg_ext_ops/BUILD.bazel @@ -42,24 +42,6 @@ iree_check_single_backend_test_suite( target_backend = "llvm-cpu", ) -iree_check_single_backend_test_suite( - name = "check_llvm-cpu_local-task_i1", - srcs = [ - "attention_i1_mask.mlir", - ], - compiler_flags = [ - "--iree-llvmcpu-target-cpu=generic", - "--iree-experimental-packed-i1-storage", - ], - driver = "local-task", - tags = [ - # attention fails with a wasm target, just disable the tests there for now - # error: Yield operand #2 is not equivalent to the corresponding iter bbArg - "nowasm", - ], - target_backend = "llvm-cpu", -) - VMVX_SRCS = enforce_glob( # keep sorted [ diff --git a/tests/e2e/linalg_ext_ops/CMakeLists.txt b/tests/e2e/linalg_ext_ops/CMakeLists.txt index 37377670dd20..5bc968c6e9b7 100644 --- a/tests/e2e/linalg_ext_ops/CMakeLists.txt +++ b/tests/e2e/linalg_ext_ops/CMakeLists.txt @@ -31,22 +31,6 @@ iree_check_single_backend_test_suite( "nowasm" ) -iree_check_single_backend_test_suite( - NAME - check_llvm-cpu_local-task_i1 - SRCS - "attention_i1_mask.mlir" - TARGET_BACKEND - "llvm-cpu" - DRIVER - "local-task" - COMPILER_FLAGS - "--iree-llvmcpu-target-cpu=generic" - "--iree-experimental-packed-i1-storage" - LABELS - "nowasm" -) - iree_check_single_backend_test_suite( NAME check_vmvx_local-task diff --git a/tests/e2e/linalg_ext_ops/attention_i1_mask.mlir b/tests/e2e/linalg_ext_ops/attention_i1_mask.mlir index e4a4631c1cd0..07d1852e6f56 100644 --- a/tests/e2e/linalg_ext_ops/attention_i1_mask.mlir +++ b/tests/e2e/linalg_ext_ops/attention_i1_mask.mlir @@ -1,3 +1,4 @@ +#packed = #iree_encoding.packed_storage func.func @attention1x4x4_i1_mask() { %init = tensor.empty() : tensor<1x4x4xf32> %query = util.unfoldable_constant dense<[[[0.1, 0.2, 0.3, 0.4], @@ -15,7 +16,7 @@ func.func @attention1x4x4_i1_mask() { [1.3, 1.4, 1.5, 1.6]]]> : tensor<1x4x4xf32> %i8mask = util.unfoldable_constant dense<[165, 165]> : tensor<2xi8> - %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1> + %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1, #packed> %scale = arith.constant 0.5 : f32 %1 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, @@ -25,7 +26,7 @@ func.func @attention1x4x4_i1_mask() { affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%query, %key, %value, %scale, %mask : tensor<1x4x4xf32>, - tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1>) outs(%init : tensor<1x4x4xf32>) { + tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1, #packed>) outs(%init : tensor<1x4x4xf32>) { ^bb0(%arg0: f32): iree_linalg_ext.yield %arg0 : f32 } -> tensor<1x4x4xf32> @@ -56,7 +57,7 @@ func.func @attention1x4x4_i1_mask_all_ones() { [1.3, 1.4, 1.5, 1.6]]]> : tensor<1x4x4xf32> %i8mask = util.unfoldable_constant dense<[255, 255]> : tensor<2xi8> - %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1> + %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1, #packed> %scale = arith.constant 0.5 : f32 %1 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, @@ -66,7 +67,7 @@ func.func @attention1x4x4_i1_mask_all_ones() { affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%query, %key, %value, %scale, %mask : tensor<1x4x4xf32>, - tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1>) outs(%init : tensor<1x4x4xf32>) { + tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1, #packed>) outs(%init : tensor<1x4x4xf32>) { ^bb0(%arg0: f32): iree_linalg_ext.yield %arg0 : f32 } -> tensor<1x4x4xf32> @@ -97,7 +98,7 @@ func.func @attention1x4x4_i1_mask_tril() { [1.3, 1.4, 1.5, 1.6]]]> : tensor<1x4x4xf32> %i8mask = util.unfoldable_constant dense<[140, 239]> : tensor<2xi8> - %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1> + %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1, #packed> %scale = arith.constant 0.5 : f32 %1 = iree_linalg_ext.attention {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>, @@ -107,7 +108,7 @@ func.func @attention1x4x4_i1_mask_tril() { affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]} ins(%query, %key, %value, %scale, %mask : tensor<1x4x4xf32>, - tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1>) outs(%init : tensor<1x4x4xf32>) { + tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1, #packed>) outs(%init : tensor<1x4x4xf32>) { ^bb0(%arg0: f32): iree_linalg_ext.yield %arg0 : f32 } -> tensor<1x4x4xf32> diff --git a/tests/e2e/subbyte_types/BUILD.bazel b/tests/e2e/subbyte_types/BUILD.bazel index ff1b1f3ea643..adee37a39294 100644 --- a/tests/e2e/subbyte_types/BUILD.bazel +++ b/tests/e2e/subbyte_types/BUILD.bazel @@ -19,16 +19,15 @@ package( ) iree_check_single_backend_test_suite( - name = "check_llvm-cpu_subbyte_emulation", + name = "check_llvm-cpu_subbyte_emulation_attr", srcs = enforce_glob( [ - "subbyte_types.mlir", + "subbyte_types_attr.mlir", ], include = ["*.mlir"], ), compiler_flags = [ "--iree-llvmcpu-target-cpu=generic", - "--iree-experimental-packed-i1-storage", ], driver = "local-task", tags = [ @@ -41,6 +40,6 @@ iree_check_single_backend_test_suite( test_suite( name = "check", tests = [ - ":check_llvm-cpu_subbyte_emulation", + ":check_llvm-cpu_subbyte_emulation_attr", ], ) diff --git a/tests/e2e/subbyte_types/CMakeLists.txt b/tests/e2e/subbyte_types/CMakeLists.txt index 8077be696478..27bfdd20337f 100644 --- a/tests/e2e/subbyte_types/CMakeLists.txt +++ b/tests/e2e/subbyte_types/CMakeLists.txt @@ -12,16 +12,15 @@ iree_add_all_subdirs() iree_check_single_backend_test_suite( NAME - check_llvm-cpu_subbyte_emulation + check_llvm-cpu_subbyte_emulation_attr SRCS - "subbyte_types.mlir" + "subbyte_types_attr.mlir" TARGET_BACKEND "llvm-cpu" DRIVER "local-task" COMPILER_FLAGS "--iree-llvmcpu-target-cpu=generic" - "--iree-experimental-packed-i1-storage" LABELS "nowasm" ) diff --git a/tests/e2e/subbyte_types/subbyte_types.mlir b/tests/e2e/subbyte_types/subbyte_types.mlir deleted file mode 100644 index a1fa90bf4446..000000000000 --- a/tests/e2e/subbyte_types/subbyte_types.mlir +++ /dev/null @@ -1,99 +0,0 @@ -func.func @i1_type() { - %c0 = arith.constant 0 : index - %c255 = arith.constant 255 : i8 - %input1 = util.unfoldable_constant dense<[85]> : tensor<1xi8> // b01010101 - %input2 = util.unfoldable_constant dense<[170]> : tensor<1xi8> // b10101010 - %lhs = flow.tensor.bitcast %input1 : tensor<1xi8> -> tensor<8xi1> - %rhs = flow.tensor.bitcast %input2 : tensor<1xi8> -> tensor<8xi1> - %empty = tensor.empty() : tensor<8xi1> - %res = linalg.generic - {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} - ins(%lhs, %rhs : tensor<8xi1>, tensor<8xi1>) outs(%empty: tensor<8xi1>) { - ^bb0(%inlhs: i1, %inrhs: i1, %out: i1): - %inres = arith.xori %inlhs, %inrhs: i1 - linalg.yield %inres : i1 - } -> tensor<8xi1> - %tensor_res = flow.tensor.bitcast %res : tensor<8xi1> -> tensor<1xi8> - check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8> - return -} - -func.func @i1_type_slice() { - %input = util.unfoldable_constant dense<[0, 255, 0]> : tensor<3xi8> - %flat_input_all = flow.tensor.bitcast %input : tensor<3xi8> -> tensor<24xi1> - %slice = tensor.extract_slice %flat_input_all[8][8][1] : tensor<24xi1> to tensor<8xi1> - %tensor_res = flow.tensor.bitcast %slice : tensor<8xi1> -> tensor<1xi8> - check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8> - return -} - -func.func @i1_representation() { - %mask = util.unfoldable_constant dense<[140]> : tensor<1xi8> - %casted = flow.tensor.bitcast %mask : tensor<1xi8> -> tensor<2x4xi1> - %bar = util.optimization_barrier %casted : tensor<2x4xi1> - %tensor_res = flow.tensor.bitcast %bar : tensor<2x4xi1> -> tensor<1xi8> - check.expect_eq_const(%tensor_res, dense<[140]> : tensor<1xi8>) : tensor<1xi8> - return -} - -func.func @i1_representation_2() { - %mask = util.unfoldable_constant dense<[140, 77]> : tensor<2xi8> - %casted = flow.tensor.bitcast %mask : tensor<2xi8> -> tensor<2x8xi1> - %bar = util.optimization_barrier %casted : tensor<2x8xi1> - %tensor_res = flow.tensor.bitcast %bar : tensor<2x8xi1> -> tensor<2xi8> - check.expect_eq_const(%tensor_res, dense<[140, 77]> : tensor<2xi8>) : tensor<2xi8> - return -} - -func.func @i1_representation_3() { - %mask = util.unfoldable_constant dense<[140, 77]> : tensor<2xi8> - %casted = flow.tensor.bitcast %mask : tensor<2xi8> -> tensor<4x4xi1> - %bar = util.optimization_barrier %casted : tensor<4x4xi1> - %tensor_res = flow.tensor.bitcast %bar : tensor<4x4xi1> -> tensor<2xi8> - check.expect_eq_const(%tensor_res, dense<[140, 77]> : tensor<2xi8>) : tensor<2xi8> - return -} - -func.func @truncate_i1() { - %mask = util.unfoldable_constant dense<[1, 1, 0, 0, - 0, 0, 1, 1]> : tensor<8xi8> - %nm = tensor.empty() : tensor<8xi1> - %truncm = linalg.generic - {indexing_maps = [ - affine_map<(d0) -> (d0)>, - affine_map<(d0) -> (d0)>], - iterator_types = ["parallel"]} - ins(%mask: tensor<8xi8>) - outs(%nm: tensor<8xi1>) { - ^bb0(%in: i8, %out: i1): - %zero = arith.constant 0 : i8 - %truncated = arith.cmpi "sgt", %in, %zero : i8 - linalg.yield %truncated : i1 - } -> tensor<8xi1> - %tensor_res = flow.tensor.bitcast %truncm : tensor<8xi1> -> tensor<1xi8> - check.expect_eq_const(%tensor_res, dense<[195]> : tensor<1xi8>) : tensor<1xi8> - return -} - -func.func @truncate_i1_2() { - %mask = util.unfoldable_constant dense<[[0, 0, 1, 1], - [1, 1, 0, 0], - [1, 1, 0, 0], - [0, 0, 1, 1]]> : tensor<4x4xi8> - %nm = tensor.empty() : tensor<4x4xi1> - %truncm = linalg.generic - {indexing_maps = [ - affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%mask: tensor<4x4xi8>) - outs(%nm: tensor<4x4xi1>) { - ^bb0(%in: i8, %out: i1): - %zero = arith.constant 0 : i8 - %truncated = arith.cmpi "sgt", %in, %zero : i8 - linalg.yield %truncated : i1 - } -> tensor<4x4xi1> - %tensor_res = flow.tensor.bitcast %truncm : tensor<4x4xi1> -> tensor<2xi8> - check.expect_eq_const(%tensor_res, dense<[60, 195]> : tensor<2xi8>) : tensor<2xi8> - return -} diff --git a/tests/e2e/subbyte_types/subbyte_types_attr.mlir b/tests/e2e/subbyte_types/subbyte_types_attr.mlir new file mode 100644 index 000000000000..d6b5fd3cb793 --- /dev/null +++ b/tests/e2e/subbyte_types/subbyte_types_attr.mlir @@ -0,0 +1,56 @@ +#packed = #iree_encoding.packed_storage +func.func @i1_type_slice() { + %input = util.unfoldable_constant dense<[0, 255, 0]> : tensor<3xi8> + %flat_input_all = flow.tensor.bitcast %input : tensor<3xi8> -> tensor<24xi1, #packed> + %slice = tensor.extract_slice %flat_input_all[8][8][1] : tensor<24xi1, #packed> to tensor<8xi1, #packed> + %tensor_res = flow.tensor.bitcast %slice : tensor<8xi1, #packed> -> tensor<1xi8> + check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8> + return +} + +func.func @i1_representation() { + %mask = util.unfoldable_constant dense<[140]> : tensor<1xi8> + %casted = flow.tensor.bitcast %mask : tensor<1xi8> -> tensor<2x4xi1, #packed> + %bar = util.optimization_barrier %casted : tensor<2x4xi1, #packed> + %tensor_res = flow.tensor.bitcast %bar : tensor<2x4xi1, #packed> -> tensor<1xi8> + check.expect_eq_const(%tensor_res, dense<[140]> : tensor<1xi8>) : tensor<1xi8> + return +} + +func.func @i1_representation_2() { + %mask = util.unfoldable_constant dense<[140, 77]> : tensor<2xi8> + %casted = flow.tensor.bitcast %mask : tensor<2xi8> -> tensor<2x8xi1, #packed> + %bar = util.optimization_barrier %casted : tensor<2x8xi1, #packed> + %tensor_res = flow.tensor.bitcast %bar : tensor<2x8xi1, #packed> -> tensor<2xi8> + check.expect_eq_const(%tensor_res, dense<[140, 77]> : tensor<2xi8>) : tensor<2xi8> + return +} + +func.func @i1_representation_3() { + %mask = util.unfoldable_constant dense<[140, 77]> : tensor<2xi8> + %casted = flow.tensor.bitcast %mask : tensor<2xi8> -> tensor<4x4xi1, #packed> + %bar = util.optimization_barrier %casted : tensor<4x4xi1, #packed> + %tensor_res = flow.tensor.bitcast %bar : tensor<4x4xi1, #packed> -> tensor<2xi8> + check.expect_eq_const(%tensor_res, dense<[140, 77]> : tensor<2xi8>) : tensor<2xi8> + return +} + +func.func @i1_type() { + %c0 = arith.constant 0 : index + %c255 = arith.constant 255 : i8 + %input1 = util.unfoldable_constant dense<[85]> : tensor<1xi8> // b01010101 + %input2 = util.unfoldable_constant dense<[170]> : tensor<1xi8> // b10101010 + %lhs = flow.tensor.bitcast %input1 : tensor<1xi8> -> tensor<8xi1, #packed> + %rhs = flow.tensor.bitcast %input2 : tensor<1xi8> -> tensor<8xi1, #packed> + %empty = tensor.empty() : tensor<8xi1, #packed> + %res = linalg.generic + {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} + ins(%lhs, %rhs : tensor<8xi1, #packed>, tensor<8xi1, #packed>) outs(%empty: tensor<8xi1, #packed>) { + ^bb0(%inlhs: i1, %inrhs: i1, %out: i1): + %inres = arith.xori %inlhs, %inrhs: i1 + linalg.yield %inres : i1 + } -> tensor<8xi1, #packed> + %tensor_res = flow.tensor.bitcast %res : tensor<8xi1, #packed> -> tensor<1xi8> + check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8> + return +}