diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
index e5d1b93cbf82..119b3291817b 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -30,10 +30,13 @@ MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter(
   addConversion([](FloatType floatType) { return floatType; });
   addConversion([](MemRefType memrefType) { return memrefType; });
   addConversion([=](RankedTensorType type) -> RankedTensorType {
+    MaterializeEncodingInfo encodingInfo = getEncodingInfo(type);
+    if (IREE::Encoding::hasPackedStorageAttr(type)) {
+      return type;
+    }
     // For a given tensor type with an encoding, return the materialized
     // type to use for it. If no encoding is set, then return the tensor type
     // itself.
-    MaterializeEncodingInfo encodingInfo = getEncodingInfo(type);
     if (IREE::Codegen::isIdentityLayout(encodingInfo)) {
       return dropEncoding(type);
     }
@@ -92,6 +95,14 @@ MaterializeEncodingTypeConverter::getEncodingInfo(RankedTensorType type) const {
 }
 
 RankedTensorType dropEncoding(RankedTensorType type) {
+  assert(!IREE::Encoding::hasPackedStorageAttr(type) &&
+         "not expected `packed_storage` attribute.");
+  return RankedTensorType::get(type.getShape(), type.getElementType());
+}
+
+RankedTensorType dropPackedStorageEncodingIfAny(RankedTensorType type) {
+  if (!IREE::Encoding::hasPackedStorageAttr(type))
+    return type;
   return RankedTensorType::get(type.getShape(), type.getElementType());
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
index 08a8a5aadbe6..ff07c37ae233 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -10,6 +10,7 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenTypes.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
+#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -77,6 +78,9 @@ class OpMaterializeEncodingPattern : public OpConversionPattern<OpTy> {
 /// Returns the RankedTensorType without encodings.
 RankedTensorType dropEncoding(RankedTensorType type);
 
+/// Returns the RankedTensorType without packed storage encoding (if any).
+RankedTensorType dropPackedStorageEncodingIfAny(RankedTensorType type);
+
 /// Returns the deserialized MaterializeEncodingInfo if the `layouts` field is
 /// present in encodings and it only has a single layout. Otherwise, returns
 /// std::nullopt.
diff --git a/compiler/src/iree/compiler/Codegen/Common/TypePropagationPass.cpp b/compiler/src/iree/compiler/Codegen/Common/TypePropagationPass.cpp
index bd182bdd77fd..676a7c3ae005 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TypePropagationPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TypePropagationPass.cpp
@@ -25,6 +25,7 @@
 //===---------------------------------------------------------------------===//
 
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
@@ -65,9 +66,8 @@ static Value convertElementType(OpBuilder &b, Location loc, Type targetType,
 /// std::nullopt.
 static std::optional<Type> getLegalizedType(Type t) {
   if (auto shapedType = llvm::dyn_cast<RankedTensorType>(t)) {
-    Type elementType = shapedType.getElementType();
     std::optional<Type> legalizedElementType =
-        legalizeStorageElementType(elementType);
+        legalizeTensorStorageElementType(shapedType);
     if (!legalizedElementType)
       return std::nullopt;
     return RankedTensorType::get(shapedType.getShape(),
@@ -121,7 +121,7 @@ struct ConstantOpTypeConversion
           constantOp, "expected attribute type to be shaped type");
     }
     std::optional<Type> legalizedElementType =
-        legalizeStorageElementType(attrType.getElementType());
+        legalizeTensorStorageElementType(attrType);
     if (!legalizedElementType) {
       return rewriter.notifyMatchFailure(constantOp,
                                          "cannot legalize elementType");
@@ -227,8 +227,10 @@ struct GenericOpTypePropagation
         signatureConverter.addInputs(index, argType);
         continue;
       }
+      auto inputOperandType =
+          llvm::cast<RankedTensorType>(genericOp->getOperandTypes()[index]);
       std::optional<Type> legalizedArgType =
-          legalizeStorageElementType(argType);
+          legalizeTensorStorageElementType(inputOperandType);
       if (!legalizedArgType) {
         return genericOp.emitOpError("failed to get legalized type for arg ")
                << index;
@@ -258,8 +260,8 @@ struct GenericOpTypePropagation
           modifyYield = true;
           OpOperand *yieldOperand =
               modifiedOp.getMatchingYieldValue(modifiedOpOperand);
-          std::optional<Type> legalizedType =
-              legalizeStorageElementType(yieldOperand->get().getType());
+          std::optional<Type> legalizedType = legalizeTensorStorageElementType(
+              modifiedOpOperand->get().getType());
           if (!legalizedType) {
             return genericOp.emitOpError(
                 "failed to get legalized type for yield value");
@@ -289,7 +291,7 @@ struct LinalgFillTypePropagation
                   ConversionPatternRewriter &rewriter) const final {
     Value value = adaptor.getInputs().front();
     std::optional<Type> legalizedElementType =
-        legalizeStorageElementType(value.getType());
+        legalizeTensorStorageElementType(adaptor.getOutputs()[0].getType());
     if (!legalizedElementType) {
       return fillOp.emitOpError("failed to get legalized type for value");
     }
@@ -355,8 +357,8 @@ struct IREELinalgExtScatterTypePropagation
     // type.
     TypeConverter::SignatureConversion signatureConverter(
         modifiedOpRegion.getNumArguments());
-    Type argType = modifiedOpRegion.getArguments()[0].getType();
-    std::optional<Type> legalizedArgType = legalizeStorageElementType(argType);
+    std::optional<Type> legalizedArgType =
+        legalizeTensorStorageElementType(inputType);
     if (!legalizedArgType) {
       return scatterOp.emitOpError("failed to get legalized type for argument");
     }
@@ -418,8 +420,12 @@ struct IREELinalgExtSortTypePropagation
     TypeConverter::SignatureConversion signatureConverter(
         modifiedOpRegion.getNumArguments());
     for (auto [index, arg] : llvm::enumerate(modifiedOpRegion.getArguments())) {
+      // Refer to input types of the original operation to determine the
+      // corresponding legal arg type.
+      auto convertType = index % 2 == 0 ? sortOp->getOperandTypes()[index / 2]
+                                        : sortOp->getResultTypes()[index / 2];
       std::optional<Type> legalizedArgType =
-          legalizeStorageElementType(arg.getType());
+          legalizeTensorStorageElementType(convertType);
       if (!legalizedArgType) {
         return sortOp.emitOpError("failed to get legalized type for argument");
       }
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp
index be1462cd30a4..e37589bf96ad 100644
--- a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp
+++ b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingAttrs.cpp
@@ -281,8 +281,12 @@ EncodingAttr getEncodingAttr(RankedTensorType type) {
   return dyn_cast_or_null<EncodingAttr>(type.getEncoding());
 }
 
-bool hasPackedStorageAttr(RankedTensorType type) {
-  return dyn_cast_or_null<PackedStorageAttr>(type.getEncoding()) != nullptr;
+bool hasPackedStorageAttr(Type type) {
+  if (auto tensorType = dyn_cast<RankedTensorType>(type)) {
+    return dyn_cast_or_null<PackedStorageAttr>(tensorType.getEncoding()) !=
+           nullptr;
+  }
+  return false;
 }
 
 FailureOr<linalg::ContractionDimensions>
diff --git a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h
index 6edc666110d0..96812d63afe2 100644
--- a/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h
+++ b/compiler/src/iree/compiler/Dialect/Encoding/IR/EncodingTypes.h
@@ -43,7 +43,7 @@ namespace mlir::iree_compiler::IREE::Encoding {
 EncodingAttr getEncodingAttr(RankedTensorType type);
 
 /// Returns true if the type contains packed_storage attribute.
-bool hasPackedStorageAttr(RankedTensorType type);
+bool hasPackedStorageAttr(Type type);
 
 /// Returns the ContractionDimensions for the encoding user_indexing_maps.
 FailureOr<linalg::ContractionDimensions>
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel
index 5c64fa1ce8a2..bb8606fec0af 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/BUILD.bazel
@@ -22,6 +22,7 @@ iree_compiler_cc_library(
     ],
     deps = [
         ":Utils",
+        "//compiler/src/iree/compiler/Codegen/Common",
         "//compiler/src/iree/compiler/Dialect/HAL/Analysis",
         "//compiler/src/iree/compiler/Dialect/HAL/Conversion",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt
index 92dc1b9dba64..de1892ce0b4b 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/CMakeLists.txt
@@ -28,6 +28,7 @@ iree_cc_library(
     MLIRSCFDialect
     MLIRTransformUtils
     MLIRTransforms
+    iree::compiler::Codegen::Common
     iree::compiler::Dialect::HAL::Analysis
     iree::compiler::Dialect::HAL::Conversion
     iree::compiler::Dialect::HAL::IR
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp
index 748483349f98..e3da3683d4c6 100644
--- a/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Patterns.h"
 
+#include "iree/compiler/Codegen/Common/EncodingUtils.h"
 #include "iree/compiler/Dialect/HAL/Analysis/Captures.h"
 #include "iree/compiler/Dialect/HAL/Conversion/StreamToHAL/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALDialect.h"
@@ -478,7 +479,8 @@ struct TensorExportBufferViewOpPattern
     }
 
     auto loc = exportOp.getLoc();
-    auto tensorType = llvm::cast<RankedTensorType>(adaptor.getSourceEncoding());
+    auto tensorType = dropPackedStorageEncodingIfAny(
+        llvm::cast<RankedTensorType>(adaptor.getSourceEncoding()));
     auto dynamicDims = adaptor.getSourceEncodingDims();
 
     // NOTE: we should have verified supported encodings/types at entry into the
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD.bazel
index 2fa22edf5eb6..ef727de312b6 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/BUILD.bazel
@@ -64,7 +64,7 @@ iree_compiler_cc_library(
         ":StreamInterfacesGen",
         ":StreamOpsGen",
         ":StreamTypesGen",
-        "//compiler/src/iree/compiler/Dialect/Util/IR",
+        "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "//compiler/src/iree/compiler/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/IR/CMakeLists.txt
index 2f10910741ae..79e4e2c46539 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/CMakeLists.txt
@@ -54,7 +54,7 @@ iree_cc_library(
     MLIRTensorDialect
     MLIRTransformUtils
     MLIRViewLikeInterface
-    iree::compiler::Dialect::Util::IR
+    iree::compiler::Dialect::Encoding::IR
     iree::compiler::Utils
   PUBLIC
 )
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
index 13988a999b2f..46f88edb7def 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Dialect/Stream/IR/StreamOps.h"
 
+#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
 #include "iree/compiler/Dialect/Util/IR/ClosureOpUtils.h"
 #include "iree/compiler/Dialect/Util/IR/UtilOps.h"
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
@@ -27,6 +28,10 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/RegionUtils.h"
 
+namespace mlir::iree_compiler {
+using IREE::Encoding::getEncodingAttr;
+}
+
 namespace mlir::iree_compiler::IREE::Stream {
 
 //===----------------------------------------------------------------------===//
@@ -1903,7 +1908,7 @@ LogicalResult TensorCloneOp::verify() {
   // information.
   auto sourceEncoding = llvm::cast<RankedTensorType>(op.getSourceEncoding());
   auto resultEncoding = llvm::cast<RankedTensorType>(op.getResultEncoding());
-  if (sourceEncoding.getEncoding() != resultEncoding.getEncoding()) {
+  if (getEncodingAttr(sourceEncoding) != getEncodingAttr(resultEncoding)) {
     return op.emitOpError() << "clones changing tensor encoding from "
                             << sourceEncoding.getEncoding() << " to "
                             << resultEncoding.getEncoding() << "; not allowed";
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ConvertToStream.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ConvertToStream.cpp
index 501cbb83fbbb..4676fb4fc307 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ConvertToStream.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ConvertToStream.cpp
@@ -4,6 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowDialect.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowTypes.h"
 #include "iree/compiler/Dialect/Stream/Analysis/Affinity.h"
@@ -22,6 +23,7 @@
 #include "iree/compiler/Dialect/Util/IR/UtilTypes.h"
 #include "iree/compiler/Dialect/Util/Transforms/Passes.h"
 #include "iree/compiler/Dialect/Util/Transforms/Patterns.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -247,6 +249,12 @@ struct ConvertToStreamPass final
       if (llvm::isa<IREE::Flow::ChannelType>(type)) {
         return IREE::Stream::ChannelType::get(context);
       }
+      if (auto rankedType = llvm::dyn_cast_or_null<RankedTensorType>(type)) {
+        if (IREE::Encoding::hasPackedStorageAttr(rankedType)) {
+          return RankedTensorType::get(rankedType.getShape(),
+                                       rankedType.getElementType());
+        }
+      }
       return !llvm::isa<TensorType>(type) ? type : Type{};
     });
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
index c3753aab0dfe..ec0d24e74eed 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/EncodeTensors.cpp
@@ -58,7 +58,7 @@ static LogicalResult checkEncoding(Operation *op, RankedTensorType encodingType,
 // Aligns the element type of a tensor<> to a byte-aligned power of 2 bit width.
 static RankedTensorType alignTensorType(RankedTensorType originalType) {
   Type elementType = originalType.getElementType();
-  Type alignedType = legalizeStorageElementType(elementType);
+  Type alignedType = legalizeTensorStorageElementType(originalType);
   if (alignedType == elementType)
     return originalType;
   return RankedTensorType::get(originalType.getShape(), alignedType,
@@ -168,7 +168,9 @@ static Value canonicalizeFillPattern(Value pattern, OpBuilder &builder) {
   //   %i8_val = (%i8_val << 2) | %i2_val
   //   %i8_val = (%i8_val << 2) | %i2_val
   //   %i8_val = (%i8_val << 2) | %i2_val
-  if (needToPackSubByteElementBitWidth(elementBitWidth)) {
+  bool patternIsPacked =
+      IREE::Encoding::hasPackedStorageAttr(pattern.getType());
+  if (!patternIsPacked && needToPackSubByteElementBitWidth(elementBitWidth)) {
     Type i8Type = builder.getI8Type();
     Value bitwidth = builder.createOrFold<arith::ConstantOp>(
         loc, i8Type, builder.getIntegerAttr(i8Type, elementBitWidth));
@@ -655,7 +657,8 @@ struct EncodeHostTensorsPass
 static IREE::Flow::DispatchTensorType
 alignDispatchTensorType(IREE::Flow::DispatchTensorType originalType) {
   Type elementType = originalType.getBoundElementType();
-  Type alignedType = legalizeStorageElementType(elementType);
+  Type alignedType =
+      legalizeTensorStorageElementType(originalType.asRankedTensorType());
   if (alignedType == elementType)
     return originalType;
   return IREE::Flow::DispatchTensorType::get(
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
index 138ba0be6689..3e1dacdb1eda 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/BUILD.bazel
@@ -30,7 +30,6 @@ iree_lit_test_suite(
             "encode_host_tensors.mlir",
             "encode_host_tensors_encoding.mlir",
             "encode_host_tensors_packing.mlir",
-            "encode_host_tensors_packing_i1_experimental_clopt.mlir",
             "fold_globals.mlir",
             "fold_uniform_operands.mlir",
             "fuse_dispatch_bindings.mlir",
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
index 4c4cb93d80ef..75aaf654b8b8 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/CMakeLists.txt
@@ -28,7 +28,6 @@ iree_lit_test_suite(
     "encode_host_tensors.mlir"
     "encode_host_tensors_encoding.mlir"
     "encode_host_tensors_packing.mlir"
-    "encode_host_tensors_packing_i1_experimental_clopt.mlir"
     "fold_globals.mlir"
     "fold_uniform_operands.mlir"
     "fuse_dispatch_bindings.mlir"
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir
deleted file mode 100644
index c96e05270d12..000000000000
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors_packing_i1_experimental_clopt.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-// This is only used to test the experimental packing flag. When the default
-// is changed the encode_host_tensors.mlir test should be updated and used
-// instead and this file should be deleted.
-
-// RUN: iree-opt --split-input-file --iree-stream-encode-host-tensors --iree-experimental-packed-i1-storage %s | FileCheck %s
-
-// CHECK-LABEL: @tensorSizeOfUnalignedPackedI1
-util.func @tensorSizeOfUnalignedPackedI1() -> index {
-  // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index
-  %0 = stream.tensor.sizeof tensor<12xi1> : index
-  // CHECK: return %[[C2]] : index
-  util.return %0 : index
-}
-
-// -----
-
-// CHECK-LABEL: @tensorSizeOfAlignedPackedI1
-util.func @tensorSizeOfAlignedPackedI1() -> index {
-  // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
-  %0 = stream.tensor.sizeof tensor<24xi1> : index
-  // CHECK: util.return %[[C3]] : index
-  util.return %0 : index
-}
diff --git a/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp b/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp
index d7b3258d5e45..f902874582cd 100644
--- a/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp
+++ b/compiler/src/iree/compiler/Utils/ElementPackingUtils.cpp
@@ -15,27 +15,9 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/BuiltinTypes.h"
 
-// TODO(lialan): remove cl options once frontend can emit packed i1 tensors.
-llvm::cl::opt<bool> clEnableI1Support(
-    "iree-experimental-packed-i1-storage",
-    llvm::cl::desc(
-        "Experimental feature: force to use packed storage for i1 tensors."
-        "Turning on this option will see i1 tensors as if it has "
-        "#iree_encoding.packed_storage attribute."
-        "This is to allow an alternative way to test the packed storage "
-        "feature before frontend can emit packed i1 tensors."
-        "This option can be dropped once the frontend can emit packed i1 "
-        "tensors."),
-    llvm::cl::init(false));
-
 namespace mlir::iree_compiler {
 
-static bool needToPackSubByteElementBitWidthImpl(unsigned bitWidth,
-                                                 bool isPackedStorage) {
-  // Enable i1 support if requested.
-  if (isPackedStorage && bitWidth == 1) {
-    return true;
-  }
+bool needToPackSubByteElementBitWidth(unsigned bitWidth) {
   // Require the original bit width to be some power of two for now to avoid
   // trickiness and weirdness of packing and cross-byte access.
   // Also disallow boolean values for now--they may require separate interface
@@ -43,19 +25,13 @@ static bool needToPackSubByteElementBitWidthImpl(unsigned bitWidth,
   return bitWidth < 8 && llvm::isPowerOf2_32(bitWidth) && bitWidth != 1;
 }
 
-bool needToPackSubByteElementBitWidth(unsigned bitWidth) {
-  return needToPackSubByteElementBitWidthImpl(
-      bitWidth, /*isPackedStorage=*/clEnableI1Support);
-}
-
 bool needToPackSubByteElements(RankedTensorType shapedType) {
   unsigned bitWidth = IREE::Util::getTypeBitWidth(shapedType.getElementType());
-  // Two paths to enable packed storage for i1 tensors: the attribute or cl
-  // option. The cl option will be dropped once frontend supports emitting
-  // tensors with attributes.
-  bool isPackedStorage =
-      IREE::Encoding::hasPackedStorageAttr(shapedType) || clEnableI1Support;
-  return needToPackSubByteElementBitWidthImpl(bitWidth, isPackedStorage);
+  // i1 with packed memory layout does not need to be extended.
+  if (bitWidth == 1 && IREE::Encoding::hasPackedStorageAttr(shapedType)) {
+    return true;
+  }
+  return needToPackSubByteElementBitWidth(bitWidth);
 }
 
 static Type legalizeStorageElementTypeImpl(Type elementType,
@@ -65,9 +41,13 @@ static Type legalizeStorageElementTypeImpl(Type elementType,
   if (!intType)
     return elementType;
 
-  // For sub-byte elements, default to pack them into bytes.
   unsigned bitWidth = intType.getWidth();
-  if (needToPackSubByteElementBitWidthImpl(bitWidth, isPackedStorage))
+  if (bitWidth == 1 && isPackedStorage) {
+    return elementType;
+  }
+
+  // For sub-byte elements, default to pack them into bytes.
+  if (needToPackSubByteElementBitWidth(bitWidth))
     return elementType;
 
   // Otherwise, extend them to the next power-of-two bit width.
@@ -79,10 +59,10 @@ static Type legalizeStorageElementTypeImpl(Type elementType,
                           intType.getSignedness());
 }
 
-Type legalizeStorageElementType(Type elementType) {
-  // Consider packed storage for i1 tensors if cl opt is set.
-  return legalizeStorageElementTypeImpl(elementType,
-                                        /*isPackedStorage=*/clEnableI1Support);
+Type legalizeTensorStorageElementType(Type type) {
+  auto tensorType = llvm::cast<TensorType>(type);
+  return legalizeStorageElementTypeImpl(
+      tensorType.getElementType(), IREE::Encoding::hasPackedStorageAttr(type));
 }
 
 Value calculateStorageElementCountInBytes(Location loc,
@@ -96,15 +76,16 @@ Value calculateStorageElementCountInBytes(Location loc,
         loc, builder, shapedType, dynamicDims);
   }
 
-  bool isPackedStorage =
-      IREE::Encoding::hasPackedStorageAttr(shapedType) || clEnableI1Support;
-  Type alignedElementType = legalizeStorageElementTypeImpl(
-      shapedType.getElementType(), isPackedStorage);
+  Type alignedElementType = legalizeTensorStorageElementType(shapedType);
   unsigned elementBits = IREE::Util::getTypeBitWidth(alignedElementType);
 
+  bool isPackedStorage = IREE::Encoding::hasPackedStorageAttr(shapedType);
+  bool isI1WithPackedStorage = elementBits == 1 && isPackedStorage;
+
   // Calculate all static dims first, if any.
   int64_t staticCount = 1;
-  if (!needToPackSubByteElementBitWidthImpl(elementBits, isPackedStorage)) {
+  if (!isI1WithPackedStorage &&
+      !needToPackSubByteElementBitWidth(elementBits)) {
     staticCount *= IREE::Util::getRoundedElementByteWidth(alignedElementType);
   }
 
@@ -119,7 +100,7 @@ Value calculateStorageElementCountInBytes(Location loc,
     value = builder.createOrFold<arith::MulIOp>(loc, value, dim);
   }
   // Sub-byte packing requires putting multiple elements in the same byte.
-  if (needToPackSubByteElementBitWidthImpl(elementBits, isPackedStorage)) {
+  if (isI1WithPackedStorage || needToPackSubByteElementBitWidth(elementBits)) {
     assert(8 % elementBits == 0);
     unsigned byteElements = 8 / elementBits;
     // TODO(antiagainst): We may want to emit runtime check to make sure this is
@@ -139,14 +120,14 @@ Value calculateStorageElementOffsetInBytes(Location loc,
                                            RankedTensorType originalType,
                                            Value linearizedIndex,
                                            OpBuilder &builder) {
-  bool isPackedStorage =
-      IREE::Encoding::hasPackedStorageAttr(originalType) || clEnableI1Support;
-  Type alignedElementType = legalizeStorageElementTypeImpl(
-      originalType.getElementType(), isPackedStorage);
+  Type alignedElementType = legalizeTensorStorageElementType(originalType);
   unsigned elementBits = IREE::Util::getTypeBitWidth(alignedElementType);
 
+  bool isPackedStorage = IREE::Encoding::hasPackedStorageAttr(originalType);
+  bool isI1WithPackedStorage = elementBits == 1 && isPackedStorage;
+
   // Sub-byte packing requires putting multiple elements in the same byte.
-  if (needToPackSubByteElementBitWidthImpl(elementBits, isPackedStorage)) {
+  if (isI1WithPackedStorage || needToPackSubByteElementBitWidth(elementBits)) {
     Value byteElements =
         builder.create<arith::ConstantIndexOp>(loc, 8 / elementBits);
     // TODO(antiagainst): We may want to emit runtime check to make sure this is
diff --git a/compiler/src/iree/compiler/Utils/ElementPackingUtils.h b/compiler/src/iree/compiler/Utils/ElementPackingUtils.h
index 9de6ea70c26a..38fff78df2b8 100644
--- a/compiler/src/iree/compiler/Utils/ElementPackingUtils.h
+++ b/compiler/src/iree/compiler/Utils/ElementPackingUtils.h
@@ -30,6 +30,8 @@ bool needToPackSubByteElements(RankedTensorType shapedType);
 /// cases.
 Type legalizeStorageElementType(Type elementType);
 
+Type legalizeTensorStorageElementType(Type tensorType);
+
 /// Emits IR with the given |builder| to calculate the total number of bytes
 /// required for the given |shapedType| in storage. Returns the value for the
 /// final count on success; returns nullptr on failure. Dynamic dimensions in
diff --git a/tests/e2e/linalg_ext_ops/BUILD.bazel b/tests/e2e/linalg_ext_ops/BUILD.bazel
index acc14d1c61f9..2c81be1a66c2 100644
--- a/tests/e2e/linalg_ext_ops/BUILD.bazel
+++ b/tests/e2e/linalg_ext_ops/BUILD.bazel
@@ -42,24 +42,6 @@ iree_check_single_backend_test_suite(
     target_backend = "llvm-cpu",
 )
 
-iree_check_single_backend_test_suite(
-    name = "check_llvm-cpu_local-task_i1",
-    srcs = [
-        "attention_i1_mask.mlir",
-    ],
-    compiler_flags = [
-        "--iree-llvmcpu-target-cpu=generic",
-        "--iree-experimental-packed-i1-storage",
-    ],
-    driver = "local-task",
-    tags = [
-        # attention fails with a wasm target, just disable the tests there for now
-        #   error: Yield operand #2 is not equivalent to the corresponding iter bbArg
-        "nowasm",
-    ],
-    target_backend = "llvm-cpu",
-)
-
 VMVX_SRCS = enforce_glob(
     # keep sorted
     [
diff --git a/tests/e2e/linalg_ext_ops/CMakeLists.txt b/tests/e2e/linalg_ext_ops/CMakeLists.txt
index 37377670dd20..5bc968c6e9b7 100644
--- a/tests/e2e/linalg_ext_ops/CMakeLists.txt
+++ b/tests/e2e/linalg_ext_ops/CMakeLists.txt
@@ -31,22 +31,6 @@ iree_check_single_backend_test_suite(
     "nowasm"
 )
 
-iree_check_single_backend_test_suite(
-  NAME
-    check_llvm-cpu_local-task_i1
-  SRCS
-    "attention_i1_mask.mlir"
-  TARGET_BACKEND
-    "llvm-cpu"
-  DRIVER
-    "local-task"
-  COMPILER_FLAGS
-    "--iree-llvmcpu-target-cpu=generic"
-    "--iree-experimental-packed-i1-storage"
-  LABELS
-    "nowasm"
-)
-
 iree_check_single_backend_test_suite(
   NAME
     check_vmvx_local-task
diff --git a/tests/e2e/linalg_ext_ops/attention_i1_mask.mlir b/tests/e2e/linalg_ext_ops/attention_i1_mask.mlir
index e4a4631c1cd0..07d1852e6f56 100644
--- a/tests/e2e/linalg_ext_ops/attention_i1_mask.mlir
+++ b/tests/e2e/linalg_ext_ops/attention_i1_mask.mlir
@@ -1,3 +1,4 @@
+#packed = #iree_encoding.packed_storage
 func.func @attention1x4x4_i1_mask() {
   %init = tensor.empty() : tensor<1x4x4xf32>
   %query = util.unfoldable_constant dense<[[[0.1, 0.2, 0.3, 0.4],
@@ -15,7 +16,7 @@ func.func @attention1x4x4_i1_mask() {
                                             [1.3, 1.4, 1.5, 1.6]]]> : tensor<1x4x4xf32>
 
   %i8mask = util.unfoldable_constant dense<[165, 165]> : tensor<2xi8>
-  %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1>
+  %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1, #packed>
 
   %scale = arith.constant 0.5 : f32
   %1 = iree_linalg_ext.attention  {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>,
@@ -25,7 +26,7 @@ func.func @attention1x4x4_i1_mask() {
                      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>,
                      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]}
                      ins(%query, %key, %value, %scale, %mask : tensor<1x4x4xf32>,
-        tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1>) outs(%init : tensor<1x4x4xf32>) {
+        tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1, #packed>) outs(%init : tensor<1x4x4xf32>) {
           ^bb0(%arg0: f32):
           iree_linalg_ext.yield %arg0 : f32
         } -> tensor<1x4x4xf32>
@@ -56,7 +57,7 @@ func.func @attention1x4x4_i1_mask_all_ones() {
                                             [1.3, 1.4, 1.5, 1.6]]]> : tensor<1x4x4xf32>
 
   %i8mask = util.unfoldable_constant dense<[255, 255]> : tensor<2xi8>
-  %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1>
+  %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1, #packed>
 
   %scale = arith.constant 0.5 : f32
   %1 = iree_linalg_ext.attention  {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>,
@@ -66,7 +67,7 @@ func.func @attention1x4x4_i1_mask_all_ones() {
                      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>,
                      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]}
                      ins(%query, %key, %value, %scale, %mask : tensor<1x4x4xf32>,
-        tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1>) outs(%init : tensor<1x4x4xf32>) {
+        tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1, #packed>) outs(%init : tensor<1x4x4xf32>) {
           ^bb0(%arg0: f32):
           iree_linalg_ext.yield %arg0 : f32
         } -> tensor<1x4x4xf32>
@@ -97,7 +98,7 @@ func.func @attention1x4x4_i1_mask_tril() {
                                             [1.3, 1.4, 1.5, 1.6]]]> : tensor<1x4x4xf32>
 
   %i8mask = util.unfoldable_constant dense<[140, 239]> : tensor<2xi8>
-  %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1>
+  %mask = flow.tensor.bitcast %i8mask : tensor<2xi8> -> tensor<1x4x4xi1, #packed>
 
   %scale = arith.constant 0.5 : f32
   %1 = iree_linalg_ext.attention  {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>,
@@ -107,7 +108,7 @@ func.func @attention1x4x4_i1_mask_tril() {
                      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>,
                      affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d4)>]}
                      ins(%query, %key, %value, %scale, %mask : tensor<1x4x4xf32>,
-        tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1>) outs(%init : tensor<1x4x4xf32>) {
+        tensor<1x4x4xf32>, tensor<1x4x4xf32>, f32, tensor<1x4x4xi1, #packed>) outs(%init : tensor<1x4x4xf32>) {
           ^bb0(%arg0: f32):
           iree_linalg_ext.yield %arg0 : f32
         } -> tensor<1x4x4xf32>
diff --git a/tests/e2e/subbyte_types/BUILD.bazel b/tests/e2e/subbyte_types/BUILD.bazel
index ff1b1f3ea643..adee37a39294 100644
--- a/tests/e2e/subbyte_types/BUILD.bazel
+++ b/tests/e2e/subbyte_types/BUILD.bazel
@@ -19,16 +19,15 @@ package(
 )
 
 iree_check_single_backend_test_suite(
-    name = "check_llvm-cpu_subbyte_emulation",
+    name = "check_llvm-cpu_subbyte_emulation_attr",
     srcs = enforce_glob(
         [
-            "subbyte_types.mlir",
+            "subbyte_types_attr.mlir",
         ],
         include = ["*.mlir"],
     ),
     compiler_flags = [
         "--iree-llvmcpu-target-cpu=generic",
-        "--iree-experimental-packed-i1-storage",
     ],
     driver = "local-task",
     tags = [
@@ -41,6 +40,6 @@ iree_check_single_backend_test_suite(
 test_suite(
     name = "check",
     tests = [
-        ":check_llvm-cpu_subbyte_emulation",
+        ":check_llvm-cpu_subbyte_emulation_attr",
     ],
 )
diff --git a/tests/e2e/subbyte_types/CMakeLists.txt b/tests/e2e/subbyte_types/CMakeLists.txt
index 8077be696478..27bfdd20337f 100644
--- a/tests/e2e/subbyte_types/CMakeLists.txt
+++ b/tests/e2e/subbyte_types/CMakeLists.txt
@@ -12,16 +12,15 @@ iree_add_all_subdirs()
 
 iree_check_single_backend_test_suite(
   NAME
-    check_llvm-cpu_subbyte_emulation
+    check_llvm-cpu_subbyte_emulation_attr
   SRCS
-    "subbyte_types.mlir"
+    "subbyte_types_attr.mlir"
   TARGET_BACKEND
     "llvm-cpu"
   DRIVER
     "local-task"
   COMPILER_FLAGS
     "--iree-llvmcpu-target-cpu=generic"
-    "--iree-experimental-packed-i1-storage"
   LABELS
     "nowasm"
 )
diff --git a/tests/e2e/subbyte_types/subbyte_types.mlir b/tests/e2e/subbyte_types/subbyte_types.mlir
deleted file mode 100644
index a1fa90bf4446..000000000000
--- a/tests/e2e/subbyte_types/subbyte_types.mlir
+++ /dev/null
@@ -1,99 +0,0 @@
-func.func @i1_type() {
-  %c0 = arith.constant 0 : index
-  %c255 = arith.constant 255 : i8
-  %input1 = util.unfoldable_constant dense<[85]> : tensor<1xi8>  // b01010101
-  %input2 = util.unfoldable_constant dense<[170]> : tensor<1xi8> // b10101010
-  %lhs = flow.tensor.bitcast %input1 : tensor<1xi8> -> tensor<8xi1>
-  %rhs = flow.tensor.bitcast %input2 : tensor<1xi8> -> tensor<8xi1>
-  %empty = tensor.empty() : tensor<8xi1>
-  %res = linalg.generic
-        {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
-        ins(%lhs, %rhs : tensor<8xi1>, tensor<8xi1>) outs(%empty: tensor<8xi1>) {
-  ^bb0(%inlhs: i1, %inrhs: i1, %out: i1):
-    %inres = arith.xori %inlhs, %inrhs: i1
-    linalg.yield %inres : i1
-  } -> tensor<8xi1>
-  %tensor_res = flow.tensor.bitcast %res : tensor<8xi1> -> tensor<1xi8>
-  check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
-  return
-}
-
-func.func @i1_type_slice() {
-  %input = util.unfoldable_constant dense<[0, 255, 0]> : tensor<3xi8>
-  %flat_input_all = flow.tensor.bitcast %input : tensor<3xi8> -> tensor<24xi1>
-  %slice = tensor.extract_slice %flat_input_all[8][8][1] : tensor<24xi1> to tensor<8xi1>
-  %tensor_res = flow.tensor.bitcast %slice : tensor<8xi1> -> tensor<1xi8>
-  check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
-  return
-}
-
-func.func @i1_representation() {
-  %mask = util.unfoldable_constant dense<[140]> : tensor<1xi8>
-  %casted = flow.tensor.bitcast %mask : tensor<1xi8> -> tensor<2x4xi1>
-  %bar = util.optimization_barrier %casted : tensor<2x4xi1>
-  %tensor_res = flow.tensor.bitcast %bar : tensor<2x4xi1> -> tensor<1xi8>
-  check.expect_eq_const(%tensor_res, dense<[140]> : tensor<1xi8>) : tensor<1xi8>
-  return
-}
-
-func.func @i1_representation_2() {
-  %mask = util.unfoldable_constant dense<[140, 77]> : tensor<2xi8>
-  %casted = flow.tensor.bitcast %mask : tensor<2xi8> -> tensor<2x8xi1>
-  %bar = util.optimization_barrier %casted : tensor<2x8xi1>
-  %tensor_res = flow.tensor.bitcast %bar : tensor<2x8xi1> -> tensor<2xi8>
-  check.expect_eq_const(%tensor_res, dense<[140, 77]> : tensor<2xi8>) : tensor<2xi8>
-  return
-}
-
-func.func @i1_representation_3() {
-  %mask = util.unfoldable_constant dense<[140, 77]> : tensor<2xi8>
-  %casted = flow.tensor.bitcast %mask : tensor<2xi8> -> tensor<4x4xi1>
-  %bar = util.optimization_barrier %casted : tensor<4x4xi1>
-  %tensor_res = flow.tensor.bitcast %bar : tensor<4x4xi1> -> tensor<2xi8>
-  check.expect_eq_const(%tensor_res, dense<[140, 77]> : tensor<2xi8>) : tensor<2xi8>
-  return
-}
-
-func.func @truncate_i1() {
-  %mask = util.unfoldable_constant dense<[1, 1, 0, 0,
-                                          0, 0, 1, 1]> : tensor<8xi8>
-  %nm = tensor.empty() : tensor<8xi1>
-  %truncm = linalg.generic
-  {indexing_maps = [
-    affine_map<(d0) -> (d0)>,
-    affine_map<(d0) -> (d0)>],
-  iterator_types = ["parallel"]}
-  ins(%mask: tensor<8xi8>)
-  outs(%nm: tensor<8xi1>) {
-    ^bb0(%in: i8, %out: i1):
-      %zero = arith.constant 0 : i8
-      %truncated = arith.cmpi "sgt", %in, %zero : i8
-      linalg.yield %truncated : i1
-  } -> tensor<8xi1>
-  %tensor_res = flow.tensor.bitcast %truncm : tensor<8xi1> -> tensor<1xi8>
-  check.expect_eq_const(%tensor_res, dense<[195]> : tensor<1xi8>) : tensor<1xi8>
-  return
-}
-
-func.func @truncate_i1_2() {
-  %mask = util.unfoldable_constant dense<[[0, 0, 1, 1],
-                                          [1, 1, 0, 0],
-                                          [1, 1, 0, 0],
-                                          [0, 0, 1, 1]]> : tensor<4x4xi8>
-  %nm = tensor.empty() : tensor<4x4xi1>
-  %truncm = linalg.generic
-  {indexing_maps = [
-    affine_map<(d0, d1) -> (d0, d1)>,
-    affine_map<(d0, d1) -> (d0, d1)>],
-  iterator_types = ["parallel", "parallel"]}
-  ins(%mask: tensor<4x4xi8>)
-  outs(%nm: tensor<4x4xi1>) {
-    ^bb0(%in: i8, %out: i1):
-      %zero = arith.constant 0 : i8
-      %truncated = arith.cmpi "sgt", %in, %zero : i8
-      linalg.yield %truncated : i1
-  } -> tensor<4x4xi1>
-  %tensor_res = flow.tensor.bitcast %truncm : tensor<4x4xi1> -> tensor<2xi8>
-  check.expect_eq_const(%tensor_res, dense<[60, 195]> : tensor<2xi8>) : tensor<2xi8>
-  return
-}
diff --git a/tests/e2e/subbyte_types/subbyte_types_attr.mlir b/tests/e2e/subbyte_types/subbyte_types_attr.mlir
new file mode 100644
index 000000000000..d6b5fd3cb793
--- /dev/null
+++ b/tests/e2e/subbyte_types/subbyte_types_attr.mlir
@@ -0,0 +1,56 @@
+#packed = #iree_encoding.packed_storage
+func.func @i1_type_slice() {
+  %input = util.unfoldable_constant dense<[0, 255, 0]> : tensor<3xi8>
+  %flat_input_all = flow.tensor.bitcast %input : tensor<3xi8> -> tensor<24xi1, #packed>
+  %slice = tensor.extract_slice %flat_input_all[8][8][1] : tensor<24xi1, #packed> to tensor<8xi1, #packed>
+  %tensor_res = flow.tensor.bitcast %slice : tensor<8xi1, #packed> -> tensor<1xi8>
+  check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
+  return
+}
+
+func.func @i1_representation() {
+  %mask = util.unfoldable_constant dense<[140]> : tensor<1xi8>
+  %casted = flow.tensor.bitcast %mask : tensor<1xi8> -> tensor<2x4xi1, #packed>
+  %bar = util.optimization_barrier %casted : tensor<2x4xi1, #packed>
+  %tensor_res = flow.tensor.bitcast %bar : tensor<2x4xi1, #packed> -> tensor<1xi8>
+  check.expect_eq_const(%tensor_res, dense<[140]> : tensor<1xi8>) : tensor<1xi8>
+  return
+}
+
+func.func @i1_representation_2() {
+  %mask = util.unfoldable_constant dense<[140, 77]> : tensor<2xi8>
+  %casted = flow.tensor.bitcast %mask : tensor<2xi8> -> tensor<2x8xi1, #packed>
+  %bar = util.optimization_barrier %casted : tensor<2x8xi1, #packed>
+  %tensor_res = flow.tensor.bitcast %bar : tensor<2x8xi1, #packed> -> tensor<2xi8>
+  check.expect_eq_const(%tensor_res, dense<[140, 77]> : tensor<2xi8>) : tensor<2xi8>
+  return
+}
+
+func.func @i1_representation_3() {
+  %mask = util.unfoldable_constant dense<[140, 77]> : tensor<2xi8>
+  %casted = flow.tensor.bitcast %mask : tensor<2xi8> -> tensor<4x4xi1, #packed>
+  %bar = util.optimization_barrier %casted : tensor<4x4xi1, #packed>
+  %tensor_res = flow.tensor.bitcast %bar : tensor<4x4xi1, #packed> -> tensor<2xi8>
+  check.expect_eq_const(%tensor_res, dense<[140, 77]> : tensor<2xi8>) : tensor<2xi8>
+  return
+}
+
+func.func @i1_type() {
+  %c0 = arith.constant 0 : index
+  %c255 = arith.constant 255 : i8
+  %input1 = util.unfoldable_constant dense<[85]> : tensor<1xi8>  // b01010101
+  %input2 = util.unfoldable_constant dense<[170]> : tensor<1xi8> // b10101010
+  %lhs = flow.tensor.bitcast %input1 : tensor<1xi8> -> tensor<8xi1, #packed>
+  %rhs = flow.tensor.bitcast %input2 : tensor<1xi8> -> tensor<8xi1, #packed>
+  %empty = tensor.empty() : tensor<8xi1, #packed>
+  %res = linalg.generic
+        {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]}
+        ins(%lhs, %rhs : tensor<8xi1, #packed>, tensor<8xi1, #packed>) outs(%empty: tensor<8xi1, #packed>) {
+  ^bb0(%inlhs: i1, %inrhs: i1, %out: i1):
+    %inres = arith.xori %inlhs, %inrhs: i1
+    linalg.yield %inres : i1
+  } -> tensor<8xi1, #packed>
+  %tensor_res = flow.tensor.bitcast %res : tensor<8xi1, #packed> -> tensor<1xi8>
+  check.expect_eq_const(%tensor_res, dense<[255]> : tensor<1xi8>) : tensor<1xi8>
+  return
+}