From 2d3bd370eb51d4ccc95be7d7c28695d7be85a0fb Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Fri, 15 Nov 2024 13:04:26 -0800
Subject: [PATCH 1/7] Check if type is int or float

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../Codegen/LLVMCPU/KernelDispatch.cpp        | 22 ++++++++++---------
 1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index bd3c1f4614ac..d3d3ec6cf0be 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -2922,16 +2922,18 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
   // loads and stores will have a performance impact.
   auto resultTypes = rootOperation->getResultTypes();
   if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
-    auto elementTypeSize =
-        cast<ShapedType>(rootOperation->getResultTypes().front())
-            .getElementType()
-            .getIntOrFloatBitWidth();
-    // for now just enable for i1
-    if (elementTypeSize == 1) {
-      auto innermostTileSize = commonVecTileSizes.back();
-      commonVecTileSizes.back() =
-          llvm::alignTo(innermostTileSize * elementTypeSize, 8) /
-          elementTypeSize;
+    auto resultType = cast<ShapedType>(rootOperation->getResultTypes().front())
+                          .getElementType();
+    if (resultType.isIntOrFloat()) {
+      auto elementTypeSize = resultType.getIntOrFloatBitWidth();
+
+      // for now just enable for i1
+      if (elementTypeSize == 1) {
+        auto innermostTileSize = commonVecTileSizes.back();
+        commonVecTileSizes.back() =
+            llvm::alignTo(innermostTileSize * elementTypeSize, 8) /
+            elementTypeSize;
+      }
     }
   }
 

From 55dc55ba029c63133f2bb400ead4a73d152cfadc Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Mon, 18 Nov 2024 12:59:49 -0800
Subject: [PATCH 2/7] Revert "Check if type is int or float"

This reverts commit 2d3bd370eb51d4ccc95be7d7c28695d7be85a0fb.

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../Codegen/LLVMCPU/KernelDispatch.cpp        | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index d3d3ec6cf0be..bd3c1f4614ac 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -2922,18 +2922,16 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
   // loads and stores will have a performance impact.
   auto resultTypes = rootOperation->getResultTypes();
   if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
-    auto resultType = cast<ShapedType>(rootOperation->getResultTypes().front())
-                          .getElementType();
-    if (resultType.isIntOrFloat()) {
-      auto elementTypeSize = resultType.getIntOrFloatBitWidth();
-
-      // for now just enable for i1
-      if (elementTypeSize == 1) {
-        auto innermostTileSize = commonVecTileSizes.back();
-        commonVecTileSizes.back() =
-            llvm::alignTo(innermostTileSize * elementTypeSize, 8) /
-            elementTypeSize;
-      }
+    auto elementTypeSize =
+        cast<ShapedType>(rootOperation->getResultTypes().front())
+            .getElementType()
+            .getIntOrFloatBitWidth();
+    // for now just enable for i1
+    if (elementTypeSize == 1) {
+      auto innermostTileSize = commonVecTileSizes.back();
+      commonVecTileSizes.back() =
+          llvm::alignTo(innermostTileSize * elementTypeSize, 8) /
+          elementTypeSize;
     }
   }
 

From 88dd7af6ff3899d47b4ad5137acf88d63c7dded6 Mon Sep 17 00:00:00 2001
From: giacs-epic <179146510+giacs-epic@users.noreply.github.com>
Date: Mon, 18 Nov 2024 16:17:05 +0000
Subject: [PATCH 3/7] Fix crash due to call to Type::getIntOrFloatBitwidth()
 asserting on complex numbers

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../compiler/Codegen/LLVMCPU/KernelDispatch.cpp     | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index bd3c1f4614ac..e394844dd5df 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -2922,10 +2922,15 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
   // loads and stores will have a performance impact.
   auto resultTypes = rootOperation->getResultTypes();
   if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
-    auto elementTypeSize =
-        cast<ShapedType>(rootOperation->getResultTypes().front())
-            .getElementType()
-            .getIntOrFloatBitWidth();
+    Type elementType = cast<ShapedType>(rootOperation->getResultTypes().front())
+                           .getElementType();
+    unsigned int elementTypeSize;
+    if (auto complexType = llvm::dyn_cast<ComplexType>(elementType)) {
+      elementTypeSize =
+          2 * complexType.getElementType().getIntOrFloatBitWidth();
+    } else {
+      elementTypeSize = elementType.getIntOrFloatBitWidth();
+    }
     // for now just enable for i1
     if (elementTypeSize == 1) {
       auto innermostTileSize = commonVecTileSizes.back();

From 8d95db220361d8db0888af07e0ae98034c8f36ce Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Mon, 18 Nov 2024 13:41:12 -0800
Subject: [PATCH 4/7] Add select lowering strat test

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../test/select_x86_64_lowering_strategy.mlir | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index 9161c810aa23..c2b5d84b6435 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1983,3 +1983,47 @@ func.func @i1_type()  attributes {hal.executable.target = #executable_target_emb
 // CHECK: func @i1_type()
 // CHECK: linalg.generic {
 // CHECK-SAME: {lowering_config = #[[CONFIG]]}
+
+// -----
+
+#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+func.func @complex_view_as_real() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
+  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>>
+  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>>
+  %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+  %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
+  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>> -> tensor<1x1x32x50x2xf32>
+  %6 = tensor.empty() : tensor<32x50x2xf32>
+  %extracted = tensor.extract %4[%c0] : tensor<1xi32>
+  %7 = arith.extsi %extracted : i32 to i64
+  %8 = arith.index_cast %7 : i64 to index
+  %9 = flow.dispatch.tensor.load %1, offsets = [%8, 0], sizes = [1, 50], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>> -> tensor<50xcomplex<f32>>
+  %10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%9 : tensor<50xcomplex<f32>>) outs(%6 : tensor<32x50x2xf32>) {
+  ^bb0(%in: complex<f32>, %out: f32):
+    %11 = linalg.index 0 : index
+    %12 = linalg.index 1 : index
+    %extracted_0 = tensor.extract %5[%c0, %c0, %11, %12, %c0] : tensor<1x1x32x50x2xf32>
+    %extracted_1 = tensor.extract %5[%c0, %c0, %11, %12, %c1] : tensor<1x1x32x50x2xf32>
+    %13 = complex.create %extracted_0, %extracted_1 : complex<f32>
+    %14 = complex.mul %13, %in : complex<f32>
+    %15 = complex.re %14 : complex<f32>
+    %16 = complex.im %14 : complex<f32>
+    %17 = linalg.index 2 : index
+    %18 = arith.cmpi eq, %17, %c0 : index
+    %19 = arith.select %18, %15, %16 : f32
+    linalg.yield %19 : f32
+  } -> tensor<32x50x2xf32>
+  flow.dispatch.tensor.store %10, %3, offsets = [0, 0, 0], sizes = [32, 50, 2], strides = [1, 1, 1] : tensor<32x50x2xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+  return
+}
+
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 25, 2], [1, 1, 2], [0, 0, 0], [0, 0, 0]{{\]}}>
+//      CHECK: func.func @complex_view_as_real()
+//      CHECK:   linalg.generic
+// CHECK-SAME:       lowering_config = #[[CONFIG]]
+

From 965fd0b2fd8a46977c05995b137b8d5362b63293 Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Mon, 18 Nov 2024 13:53:33 -0800
Subject: [PATCH 5/7] Address comments

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp  | 3 +--
 .../Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir  | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index e394844dd5df..77b99a902cc9 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -2922,8 +2922,7 @@ setLoweringConfigForComputeOps(mlir::FunctionOpInterface entryPointFn,
   // loads and stores will have a performance impact.
   auto resultTypes = rootOperation->getResultTypes();
   if (commonVecTileSizes.size() != 0 && !resultTypes.empty()) {
-    Type elementType = cast<ShapedType>(rootOperation->getResultTypes().front())
-                           .getElementType();
+    Type elementType = cast<ShapedType>(resultTypes[0]).getElementType();
     unsigned int elementTypeSize;
     if (auto complexType = llvm::dyn_cast<ComplexType>(elementType)) {
       elementTypeSize =
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index c2b5d84b6435..cfaed32bd62e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -2026,4 +2026,3 @@ func.func @complex_view_as_real() attributes {hal.executable.target = #executabl
 //      CHECK: func.func @complex_view_as_real()
 //      CHECK:   linalg.generic
 // CHECK-SAME:       lowering_config = #[[CONFIG]]
-

From d59dbafcdae24d50f985454eb2df42a01aecc83a Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Mon, 18 Nov 2024 14:25:46 -0800
Subject: [PATCH 6/7] Change ordinal num

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index cfaed32bd62e..cb1b9effc8a7 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1995,7 +1995,7 @@ func.func @complex_view_as_real() attributes {hal.executable.target = #executabl
   %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
   %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>>
   %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>>
-  %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
   %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
   %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>> -> tensor<1x1x32x50x2xf32>
   %6 = tensor.empty() : tensor<32x50x2xf32>

From 0e0610f129416fedabddf9761fa7588d55d11c8a Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Tue, 19 Nov 2024 03:13:46 -0800
Subject: [PATCH 7/7] Hoist pipeline layout and fix ordinal

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../test/select_x86_64_lowering_strategy.mlir      | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
index cb1b9effc8a7..22a288062bc2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_x86_64_lowering_strategy.mlir
@@ -1985,6 +1985,12 @@ func.func @i1_type()  attributes {hal.executable.target = #executable_target_emb
 // CHECK-SAME: {lowering_config = #[[CONFIG]]}
 
 // -----
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
 
 #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu_features = "+avx512f", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "x86_64-unknown-linux-gnu"}>
 #map = affine_map<(d0, d1, d2) -> (d1)>
@@ -1992,10 +1998,10 @@ func.func @i1_type()  attributes {hal.executable.target = #executable_target_emb
 func.func @complex_view_as_real() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64_} {
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
-  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
-  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>>
-  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>>
-  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1xi32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x50xcomplex<f32>>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>>
+  %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<32x50x2xf32>>
   %4 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1xi32>> -> tensor<1xi32>
   %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [1, 1, 32, 50, 2], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x50x2xf32>> -> tensor<1x1x32x50x2xf32>
   %6 = tensor.empty() : tensor<32x50x2xf32>