fuse generics from gather

iree-org · IanWood1 · May 15, 2024 · May 10, 2024 · May 13, 2024 · May 13, 2024
commit 7908f4ff4d873fe0ca38b865af2294468b334749
@@ -11,11 +11,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "iree/compiler/Dialect/Flow/Transforms/Passes.h"
+#include "iree/compiler/Dialect/Flow/Transforms/RegionOpUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir::iree_compiler::IREE::Flow {
@@ -131,14 +141,63 @@ struct FoldSuccessiveTensorInsertSliceOps
   }
 };
 
+struct GatherFusionPattern : public OpRewritePattern<linalg::YieldOp> {
+  using OpRewritePattern<linalg::YieldOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(linalg::YieldOp yieldOp,
+                                PatternRewriter &rewriter) const override {
+    // Specific case. The linalg generic implementation of "gather"
+    // cannot be fused because it there is no producer-consumer
+    // relationship between the two generics. This is because the indexing
+    // is not affine (index values come from a tensor).
+    if (yieldOp->getNumOperands() != 1) {
+      return failure();
+    }
+    auto extractOp = dyn_cast_or_null<tensor::ExtractOp>(
+        yieldOp->getOperand(0).getDefiningOp());
+    if (!extractOp) {
+      return failure();
+    }
+
+    // match the generic that gens a higher bitwidth tensor
+    auto definingOp = dyn_cast_or_null<linalg::GenericOp>(
+        extractOp.getOperand(0).getDefiningOp());
+    if (!definingOp) {
+      return rewriter.notifyMatchFailure(
+          yieldOp, "expected extract operand to be a generic op");
+    }
+
+    // generic body should contain only a arith.extf and a linalg.yield
+    auto &ops = definingOp->getRegion(0).front().getOperations();
+    if (ops.size() != 2 || !isa<arith::ExtFOp>(ops.front()) ||
+        !isa<linalg::YieldOp>(ops.back())) {
+      return rewriter.notifyMatchFailure(yieldOp,
+                                         "expected generic op to have 2 ops");
+    }
+
+    // move definingOp's body just before the yield op (root)
+    rewriter.inlineBlockBefore(
+        &definingOp.getRegion().front(), yieldOp,
+        {extractOp->getResult(0), yieldOp->getOperand(0)});
+
+    // create a new extract op that directly uses definingOp's input
+    rewriter.setInsertionPoint(extractOp);
+    auto newExtractOp = rewriter.create<tensor::ExtractOp>(
+        extractOp->getLoc(), definingOp->getOperand(0), extractOp.getIndices());
+    rewriter.replaceOp(extractOp, newExtractOp);
+    rewriter.eraseOp(yieldOp);
+    rewriter.eraseOp(definingOp);
+    return success();
+  }
+};
+
 struct FusionPreprocessingPass
     : public IREE::Flow::impl::FusionPreprocessingPassBase<
           FusionPreprocessingPass> {
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
-    patterns
-        .add<FoldSuccessiveTensorInsertSliceOps, GenericOpInterchangePattern>(
-            &getContext());
+    patterns.add<FoldSuccessiveTensorInsertSliceOps,
+                 GenericOpInterchangePattern, GatherFusionPattern>(
+        &getContext());
 
     // Fold away `tensor.dim` operations that can be resolved in terms of its
     // operand shapes.

@@ -54,3 +54,42 @@ util.func public @fold_insert_slices(%source : tensor<?x?xf32>,
 //      CHECK:   %[[RETURN:.+]] = tensor.insert_slice %[[SOURCE]] into %[[FILL]]
 // CHECK-SAME:       [%[[NEW_OFFSET0]], %[[NEW_OFFSET1]]] [%[[SIZE0]], %[[SIZE1]]]
 //      CHECK:   util.return %[[RETURN]]
+
+
+// -----
+
+util.func public @fuse_generic_gather(
+  %11 :tensor<128256x4096xf16>, %12 : tensor<4x?xi64>,
+  %13 : tensor<4x?x4096xf32>, %14 : tensor<128256x4096xf32>) 
+    -> tensor<4x?x4096xf32>{
+
+  %15 = linalg.generic {
+    indexing_maps = [ affine_map<(d0, d1) -> (d0, d1)>, 
+                      affine_map<(d0, d1) -> (d0, d1)>], 
+    iterator_types = ["parallel", "parallel"]} 
+    ins(%11 : tensor<128256x4096xf16>)
+    outs(%14 : tensor<128256x4096xf32>) {
+      ^bb0(%in: f16, %out: f32):
+        %17 = arith.extf %in : f16 to f32
+        linalg.yield %17 : f32
+    } -> tensor<128256x4096xf32>
+  %16 = linalg.generic {
+    indexing_maps = [ affine_map<(d0, d1, d2) -> (d0, d1)>, 
+                      affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+    iterator_types = ["parallel", "parallel", "parallel"]} 
+    ins(%12 : tensor<4x?xi64>) 
+    outs(%13 : tensor<4x?x4096xf32>) {
+      ^bb0(%in: i64, %out: f32):
+        %17 = arith.index_cast %in : i64 to index
+        %18 = linalg.index 2 : index
+        %extracted = tensor.extract %15[%17, %18] : tensor<128256x4096xf32>
+        linalg.yield %extracted : f32
+      } -> tensor<4x?x4096xf32>
+  util.return %16 : tensor<4x?x4096xf32> 
+}
+
+// CHECK:         %[[INDEX0:[a-zA-Z0-9]+]] = arith.index_cast %in : i64 to index
+// CHECK:         %[[INDEX1:[a-zA-Z0-9]+]] = linalg.index 2 : index
+// CHECK-NEXT:    %[[EXTRACTED:.*]] = tensor.extract %[[TENSOR0:.+]][%[[INDEX0]], %[[INDEX1]]] : tensor<128256x4096xf16>
+// CHECK-NEXT:    %[[RES:[a-zA-Z0-9]+]] = arith.extf %[[EXTRACTED]] : f16 to f32
+// CHECK-NEXT:    linalg.yield %[[RES]] : f32