diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp index 075d18193a83..282b65196428 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp @@ -32,42 +32,6 @@ namespace mlir::iree_compiler { using namespace mlir::iree_compiler::IREE::VectorExt; using VectorValue = TypedValue; -/// Helper to linearize the given |ids| with maximum values given as |sizes|. -/// Gets the element ID in terms of |elementCount| and adds the element -/// |offset|. For example, -/// -/// IDs = [d0, d1, d2, d3] -/// sizes = [s0, s1, s2, s3] -/// linear_index = d0 * (s1 * s2 * s3) -/// + d1 * (s2 * s3) -/// + d2 * (s3) -/// + d3 -/// return element_index = linear_index * |elementCount| + |offset|; -static Value linearizeIndex(OpBuilder &builder, Value offset, - ArrayRef ids, ArrayRef sizes, - int64_t elementCount) { - SmallVector exprs(ids.size() + 1); - bindSymbolsList(builder.getContext(), MutableArrayRef{exprs}); - AffineExpr idExpr = builder.getAffineConstantExpr(0); - - for (int i = 0, e = ids.size(); i < e; ++i) { - if (sizes[i] > 1) { - // Multiply by the residual threads along this dimension (which must be - // faster changing than all previous dimensions) and add the id for this - // dimension. - idExpr = idExpr * builder.getAffineConstantExpr(sizes[i]) + exprs[i]; - } - } - idExpr = idExpr * builder.getAffineConstantExpr(elementCount); - idExpr = idExpr + exprs.back(); - SmallVector mapArgs(ids); - mapArgs.push_back(offset); - return affine::makeComposedAffineApply( - builder, offset.getLoc(), - AffineMap::get(0, mapArgs.size(), idExpr), mapArgs) - .getResult(); -} - /// Given a set of base transfer |indices|, |offsets| for the batch/outer /// dimensions, and distributed warp and thread indices, computes the indices /// of the distributed transfer operation based on the |vectorLayout|. @@ -94,16 +58,28 @@ static SmallVector getTransferIndicesFromNestedLayout( continue; } unsigned pos = cast(dim).getPosition(); - SmallVector ids = { - warpIndices[i], b.getIndexAttr(batchOffsets[i]), - b.getIndexAttr(outerVectorOffsets[i]), threadIndices[i]}; + Value offset = indices[pos]; + int64_t elementCount = vectorLayout.getElementTile()[i]; + Location loc = offset.getLoc(); + SmallVector ids = { + warpIndices[i], b.create(loc, batchOffsets[i]), + b.create(loc, outerVectorOffsets[i]), + threadIndices[i], offset}; // The order in which a vector dimension is "tiled" is // subgroups -> batches -> outer vectors -> threads -> elements SmallVector sizes = { vectorLayout.getSubgroupTile()[i], vectorLayout.getBatchTile()[i], - vectorLayout.getOuterTile()[i], vectorLayout.getThreadTile()[i]}; - slicedIndices[pos] = linearizeIndex(b, indices[pos], ids, sizes, - vectorLayout.getElementTile()[i]); + vectorLayout.getOuterTile()[i], vectorLayout.getThreadTile()[i], + elementCount}; + // The offset is often not an offset within `elementCount`, so, in general, + // we can't mark this `disjoint`. However, if `offset` is known to be + // a constant less than `elementCount`, we can do this, unlocking + // potential optimizations. + bool disjoint = false; + if (std::optional offsetConst = getConstantIntValue(offset)) + disjoint = *offsetConst < elementCount; + slicedIndices[pos] = + b.create(loc, ids, sizes, disjoint); } return slicedIndices; } @@ -123,19 +99,21 @@ getElementVectorTileShape(NestedLayoutAttr vectorLayout) { /// Computes the warp and thread indices for the given vector layout from a /// single linearized thread ID. -static void populateWarpAndThreadIndices(RewriterBase &rewriter, Value threadId, - int64_t subgroupSize, - NestedLayoutAttr vectorLayout, - SmallVector &warpIndices, - SmallVector &threadIndices) { +static LogicalResult populateWarpAndThreadIndices( + RewriterBase &rewriter, Value threadId, int64_t subgroupSize, + NestedLayoutAttr vectorLayout, SmallVector &warpIndices, + SmallVector &threadIndices) { // The delinearized thread IDs are returned from outer most to inner most, // i.e. before applying the layout described dimensions ordering. int64_t rank = vectorLayout.getRank(); SmallVector threadIds = vectorLayout.computeThreadIds(threadId, subgroupSize, rewriter); + if (threadIds.empty() && rank != 0) + return failure(); warpIndices = SmallVector(threadIds.begin(), threadIds.begin() + rank); threadIndices = SmallVector(threadIds.begin() + rank, threadIds.begin() + 2 * rank); + return success(); } namespace { @@ -189,8 +167,12 @@ struct DistributeTransferRead final VectorValue acc = cast(zero); SmallVector warpIndices, threadIndices; - populateWarpAndThreadIndices(rewriter, threadId, subgroupSize, vectorLayout, - warpIndices, threadIndices); + if (failed(populateWarpAndThreadIndices(rewriter, threadId, subgroupSize, + vectorLayout, warpIndices, + threadIndices))) { + return rewriter.notifyMatchFailure( + readOp, "warp or thread tiles have overlapping strides"); + } ValueRange indices = readOp.getIndices(); SmallVector strides(rank, 1); @@ -259,8 +241,12 @@ struct DistributeTransferWrite final int64_t rank = vectorLayout.getRank(); SmallVector warpIndices, threadIndices; - populateWarpAndThreadIndices(rewriter, threadId, subgroupSize, vectorLayout, - warpIndices, threadIndices); + if (failed(populateWarpAndThreadIndices(rewriter, threadId, subgroupSize, + vectorLayout, warpIndices, + threadIndices))) { + return rewriter.notifyMatchFailure( + writeOp, "warp or thread tiles have overlapping strides"); + } Value distributedVector = getDistributed(rewriter, writeOp.getVector(), vectorLayout); @@ -1282,8 +1268,12 @@ struct DistributeStep final : OpDistributionPattern { stepOp, "missing nested layout for step op result"); } SmallVector subgroupIndices, threadIndices; - populateWarpAndThreadIndices(rewriter, threadId, subgroupSize, resultLayout, - subgroupIndices, threadIndices); + if (failed(populateWarpAndThreadIndices(rewriter, threadId, subgroupSize, + resultLayout, subgroupIndices, + threadIndices))) { + return rewriter.notifyMatchFailure( + stepOp, "warp or thread tiles have overlapping strides"); + } SmallVector undistributedShape = resultLayout.getUndistributedPackedShape(); diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir index 3e9f15513df2..e85153675aff 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir @@ -11,9 +11,6 @@ thread_strides = [8, 1] > -// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 4 - ((s0 floordiv 8) floordiv 4) * 16)> -// CHECK: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8)> -// CHECK: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)> // CHECK-LABEL: @distribute_transfer_read_col_major func.func @distribute_transfer_read_col_major(%arg0: memref<32x32xf16>) -> vector<16x16xf16> { %c0 = arith.constant 0 : index @@ -33,12 +30,12 @@ builtin.module attributes { transform.with_named_sequence } { } // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK: %[[X:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] -// CHECK: %[[Y:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] -// CHECK: %[[RD00:.+]] = vector.transfer_read %arg0[%[[X]], %[[Y]]], {{.*}} : memref<32x32xf16>, vector<4x1xf16> +// CHECK: %[[YX:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 8) +// CHECK: %[[Y_SCALED:.+]] = affine.linearize_index disjoint [%[[YX]]#1, %c0] by (4, 4) +// CHECK: %[[RD00:.+]] = vector.transfer_read %arg0[%[[Y_SCALED]], %[[YX]]#2], {{.*}} : memref<32x32xf16>, vector<4x1xf16> // CHECK: vector.insert_strided_slice %[[RD00]], %{{.*}} {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<1x2x1x1x4x1xf16> -// CHECK: %[[X_PLUS_BATCH:.+]] = affine.apply #[[$MAP2]]()[%[[IDX]]] -// CHECK: vector.transfer_read %arg0[%[[X]], %[[X_PLUS_BATCH]]], %{{.*}} {in_bounds = [true, true]} : memref<32x32xf16>, vector<4x1xf16> +// CHECK: %[[X_PLUS_BATCH:.+]] = affine.linearize_index disjoint [%c1, %[[YX]]#2] by (2, 8) +// CHECK: vector.transfer_read %arg0[%[[Y_SCALED]], %[[X_PLUS_BATCH]]], %{{.*}} {in_bounds = [true, true]} : memref<32x32xf16>, vector<4x1xf16> // CHECK: vector.insert_strided_slice {{.*}} {offsets = [0, 1, 0, 0, 0, 0] // CHECK: iree_vector_ext.to_simd %{{.*}} : vector<1x2x1x1x4x1xf16> -> vector<16x16xf16> @@ -55,10 +52,6 @@ builtin.module attributes { transform.with_named_sequence } { thread_strides = [1, 1] > -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 + 8)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8 + 8)> - func.func @distribute_transfer_read_row_major_with_nontrivial_index(%a: index, %b: index, %arg0: memref<32x32x32x32xf16>) -> vector<16x16xf16> { %c0 = arith.constant 0 : index %cst = arith.constant 0.0 : f16 @@ -81,11 +74,12 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-SAME: %[[I0:.+]]: index, %[[I1:.+]]: index // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK: %[[OFF0:.+]] = affine.apply #[[$MAP]]()[%[[I0]], %[[IDX]]] +// CHECK: %[[X:.+]]:2 = affine.delinearize_index %[[IDX]] into (8) : index, index +// CHECK: %[[OFF0:.+]] = affine.linearize_index [%[[X]]#1, %[[I0]]] by (8, 1) // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF0]], %[[I1]]] -// CHECK: %[[OFF1:.+]] = affine.apply #[[$MAP1]]()[%[[I1]]] +// CHECK: %[[OFF1:.+]] = affine.linearize_index [%c1, %[[I1]]] by (2, 8) // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF0]], %[[OFF1]]] -// CHECK: %[[OFF2:.+]] = affine.apply #[[$MAP2]]()[%[[I0]], %[[IDX]]] +// CHECK: %[[OFF2:.+]] = affine.linearize_index [%c1, %[[X]]#1, %[[I0]]] by (2, 8, 1) // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF2]], %[[I1]]] // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF2]], %[[OFF1]]] @@ -143,9 +137,6 @@ builtin.module attributes { transform.with_named_sequence } { thread_strides = [1, 1] > -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 + 8)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8 + 8)> // CHECK-DAG: #[[$PERM:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)> func.func @distribute_transfer_read_row_major_transpose(%a: index, %b: index, %arg0: memref<32x32x32x32xf16>) -> vector<16x16xf16> { @@ -171,11 +162,12 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-SAME: %[[I0:.+]]: index, %[[I1:.+]]: index // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK: %[[LIN_ID0:.+]] = affine.apply #[[$MAP:.+]]()[%[[I1]], %[[IDX]]] +// CHECK: %[[X:.+]]:2 = affine.delinearize_index %[[IDX]] into (8) : index, index +// CHECK: %[[LIN_ID0:.+]] = affine.linearize_index [%[[X]]#1, %[[I1]]] by (8, 1) // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID0]]], {{.*}} permutation_map = #[[$PERM]] -// CHECK: %[[I0_PLUS_8:.+]] = affine.apply #[[$MAP1]]()[%[[I0]]] +// CHECK: %[[I0_PLUS_8:.+]] = affine.linearize_index [%c1, %[[I0]]] by (2, 8) // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0_PLUS_8]], %[[LIN_ID0]]], {{.*}} permutation_map = #[[$PERM]] -// CHECK: %[[LIN_ID1:.+]] = affine.apply #[[$MAP2]]()[%[[I1]], %[[IDX]]] +// CHECK: %[[LIN_ID1:.+]] = affine.linearize_index [%c1, %[[X]]#1, %[[I1]]] by (2, 8, 1) // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID1]]], {{.*}} permutation_map = #[[$PERM]] // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0_PLUS_8]], %[[LIN_ID1]]], %cst_0 {in_bounds = [true, true], permutation_map = #[[$PERM]]} : memref<32x32x32x32xf16>, vector<1x8xf16> @@ -268,8 +260,6 @@ builtin.module attributes { transform.with_named_sequence } { thread_strides = [16] > -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)> - // CHECK-LABEL: @distribute_transfer_read_broadcast func.func @distribute_transfer_read_broadcast(%arg0: memref<32x32xf16>) -> vector<16xf16> { %c0 = arith.constant 0 : index @@ -289,7 +279,8 @@ builtin.module attributes { transform.with_named_sequence } { } // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK: %[[LANEY:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] +// CHECK: %[[YX:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 16) +// CHECK: %[[LANEY:.+]] = affine.linearize_index disjoint [%[[YX]]#1, %c0] by (4, 4) // CHECK: %[[RD:.+]] = vector.transfer_read %{{.*}}[%c0, %[[LANEY:.+]]], {{.*}} : memref<32x32xf16>, vector<4xf16> // ----- @@ -305,8 +296,6 @@ builtin.module attributes { transform.with_named_sequence } { thread_strides = [1] > -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4 + (s0 floordiv 64) * 64 - ((s0 floordiv 64) floordiv 2) * 128 - (s0 floordiv 16) * 64)> - // CHECK-LABEL: @distribute_transfer_read_broadcast2 func.func @distribute_transfer_read_broadcast2(%arg0: memref<32x128xf16>) -> vector<128xf16> { %c0 = arith.constant 0 : index @@ -326,7 +315,9 @@ builtin.module attributes { transform.with_named_sequence } { } // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK: %[[LANEY:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] +// CHECK: %[[YX:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 64) +// CHECK: %[[SUBGROUP:.+]]:2 = affine.delinearize_index %[[IDX]] into (16) +// CHECK: %[[LANEY:.+]] = affine.linearize_index disjoint [%[[YX]]#1, %[[SUBGROUP]]#1, %c0] by (2, 16, 4) // CHECK: %[[RD:.+]] = vector.transfer_read %{{.*}}[%c0, %[[LANEY:.+]]], {{.*}} : memref<32x128xf16>, vector<4xf16> // ----- @@ -378,9 +369,6 @@ builtin.module attributes { transform.with_named_sequence } { thread_strides = [1, 1] > -// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 mod 8)> -// CHECK: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)> - // CHECK-LABEL: @distribute_transfer_write_row_major func.func @distribute_transfer_write_row_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) { %c0 = arith.constant 0 : index @@ -400,12 +388,12 @@ builtin.module attributes { transform.with_named_sequence } { } // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK: %[[LANEX:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] +// CHECK: %[[LANEX:.+]]:2 = affine.delinearize_index %[[IDX]] into (8) // CHECK: %[[SLICE:.+]] = vector.extract %{{.*}}[0, 0, 0, 0] : vector<1x8xf16> from vector<2x2x1x1x1x8xf16> -// CHECK: vector.transfer_write %[[SLICE]], %{{.*}}[%[[LANEX]], %c0] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16> +// CHECK: vector.transfer_write %[[SLICE]], %{{.*}}[%[[LANEX]]#1, %c0] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16> // CHECK: vector.extract %{{.*}}[0, 1, 0, 0] -// CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEX]], %c8] -// CHECK: %[[LANEX_PLUS_VECDIMX:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] +// CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEX]]#1, %c8] +// CHECK: %[[LANEX_PLUS_VECDIMX:.+]] = affine.linearize_index disjoint [%c1, %[[LANEX]]#1] by (2, 8) // CHECK: vector.extract %{{.*}}[1, 0, 0, 0] // CHECK: vector.transfer_write %{{.*}}[%[[LANEX_PLUS_VECDIMX]], %c0] // CHECK: vector.extract %{{.*}}[1, 1, 0, 0] @@ -424,10 +412,6 @@ builtin.module attributes { transform.with_named_sequence } { thread_strides = [8, 1] > -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 4 - ((s0 floordiv 8) floordiv 4) * 16)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)> - // CHECK-LABEL: @distribute_transfer_write_col_major func.func @distribute_transfer_write_col_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) { %c0 = arith.constant 0 : index @@ -447,11 +431,11 @@ builtin.module attributes { transform.with_named_sequence } { } // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK: %[[LANEY:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] -// CHECK: %[[LANEY2:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] +// CHECK: %[[YX:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 8) +// CHECK: %[[LANEY:.+]] = affine.linearize_index disjoint [%[[YX]]#1, %c0] by (4, 4) // CHECK: vector.extract %{{.*}}[0, 0, 0, 0] -// CHECK: vector.transfer_write %{{.*}}[%[[LANEY]], %[[LANEY2]]] -// CHECK: %[[LANEX:.+]] = affine.apply #[[$MAP2]]()[%[[IDX]]] +// CHECK: vector.transfer_write %{{.*}}[%[[LANEY]], %[[YX]]#2] +// CHECK: %[[LANEX:.+]] = affine.linearize_index disjoint [%c1, %[[YX]]#2] by (2, 8) // CHECK: vector.extract %{{.*}}[0, 1, 0, 0] // CHECK: vector.transfer_write {{.*}}[%[[LANEY]], %[[LANEX]]] @@ -468,10 +452,7 @@ builtin.module attributes { transform.with_named_sequence } { thread_strides = [1, 1] > -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 + 8)> -// CHECK-DAG: #[[$MAP3:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8 + 8)> +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)> func.func @distribute_transfer_write_row_major_with_nontrivial_index(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) { %c0 = arith.constant 0 : index @@ -495,17 +476,18 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-SAME: vector<16x16xf16>, %[[I0:.+]]: index, %[[I1:.+]]: index // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK: %[[LIN_ID0:.+]] = affine.apply #[[$MAP]]()[%[[I1]], %[[IDX]]] +// CHECK: %[[LANE:.+]]:2 = affine.delinearize_index %[[IDX]] into (8) +// CHECK: %[[LIN_ID0:.+]] = affine.linearize_index [%[[LANE]]#1, %[[I1]]] by (8, 1) // CHECK: vector.extract %{{.*}}[0, 0, 0, 0] -// CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID0]]] {{.*}} permutation_map = #[[$MAP1]] -// CHECK: %[[LIN_ID1:.+]] = affine.apply #[[$MAP2]]()[%[[I0]]] +// CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID0]]] {{.*}} permutation_map = #[[$MAP]] +// CHECK: %[[LIN_ID1:.+]] = affine.linearize_index [%c1, %[[I0]]] by (2, 8) // CHECK: vector.extract %{{.*}}[0, 1, 0, 0] -// CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[LIN_ID1]], %[[LIN_ID0]]] {{.*}} permutation_map = #[[$MAP1]] -// CHECK: %[[LIN_ID2:.+]] = affine.apply #[[$MAP3]]()[%[[I1]], %[[IDX]]] +// CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[LIN_ID1]], %[[LIN_ID0]]] {{.*}} permutation_map = #[[$MAP]] +// CHECK: %[[LIN_ID2:.+]] = affine.linearize_index [%c1, %[[LANE]]#1, %[[I1]]] // CHECK: vector.extract %{{.*}}[1, 0, 0, 0] -// CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID2]]] {{.*}} permutation_map = #[[$MAP1]] +// CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID2]]] {{.*}} permutation_map = #[[$MAP]] // CHECK: vector.extract %{{.*}}[1, 1, 0, 0] -// CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[LIN_ID1]], %[[LIN_ID2]]] {{.*}} permutation_map = #[[$MAP1]] +// CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[LIN_ID1]], %[[LIN_ID2]]] {{.*}} permutation_map = #[[$MAP]] // ----- @@ -594,13 +576,6 @@ builtin.module attributes { transform.with_named_sequence } { thread_strides = [32, 1] > -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 - (s0 floordiv 32) * 32)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 32) * 32)> -// CHECK-DAG: #[[$MAP3:.+]] = affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 + (s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8)> -// CHECK-DAG: #[[$MAP4:.+]] = affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 + (s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8 + 8)> -// CHECK-DAG: #[[$MAP5:.+]] = affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 + (s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8 + 16)> -// CHECK-DAG: #[[$MAP6:.+]] = affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 + (s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8 + 24)> // CHECK-LABEL: @mfma_64x128x8_read func.func @mfma_64x128x8_read(%mem: memref<128x8xf16>, %mem1: memref<8x64xf16>, @@ -611,23 +586,27 @@ func.func @mfma_64x128x8_read(%mem: memref<128x8xf16>, %cst = arith.constant 0.0 : f16 // CHECK: %[[IDX:.+]] = gpu.thread_id x - - // CHECK-DAG: %[[LHSM:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] + // CHECK-DAG: %[[WG:.+]]:4 = affine.delinearize_index %[[IDX]] into (4, 2, 64) + // CHECK-DAG: %[[LANE:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 32) + // This doesn't canonicalize away currently, but could be equivalent to %WG + // CHECK-DAG: %[[WG_N:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 64) + // CHECK-DAG: %[[LHSM:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %[[LANE]]#2] // LHSK = RHSK - // CHECK-DAG: %[[LHSK:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] + // CHECK-DAG: %[[LHSK:.+]] = affine.linearize_index disjoint [%[[LANE]]#1, %c0] by (2, 4) // ACCN = RHSN - // CHECK-DAG: %[[RHSN:.+]] = affine.apply #[[$MAP2]]()[%[[IDX]]] + // CHECK-DAG: %[[RHSN_DUP_WG:.+]] = affine.linearize_index disjoint [%[[WG_N]]#1, %[[LANE]]#2] by (2, 32) + // CHECK-DAG: %[[RHSN:.+]] = affine.linearize_index disjoint [%[[WG]]#2, %[[LANE]]#2] by (2, 32) // M is unrolled 4 times. - // CHECK-DAG: %[[ACCM0:.+]] = affine.apply #[[$MAP3]]()[%[[IDX]]] - // CHECK-DAG: %[[ACCM1:.+]] = affine.apply #[[$MAP4]]()[%[[IDX]]] - // CHECK-DAG: %[[ACCM2:.+]] = affine.apply #[[$MAP5]]()[%[[IDX]]] - // CHECK-DAG: %[[ACCM3:.+]] = affine.apply #[[$MAP6]]()[%[[IDX]]] + // CHECK-DAG: %[[ACCM0:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %c0, %[[LANE]]#1, %c0] by (4, 4, 2, 4) + // CHECK-DAG: %[[ACCM1:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %c1, %[[LANE]]#1, %c0] by (4, 4, 2, 4) + // CHECK-DAG: %[[ACCM2:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %c2, %[[LANE]]#1, %c0] by (4, 4, 2, 4) + // CHECK-DAG: %[[ACCM3:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %c3, %[[LANE]]#1, %c0] by (4, 4, 2, 4) // M, K // CHECK-DAG: transfer_read %{{.*}}[%[[LHSM]], %[[LHSK]]] // K, N - // CHECK-DAG: transfer_read %{{.*}}[%[[LHSK]], %[[RHSN]]] + // CHECK-DAG: transfer_read %{{.*}}[%[[LHSK]], %[[RHSN_DUP_WG]]] // M, N // CHECK-DAG: transfer_read %{{.*}}[%[[ACCM0]], %[[RHSN]]] // CHECK-DAG: transfer_read %{{.*}}[%[[ACCM1]], %[[RHSN]]] @@ -694,14 +673,13 @@ builtin.module attributes { transform.with_named_sequence } { } } -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 32) * 32)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8)> - // CHECK-LABEL: @transposed_read_64x8 // CHECK: %[[IDX:.+]] = gpu.thread_id x -// CHECK-DAG: %[[M:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] -// CHECK-DAG: %[[N:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] +// CHECK-DAG: %[[WG:.+]]:4 = affine.delinearize_index %[[IDX]] into (2, 2, 64) +// CHECK-DAG: %[[LANE:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 32) +// CHECK-DAG: %[[M:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %[[LANE]]#2] by (2, 32) +// CHECK-DAG: %[[N:.+]] = affine.linearize_index disjoint [%[[LANE]]#1, %c0] by (2, 4) // CHECK: transfer_read %{{.*}}[%[[N]], %[[M]] // ----- @@ -995,28 +973,21 @@ builtin.module attributes { transform.with_named_sequence } { } } -// CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 64) * 16 - ((s0 floordiv 64) floordiv 2) * 32 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> ((s0 floordiv 2) mod 8)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 2) * 4)> -// CHECK-DAG: #[[$MAP3:.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 2) * 4 + 4)> -// CHECK-DAG: #[[$MAP4:.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 2) * 4 + 8)> -// CHECK-DAG: #[[$MAP5:.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 2) * 4 + 12)> -// CHECK-DAG: #[[$MAP6:.+]] = affine_map<()[s0] -> ((s0 floordiv 2) mod 8 + 8)> - // CHECK-LABEL: func @transpose_3d // CHECK-DAG: %[[IDX:.+]] = gpu.thread_id x - -// CHECK-DAG: %[[DIM:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] -// CHECK-DAG: %[[DIM1:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] -// CHECK-DAG: %[[DIM2:.+]] = affine.apply #[[$MAP2]]()[%[[IDX]]] -// CHECK-DAG: %[[DIM3:.+]] = affine.apply #[[$MAP3]]()[%[[IDX]]] -// CHECK-DAG: %[[DIM4:.+]] = affine.apply #[[$MAP4]]()[%[[IDX]]] -// CHECK-DAG: %[[DIM5:.+]] = affine.apply #[[$MAP5]]()[%[[IDX]]] -// CHECK-DAG: %[[DIM6:.+]] = affine.apply #[[$MAP6]]()[%[[IDX]]] -// CHECK-DAG: %[[RD0:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM1]], %[[DIM2]]], {{.*}} : memref<32x32x32xf16>, vector<4x1x2xf16> -// CHECK-DAG: %[[RD1:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM1]], %[[DIM3]]] -// CHECK-DAG: %[[RD2:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM1]], %[[DIM4]]] -// CHECK-DAG: %[[RD3:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM1]], %[[DIM5]]] +// CHECK-DAG: %[[WG:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 64) +// CHECK-DAG: %[[LANE:.+]]:4 = affine.delinearize_index %[[IDX]] into (4, 8, 2) +// CHECK-DAG: %[[DIM:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %[[LANE]]#1, %c0] by (2, 4, 4) +// COM: DIM1 == LANE#2 +// CHECK-DAG: %[[DIM2:.+]] = affine.linearize_index disjoint [%[[LANE]]#3, %c0] by (2, 2) +// CHECK-DAG: %[[DIM3:.+]] = affine.linearize_index disjoint [%c1, %[[LANE]]#3, %c0] by (4, 2, 2) +// CHECK-DAG: %[[DIM4:.+]] = affine.linearize_index disjoint [%c2, %[[LANE]]#3, %c0] by (4, 2, 2) +// CHECK-DAG: %[[DIM5:.+]] = affine.linearize_index disjoint [%c3, %[[LANE]]#3, %c0] by (4, 2, 2) +// CHECK-DAG: %[[DIM6:.+]] = affine.linearize_index disjoint [%c1, %[[LANE]]#2] by (2, 8) +// CHECK-DAG: %[[RD0:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[LANE]]#2, %[[DIM2]]], {{.*}} : memref<32x32x32xf16>, vector<4x1x2xf16> +// CHECK-DAG: %[[RD1:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[LANE]]#2, %[[DIM3]]] +// CHECK-DAG: %[[RD2:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[LANE]]#2, %[[DIM4]]] +// CHECK-DAG: %[[RD3:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[LANE]]#2, %[[DIM5]]] // CHECK-DAG: %[[RD4:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM6]], %[[DIM2]]] // CHECK-DAG: %[[RD5:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM6]], %[[DIM3]]] // CHECK-DAG: %[[RD6:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM6]], %[[DIM4]]] @@ -1024,10 +995,10 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK: vector.transpose %{{.*}}, [1, 2, 0, 4, 5, 3, 7, 8, 6] : vector<1x2x4x1x1x1x4x1x2xf16> to vector<2x4x1x1x1x1x1x2x4xf16> -// CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM1]], %[[DIM2]], %[[DIM]]] {{.*}} : vector<1x2x4xf16>, memref<32x32x32xf16> -// CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM1]], %[[DIM3]], %[[DIM]]] -// CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM1]], %[[DIM4]], %[[DIM]]] -// CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM1]], %[[DIM5]], %[[DIM]]] +// CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[LANE]]#2, %[[DIM2]], %[[DIM]]] {{.*}} : vector<1x2x4xf16>, memref<32x32x32xf16> +// CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[LANE]]#2, %[[DIM3]], %[[DIM]]] +// CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[LANE]]#2, %[[DIM4]], %[[DIM]]] +// CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[LANE]]#2, %[[DIM5]], %[[DIM]]] // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM6]], %[[DIM2]], %[[DIM]]] // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM6]], %[[DIM3]], %[[DIM]]] // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM6]], %[[DIM4]], %[[DIM]]] diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir index aa3ba7752541..41c108dfdc67 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir @@ -163,15 +163,15 @@ builtin.module attributes { transform.with_named_sequence } { // Subgroup reduction // CHECK-DAG: %[[ALLOC:.+]] = memref.alloc() : memref<32x2xf32, #gpu.address_space> // CHECK: gpu.barrier -// CHECK-DAG: %[[TIDX0:.+]] = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%thread_id_x] -// CHECK-DAG: %[[TIDX1:.+]] = affine.apply affine_map<()[s0] -> (s0 mod 16 + 16)>()[%thread_id_x] -// CHECK-DAG: %[[SGIDX:.+]] = affine.apply affine_map<()[s0] -> ((s0 floordiv 64) mod 2)>()[%thread_id_x] +// CHECK-DAG: %[[SGID:.+]]:3 = affine.delinearize_index %thread_id_x into (2, 64) +// CHECK-DAG: %[[TIDX:.+]]:2 = affine.delinearize_index %thread_id_x into (16) // CHECK-DAG: %[[EXTRACT0:.+]] = vector.extract %[[THREAD_RED4]][0] : vector<1x1xf32> from vector<2x1x1xf32> // CHECK-DAG: %[[EXTRACT1:.+]] = vector.extract %[[THREAD_RED4]][1] : vector<1x1xf32> from vector<2x1x1xf32> -// CHECK-DAG: vector.transfer_write %[[EXTRACT0]], %[[ALLOC]][%[[TIDX0]], %[[SGIDX]]] -// CHECK-DAG: vector.transfer_write %[[EXTRACT1]], %[[ALLOC]][%[[TIDX1]], %[[SGIDX]]] +// CHECK-DAG: %[[TIDX1:.+]] = affine.linearize_index disjoint [%c1, %[[TIDX]]#1] by (2, 16) : index +// CHECK-DAG: vector.transfer_write %[[EXTRACT0]], %[[ALLOC]][%[[TIDX]]#1, %[[SGID]]#1] +// CHECK-DAG: vector.transfer_write %[[EXTRACT1]], %[[ALLOC]][%[[TIDX1]], %[[SGID]]#1] // CHECK: gpu.barrier -// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %alloc[%[[TIDX0]], %c0], {{.*}} {in_bounds = [true, true]} : memref<32x2xf32, #gpu.address_space>, vector<1x2xf32> +// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %alloc[%[[TIDX]]#1, %c0], {{.*}} {in_bounds = [true, true]} : memref<32x2xf32, #gpu.address_space>, vector<1x2xf32> // CHECK-DAG: %[[GATHER0:.+]] = vector.insert_strided_slice %[[READ0]], %[[CST]] {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x2xf32> into vector<2x1x1x1x1x2xf32> // CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %alloc[%[[TIDX1]], %c0], %cst_0 {in_bounds = [true, true]} : memref<32x2xf32, #gpu.address_space>, vector<1x2xf32> // CHECK-DAG: %[[GATHER1:.+]] = vector.insert_strided_slice %[[READ1]], %[[GATHER0]] {offsets = [1, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<1x2xf32> into vector<2x1x1x1x1x2xf32> diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_step.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_step.mlir index 76c33e4d31e2..6c020733edd9 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_step.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_step.mlir @@ -27,8 +27,9 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-LABEL: func @step_1 // CHECK: %[[CST:.+]] = arith.constant dense<[0, 4, 8, 12]> : vector<4xindex> -// CHECK: %[[TID:.+]] = affine.apply affine_map<()[s0] -> ((s0 floordiv 16) mod 4)>()[%thread_id_x] -// CHECK: %[[TIDB:.+]] = vector.broadcast %[[TID]] : index to vector<4xindex> +// CHECK: %[[IDX:.+]] = gpu.thread_id x +// CHECK: %[[TID:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 16) +// CHECK: %[[TIDB:.+]] = vector.broadcast %[[TID]]#1 : index to vector<4xindex> // CHECK: %[[OFFSET:.+]] = arith.addi %[[TIDB]], %[[CST]] : vector<4xindex> // ----- @@ -60,8 +61,9 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-LABEL: func @step_2 // CHECK: %[[CST:.+]] = arith.constant dense<[0, 1, 8, 9, 16, 17]> : vector<6xindex> -// CHECK: %[[TID:.+]] = affine.apply affine_map<()[s0] -> ((s0 floordiv 2) mod 4)>()[%thread_id_x] -// CHECK: %[[TID_STRIDE:.+]] = arith.muli %[[TID]], %c2 : index +// CHECK: %[[IDX:.+]] = gpu.thread_id x +// CHECK: %[[TID:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 2) +// CHECK: %[[TID_STRIDE:.+]] = arith.muli %[[TID]]#1, %c2 : index // CHECK: %[[TID_STRIDEV:.+]] = vector.broadcast %[[TID_STRIDE]] : index to vector<6xindex> // CHECK: %[[OFFSET:.+]] = arith.addi %[[TID_STRIDEV]], %[[CST]] : vector<6xindex> @@ -94,12 +96,13 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-LABEL: func @step_3 // CHECK: %[[CST:.+]] = arith.constant dense<[0, 1, 8, 9]> : vector<4xindex> -// CHECK: %[[WID:.+]] = affine.apply affine_map<()[s0] -> ((s0 floordiv 512) mod 3)>()[%thread_id_x] -// CHECK: %[[TID:.+]] = affine.apply affine_map<()[s0] -> ((s0 floordiv 2) mod 4)>()[%thread_id_x] -// CHECK: %[[WID_STRIDE:.+]] = arith.muli %[[WID]], %c16 : index +// CHECK: %[[IDX:.+]] = gpu.thread_id x +// CHECK: %[[WID:.+]]:4 = affine.delinearize_index %[[IDX]] into (3, 8, 64) +// CHECK: %[[TID:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 2) +// CHECK: %[[WID_STRIDE:.+]] = arith.muli %[[WID]]#1, %c16 : index // CHECK: %[[WID_STRIDEV:.+]] = vector.broadcast %[[WID_STRIDE]] : index to vector<4xindex> // CHECK: %[[OFFSET0:.+]] = arith.addi %[[WID_STRIDEV]], %[[CST]] : vector<4xindex> -// CHECK: %[[TID_STRIDE:.+]] = arith.muli %[[TID]], %c2 : index +// CHECK: %[[TID_STRIDE:.+]] = arith.muli %[[TID]]#1, %c2 : index // CHECK: %[[TID_STRIDEV:.+]] = vector.broadcast %[[TID_STRIDE]] : index to vector<4xindex> // CHECK: %[[OFFSET1:.+]] = arith.addi %[[OFFSET0]], %[[TID_STRIDEV]] : vector<4xindex> @@ -132,7 +135,8 @@ builtin.module attributes { transform.with_named_sequence } { // CHECK-LABEL: func @step_4 // CHECK: %[[CST:.+]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex> -// CHECK: %[[TID:.+]] = affine.apply affine_map<()[s0] -> (s0 mod 16)>()[%thread_id_x] -// CHECK: %[[TID_STRIDE:.+]] = arith.muli %[[TID]], %c8 : index +// CHECK: %[[IDX:.+]] = gpu.thread_id x +// CHECK: %[[TID:.+]]:2 = affine.delinearize_index %[[IDX]] into (16) +// CHECK: %[[TID_STRIDE:.+]] = arith.muli %[[TID]]#1, %c8 : index // CHECK: %[[TID_STRIDEV:.+]] = vector.broadcast %[[TID_STRIDE]] : index to vector<8xindex> // CHECK: %[[OFFSET:.+]] = arith.addi %[[TID_STRIDEV]], %[[CST]] : vector<8xindex> diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/BUILD.bazel index bdd29befadd7..4a28bcbb4cf3 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/BUILD.bazel @@ -90,6 +90,7 @@ iree_compiler_cc_library( "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect", "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils", "//compiler/src/iree/compiler/Dialect/LinalgExt/IR", + "//compiler/src/iree/compiler/Utils", "@llvm-project//llvm:Support", "@llvm-project//mlir:AMDGPUDialect", "@llvm-project//mlir:AffineDialect", diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/CMakeLists.txt index adaa901e4dfb..e63aeb33b82d 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/CMakeLists.txt @@ -70,6 +70,7 @@ iree_cc_library( iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect iree::compiler::Codegen::Utils::VectorOpUtils iree::compiler::Dialect::LinalgExt::IR + iree::compiler::Utils iree::compiler::bindings::c::headers PUBLIC ) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 5c0f6a3f9d10..05878facba3c 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -13,6 +13,7 @@ #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h" #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h" #include "iree/compiler/Codegen/Utils/VectorOpUtils.h" +#include "iree/compiler/Utils/Indexing.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/STLForwardCompat.h" #include "llvm/ADT/SmallVector.h" @@ -476,28 +477,38 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides( OpFoldResult zero = builder.getIndexAttr(0); OpFoldResult one = builder.getIndexAttr(1); + Value cZero = builder.create(loc, 0); canonicalStrides.append(rankReducedShape.size(), one); + SmallVector vtids; + SmallVector vtidBasis; + SmallVector dimToVtid; + if (failed(basisFromSizesStrides(subgroupLayout.thread, + subgroupLayout.tstrides, vtidBasis, + dimToVtid))) { + return failure(); + } + auto splitLaneId = builder.create( + loc, laneId, vtidBasis, /*hasOuterBound=*/false); + // Each thread grabs `element` contiguous data, so the vtid needs to be // multiplied by `element` to get the next bunch of data. // vtid: virtual thread id // tid: lane id // vtid = ((tid floordiv stride_i) mod size_i) * element_i. - SmallVector vtids; - for (auto [dimSize, dimStride, element] : - llvm::zip_equal(subgroupLayout.thread, subgroupLayout.tstrides, - subgroupLayout.element)) { - if (dimSize == 1) { - vtids.push_back(zero); - continue; + // + // Instead of computing those maps, we use one big `delinearize` expression + // in order to prevent unwanted "simplifications" on affine maps that + // worsen the generated code quality. + for (auto [splitResultIdx, element] : + llvm::zip_equal(dimToVtid, subgroupLayout.element)) { + Value vtid = splitLaneId.getResult(splitResultIdx); + int64_t vtidLen = vtidBasis[splitResultIdx - 1]; + if (element != 1) { + vtid = builder.create( + loc, ValueRange{vtid, cZero}, ArrayRef{vtidLen, element}, + /*disjoint=*/true); } - - // ((tid floordiv stride) mod size) * element. - AffineExpr tidExpr = builder.getAffineDimExpr(0); - AffineMap vtidMap = AffineMap::get( - /*dims=*/1, /*syms=*/0, - (tidExpr.floorDiv(dimStride) % dimSize) * element); - Value vtid = builder.create(loc, vtidMap, laneId); vtids.push_back(vtid); } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir index 8d00f6f764db..aaebe6cee6c3 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir @@ -28,25 +28,23 @@ module attributes { transform.with_named_sequence } { } } -// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 16)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> // CHECK-LABEL: func @distribute_multi_mma_F16_16x16x16_F32 // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xf32> // CHECK: scf.forall (%[[LANE_ID:.+]]) in (64) shared_outs(%[[ITER_ARG:.+]] = %[[ACC]]) -> (tensor<2x2x16x16xf32>) -// CHECK: %[[ID:.+]] = affine.apply #[[$MAP]](%[[LANE_ID]]) -// CHECK: %[[ID1:.+]] = affine.apply #[[$MAP1]](%[[LANE_ID]]) -// CHECK: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]], %[[ID1]]] +// CHECK: %[[ID:.+]]:3 = affine.delinearize_index %[[LANE_ID]] into (4, 16) +// CHECK: %[[ID1:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 4) +// CHECK: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[ID1]]] // CHECK-SAME: [2, 2, 1, 4] [1, 1, 1, 1] : tensor<2x2x16x16xf16> to tensor<2x2x1x4xf16> -// CHECK: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID1]], %[[ID]]] +// CHECK: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID1]], %[[ID]]#2] // CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xf16> to tensor<2x2x4x1xf16> -// CHECK: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]] +// CHECK: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]#2] // CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xf32> to tensor<2x2x4x1xf32> // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: : tensor<2x2x1x4xf16>, tensor<2x2x4x1xf16> into tensor<2x2x4x1xf32> // CHECK: scf.forall.in_parallel -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]#2] // CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xf32> into tensor<2x2x16x16xf32> // CHECK: mapping = [#iree_gpu.lane_id<0>] @@ -80,33 +78,27 @@ module attributes { transform.with_named_sequence } { transform.yield } } -#map = affine_map<(d0) -> (d0 mod 16)> -#map1 = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)> -#map2 = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> -#map3 = affine_map<(d0, d1, d2) -> (d0, d2)> -#map4 = affine_map<(d0, d1, d2) -> (d1, d2)> -#map5 = affine_map<(d0, d1, d2) -> (d0, d1)> +#map = affine_map<(d0, d1, d2) -> (d0, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d1, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 16)> -// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)> -// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> // CHECK-LABEL: func @distribute_multi_mma_I8_16x16x32_I32 // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xi32> // CHECK: scf.forall (%[[LANE_ID:.+]]) in (64) shared_outs(%[[ITER_ARG:.+]] = %[[ACC]]) -> (tensor<2x2x16x16xi32>) -// CHECK: %[[ID:.+]] = affine.apply #[[$MAP]](%[[LANE_ID]]) -// CHECK: %[[ID1:.+]] = affine.apply #[[$MAP1]](%[[LANE_ID]]) -// CHECK: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]], %[[ID1]]] +// CHECK: %[[ID:.+]]:3 = affine.delinearize_index %[[LANE_ID]] into (4, 16) +// CHECK: %[[ID1:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 8) +// CHECK: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[ID1]]] // CHECK-SAME: [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8> -// CHECK: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]], %[[ID1]]] +// CHECK: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]]#2, %[[ID1]]] // CHECK-SAME: [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8> -// CHECK: %[[ID2:.+]] = affine.apply #[[$MAP2]](%[[LANE_ID]]) -// CHECK: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID2]], %[[ID]]] +// CHECK: %[[ID1_2:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 4) +// CHECK: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID1_2]], %[[ID]]#2] // CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xi32> to tensor<2x2x4x1xi32> // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: : tensor<2x2x1x8xi8>, tensor<2x2x1x8xi8> into tensor<2x2x4x1xi32> // CHECK: scf.forall.in_parallel -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID2]], %[[ID]]] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1_2]], %[[ID]]#2] // CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xi32> into tensor<2x2x16x16xi32> // CHECK: mapping = [#iree_gpu.lane_id<0>] diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir index aa818d6d4d43..d8af13ab5916 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir @@ -85,8 +85,6 @@ module { } } -// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 32)> -// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 4 - ((d0 floordiv 32) floordiv 2) * 8)> // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -95,16 +93,16 @@ module { // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x32x8xf16> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x32x8xf16> // CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x4x8x32xf32>) -// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) -// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) -// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IDX]], %[[IDY]]] [2, 8, 1, 4] -// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IDX]], %[[IDY]]] [8, 2, 1, 4] -// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 4, 4, 1] +// CHECK-DAG: %[[ID:.+]]:3 = affine.delinearize_index %[[LANEID]] into (2, 32) +// CHECK-DAG: %[[IDY:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (2, 4) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[IDY]]] [2, 8, 1, 4] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]]#2, %[[IDY]]] [8, 2, 1, 4] +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[ID]]#2] [2, 2, 4, 4, 1] // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x4x4x1xf32> -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 4, 4, 1] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[ID]]#2] [2, 2, 4, 4, 1] // CHECK: mapping = [#iree_gpu.lane_id<0>] // ----- @@ -126,8 +124,6 @@ module { } } -// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 32)> -// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 4 - ((d0 floordiv 32) floordiv 2) * 8)> // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -136,16 +132,16 @@ module { // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x32x8xi8> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x32x8xi8> // CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x4x8x32xi32>) -// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) -// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) -// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IDX]], %[[IDY]]] [2, 8, 1, 4] -// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IDX]], %[[IDY]]] [8, 2, 1, 4] -// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 4, 4, 1] +// CHECK-DAG: %[[ID:.+]]:3 = affine.delinearize_index %[[LANEID]] into (2, 32) +// CHECK-DAG: %[[IDY:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (2, 4) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[IDY]]] [2, 8, 1, 4] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]]#2, %[[IDY]]] [8, 2, 1, 4] +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[ID]]#2] [2, 2, 4, 4, 1] // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<2x8x1x4xi8>, tensor<8x2x1x4xi8> into tensor<2x2x4x4x1xi32> -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 4, 4, 1] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[ID]]#2] [2, 2, 4, 4, 1] // CHECK: mapping = [#iree_gpu.lane_id<0>] // ----- @@ -167,8 +163,6 @@ module { } } -// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> -// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 2)> // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -177,16 +171,17 @@ module { // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x16x16xf16> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x16x16xf16> // CHECK: scf.forall (%[[LANEID:.+]]) in (32) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x8x2x16xf32>) -// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) -// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IDX]], 0] [2, 8, 1, 16] -// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IDX]], 0] [8, 2, 1, 16] -// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) -// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 8, 1, 1] +// CHECK-DAG: %[[ID_1:.+]]:2 = affine.delinearize_index %[[LANEID]] into (16) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID_1]]#1, 0] [2, 8, 1, 16] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID_1]]#1, 0] [8, 2, 1, 16] +// CHECK-DAG: %[[ID_2:.+]]:3 = affine.delinearize_index %[[LANEID]] into (2, 16) +// Note: ID_2#1 and I_2#2 should not be delinearize outputs once we move to linearized indexing +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[ID_2]]#1, %[[ID_2]]#2] [2, 2, 8, 1, 1] // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<2x8x1x16xf16>, tensor<8x2x1x16xf16> into tensor<2x2x8x1x1xf32> -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 8, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[ID_2]]#1, %[[ID_2]]#2] [2, 2, 8, 1, 1] // CHECK: mapping = [#iree_gpu.lane_id<0>] // ----- @@ -205,24 +200,19 @@ func.func @distribute_MFMA_F32_16x16x4_F32(%lhs: tensor<16x4xf32>, %rhs: tensor< return %0 : tensor<16x16xf32> } -// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> -// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 4)> -// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> - // CHECK-LABEL: func @distribute_MFMA_F32_16x16x4_F32 // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x4xf32> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<4x16xf32> // CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<16x16xf32>) -// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) -// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) -// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 1] -// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [1, 1] -// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) -// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK-DAG: %[[ID:.+]]:3 = affine.delinearize_index %[[LANEID]] into (4, 16) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[ID]]#2, %[[ID]]#1] [1, 1] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[ID]]#1, %[[ID]]#2] [1, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 4) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][%[[IDZ]], %[[ID]]#2] [4, 1] // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<1x1xf32>, tensor<1x1xf32> into tensor<4x1xf32> -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][%[[IDZ]], %[[ID]]#2] [4, 1] // CHECK: mapping = [#iree_gpu.lane_id<0>] // ----- @@ -241,24 +231,20 @@ func.func @distribute_F32_16x16x32_F8E4M3FNUZ(%lhs: tensor<16x32xf8E4M3FNUZ>, %r return %0 : tensor<16x16xf32> } -// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> -// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)> -// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> - // CHECK-LABEL: func @distribute_F32_16x16x32_F8E4M3FNUZ // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x32xf8E4M3FNUZ> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<32x16xf8E4M3FNUZ> // CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<16x16xf32>) -// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) -// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) -// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 8] -// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [8, 1] -// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) -// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK-DAG: %[[ID:.+]]:3 = affine.delinearize_index %[[LANEID]] into (4, 16) +// CHECK-DAG: %[[IDY:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 8) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[ID]]#2, %[[IDY]]] [1, 8] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[ID]]#2] [8, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 4) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][%[[IDZ]], %[[ID]]#2] [4, 1] // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<1x8xf8E4M3FNUZ>, tensor<8x1xf8E4M3FNUZ> into tensor<4x1xf32> -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][%[[IDZ]], %[[IDX]]] [4, 1] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][%[[IDZ]], %[[ID]]#2] [4, 1] // CHECK: mapping = [#iree_gpu.lane_id<0>] // ----- @@ -277,24 +263,20 @@ func.func @distribute_I32_32x32x16_I8(%lhs: tensor<32x16xi8>, %rhs: tensor<16x32 return %0 : tensor<4x8x32xi32> } -// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 32)> -// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 8 - ((d0 floordiv 32) floordiv 2) * 16)> -// CHECK-DAG: #[[$ZMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 32) * 4 - ((d0 floordiv 32) floordiv 2) * 8)> - // CHECK-LABEL: func @distribute_I32_32x32x16_I8 // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<32x16xi8> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x32xi8> // CHECK: scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<4x8x32xi32>) -// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) -// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) -// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], %[[IDY]]] [1, 8] -// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[IDX]]] [8, 1] -// CHECK-DAG: %[[IDZ:.+]] = affine.apply #[[$ZMAP]](%[[LANEID]]) -// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, %[[IDZ]], %[[IDX]]] [4, 4, 1] +// CHECK-DAG: %[[ID:.+]]:3 = affine.delinearize_index %[[LANEID]] into (2, 32) +// CHECK-DAG: %[[IDY:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (2, 8) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[ID]]#2, %[[IDY]]] [1, 8] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][%[[IDY]], %[[ID]]#2] [8, 1] +// CHECK-DAG: %[[IDZ:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (2, 4) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, %[[IDZ]], %[[ID]]#2] [4, 4, 1] // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<1x8xi8>, tensor<8x1xi8> into tensor<4x4x1xi32> -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, %[[IDZ]], %[[IDX]]] [4, 4, 1] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, %[[IDZ]], %[[ID]]#2] [4, 4, 1] // CHECK: mapping = [#iree_gpu.lane_id<0>] // ----- @@ -313,20 +295,18 @@ func.func @distribute_WMMA_F16_16x16x16_F16(%lhs: tensor<16x16xf16>, %rhs: tenso return %0 : tensor<8x2x16xf16> } -// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> - // CHECK-LABEL: func @distribute_WMMA_F16_16x16x16_F16 // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<16x16xf16> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<16x16xf16> // CHECK: scf.forall (%[[LANEID:.+]]) in (32) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<8x2x16xf16>) -// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) -// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[IDX]], 0] [1, 16] -// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, %[[IDX]]] [16, 1] -// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, %[[IDX]]] [16, 1, 1] +// CHECK-DAG: %[[ID:.+]]:2 = affine.delinearize_index %[[LANEID]] into (16) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][%[[ID]]#1, 0] [1, 16] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, %[[ID]]#1] [16, 1] +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, %[[ID]]#1] [16, 1, 1] // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<1x16xf16>, tensor<16x1xf16> into tensor<16x1x1xf16> -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, %[[IDX]]] [16, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, %[[ID]]#1] [16, 1, 1] // CHECK: mapping = [#iree_gpu.lane_id<0>] // ----- @@ -348,8 +328,6 @@ module { } } -// CHECK-DAG: #[[$XMAP:.+]] = affine_map<(d0) -> (d0 mod 16)> -// CHECK-DAG: #[[$YMAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 2)> // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> // CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> // CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -358,16 +336,16 @@ module { // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x16x16xi8> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x16x16xi8> // CHECK: scf.forall (%[[LANEID:.+]]) in (32) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x8x2x16xi32>) -// CHECK-DAG: %[[IDX:.+]] = affine.apply #[[$XMAP]](%[[LANEID]]) -// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[IDX]], 0] [2, 8, 1, 16] -// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[IDX]], 0] [8, 2, 1, 16] -// CHECK-DAG: %[[IDY:.+]] = affine.apply #[[$YMAP]](%[[LANEID]]) -// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 8, 1, 1] +// CHECK-DAG: %[[ID:.+]]:2 = affine.delinearize_index %[[LANEID]] into (16) +// CHECK-DAG: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#1, 0] [2, 8, 1, 16] +// CHECK-DAG: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]]#1, 0] [8, 2, 1, 16] +// CHECK-DAG: %[[ID_ACC:.+]]:3 = affine.delinearize_index %[[LANEID]] into (2, 16) +// CHECK-DAG: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, 0, %[[ID_ACC]]#1, %[[ID_ACC]]#2] [2, 2, 8, 1, 1] // CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] // CHECK-SAME: indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]] // CHECK-SAME: kind = #iree_gpu.mma_layout // CHECK-SAME: : tensor<2x8x1x16xi8>, tensor<8x2x1x16xi8> into tensor<2x2x8x1x1xi32> -// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[IDY]], %[[IDX]]] [2, 2, 8, 1, 1] +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, 0, %[[ID_ACC]]#1, %[[ID_ACC]]#2] [2, 2, 8, 1, 1] // CHECK: mapping = [#iree_gpu.lane_id<0>] // ----- diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/BUILD.bazel index 1d32b4747aed..b0ca13961855 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/BUILD.bazel @@ -70,6 +70,7 @@ iree_compiler_cc_library( ":IREEVectorExtInterfacesGen", ":IREEVectorExtOpsGen", "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils", + "//compiler/src/iree/compiler/Utils", "@llvm-project//llvm:Support", "@llvm-project//mlir:AffineDialect", "@llvm-project//mlir:DialectUtils", diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/CMakeLists.txt index 03fbfcfc7003..2eb395c724d6 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/CMakeLists.txt @@ -46,6 +46,7 @@ iree_cc_library( MLIRTensorDialect MLIRVectorDialect iree::compiler::Codegen::Utils::VectorOpUtils + iree::compiler::Utils PUBLIC ) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp index 732282c3f6db..d0d86702da5d 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp @@ -8,6 +8,7 @@ #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h" #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h" +#include "iree/compiler/Utils/Indexing.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -286,51 +287,28 @@ NestedLayoutAttr::computeThreadIds(Value threadId, int64_t subgroupSize, Location loc = threadId.getLoc(); - AffineExpr tidExpr, size, stride; - bindDims(rewriter.getContext(), tidExpr); - bindSymbols(rewriter.getContext(), size, stride); - - // (tid floordiv stride) mod size - AffineMap threadTidMap = - AffineMap::get(/*dims=*/1, /*syms=*/2, tidExpr.floorDiv(stride) % size); - - // (tid floordiv (stride * subgroup_size)) mod size - AffineMap subgroupTidMap = AffineMap::get( - /*dims=*/1, /*syms=*/2, tidExpr.floorDiv(stride * subgroupSize) % size); - - for (auto [dimSize, dimStride] : - llvm::zip_equal(getSubgroupTile(), getSubgroupStrides())) { - // Dimension is not distributed. - if (dimStride == 0) { - virtualTids.push_back(rewriter.create( - loc, rewriter.getIndexAttr(dimStride))); - continue; - } + SmallVector subgroupBasis, threadBasis; + SmallVector subgroupDimToResult, threadDimToResult; - auto sizeVal = - rewriter.create(loc, rewriter.getIndexAttr(dimSize)); - auto strideVal = rewriter.create( - loc, rewriter.getIndexAttr(dimStride)); - virtualTids.push_back(rewriter.create( - loc, subgroupTidMap, ValueRange{threadId, sizeVal, strideVal})); - } + if (failed(basisFromSizesStrides(getSubgroupTile(), getSubgroupStrides(), + subgroupBasis, subgroupDimToResult))) + return {}; + if (failed(basisFromSizesStrides(getThreadTile(), getThreadStrides(), + threadBasis, threadDimToResult))) + return {}; - for (auto [dimSize, dimStride] : - llvm::zip_equal(getThreadTile(), getThreadStrides())) { - // Dimension is not distributed. - if (dimStride == 0) { - virtualTids.push_back(rewriter.create( - loc, rewriter.getIndexAttr(dimStride))); - continue; - } + // Add the subgroup_size to the end of the subgroup delinearization basis. + subgroupBasis.push_back(subgroupSize); - auto sizeVal = - rewriter.create(loc, rewriter.getIndexAttr(dimSize)); - auto strideVal = rewriter.create( - loc, rewriter.getIndexAttr(dimStride)); - virtualTids.push_back(rewriter.create( - loc, threadTidMap, ValueRange{threadId, sizeVal, strideVal})); - } + auto subgroupSplit = rewriter.create( + loc, threadId, subgroupBasis, /*hasOuterBound=*/false); + auto threadSplit = rewriter.create( + loc, threadId, threadBasis, /*hasOuterBound=*/false); + + llvm::transform(subgroupDimToResult, std::back_inserter(virtualTids), + [&](size_t idx) { return subgroupSplit.getResult(idx); }); + llvm::transform(threadDimToResult, std::back_inserter(virtualTids), + [&](size_t idx) { return threadSplit.getResult(idx); }); return virtualTids; } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td index 16bad7f23bc2..8b375478f80e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.td @@ -75,6 +75,12 @@ def NestedLayoutAttr : IREEVectorExt_Attr<"NestedLayout", 0, 4, 1, 5, 2, 6, 3, 7 ``` + The subgroups are placed contiguously with their shape and ordering + determined by: + - `subgroup_tile`: Sizes of this level of tiling + - `subgroup_strides`: Stride of this level of tiling. 0 if not distributed. + Tiling levels must not overlap. + The total number of subgroups used (computed by multiplying each dim in subgroup_tile) should be a multiple of number of subgroups in the harware. If the total number of subgroups used exceeds the number of @@ -231,7 +237,7 @@ def NestedLayoutAttr : IREEVectorExt_Attr<"NestedLayout", let extraClassDeclaration = [{ // Returns the subgroup/lane ids delinearized from a single linearized - // thread ID. + // thread ID. Returns the empty vector on failure. SmallVector computeThreadIds(Value threadId, int64_t subgroupSize, RewriterBase &rewriter) const; // Get the undistributed shape that is subgroup x batch x outer x thread x element diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir index e230d0a80996..514a913283b1 100644 --- a/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir +++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/tile_and_vectorize_to_cooperative_ops.mlir @@ -77,8 +77,7 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla return } -// CHECK: #[[$MAP_Y:.+]] = affine_map<()[s0] -> (s0 * 16)> -// CHECK: #[[$MAP_X:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 16)> +// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 16)> // CHECK-LABEL: func.func @matmul_256x1024x128_div_add() @@ -94,8 +93,9 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla // CHECK-DAG: %[[LHS_ALLOC:.+]] = memref.alloc() : memref<32x32xf16, 3> // CHECK-DAG: %[[RHS_ALLOC:.+]] = memref.alloc() : memref<32x32xf16, 3> -// CHECK: %[[OFFSET_Y:.+]] = affine.apply #[[$MAP_Y]]()[%[[ID_Y]]] -// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$MAP_X]]()[%[[ID_X]]] +// CHECK: %[[IDS_X:.+]]:2 = affine.delinearize_index %[[ID_X]] into (2, 32) : index, index +// CHECK: %[[OFFSET_Y:.+]] = affine.apply #[[$MAP]]()[%[[ID_Y]]] +// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$MAP]]()[%[[IDS_X]]#0] // CHECK: scf.for %{{.+}} = %[[OFFSET_Y]] to %[[C32]] step %[[C32]] // CHECK: scf.for %{{.+}} = %[[OFFSET_X]] to %[[C32]] step %[[C32]] @@ -109,10 +109,10 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla // CHECK: gpu.barrier // CHECK: scf.for %[[IV_Y:.+]] = %[[OFFSET_Y]] to %[[C32]] step %[[C32]] // CHECK: %[[LHS_VIEW:.+]] = memref.subview %[[LHS_ALLOC]][%[[IV_Y]], 0] +// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]]] +// CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C16]]] // CHECK: scf.for %[[IV_X:.+]] = %[[OFFSET_X]] to %[[C32]] step %[[C32]] // CHECK: %[[RHS_VIEW:.+]] = memref.subview %[[RHS_ALLOC]][0, %[[IV_X]]] -// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]]] -// CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C16]]] // CHECK-DAG: %[[READ2:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C0]], %[[C0]]] // CHECK-DAG: %[[READ3:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C16]], %[[C0]]] // CHECK-DAG: %[[READ4:.+]] = vector.transfer_read %{{.+}}[%[[C0]], %[[C0]]] @@ -209,8 +209,7 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla return } -// CHECK: #[[$MAP_Y:.+]] = affine_map<()[s0] -> (s0 * 16)> -// CHECK: #[[$MAP_X:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 16)> +// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 16)> // CHECK-LABEL: func.func @matmul_256x1024x128_div_add() @@ -228,8 +227,9 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla // CHECK: %[[LHS_ALLOC:.+]] = memref.alloc() : memref<1x32x32xf16, 3> // CHECK: %[[RHS_ALLOC:.+]] = memref.alloc() : memref<1x32x32xf16, 3> -// CHECK: %[[OFFSET_Y:.+]] = affine.apply #[[$MAP_Y]]()[%[[ID_Y]]] -// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$MAP_X]]()[%[[ID_X]]] +// CHECK: %[[IDS_X:.+]]:2 = affine.delinearize_index %[[ID_X]] into (2, 32) +// CHECK: %[[OFFSET_Y:.+]] = affine.apply #[[$MAP]]()[%[[ID_Y]]] +// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$MAP]]()[%[[IDS_X]]#0] // CHECK: scf.for %{{.+}} = %[[ID_Z]] to %[[C1]] step %[[C1]] // CHECK: scf.for %{{.+}} = %[[OFFSET_Y]] to %[[C32]] step %[[C32]] @@ -246,10 +246,10 @@ func.func @matmul_256x1024x128_div_add() attributes {translation_info = #transla // CHECK: scf.for %[[IV_Z:.+]] = %[[ID_Z]] to %[[C1]] step %[[C1]] // CHECK: scf.for %[[IV_Y:.+]] = %[[OFFSET_Y]] to %[[C32]] step %[[C32]] // CHECK: %[[LHS_VIEW:.+]] = memref.subview %[[LHS_ALLOC]][%[[IV_Z]], %[[IV_Y]], 0] [1, 16, 32] +// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]], %[[C0]]] +// CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]], %[[C16]]] // CHECK: scf.for %[[IV_X:.+]] = %[[OFFSET_X]] to %[[C32]] step %[[C32]] { // CHECK: %[[RHS_VIEW:.+]] = memref.subview %[[RHS_ALLOC]][%[[IV_Z]], 0, %[[IV_X]]] [1, 32, 16] -// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]], %[[C0]]] -// CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]], %[[C16]]] // CHECK-DAG: %[[READ2:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C0]], %[[C0]], %[[C0]]] // CHECK-DAG: %[[READ3:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C0]], %[[C16]], %[[C0]]] // CHECK-DAG: %[[READ4:.+]] = vector.transfer_read %{{.+}}[%[[C0]], %[[C0]], %[[C0]]] @@ -337,8 +337,7 @@ func.func @matmul_256x1024x128_mixed_signedness_int8() { return } -// CHECK: #[[$MAP_Y:.+]] = affine_map<()[s0] -> (s0 * 16)> -// CHECK: #[[$MAP_X:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 16)> +// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 16)> // CHECK-LABEL: func.func @matmul_256x1024x128_mixed_signedness_int8() @@ -354,8 +353,9 @@ func.func @matmul_256x1024x128_mixed_signedness_int8() { // CHECK-DAG: %[[LHS_ALLOC:.+]] = memref.alloc() : memref<32x32xi8, 3> // CHECK-DAG: %[[RHS_ALLOC:.+]] = memref.alloc() : memref<32x32xi8, 3> -// CHECK: %[[OFFSET_Y:.+]] = affine.apply #[[$MAP_Y]]()[%[[ID_Y]]] -// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$MAP_X]]()[%[[ID_X]]] +// CHECK: %[[IDS_X:.+]]:2 = affine.delinearize_index %[[ID_X]] into (2, 32) +// CHECK: %[[OFFSET_Y:.+]] = affine.apply #[[$MAP]]()[%[[ID_Y]]] +// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$MAP]]()[%[[IDS_X]]#0] // CHECK: scf.for %{{.+}} = %[[OFFSET_Y]] to %[[C32]] step %[[C32]] // CHECK: scf.for %{{.+}} = %[[OFFSET_X]] to %[[C32]] step %[[C32]] @@ -369,10 +369,10 @@ func.func @matmul_256x1024x128_mixed_signedness_int8() { // CHECK: gpu.barrier // CHECK: scf.for %[[IV_Y:.+]] = %[[OFFSET_Y]] to %[[C32]] step %[[C32]] // CHECK: %[[LHS_VIEW:.+]] = memref.subview %[[LHS_ALLOC]][%[[IV_Y]], 0] +// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]]] +// CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C16]]] // CHECK: scf.for %[[IV_X:.+]] = %[[OFFSET_X]] to %[[C32]] step %[[C32]] // CHECK: %[[RHS_VIEW:.+]] = memref.subview %[[RHS_ALLOC]][0, %[[IV_X]]] -// CHECK-DAG: %[[READ0:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C0]]] -// CHECK-DAG: %[[READ1:.+]] = vector.transfer_read %[[LHS_VIEW]][%[[C0]], %[[C16]]] // CHECK-DAG: %[[READ2:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C0]], %[[C0]]] // CHECK-DAG: %[[READ3:.+]] = vector.transfer_read %[[RHS_VIEW]][%[[C16]], %[[C0]]] // CHECK-DAG: %[[READ4:.+]] = vector.transfer_read %{{.+}}[%[[C0]], %[[C0]]] diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp index 8f09f6f932f8..8fe633e51b71 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp @@ -82,10 +82,11 @@ getSubgroupIdsAndCounts(mlir::OpBuilder &builder, mlir::Location loc, mlir::Value subgroupId = builder.create(loc, indexType, dimAttr[i]); if (i == 0) { - mlir::AffineExpr d0 = builder.getAffineDimExpr(0); - subgroupId = mlir::affine::makeComposedAffineApply( - builder, loc, d0.floorDiv(builder.getAffineConstantExpr(warpSize)), - {subgroupId}); + subgroupId = + builder + .create( + loc, subgroupId, ArrayRef{numSubgroups[i], warpSize}) + .getResult(0); } procInfo[numDims - 1 - i] = { subgroupId, diff --git a/compiler/src/iree/compiler/Utils/BUILD.bazel b/compiler/src/iree/compiler/Utils/BUILD.bazel index c7c2acc2a8fd..29966395e300 100644 --- a/compiler/src/iree/compiler/Utils/BUILD.bazel +++ b/compiler/src/iree/compiler/Utils/BUILD.bazel @@ -21,6 +21,7 @@ iree_compiler_cc_library( "ElementPackingUtils.cpp", "EquivalenceUtils.cpp", "FlatbufferUtils.cpp", + "Indexing.cpp", "ModuleUtils.cpp", "OptionUtils.cpp", "PassUtils.cpp", diff --git a/compiler/src/iree/compiler/Utils/CMakeLists.txt b/compiler/src/iree/compiler/Utils/CMakeLists.txt index 84be0745bbf6..74dd2bbb3ed7 100644 --- a/compiler/src/iree/compiler/Utils/CMakeLists.txt +++ b/compiler/src/iree/compiler/Utils/CMakeLists.txt @@ -37,6 +37,7 @@ iree_cc_library( "ElementPackingUtils.cpp" "EquivalenceUtils.cpp" "FlatbufferUtils.cpp" + "Indexing.cpp" "ModuleUtils.cpp" "OptionUtils.cpp" "PassUtils.cpp" diff --git a/compiler/src/iree/compiler/Utils/Indexing.cpp b/compiler/src/iree/compiler/Utils/Indexing.cpp new file mode 100644 index 000000000000..ad36ea83304d --- /dev/null +++ b/compiler/src/iree/compiler/Utils/Indexing.cpp @@ -0,0 +1,67 @@ +// Copyright 2025 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/compiler/Utils/Indexing.h" + +using namespace mlir; + +namespace mlir::iree_compiler { +LogicalResult basisFromSizesStrides(ArrayRef sizes, + ArrayRef strides, + SmallVectorImpl &basis, + SmallVectorImpl &dimToResult) { + assert(sizes.size() == strides.size()); + size_t numDims = sizes.size(); + basis.reserve(numDims); + + SmallVector> terms = + llvm::map_to_vector(llvm::enumerate(strides, sizes), [&](auto tuple) { + auto [dim, stride, size] = tuple; + return std::make_tuple(stride, dim, size); + }); + llvm::sort(terms); + + int64_t previousSizes = 1; + SmallVector> basisEntryToDim; + basisEntryToDim.reserve(numDims); + for (auto [stride, dim, size] : terms) { + if (stride == 0) { + stride = 1; + size = 1; + } + if (stride % previousSizes != 0) + return failure(); + + // Handle casis like threads = {4, 8}, strides = {1, 16}, which need an + // extra basis element. + if (stride != previousSizes) { + int64_t jumpSize = stride / previousSizes; + basisEntryToDim.push_back(std::nullopt); + basis.push_back(jumpSize); + previousSizes *= jumpSize; + } + + basisEntryToDim.push_back(dim); + basis.push_back(size); + previousSizes *= size; + } + + // Post-process. The basis is backwards and the permutation + // we've constructed is the inverse of what we need. + std::reverse(basis.begin(), basis.end()); + size_t basisLength = basis.size(); + dimToResult.assign(numDims, ~0); + for (auto [reverseBasisPos, dimPos] : llvm::enumerate(basisEntryToDim)) { + if (!dimPos) + continue; + // There's an extra overflow term at the front of the delineraize results, + // so this subtraction lands in the [1, basisLength] range we need it + // to be in. + dimToResult[*dimPos] = basisLength - reverseBasisPos; + } + return success(); +} +} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Utils/Indexing.h b/compiler/src/iree/compiler/Utils/Indexing.h index 58484e8bd775..f92776ed44d1 100644 --- a/compiler/src/iree/compiler/Utils/Indexing.h +++ b/compiler/src/iree/compiler/Utils/Indexing.h @@ -49,6 +49,38 @@ inline OpFoldResult linearIndexFromShape(ArrayRef multiIndex, builder, builder.getLoc(), linearIndexExpr, multiIndexAndStrides); } +/// Given a set of dimension `sizes` and `strides`, compute a `basis` - a list +/// of sizes suitable for passing to an `affine.delinearize_index` op without +/// outer bound that would produce the same effects as a `(x / strides[i]) % +/// sizes[i]` delinearization. The permutation mapping each dimension in `sizes` +/// to its corresponding delinearization result is in `dimToResult`. +/// +/// That is, if there are `N` elements in the shape, after one builds +/// +/// %r:(N+1) affine.delinearize_index %x by (basis) : index, index, ... +/// +/// then, for all `i` +/// +/// %r#(dimToResult[i]) == (%x floordiv strides[i]) mod sizes[i] +/// +/// For example, sizes = {4, 16}, strides = {1, 4} will return basis = {4, 1} +/// and dimToResult = {2, 1} +/// +/// This function does handle the case where the strides "skip over" elements. +/// For example, sizes = {16, 4} strides = {8, 1} will yield basis = {16, 2, 4} +/// and dimToResult = {1, 3}. +/// +/// If a basis can't be found - for instance, if we have sizes = {4, 4} +/// strides = {3, 1}, returns failure(). +/// +/// As a special case, dimensions with stride 0 are treated as size-1 +/// dimensions that are placed at the end of the delinearization, from where +/// they will canonicalize to 0. +LogicalResult basisFromSizesStrides(ArrayRef sizes, + ArrayRef strides, + SmallVectorImpl &basis, + SmallVectorImpl &dimToResult); + } // namespace mlir::iree_compiler #endif // IREE_COMPILER_UTILS_INDEXING_H_ diff --git a/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp b/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp index 17a0c606155c..db757a195332 100644 --- a/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp +++ b/compiler/src/iree/compiler/Utils/unittests/UtilsTest.cpp @@ -9,9 +9,11 @@ #include #include "iree/compiler/Utils/EmbeddedDataDirectory.h" +#include "iree/compiler/Utils/Indexing.h" #include "iree/compiler/Utils/Permutation.h" #include "llvm/Support/FormatVariadic.h" +using namespace mlir; using namespace mlir::iree_compiler; using namespace testing; @@ -66,3 +68,41 @@ TEST(EmbeddedDataDirectory, GetMap) { } EXPECT_THAT(keys, UnorderedElementsAre("filename1", "filename2")); } + +TEST(BasisFromSizeStrides, SimpleCase) { + SmallVector basis; + SmallVector dimToResult; + + EXPECT_TRUE( + succeeded(basisFromSizesStrides({4, 16}, {1, 4}, basis, dimToResult))); + EXPECT_THAT(basis, ElementsAre(16, 4)); + EXPECT_THAT(dimToResult, ElementsAre(2, 1)); +} + +TEST(BasisFromSizeStrides, ZeroStride) { + SmallVector basis; + SmallVector dimToResult; + + EXPECT_TRUE(succeeded( + basisFromSizesStrides({16, 4, 4}, {1, 0, 16}, basis, dimToResult))); + EXPECT_THAT(basis, ElementsAre(4, 16, 1)); + EXPECT_THAT(dimToResult, ElementsAre(2, 3, 1)); +} + +TEST(BasisFromSizeStrides, JumpsInStrides) { + SmallVector basis; + SmallVector dimToResult; + + EXPECT_TRUE( + succeeded(basisFromSizesStrides({8, 4}, {8, 1}, basis, dimToResult))); + EXPECT_THAT(basis, ElementsAre(8, 2, 4)); + EXPECT_THAT(dimToResult, ElementsAre(1, 3)); +} + +TEST(BasisFromSizeStrides, OverlappingStrides) { + SmallVector basis; + SmallVector dimToResult; + + EXPECT_FALSE( + succeeded(basisFromSizesStrides({8, 4}, {6, 1}, basis, dimToResult))); +}