Skip to content

Commit

Permalink
GPU ukernel lowering config for data-tiled multi_mma, and a simple uk…
Browse files Browse the repository at this point in the history
…ernel. (#19504)

This PR adds the KernelConfig logic to generate a lowering_config
selecting a ukernel for multi_mma. In order to be able to test it, this
PR also adds a very simple `multi_mma` ukernel, but it isn't actually
exercised yet, other than successfully compiling to bitcode. The
compiler logic only cares about the existence of the resulting bitcode
file. The actual lowering to ukernel op will come in the next PR.

---------

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
  • Loading branch information
bjacob authored Dec 17, 2024
1 parent a31da1f commit 72d98bc
Show file tree
Hide file tree
Showing 12 changed files with 177 additions and 43 deletions.
16 changes: 14 additions & 2 deletions compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ argmax_types = [
[iree_amdgpu_bitcode_library(
name = "iree_uk_amdgpu_argmax_%s_%s" % (type, gpu_arch),
srcs = [
"iree_uk_amdgpu_argmax_%s.c" % type,
"common.h",
"iree_uk_amdgpu_argmax_%s.c" % type,
],
out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch),
gpu_arch = gpu_arch,
Expand All @@ -59,9 +59,21 @@ argmax_bc_files = [
for gpu_arch in gpu_archs
]

iree_amdgpu_bitcode_library(
name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942",
srcs = [
"common.h",
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c",
],
out = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
gpu_arch = "gfx942",
)

iree_c_embed_data(
name = "iree_uk_amdgpu_bitcode",
srcs = argmax_bc_files,
srcs = argmax_bc_files + [
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
],
c_file_output = "iree_uk_amdgpu_bitcode.c",
flatten = True,
h_file_output = "iree_uk_amdgpu_bitcode.h",
Expand Down
13 changes: 13 additions & 0 deletions compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,18 @@ iree_amdgpu_bitcode_library(
"iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
)

iree_amdgpu_bitcode_library(
NAME
iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942
GPU_ARCH
gfx942
SRCS
"common.h"
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c"
OUT
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
)

iree_c_embed_data(
NAME
iree_uk_amdgpu_bitcode
Expand All @@ -226,6 +238,7 @@ iree_c_embed_data(
"iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
"iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
"iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
"iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
C_FILE_OUTPUT
"iree_uk_amdgpu_bitcode.c"
H_FILE_OUTPUT
Expand Down
7 changes: 7 additions & 0 deletions compiler/plugins/target/ROCM/builtins/ukernel/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ typedef __UINT64_TYPE__ uint64_t;
#define FLT_MIN __FLT_MIN__
#define FLT_MAX __FLT_MAX__

//===----------------------------------------------------------------------===//
// Vector typedefs
//===----------------------------------------------------------------------===//

typedef __attribute__((__vector_size__(8 * 2))) int64_t int64x2_t;
typedef __attribute__((__vector_size__(4 * 4))) int32_t int32x4_t;

//===----------------------------------------------------------------------===//
// Declarations for Clangd, which may be slightly older than actual clang.
// Drop these as clangd versions used in practice gain these builtins.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"

// Very naive kernel. TODO(bjacob):
// 1. Shared memory: can't allocate it within the microkernel (which is just a
// helper device function, not the actual amdgpu_kernel). Need to get it
// passed down here as a `T [[clang::address_space(3)]] *` parameter.
// 2. Better scheduling via either barrier intrinsics or inline assemby.
// 3. Subgroups1x4 being asymmetric is a historical accident... should be 2x2.
[[clang::always_inline]] void
iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4(
const int8_t *a_buffer, int64_t a_offset, const int8_t *b_buffer,
int64_t b_offset, int32_t *c_buffer, int64_t c_offset, int64_t k_size) {
int tid = __builtin_amdgcn_workitem_id_x();

// Load existing accumulators.
int32x4_t acc[8][2] = {{0}};
int32x4_t *c_global = (int32x4_t *)(c_buffer + c_offset);
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
acc[i][j] = c_global[256 * (2 * i + j) + tid];
}
}

// Arithmetic loop.
const int64x2_t *a_global =
(const int64x2_t *)(a_buffer + a_offset) + (tid % 64);
const int64x2_t *b_global = (const int64x2_t *)(b_buffer + b_offset) + tid;
for (int k_outer = 0; k_outer < k_size; ++k_outer) {
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
for (int k = 0; k < 2; ++k) {
acc[i][j] = __builtin_amdgcn_mfma_i32_16x16x32_i8(
a_global[64 * i][k], b_global[256 * j][k], acc[i][j], 0, 0, 0);
}
}
}
a_global += 512;
b_global += 512;
}

// Store accumulators.
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 2; ++j) {
c_global[256 * (2 * i + j) + tid] = acc[i][j];
}
}
}
1 change: 1 addition & 0 deletions compiler/plugins/target/ROCM/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ iree_lit_test_suite(
srcs = [
"config_ukernel_argmax_gfx908.mlir",
"config_ukernel_argmax_gfx942.mlir",
"config_ukernel_multi_mma_gfx942.mlir",
"default_tuning_specs_amdgpu.mlir",
"lowering_strategy_from_tuning_spec.mlir",
"ukernel_pipeline_transform.mlir",
Expand Down
1 change: 1 addition & 0 deletions compiler/plugins/target/ROCM/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ iree_lit_test_suite(
SRCS
"config_ukernel_argmax_gfx908.mlir"
"config_ukernel_argmax_gfx942.mlir"
"config_ukernel_multi_mma_gfx942.mlir"
"default_tuning_specs_amdgpu.mlir"
"lowering_strategy_from_tuning_spec.mlir"
"ukernel_pipeline_transform.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s

func.func @multi_mma_mfma_i32_16x16x32_i8(%a : tensor<1x2x8x4x16x2x8xi8>,
%b : tensor<1x2x4x2x4x16x2x8xi8>,
%c : tensor<1x1x8x4x2x4x16x4xi32>)
-> tensor<1x1x8x4x2x4x16x4xi32> attributes {
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "multi_mma"}>
} {
%d = iree_gpu.multi_mma %a, %b, %c {indexing_maps = [
affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>
], iterator_types = [
#iree_gpu.iterator_type<parallel>,
#iree_gpu.iterator_type<parallel>,
#iree_gpu.iterator_type<reduction>
], kind = #iree_gpu.data_tiled_mma_layout<
intrinsic = MFMA_I32_16x16x32_I8,
unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2
>} : tensor<1x2x8x4x16x2x8xi8>, tensor<1x2x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
return %d : tensor<1x1x8x4x2x4x16x4xi32>
}

// CHECK-LABEL: @multi_mma_mfma_i32_16x16x32_i8
// CHECK: iree_gpu.multi_mma
// CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
// CHECK-NOT: promote_operands
// CHECK-SAME: reduction = [0, 0, 0]
// CHECK-SAME: #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4"
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,9 @@ namespace mlir::iree_compiler::IREE::GPU {

constexpr int64_t kCacheLineSizeBits = 128 * 8;

LogicalResult
setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op) {
LogicalResult setDataTiledMultiMmaLoweringConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op);
if (!multiMmaOp) {
return failure();
Expand Down Expand Up @@ -70,7 +69,7 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
SmallVector<int64_t> reductionTileSizes(iterationRank, 0);
for (int64_t kDim : contractionDims.k) {
workgroupTileSizes[kDim] = 0;
reductionTileSizes[kDim] = 1;
reductionTileSizes[kDim] = ukernelConfig ? 0 : 1;
}

// Set tile sizes.
Expand All @@ -81,8 +80,16 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
b.getI64ArrayAttr(workgroupTileSizes));
attrs.emplace_back(b.getStringAttr("reduction"),
b.getI64ArrayAttr(reductionTileSizes));
// Promote operands to use shared memory for LHS and RHS.
GPU::setPromotedOperandList(context, attrs, {0, 1});
if (ukernelConfig) {
attrs.emplace_back(b.getStringAttr("ukernel"), ukernelConfig);
} else {
// Promote operands to use shared memory for LHS and RHS.
// Don't do that with ukernels: their untiled reduction dimension is too
// large to fit in shared memory, so they just want global memory and they
// will take care of moving small chunks at a time into a shared memory
// operand that will be created together with the ukernel op.
GPU::setPromotedOperandList(context, attrs, {0, 1});
}
auto configDict = b.getDictionaryAttr(attrs);
auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@ namespace mlir::iree_compiler::IREE::GPU {

/// Helper for setting up a data tiled multi_mma config based on the specified
/// target.
LogicalResult
setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
Operation *op);
LogicalResult setDataTiledMultiMmaLoweringConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig);

/// Helper for setting up a convolution config using IGEMM based on the
/// specified target.
Expand Down
33 changes: 8 additions & 25 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2099,15 +2099,9 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint,
/// Set the configuration for argmax when ukernels are enabled.
/// Distribute all parallel dim across different workgroups, and only use single
/// subgroup per workgroup.
static LogicalResult
setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
linalg::GenericOp op) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(op);
if (!ukernelConfig) {
return failure();
}

static LogicalResult setArgmaxUkernelConfig(
IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
linalg::GenericOp op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
SmallVector<unsigned> parallelDims;
SmallVector<unsigned> reductionDims;
op.getParallelDims(parallelDims);
Expand Down Expand Up @@ -2170,15 +2164,6 @@ setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
return success();
}

/// Make UKernels take the LLVMGPUDefault lowering pipeline.
static LogicalResult
setUKernelConfig(mlir::FunctionOpInterface entryPoint,
IREE::Codegen::UKernelOpInterface ukernelOp) {
auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
entryPoint->getContext(), CodeGenPipeline::LLVMGPUDefault);
return setTranslationInfo(entryPoint, translationInfo);
}

/// Decides the tiling and distribution parameters for one convolution
/// dimension. Returns true if we can succesfully deduce.
///
Expand Down Expand Up @@ -2358,13 +2343,14 @@ static LogicalResult setConvolutionConfig(
static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPointFn,
Operation *computeOp) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
LLVM_DEBUG({
DBGS() << "Selecting root config for: ";
computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
llvm::dbgs() << "\n";
});
if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
computeOp))) {
computeOp, ukernelConfig))) {
LDBG("Tile and fuse data tiled multi_mma config");
return success();
}
Expand Down Expand Up @@ -2410,8 +2396,9 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
if (genericOp && succeeded(setTransposeConfig(entryPointFn, genericOp))) {
LDBG("Transpose Config");
return success();
} else if (genericOp && succeeded(setArgmaxUkernelConfig(
target, entryPointFn, genericOp))) {
} else if (genericOp && ukernelConfig &&
succeeded(setArgmaxUkernelConfig(target, entryPointFn, genericOp,
ukernelConfig))) {
LDBG("Argmax Ukernel Config");
return success();
}
Expand All @@ -2435,10 +2422,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
LDBG("Pack Config");
return setPackConfig(target, entryPointFn, packOp);
})
.Case<IREE::Codegen::UKernelOpInterface>([&](auto ukernelOp) {
LDBG("Ukernel Config");
return setUKernelConfig(entryPointFn, ukernelOp);
})
.Case<IREE::LinalgExt::CustomOp>([&](auto customOp) {
LDBG("CustomOp Config");
return setDefaultCustomOpLoweringConfig(entryPointFn, customOp,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h"

#include "compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
Expand Down Expand Up @@ -272,8 +273,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPointFn,
Operation *computeOp) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
computeOp))) {
computeOp, ukernelConfig))) {
return success();
}
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Utils/EmbeddedDataDirectory.h"
Expand All @@ -18,8 +19,13 @@ namespace mlir::iree_compiler {

namespace {

struct UKernelNameAndSuffix {
std::string name;
std::string suffix;
};

// Returns ukernel name and suffix for argmax. Empty name = no ukernel.
static std::tuple<std::string, std::string>
static UKernelNameAndSuffix
getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
Value input = op.getDpsInputOperand(0)->get();
auto inputType = cast<ShapedType>(input.getType());
Expand All @@ -29,13 +35,34 @@ getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
indexType.getElementType())};
}

// Returns ukernel name and suffix for multi_mma. Empty name = no ukernel.
static UKernelNameAndSuffix
getUKernelNameAndSuffixForMultiMma(IREE::GPU::MultiMmaOp op) {
auto mma = dyn_cast<IREE::GPU::DataTiledMMAAttr>(op.getKind());
if (!mma) {
return {}; // Only handling DataTiledMMAAttr for now.
}
std::string suffix{
stringifyMMAIntrinsic(mma.getIntrinsic().getValue()).lower()};
if (mma.getUnrollM() != 1 || mma.getUnrollN() != 1 || mma.getUnrollK() != 1) {
suffix += llvm::formatv("_unroll{}x{}x{}", mma.getUnrollM(),
mma.getUnrollN(), mma.getUnrollK());
}
if (mma.getSubgroupsM() != 1 || mma.getSubgroupsN() != 1) {
suffix += llvm::formatv("_subgroups{}x{}", mma.getSubgroupsM(),
mma.getSubgroupsN());
}
return {"multi_mma", suffix};
}

// Returns ukernel name and suffix for any op. Empty name = no ukernel.
static std::tuple<std::string, std::string>
getUKernelNameAndSuffix(Operation *op) {
static UKernelNameAndSuffix getUKernelNameAndSuffix(Operation *op) {
if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
if (succeeded(isArgmaxOp(genericOp))) {
return getUKernelNameAndSuffixForArgmax(genericOp);
}
} else if (auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op)) {
return getUKernelNameAndSuffixForMultiMma(multiMmaOp);
}
return {};
}
Expand All @@ -44,7 +71,7 @@ getUKernelNameAndSuffix(Operation *op) {
static IREE::GPU::UKernelConfigAttr getUKernelConfig(Operation *op) {
MLIRContext *context = op->getContext();
auto [name, suffix] = getUKernelNameAndSuffix(op);
if (name.empty() || suffix.empty()) {
if (name.empty()) {
return {};
}
auto target = IREE::HAL::ExecutableTargetAttr::lookup(op);
Expand Down

0 comments on commit 72d98bc

Please sign in to comment.