Skip to content

Commit

Permalink
[NFC] GPU ukernels cleanups (#19503)
Browse files Browse the repository at this point in the history
1. Rename `UKernelSpec` to `UKernelConfig`. I was grappling for the
right word, but now that it's part of `LoweringConfig`, it's clearer.
2. Drop unused `KernelConfig` case for ukernel ops. The lowering to
ukernel ops happens after `KernelConfig`.
3. To stringify types, instead of using a stringstream, we can actually
just use `llvm::formatv`.
4. Reorganize LLVMGPUSelectUKernels.cpp to make it easier to add logic
for other ukernels.

Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
  • Loading branch information
bjacob authored Dec 17, 2024
1 parent 1894af3 commit a5cf548
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 66 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func.func @argmax_2d_f32i64(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes
// CHECK: linalg.generic
// CHECK-SAME: hal.executable.objects = [
// CEHCK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", data = dense_resource<iree_uk_amdgpu_argmax_f32i64.gfx942.bc> : vector<{{[0-9]+}}xi8>}>]
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>

// -----

Expand Down Expand Up @@ -54,7 +54,7 @@ func.func @argmax_4d_unit_parallel_f32i64(%arg0 : tensor<1x1x1x?xf32>) -> tensor
// CHECK: linalg.generic
// CHECK-SAME: hal.executable.objects = [
// CEHCK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", data = dense_resource<iree_uk_amdgpu_argmax_f32i64.gfx942.bc> : vector<{{[0-9]+}}xi8>}>]
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>

// -----

Expand Down Expand Up @@ -82,7 +82,7 @@ func.func @argmax_none_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1xi64>
// CHECK-LABEL: func @argmax_none_ukernel_enabled(
// CHECK: linalg.generic
// CHECK-NOT: hal.executable.objects
// CHECK-NOT: iree_gpu.ukernel_spec
// CHECK-NOT: iree_gpu.ukernel_config

// -----

Expand Down Expand Up @@ -111,7 +111,7 @@ func.func @argmax_only_argmax_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor
// CHECK: linalg.generic
// CHECK-SAME: hal.executable.objects = [
// CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", data = dense_resource<iree_uk_amdgpu_argmax_f32i64.gfx942.bc> : vector<{{[0-9]+}}xi8>}>]
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>

// -----

Expand Down Expand Up @@ -140,7 +140,7 @@ func.func @argmax_only_foo_argmax_bar_ukernel_enabled(%arg0 : tensor<1x?xf32>) -
// CHECK: linalg.generic
// CHECK-SAME: hal.executable.objects = [
// CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_argmax_f32i64.gfx942.bc", data = dense_resource<iree_uk_amdgpu_argmax_f32i64.gfx942.bc> : vector<{{[0-9]+}}xi8>}>]
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>

// -----

Expand Down Expand Up @@ -168,7 +168,7 @@ func.func @argmax_only_foo_ukernel_enabled(%arg0 : tensor<1x?xf32>) -> tensor<1x
// CHECK-LABEL: func @argmax_only_foo_ukernel_enabled(
// CHECK: linalg.generic
// CHECK-NOT: hal.executable.objects
// CHECK-NOT: iree_gpu.ukernel_spec
// CHECK-NOT: iree_gpu.ukernel_config

// -----

Expand Down Expand Up @@ -239,4 +239,4 @@ func.func @argmax_2d_f32i64_custom_bitcode(%arg0 : tensor<1x?xf32>) -> tensor<1x
// CHECK-SAME: data = dense<[66, 67, -64, -34, 1, 35, 69, 103, -119, -85, -51, -17]> : tensor<12xi8>
// CHECK-SAME: }>
// CHECK-SAME: ]
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_spec<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>
// CHECK-SAME: #iree_gpu.lowering_config<{{.*}}ukernel = #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_argmax_f32i64", def_attrs = {vm.import.module = "rocm"}>
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ matchArgmaxDAGForUKernel(RewriterBase &rewriter, linalg::GenericOp op) {
if (!loweringConfig) {
return rewriter.notifyMatchFailure(op, "no lowering_config on this op");
}
IREE::GPU::UKernelSpecAttr ukernelAttr =
IREE::GPU::UKernelConfigAttr ukernelAttr =
IREE::GPU::getUkernelSpec(loweringConfig);
if (!ukernelAttr) {
return rewriter.notifyMatchFailure(op, "no ukernel selected for this op");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-lower-to-ukernels,cse,canonicalize))" %s | FileCheck %s

#config = #iree_gpu.lowering_config<{ukernel = #iree_gpu.ukernel_spec<name = "some_ukernel", def_attrs = {vm.import.module = "rocm"}>}>
#config = #iree_gpu.lowering_config<{ukernel = #iree_gpu.ukernel_config<name = "some_ukernel", def_attrs = {vm.import.module = "rocm"}>}>
func.func @argmax_f32i64_with_selected_ukernel(%arg0 : tensor<1x?xf32>) -> tensor<1xi64> attributes {
hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "all"}>
} {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,9 @@ std::optional<SmallVector<int64_t>> getPaddingList(LoweringConfigAttr config) {
return getIntegerVector(array);
}

IREE::GPU::UKernelSpecAttr
IREE::GPU::UKernelConfigAttr
getUkernelSpec(IREE::GPU::LoweringConfigAttr config) {
return config.getAttributes().getAs<IREE::GPU::UKernelSpecAttr>("ukernel");
return config.getAttributes().getAs<IREE::GPU::UKernelConfigAttr>("ukernel");
}

} // namespace mlir::iree_compiler::IREE::GPU
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ void setPromotedOperandList(MLIRContext *context,
/// Helper to retrieve list of operand to pad.
std::optional<SmallVector<int64_t>> getPaddingList(LoweringConfigAttr config);

IREE::GPU::UKernelSpecAttr getUkernelSpec(IREE::GPU::LoweringConfigAttr config);
IREE::GPU::UKernelConfigAttr
getUkernelSpec(IREE::GPU::LoweringConfigAttr config);

} // namespace mlir::iree_compiler::IREE::GPU

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -521,12 +521,12 @@ def IREEGPU_LaneIdAttr : AttrDef<IREEGPU_Dialect, "LaneId", [
}

//===---------------------------------------------------------------------===//
// iree_gpu.ukernel_spec
// iree_gpu.ukernel_config
//===---------------------------------------------------------------------===//

def IREEGPU_UKernelSpecAttr :
AttrDef<IREEGPU_Dialect, "UKernelSpec", []> {
let mnemonic = "ukernel_spec";
def IREEGPU_UKernelConfigAttr :
AttrDef<IREEGPU_Dialect, "UKernelConfig", []> {
let mnemonic = "ukernel_config";
let summary = "An attribute specifying a ukernel that an op can lower to.";
let description = [{
An attribute that can be applied to any operation to specify that it has
Expand Down
9 changes: 3 additions & 6 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2103,14 +2103,11 @@ static LogicalResult
setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
mlir::FunctionOpInterface entryPoint,
linalg::GenericOp op) {
// Checks if UKernels are enabled.
IREE::GPU::UKernelSpecAttr ukernelSpec = selectUKernelForArgmax(op);
if (!ukernelSpec) {
IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(op);
if (!ukernelConfig) {
return failure();
}

if (failed(isArgmaxOp(op)))
return failure();
SmallVector<unsigned> parallelDims;
SmallVector<unsigned> reductionDims;
op.getParallelDims(parallelDims);
Expand Down Expand Up @@ -2161,7 +2158,7 @@ setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
b.getI64ArrayAttr(workgroupTileSizes));
attrs.emplace_back(StringAttr::get(context, "reduction"),
b.getI64ArrayAttr(reductionTileSizes));
attrs.emplace_back(StringAttr::get(context, "ukernel"), ukernelSpec);
attrs.emplace_back(StringAttr::get(context, "ukernel"), ukernelConfig);
IREE::GPU::setPromotedOperandList(context, attrs, {0, 1});
auto configDict = DictionaryAttr::get(context, attrs);
auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,49 @@ namespace mlir::iree_compiler {

namespace {

constexpr StringLiteral executableObjectsAttrName = "hal.executable.objects";
// Returns ukernel name and suffix for argmax. Empty name = no ukernel.
static std::tuple<std::string, std::string>
getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
Value input = op.getDpsInputOperand(0)->get();
auto inputType = cast<ShapedType>(input.getType());
Value index = op.getDpsInitOperand(1)->get();
auto indexType = cast<ShapedType>(index.getType());
return {"argmax", llvm::formatv("{}{}", inputType.getElementType(),
indexType.getElementType())};
}

// Returns ukernel name and suffix for any op. Empty name = no ukernel.
static std::tuple<std::string, std::string>
getUKernelNameAndSuffix(Operation *op) {
if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
if (succeeded(isArgmaxOp(genericOp))) {
return getUKernelNameAndSuffixForArgmax(genericOp);
}
}
return {};
}

// Returns the UKernelConfigAttr for any op. Returns {} if no ukernel.
static IREE::GPU::UKernelConfigAttr getUKernelConfig(Operation *op) {
MLIRContext *context = op->getContext();
auto [name, suffix] = getUKernelNameAndSuffix(op);
if (name.empty() || suffix.empty()) {
return {};
}
auto target = IREE::HAL::ExecutableTargetAttr::lookup(op);
if (!hasUkernel(target, name)) {
return {};
}
if (isROCMBackend(target)) {
auto nameAttr = StringAttr::get(
context, llvm::formatv("iree_uk_amdgpu_{}_{}", name, suffix));
auto defsAttr = DictionaryAttr::get(
context, {{StringAttr::get(context, "vm.import.module"),
StringAttr::get(context, "rocm")}});
return IREE::GPU::UKernelConfigAttr::get(context, nameAttr, defsAttr);
}
return {};
}

// Returns a ExecutableObjectAttr carrying the bitcode for the given ukernel.
//
Expand Down Expand Up @@ -77,7 +119,8 @@ getUKernelBitcode(MLIRContext *context,
// array attribute. If the parent hal.executable.variant is reached, its objects
// attribute is returned.
// Adapted from ExecutableTargetAttr::lookup.
static ArrayAttr lookUpExecutableObjects(Operation *op) {
static ArrayAttr lookUpExecutableObjects(Operation *op,
StringRef executableObjectsAttrName) {
MLIRContext *context = op->getContext();
auto attrId = StringAttr::get(context, executableObjectsAttrName);
while (op) {
Expand All @@ -97,56 +140,39 @@ static ArrayAttr lookUpExecutableObjects(Operation *op) {
return {};
}

/// Returns the function name and attributes to use for a ukernel with given
/// `name` and `suffix` on the target described by `targetAttr`.
static IREE::GPU::UKernelSpecAttr
getUKernelSpec(StringRef name, StringRef suffix, MLIRContext *context,
IREE::HAL::ExecutableTargetAttr targetAttr) {
if (isROCMBackend(targetAttr)) {
auto nameAttr = StringAttr::get(
context, llvm::formatv("iree_uk_amdgpu_{}_{}", name, suffix));
auto defsAttr = DictionaryAttr::get(
context, {{StringAttr::get(context, "vm.import.module"),
StringAttr::get(context, "rocm")}});
return IREE::GPU::UKernelSpecAttr::get(context, nameAttr, defsAttr);
// Ensures that the op has ukernel bitcode as a hal.executable.object, stored
// as a hal.executable.objects attribute on the op itself, ready to be hoisted
// by the HoistExecutableObjects pass.
// Returns failure if no bitcode was found for the configured ukernel.
static LogicalResult
ensureUKernelBitcode(Operation *op,
IREE::GPU::UKernelConfigAttr ukernelConfig) {
constexpr StringLiteral executableObjectsAttrName = "hal.executable.objects";
auto target = IREE::HAL::ExecutableTargetAttr::lookup(op);
ArrayAttr sourceExecutableObjects =
lookUpExecutableObjects(op, executableObjectsAttrName);
MLIRContext *context = op->getContext();
IREE::HAL::ExecutableObjectAttr bitcodeObject = getUKernelBitcode(
context, target, sourceExecutableObjects, ukernelConfig.getName());
if (!bitcodeObject) {
return failure();
}
return {};
op->setAttr(executableObjectsAttrName,
ArrayAttr::get(context, bitcodeObject));
return success();
}

} // namespace

IREE::GPU::UKernelSpecAttr selectUKernelForArgmax(linalg::GenericOp op) {
if (failed(isArgmaxOp(op))) {
return {};
}
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op);
const char ukernelName[] = "argmax";
if (!hasUkernel(targetAttr, ukernelName)) {
return {};
}
Value input = op.getDpsInputOperand(0)->get();
auto inputType = cast<ShapedType>(input.getType());
Value index = op.getDpsInitOperand(1)->get();
auto indexType = cast<ShapedType>(index.getType());
std::string suffix;
llvm::raw_string_ostream(suffix)
<< inputType.getElementType() << indexType.getElementType();
MLIRContext *context = op->getContext();
IREE::GPU::UKernelSpecAttr ukernelSpec =
getUKernelSpec(ukernelName, suffix, context, targetAttr);
if (!ukernelSpec) {
IREE::GPU::UKernelConfigAttr selectUKernel(Operation *op) {
IREE::GPU::UKernelConfigAttr ukernelConfig = getUKernelConfig(op);
if (!ukernelConfig) {
return {};
}
auto execTarget = IREE::HAL::ExecutableTargetAttr::lookup(op);
ArrayAttr sourceExecutableObjects = lookUpExecutableObjects(op);
IREE::HAL::ExecutableObjectAttr bitcodeObject = getUKernelBitcode(
context, execTarget, sourceExecutableObjects, ukernelSpec.getName());
if (!bitcodeObject) {
if (failed(ensureUKernelBitcode(op, ukernelConfig))) {
return {};
}
op->setAttr(executableObjectsAttrName,
ArrayAttr::get(context, bitcodeObject));
return ukernelSpec;
return ukernelConfig;
}

} // namespace mlir::iree_compiler
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@

namespace mlir::iree_compiler {

IREE::GPU::UKernelSpecAttr selectUKernelForArgmax(linalg::GenericOp op);
IREE::GPU::UKernelConfigAttr selectUKernel(Operation *op);

} // namespace mlir::iree_compiler

0 comments on commit a5cf548

Please sign in to comment.