Skip to content

Commit

Permalink
[Codegen][GPU] Add a pass for basic distribution verification (#18236)
Browse files Browse the repository at this point in the history
After various levels of tiling to warps/threads and then bufferization
in GPU codegen pipelines, but before resolving distributed loops like
`scf.forall` ops, we have an opportunity for additional verification
that all operations were properly mapped to threads. In particular, any
operation that vectorized/bufferized to an operation with a write effect
must now be within a *thread* distributed context or else there is
almost certainly a write race. Such cases means something went wrong in
earlier passes and is a compiler failure.

Note: this is only added for the LLVMGPUTileAndFuse pipeline because
other pipelines allow for write effecting ops like
`memref.copy` to persist past `scf.forall` resolution.
  • Loading branch information
qedawkins authored Aug 15, 2024
1 parent b6602e8 commit 78f54c2
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 0 deletions.
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ iree_compiler_cc_library(
"GPUTileReduction.cpp",
"GPUVectorAlloc.cpp",
"GPUVectorDistribution.cpp",
"GPUVerifyDistribution.cpp",
"Passes.cpp",
"VectorReductionToGPU.cpp",
"WorkgroupReordering.cpp",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ iree_cc_library(
"GPUTileReduction.cpp"
"GPUVectorAlloc.cpp"
"GPUVectorDistribution.cpp"
"GPUVerifyDistribution.cpp"
"Passes.cpp"
"VectorReductionToGPU.cpp"
"WorkgroupReordering.cpp"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/Common/GPU/Passes.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/IR/Visitors.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"

namespace mlir::iree_compiler {

#define GEN_PASS_DEF_GPUVERIFYDISTRIBUTIONPASS
#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"

namespace {

template <typename... Type>
bool forallOpHasMappingType(scf::ForallOp forallOp) {
std::optional<ArrayAttr> mapping = forallOp.getMapping();
if (!mapping || mapping.value().empty()) {
return false;
}

return isa<Type...>(*mapping.value().begin());
}

template <typename... Type>
bool operationHasParentForallOfMappingType(Operation *op) {
auto parentForallOp = op->getParentOfType<scf::ForallOp>();
while (parentForallOp) {
if (forallOpHasMappingType<Type...>(parentForallOp)) {
return true;
}
parentForallOp = parentForallOp->getParentOfType<scf::ForallOp>();
}
return false;
}

/// Pass to verify that writes only happen in distributed contexts. Code in
/// shared contexts are executed uniformly across all threads after resolution
/// of distributed contexts (i.e. scf.forall), thus operations with write
/// memory effects are inherently
struct GPUVerifyDistributionPass final
: impl::GPUVerifyDistributionPassBase<GPUVerifyDistributionPass> {

void runOnOperation() override {
FunctionOpInterface funcOp = getOperation();

WalkResult res = funcOp.walk([](Operation *op) {
if (auto forallOp = dyn_cast<scf::ForallOp>(op)) {
std::optional<ArrayAttr> mapping = forallOp.getMapping();
if (!mapping || mapping.value().empty()) {
forallOp->emitOpError("requires a mapping attribute.");
return WalkResult::interrupt();
}

if (isa<IREE::GPU::LaneIdAttr>(*mapping.value().begin()) &&
!operationHasParentForallOfMappingType<
mlir::gpu::GPUWarpMappingAttr>(forallOp)) {
forallOp->emitOpError("lane distributed scf.forall must have a "
"parent subgroup distributed loop.");
return WalkResult::interrupt();
}
return WalkResult::advance();
}
if (auto memoryEffectOp = dyn_cast<MemoryEffectOpInterface>(op)) {
if (memoryEffectOp.hasEffect<MemoryEffects::Write>() &&
!operationHasParentForallOfMappingType<
mlir::gpu::GPUThreadMappingAttr, IREE::GPU::LaneIdAttr>(op)) {
op->emitOpError("write affecting operations are restricted to lane "
"or thread distributed contexts.");
return WalkResult::interrupt();
}
}
return WalkResult::advance();
});

if (res.wasInterrupted()) {
return signalPassFailure();
}
}
};

} // namespace

} // namespace mlir::iree_compiler
5 changes: 5 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,11 @@ def GPUTileReductionPass :
let dependentDialects = ["::mlir::scf::SCFDialect"];
}

def GPUVerifyDistributionPass :
InterfacePass<"iree-codegen-gpu-verify-distribution", "mlir::FunctionOpInterface"> {
let summary = "Pass to verify writes before resolving distributed contexts.";
}

def GPUVectorAllocPass :
InterfacePass<"iree-codegen-gpu-vector-alloc", "mlir::FunctionOpInterface"> {
let summary = "Pass to create allocations for contraction inputs to copy "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ iree_lit_test_suite(
"gpu_tile_reduction.mlir",
"gpu_vector_alloc.mlir",
"gpu_vector_distribution.mlir",
"gpu_verify_distribution.mlir",
"reduce_bank_conflicts.mlir",
"transform_gpu_distribute_shared_memory.mlir",
"transform_gpu_reduce_bank_conflicts.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ iree_lit_test_suite(
"gpu_tile_reduction.mlir"
"gpu_vector_alloc.mlir"
"gpu_vector_distribution.mlir"
"gpu_verify_distribution.mlir"
"reduce_bank_conflicts.mlir"
"transform_gpu_distribute_shared_memory.mlir"
"transform_gpu_reduce_bank_conflicts.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// RUN: iree-opt %s --split-input-file --verify-diagnostics \
// RUN: --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-verify-distribution))"

func.func @unmapped_forall(%out : memref<32xi32>) {
// expected-error @+1 {{requires a mapping attribute}}
scf.forall (%arg0) in (32) {
}
return
}

// -----

func.func @write_in_warp_forall(%out : memref<32xi32>) {
%c0 = arith.constant 0 : i32
scf.forall (%arg0) in (32) {
// expected-error@+1 {{write affecting operations are restricted to lane or thread distributed contexts}}
memref.store %c0, %out[%arg0] : memref<32xi32>
} {mapping = [#gpu.warp<x>]}
return
}

// -----

func.func @lane_forall_no_warp_parent(%out : memref<32xi32>) {
// expected-error@+1 {{lane distributed scf.forall must have a parent subgroup distributed loop}}
scf.forall (%arg0) in (32) {
} {mapping = [#iree_gpu.lane_id<0>]}
return
}
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) {
addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/false);

// Step 8. Resolve remaining parallel loops.
funcPassManager.addPass(createGPUVerifyDistributionPass());
funcPassManager.addPass(createGPUDistributePass());

// Vectorize copies that came out of bufferization.
Expand Down

0 comments on commit 78f54c2

Please sign in to comment.