diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel index da454087781c..4a0b879b94ee 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel @@ -72,6 +72,7 @@ iree_compiler_cc_library( "GPUTileReduction.cpp", "GPUVectorAlloc.cpp", "GPUVectorDistribution.cpp", + "GPUVerifyDistribution.cpp", "Passes.cpp", "VectorReductionToGPU.cpp", "WorkgroupReordering.cpp", diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt index e22fa0306556..eb51b3ec2408 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt @@ -70,6 +70,7 @@ iree_cc_library( "GPUTileReduction.cpp" "GPUVectorAlloc.cpp" "GPUVectorDistribution.cpp" + "GPUVerifyDistribution.cpp" "Passes.cpp" "VectorReductionToGPU.cpp" "WorkgroupReordering.cpp" diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp new file mode 100644 index 000000000000..273cadfff5a8 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVerifyDistribution.cpp @@ -0,0 +1,91 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree/compiler/Codegen/Common/GPU/Passes.h" +#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h" +#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/IR/Visitors.h" +#include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" + +namespace mlir::iree_compiler { + +#define GEN_PASS_DEF_GPUVERIFYDISTRIBUTIONPASS +#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc" + +namespace { + +template +bool forallOpHasMappingType(scf::ForallOp forallOp) { + std::optional mapping = forallOp.getMapping(); + if (!mapping || mapping.value().empty()) { + return false; + } + + return isa(*mapping.value().begin()); +} + +template +bool operationHasParentForallOfMappingType(Operation *op) { + auto parentForallOp = op->getParentOfType(); + while (parentForallOp) { + if (forallOpHasMappingType(parentForallOp)) { + return true; + } + parentForallOp = parentForallOp->getParentOfType(); + } + return false; +} + +/// Pass to verify that writes only happen in distributed contexts. Code in +/// shared contexts are executed uniformly across all threads after resolution +/// of distributed contexts (i.e. scf.forall), thus operations with write +/// memory effects are inherently +struct GPUVerifyDistributionPass final + : impl::GPUVerifyDistributionPassBase { + + void runOnOperation() override { + FunctionOpInterface funcOp = getOperation(); + + WalkResult res = funcOp.walk([](Operation *op) { + if (auto forallOp = dyn_cast(op)) { + std::optional mapping = forallOp.getMapping(); + if (!mapping || mapping.value().empty()) { + forallOp->emitOpError("requires a mapping attribute."); + return WalkResult::interrupt(); + } + + if (isa(*mapping.value().begin()) && + !operationHasParentForallOfMappingType< + mlir::gpu::GPUWarpMappingAttr>(forallOp)) { + forallOp->emitOpError("lane distributed scf.forall must have a " + "parent subgroup distributed loop."); + return WalkResult::interrupt(); + } + return WalkResult::advance(); + } + if (auto memoryEffectOp = dyn_cast(op)) { + if (memoryEffectOp.hasEffect() && + !operationHasParentForallOfMappingType< + mlir::gpu::GPUThreadMappingAttr, IREE::GPU::LaneIdAttr>(op)) { + op->emitOpError("write affecting operations are restricted to lane " + "or thread distributed contexts."); + return WalkResult::interrupt(); + } + } + return WalkResult::advance(); + }); + + if (res.wasInterrupted()) { + return signalPassFailure(); + } + } +}; + +} // namespace + +} // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td index 36507fdd0a41..cec8ba43a030 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td @@ -184,6 +184,11 @@ def GPUTileReductionPass : let dependentDialects = ["::mlir::scf::SCFDialect"]; } +def GPUVerifyDistributionPass : + InterfacePass<"iree-codegen-gpu-verify-distribution", "mlir::FunctionOpInterface"> { + let summary = "Pass to verify writes before resolving distributed contexts."; +} + def GPUVectorAllocPass : InterfacePass<"iree-codegen-gpu-vector-alloc", "mlir::FunctionOpInterface"> { let summary = "Pass to create allocations for contraction inputs to copy " diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel index 14dbfda1848b..5854bd5ca932 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel @@ -38,6 +38,7 @@ iree_lit_test_suite( "gpu_tile_reduction.mlir", "gpu_vector_alloc.mlir", "gpu_vector_distribution.mlir", + "gpu_verify_distribution.mlir", "reduce_bank_conflicts.mlir", "transform_gpu_distribute_shared_memory.mlir", "transform_gpu_reduce_bank_conflicts.mlir", diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt index 9ccc268d0d79..a61138be693a 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt @@ -34,6 +34,7 @@ iree_lit_test_suite( "gpu_tile_reduction.mlir" "gpu_vector_alloc.mlir" "gpu_vector_distribution.mlir" + "gpu_verify_distribution.mlir" "reduce_bank_conflicts.mlir" "transform_gpu_distribute_shared_memory.mlir" "transform_gpu_reduce_bank_conflicts.mlir" diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir new file mode 100644 index 000000000000..cf65a02d05c0 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_verify_distribution.mlir @@ -0,0 +1,29 @@ +// RUN: iree-opt %s --split-input-file --verify-diagnostics \ +// RUN: --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-verify-distribution))" + +func.func @unmapped_forall(%out : memref<32xi32>) { + // expected-error @+1 {{requires a mapping attribute}} + scf.forall (%arg0) in (32) { + } + return +} + +// ----- + +func.func @write_in_warp_forall(%out : memref<32xi32>) { + %c0 = arith.constant 0 : i32 + scf.forall (%arg0) in (32) { + // expected-error@+1 {{write affecting operations are restricted to lane or thread distributed contexts}} + memref.store %c0, %out[%arg0] : memref<32xi32> + } {mapping = [#gpu.warp]} + return +} + +// ----- + +func.func @lane_forall_no_warp_parent(%out : memref<32xi32>) { + // expected-error@+1 {{lane distributed scf.forall must have a parent subgroup distributed loop}} + scf.forall (%arg0) in (32) { + } {mapping = [#iree_gpu.lane_id<0>]} + return +} diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 02d03428dad8..0fd672b97c49 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -382,6 +382,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager) { addBufferizePasses(funcPassManager, /*allowPrivateAllocations=*/false); // Step 8. Resolve remaining parallel loops. + funcPassManager.addPass(createGPUVerifyDistributionPass()); funcPassManager.addPass(createGPUDistributePass()); // Vectorize copies that came out of bufferization.