diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp index 42043adc37b715..74ce6a9fc4ac08 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp @@ -34,19 +34,18 @@ void NVPTXSubtarget::anchor() {} NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { - // Provide the default CPU if we don't have one. - TargetName = std::string(CPU.empty() ? "sm_30" : CPU); + TargetName = std::string(CPU); - ParseSubtargetFeatures(TargetName, /*TuneCPU*/ TargetName, FS); + ParseSubtargetFeatures(getTargetName(), /*TuneCPU=*/getTargetName(), FS); - // Re-map SM version numbers, SmVersion carries the regular SMs which do - // have relative order, while FullSmVersion allows distinguishing sm_90 from - // sm_90a, which would *not* be a subset of sm_91. - SmVersion = getSmVersion(); + // Re-map SM version numbers, SmVersion carries the regular SMs which do + // have relative order, while FullSmVersion allows distinguishing sm_90 from + // sm_90a, which would *not* be a subset of sm_91. + SmVersion = getSmVersion(); - // Set default to PTX 6.0 (CUDA 9.0) - if (PTXVersion == 0) { - PTXVersion = 60; + // Set default to PTX 6.0 (CUDA 9.0) + if (PTXVersion == 0) { + PTXVersion = 60; } return *this; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 7555a2368ec963..bbc1cca7c12d85 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -111,7 +111,12 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { // - 0 represents base GPU model, // - non-zero value identifies particular architecture-accelerated variant. bool hasAAFeatures() const { return getFullSmVersion() % 10; } - std::string getTargetName() const { return TargetName; } + + // If the user did not provide a target we default to the `sm_30` target. + std::string getTargetName() const { + return TargetName.empty() ? "sm_30" : TargetName; + } + bool hasTargetName() const { return !TargetName.empty(); } // Get maximum value of required alignments among the supported data types. // From the PTX ISA doc, section 8.2.3: diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index b3b2880588cc59..6d4b82aa54a2b8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -255,7 +255,10 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerPipelineStartEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; - FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); + // We do not want to fold out calls to nvvm.reflect early if the user + // has not provided a target architecture just yet. + if (Subtarget.hasTargetName()) + FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); // Note: NVVMIntrRangePass was causing numerical discrepancies at one // point, if issues crop up, consider disabling. FPM.addPass(NVVMIntrRangePass()); diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 56525a1edc7614..a0e897584a9d32 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -21,6 +21,7 @@ #include "NVPTX.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/CodeGen/CommandFlags.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -219,7 +220,12 @@ bool NVVMReflect::runOnFunction(Function &F) { return runNVVMReflect(F, SmVersion); } -NVVMReflectPass::NVVMReflectPass() : NVVMReflectPass(0) {} +NVVMReflectPass::NVVMReflectPass() { + // Get the CPU string from the command line if not provided. + StringRef SM = codegen::getMCPU(); + if (!SM.consume_front("sm_") || SM.consumeInteger(10, SmVersion)) + SmVersion = 0; +} PreservedAnalyses NVVMReflectPass::run(Function &F, FunctionAnalysisManager &AM) { diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll index ac5875c6ab1043..83cb3cde48de18 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch.ll @@ -1,9 +1,9 @@ ; Libdevice in recent CUDA versions relies on __CUDA_ARCH reflecting GPU type. ; Verify that __nvvm_reflect() is replaced with an appropriate value. ; -; RUN: opt %s -S -passes='default' -mtriple=nvptx64 -mcpu=sm_20 \ +; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \ ; RUN: | FileCheck %s --check-prefixes=COMMON,SM20 -; RUN: opt %s -S -passes='default' -mtriple=nvptx64 -mcpu=sm_35 \ +; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \ ; RUN: | FileCheck %s --check-prefixes=COMMON,SM35 @"$str" = private addrspace(1) constant [12 x i8] c"__CUDA_ARCH\00" diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll index 9d383218dce86a..bf8d6e2cca3071 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-ocl.ll @@ -1,8 +1,8 @@ ; Verify that __nvvm_reflect_ocl() is replaced with an appropriate value ; -; RUN: opt %s -S -passes='default' -mtriple=nvptx64 -mcpu=sm_20 \ +; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_20 \ ; RUN: | FileCheck %s --check-prefixes=COMMON,SM20 -; RUN: opt %s -S -passes='default' -mtriple=nvptx64 -mcpu=sm_35 \ +; RUN: opt %s -S -passes='nvvm-reflect' -mtriple=nvptx64 -mcpu=sm_35 \ ; RUN: | FileCheck %s --check-prefixes=COMMON,SM35 @"$str" = private addrspace(4) constant [12 x i8] c"__CUDA_ARCH\00" diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll index 46ab79d9858cad..19c74df3037028 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-opaque.ll @@ -3,12 +3,12 @@ ; RUN: cat %s > %t.noftz ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz -; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default' \ +; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \ ; RUN: | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK ; RUN: cat %s > %t.ftz ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz -; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default' \ +; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \ ; RUN: | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00" @@ -43,7 +43,7 @@ exit: declare i32 @llvm.nvvm.reflect(ptr) -; CHECK-LABEL: define noundef i32 @intrinsic +; CHECK-LABEL: define i32 @intrinsic define i32 @intrinsic() { ; CHECK-NOT: call i32 @llvm.nvvm.reflect ; USE_FTZ_0: ret i32 0 diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll index 2ed9f7c11bcf9b..244b44fea9b83c 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect.ll @@ -3,12 +3,12 @@ ; RUN: cat %s > %t.noftz ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 0}' >> %t.noftz -; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='default' \ +; RUN: opt %t.noftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \ ; RUN: | FileCheck %s --check-prefix=USE_FTZ_0 --check-prefix=CHECK ; RUN: cat %s > %t.ftz ; RUN: echo '!0 = !{i32 4, !"nvvm-reflect-ftz", i32 1}' >> %t.ftz -; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='default' \ +; RUN: opt %t.ftz -S -mtriple=nvptx-nvidia-cuda -passes='nvvm-reflect,simplifycfg' \ ; RUN: | FileCheck %s --check-prefix=USE_FTZ_1 --check-prefix=CHECK @str = private unnamed_addr addrspace(4) constant [11 x i8] c"__CUDA_FTZ\00" @@ -43,7 +43,8 @@ exit: declare i32 @llvm.nvvm.reflect(ptr) -; CHECK-LABEL: define noundef i32 @intrinsic +; CHECK-LABEL: define i32 @intrinsic + define i32 @intrinsic() { ; CHECK-NOT: call i32 @llvm.nvvm.reflect ; USE_FTZ_0: ret i32 0