-
Notifications
You must be signed in to change notification settings - Fork 755
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SYCL] Add support for -foffload-fp32-prec-div/sqrt options. #15836
base: sycl
Are you sure you want to change the base?
Changes from 19 commits
f8caf83
00ffb5a
795dd38
78a9005
50e71c0
54f2409
bdf78d7
8cd6d8b
755d630
27011c8
ff2b3d9
07752e2
aa909d2
fcc4786
24711fd
56314b7
e643027
b25e5ac
ce00296
bc01759
c5fffc5
f2fb8b2
83c9b31
0efc825
e1de775
34f07cc
e18930f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1879,25 +1879,44 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes( | |
// the 'FPAccuracyFuncMap'; if no accuracy is mapped to Name (FuncAttrs | ||
// is empty), then set its accuracy from the TU's accuracy value. | ||
if (!getLangOpts().FPAccuracyFuncMap.empty()) { | ||
StringRef FPAccuracyVal; | ||
auto FuncMapIt = getLangOpts().FPAccuracyFuncMap.find(Name.str()); | ||
if (FuncMapIt != getLangOpts().FPAccuracyFuncMap.end()) { | ||
StringRef FPAccuracyVal = llvm::fp::getAccuracyForFPBuiltin( | ||
ID, FuncType, convertFPAccuracy(FuncMapIt->second)); | ||
if (!getLangOpts().OffloadFP32PrecDiv && Name == "fdiv") | ||
FPAccuracyVal = "2.5"; | ||
else if (!getLangOpts().OffloadFP32PrecSqrt && Name == "sqrt") | ||
FPAccuracyVal = "3.0"; | ||
else | ||
FPAccuracyVal = llvm::fp::getAccuracyForFPBuiltin( | ||
ID, FuncType, convertFPAccuracy(FuncMapIt->second)); | ||
assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected"); | ||
FuncAttrs.addAttribute("fpbuiltin-max-error", FPAccuracyVal); | ||
MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( | ||
Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second))); | ||
} | ||
} | ||
if (FuncAttrs.attrs().size() == 0) | ||
if (FuncAttrs.attrs().size() == 0) { | ||
if (!getLangOpts().FPAccuracyVal.empty()) { | ||
StringRef FPAccuracyVal = llvm::fp::getAccuracyForFPBuiltin( | ||
ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal)); | ||
StringRef FPAccuracyVal; | ||
if (!getLangOpts().OffloadFP32PrecDiv && Name == "fdiv") | ||
FPAccuracyVal = "2.5"; | ||
else if (!getLangOpts().OffloadFP32PrecSqrt && Name == "sqrt") | ||
FPAccuracyVal = "3.0"; | ||
else | ||
FPAccuracyVal = llvm::fp::getAccuracyForFPBuiltin( | ||
ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal)); | ||
assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected"); | ||
FuncAttrs.addAttribute("fpbuiltin-max-error", FPAccuracyVal); | ||
MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( | ||
Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal))); | ||
} else { | ||
if (!getLangOpts().OffloadFP32PrecDiv && Name == "fdiv") { | ||
FuncAttrs.addAttribute("fpbuiltin-max-error", "2.5"); | ||
} else if (!getLangOpts().OffloadFP32PrecSqrt && Name == "sqrt") { | ||
FuncAttrs.addAttribute("fpbuiltin-max-error", "3.0"); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/// Add denormal-fp-math and denormal-fp-math-f32 as appropriate for the | ||
|
@@ -5790,10 +5809,20 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, | |
// Emit the actual call/invoke instruction. | ||
llvm::CallBase *CI; | ||
if (!InvokeDest) { | ||
if (!getLangOpts().FPAccuracyFuncMap.empty() || | ||
!getLangOpts().FPAccuracyVal.empty()) { | ||
const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl); | ||
if (FD && FD->getNameInfo().getName().isIdentifier()) { | ||
const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl); | ||
if (FD && FD->getNameInfo().getName().isIdentifier()) { | ||
StringRef FuncName = FD->getName(); | ||
const bool IsFloat32Type = FD->getReturnType()->isFloat32Type(); | ||
bool hasFPAccuracyFuncMap = hasAccuracyRequirement(FuncName); | ||
bool hasFPAccuracyVal = !getLangOpts().FPAccuracyVal.empty(); | ||
bool isFp32SqrtFunction = | ||
(FuncName == "sqrt" && !getLangOpts().OffloadFP32PrecSqrt && | ||
IsFloat32Type); | ||
bool isFP32FdivFunction = | ||
(FuncName == "fdiv" && !getLangOpts().OffloadFP32PrecDiv && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I actually though, that the request is done to replace fdiv instruction with the intrinsic, not fdiv function. Do we know if users actually use such function? I don't see any mentioning of it in SYCL or OpenCL specifications. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gmlueck could you please comment on that? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The intent of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAIK there is no standard function float FP division. There is std::div, but it works only on integers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no C/C++ fdiv function. |
||
IsFloat32Type); | ||
if (hasFPAccuracyFuncMap || hasFPAccuracyVal || isFp32SqrtFunction || | ||
isFP32FdivFunction) { | ||
CI = MaybeEmitFPBuiltinofFD(IRFuncTy, IRCallArgs, CalleePtr, | ||
FD->getName(), FD->getBuiltinID()); | ||
if (CI) | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -2950,10 +2950,32 @@ RenderComplexRangeOption(LangOptions::ComplexRangeKind Range) { | |||||||||||
return ComplexRangeStr; | ||||||||||||
} | ||||||||||||
|
||||||||||||
static void EmitAccuracyDiag(const Driver &D, const JobAction &JA, | ||||||||||||
StringRef AccuracValStr, StringRef TargetPrecStr) { | ||||||||||||
if (JA.isDeviceOffloading(Action::OFK_SYCL)) { | ||||||||||||
D.Diag(clang::diag:: | ||||||||||||
warn_acuracy_conflicts_with_explicit_offload_fp32_prec_option) | ||||||||||||
<< AccuracValStr << TargetPrecStr; | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
static SmallVector<StringRef, 8> SplitFPAccuracyVal(StringRef Val) { | ||||||||||||
SmallVector<StringRef, 8> ValuesArr; | ||||||||||||
SmallVector<StringRef, 8> FuncsArr; | ||||||||||||
Val.split(ValuesArr, ":"); | ||||||||||||
if (ValuesArr.size() > 1) { | ||||||||||||
StringRef x = ValuesArr[1]; | ||||||||||||
x.split(FuncsArr, ","); | ||||||||||||
} | ||||||||||||
return FuncsArr; | ||||||||||||
} | ||||||||||||
|
||||||||||||
static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, | ||||||||||||
bool OFastEnabled, const ArgList &Args, | ||||||||||||
ArgStringList &CmdArgs, | ||||||||||||
const JobAction &JA) { | ||||||||||||
const JobAction &JA, | ||||||||||||
bool &NoOffloadFP32PrecDiv, | ||||||||||||
bool &NoOffloadFP32PrecSqrt) { | ||||||||||||
// Handle various floating point optimization flags, mapping them to the | ||||||||||||
// appropriate LLVM code generation flags. This is complicated by several | ||||||||||||
// "umbrella" flags, so we do this by stepping through the flags incrementally | ||||||||||||
|
@@ -2998,6 +3020,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, | |||||||||||
LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None; | ||||||||||||
std::string ComplexRangeStr = ""; | ||||||||||||
std::string GccRangeComplexOption = ""; | ||||||||||||
bool IsFp32PrecDivSqrtAllowed = JA.isDeviceOffloading(Action::OFK_SYCL) && | ||||||||||||
!JA.isDeviceOffloading(Action::OFK_Cuda) && | ||||||||||||
!JA.isOffloading(Action::OFK_HIP); | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed offlne, something like:
Suggested change
|
||||||||||||
|
||||||||||||
// Lambda to set fast-math options. This is also used by -ffp-model=fast | ||||||||||||
auto applyFastMath = [&]() { | ||||||||||||
|
@@ -3027,6 +3052,12 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, | |||||||||||
: ComplexArithmeticStr(LangOptions::ComplexRangeKind::CX_Basic)); | ||||||||||||
Range = LangOptions::ComplexRangeKind::CX_Basic; | ||||||||||||
SeenUnsafeMathModeOption = true; | ||||||||||||
if (IsFp32PrecDivSqrtAllowed) { | ||||||||||||
// when fp-model=fast is used the default precision for division and | ||||||||||||
// sqrt is not precise. | ||||||||||||
NoOffloadFP32PrecDiv = true; | ||||||||||||
NoOffloadFP32PrecSqrt = true; | ||||||||||||
} | ||||||||||||
}; | ||||||||||||
|
||||||||||||
// Lambda to consolidate common handling for fp-contract | ||||||||||||
|
@@ -3055,11 +3086,48 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, | |||||||||||
CmdArgs.push_back(A->getValue()); | ||||||||||||
} | ||||||||||||
|
||||||||||||
auto addSPIRVArgs = [&](StringRef SPIRVArg) { | ||||||||||||
if (IsFp32PrecDivSqrtAllowed) { | ||||||||||||
if (!FPAccuracy.empty()) | ||||||||||||
EmitAccuracyDiag(D, JA, FPAccuracy, SPIRVArg); | ||||||||||||
if (SPIRVArg == "-fno-offload-fp32-prec-div") | ||||||||||||
NoOffloadFP32PrecDiv = true; | ||||||||||||
else if (SPIRVArg == "-fno-offload-fp32-prec-sqrt") | ||||||||||||
NoOffloadFP32PrecSqrt = true; | ||||||||||||
else if (SPIRVArg == "-foffload-fp32-prec-sqrt") | ||||||||||||
NoOffloadFP32PrecSqrt = false; | ||||||||||||
else if (SPIRVArg == "-foffload-fp32-prec-div") | ||||||||||||
NoOffloadFP32PrecDiv = false; | ||||||||||||
} | ||||||||||||
}; | ||||||||||||
|
||||||||||||
auto parseFPAccOption = [&](StringRef Val, bool &NoOffloadFlag) { | ||||||||||||
SmallVector<StringRef, 8> FuncsArr = SplitFPAccuracyVal(Val); | ||||||||||||
for (const auto &V : FuncsArr) { | ||||||||||||
if (V == "fdiv") | ||||||||||||
NoOffloadFlag = false; | ||||||||||||
else if (V == "sqrt") | ||||||||||||
NoOffloadFlag = false; | ||||||||||||
} | ||||||||||||
}; | ||||||||||||
|
||||||||||||
for (const Arg *A : Args) { | ||||||||||||
switch (A->getOption().getID()) { | ||||||||||||
// If this isn't an FP option skip the claim below | ||||||||||||
default: continue; | ||||||||||||
|
||||||||||||
case options::OPT_foffload_fp32_prec_div: | ||||||||||||
addSPIRVArgs("-foffload-fp32-prec-div"); | ||||||||||||
break; | ||||||||||||
case options::OPT_foffload_fp32_prec_sqrt: | ||||||||||||
addSPIRVArgs("-foffload-fp32-prec-sqrt"); | ||||||||||||
break; | ||||||||||||
case options::OPT_fno_offload_fp32_prec_div: | ||||||||||||
addSPIRVArgs("-fno-offload-fp32-prec-div"); | ||||||||||||
break; | ||||||||||||
case options::OPT_fno_offload_fp32_prec_sqrt: | ||||||||||||
addSPIRVArgs("-fno-offload-fp32-prec-sqrt"); | ||||||||||||
break; | ||||||||||||
case options::OPT_fcx_limited_range: | ||||||||||||
if (GccRangeComplexOption.empty()) { | ||||||||||||
if (Range != LangOptions::ComplexRangeKind::CX_Basic) | ||||||||||||
|
@@ -3144,6 +3212,14 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, | |||||||||||
case options::OPT_ffp_accuracy_EQ: { | ||||||||||||
StringRef Val = A->getValue(); | ||||||||||||
FPAccuracy = Val; | ||||||||||||
if (NoOffloadFP32PrecDiv) { | ||||||||||||
EmitAccuracyDiag(D, JA, FPAccuracy, "-fno-offload-fp32-prec-div"); | ||||||||||||
parseFPAccOption(Val, NoOffloadFP32PrecDiv); | ||||||||||||
} | ||||||||||||
if (NoOffloadFP32PrecSqrt) { | ||||||||||||
EmitAccuracyDiag(D, JA, FPAccuracy, "-fno-offload-fp32-prec-sqrt"); | ||||||||||||
parseFPAccOption(Val, NoOffloadFP32PrecSqrt); | ||||||||||||
} | ||||||||||||
break; | ||||||||||||
} | ||||||||||||
case options::OPT_ffp_model_EQ: { | ||||||||||||
|
@@ -3557,6 +3633,12 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, | |||||||||||
CmdArgs.push_back("-fno-cx-limited-range"); | ||||||||||||
if (Args.hasArg(options::OPT_fno_cx_fortran_rules)) | ||||||||||||
CmdArgs.push_back("-fno-cx-fortran-rules"); | ||||||||||||
if (IsFp32PrecDivSqrtAllowed) { | ||||||||||||
if (NoOffloadFP32PrecDiv) | ||||||||||||
CmdArgs.push_back("-fno-offload-fp32-prec-div"); | ||||||||||||
if (NoOffloadFP32PrecSqrt) | ||||||||||||
CmdArgs.push_back("-fno-offload-fp32-prec-sqrt"); | ||||||||||||
} | ||||||||||||
} | ||||||||||||
|
||||||||||||
static void RenderAnalyzerOptions(const ArgList &Args, ArgStringList &CmdArgs, | ||||||||||||
|
@@ -5311,6 +5393,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, | |||||||||||
Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext); | ||||||||||||
const Driver &D = TC.getDriver(); | ||||||||||||
ArgStringList CmdArgs; | ||||||||||||
bool NoOffloadFP32PrecDiv = false; | ||||||||||||
bool NoOffloadFP32PrecSqrt = false; | ||||||||||||
|
||||||||||||
assert(Inputs.size() >= 1 && "Must have at least one input."); | ||||||||||||
// CUDA/HIP compilation may have multiple inputs (source file + results of | ||||||||||||
|
@@ -6119,7 +6203,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, | |||||||||||
options::OPT_fno_optimize_sibling_calls); | ||||||||||||
|
||||||||||||
RenderFloatingPointOptions(TC, D, isOptimizationLevelFast(Args), Args, | ||||||||||||
CmdArgs, JA); | ||||||||||||
CmdArgs, JA, NoOffloadFP32PrecDiv, | ||||||||||||
NoOffloadFP32PrecSqrt); | ||||||||||||
|
||||||||||||
// Render ABI arguments | ||||||||||||
switch (TC.getArch()) { | ||||||||||||
|
@@ -6593,7 +6678,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, | |||||||||||
options::OPT_fno_protect_parens, false)) | ||||||||||||
CmdArgs.push_back("-fprotect-parens"); | ||||||||||||
|
||||||||||||
RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs, JA); | ||||||||||||
RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs, JA, | ||||||||||||
NoOffloadFP32PrecDiv, NoOffloadFP32PrecSqrt); | ||||||||||||
|
||||||||||||
if (Arg *A = Args.getLastArg(options::OPT_fextend_args_EQ)) { | ||||||||||||
const llvm::Triple::ArchType Arch = TC.getArch(); | ||||||||||||
|
@@ -6644,8 +6730,18 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, | |||||||||||
FpAccuracyAttr += OptStr.str(); | ||||||||||||
} | ||||||||||||
}; | ||||||||||||
for (StringRef A : Args.getAllArgValues(options::OPT_ffp_accuracy_EQ)) | ||||||||||||
RenderFPAccuracyOptions(A); | ||||||||||||
auto shouldAddFpAccuracyOption = [&](StringRef Val, StringRef Func) { | ||||||||||||
SmallVector<StringRef, 8> FuncsArr = SplitFPAccuracyVal(Val); | ||||||||||||
for (const auto &V : FuncsArr) | ||||||||||||
return (V == Func); | ||||||||||||
return false; | ||||||||||||
}; | ||||||||||||
|
||||||||||||
for (StringRef A : Args.getAllArgValues(options::OPT_ffp_accuracy_EQ)) { | ||||||||||||
if (!(NoOffloadFP32PrecDiv && shouldAddFpAccuracyOption(A, "fdiv")) && | ||||||||||||
!(NoOffloadFP32PrecSqrt && shouldAddFpAccuracyOption(A, "sqrt"))) | ||||||||||||
RenderFPAccuracyOptions(A); | ||||||||||||
} | ||||||||||||
if (!FpAccuracyAttr.empty()) | ||||||||||||
CmdArgs.push_back(Args.MakeArgString(FpAccuracyAttr)); | ||||||||||||
|
||||||||||||
|
@@ -10603,8 +10699,22 @@ static void getTripleBasedSPIRVTransOpts(Compilation &C, | |||||||||||
",+SPV_KHR_non_semantic_info" | ||||||||||||
",+SPV_KHR_cooperative_matrix" | ||||||||||||
",+SPV_EXT_shader_atomic_float16_add"; | ||||||||||||
if (IsCPU) | ||||||||||||
auto hasNoOffloadFP32PrecOption = [](const llvm::opt::ArgList &TCArgs) { | ||||||||||||
return !TCArgs.hasFlag(options::OPT_foffload_fp32_prec_sqrt, | ||||||||||||
options::OPT_fno_offload_fp32_prec_sqrt, false) && | ||||||||||||
!TCArgs.hasFlag(options::OPT_foffload_fp32_prec_div, | ||||||||||||
options::OPT_fno_offload_fp32_prec_div, false); | ||||||||||||
}; | ||||||||||||
auto shouldUseOffloadFP32PrecOption = [](const llvm::opt::ArgList &TCArgs) { | ||||||||||||
return (TCArgs.hasFlag(options::OPT_fno_offload_fp32_prec_sqrt, | ||||||||||||
options::OPT_foffload_fp32_prec_sqrt, false) || | ||||||||||||
TCArgs.hasFlag(options::OPT_fno_offload_fp32_prec_div, | ||||||||||||
options::OPT_foffload_fp32_prec_div, false)); | ||||||||||||
}; | ||||||||||||
if ((IsCPU && hasNoOffloadFP32PrecOption(TCArgs)) || | ||||||||||||
shouldUseOffloadFP32PrecOption(TCArgs)) { | ||||||||||||
ExtArg += ",+SPV_INTEL_fp_max_error"; | ||||||||||||
} | ||||||||||||
|
||||||||||||
TranslatorArgs.push_back(TCArgs.MakeArgString(ExtArg)); | ||||||||||||
} | ||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we compare with un-mangled sqrt?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FuncName is the output of FD->getName() which returns a simple identifier. https://github.com/intel/llvm/blob/sycl/clang/include/clang/AST/Decl.h#L280
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So clang/test/CodeGenSYCL/offload-fp32-div-sqrt.cpp will pass even with
extern "C"
removed fromsqrt
function declaration?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if the user has a function in their own namespace that happens to be named "sqrt"?