[Clang] Add float type support to __builtin_reduce_add and __builtin_reduce_multipy #120367

farzonl · 2024-12-18T06:49:25Z

update __builtin_reduce_add and __builtin_reduce_multiply in clang/lib/CodeGen/CGBuiltin.cpp to emit the floating point intrinsics for add and multiply vector reductions.
update clang/lib/Sema/SemaChecking.cpp so that the add\multiply builtins can accept floating point vectors.
update clang/lib/AST/ByteCode/InterpBuiltin.cpp so that const floating point vector passed to add\multiply reductions are not computed at compile time.

llvmbot · 2024-12-18T06:49:57Z

@llvm/pr-subscribers-clang

Author: Farzon Lotfi (farzonl)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/120367.diff

8 Files Affected:

(modified) clang/include/clang/Basic/DiagnosticSemaKinds.td (+2-1)
(modified) clang/lib/AST/ByteCode/InterpBuiltin.cpp (+12)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+27-2)
(modified) clang/lib/Sema/SemaChecking.cpp (+23-3)
(modified) clang/test/AST/ByteCode/builtin-functions.cpp (+15)
(modified) clang/test/CodeGen/builtins-reduction-math.c (+21)
(modified) clang/test/Sema/builtins-reduction-math.c (+8-8)
(modified) clang/test/Sema/constant_builtins_vector.cpp (+12)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9344b620779b84..0e227c5a3d8179 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12338,7 +12338,8 @@ def err_builtin_invalid_arg_type: Error <
   "a vector of integers|"
   "an unsigned integer|"
   "an 'int'|"
-  "a vector of floating points}1 (was %2)">;
+  "a vector of floating points|"
+  "a vector of integers or floating points}1 (was %2)">;
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d6b33c8aeeaac3..be34b4bb04e3c2 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #include "../ExprConstShared.h"
 #include "Boolean.h"
+#include "ByteCode/Floating.h"
 #include "Compiler.h"
 #include "EvalEmitter.h"
 #include "Interp.h"
@@ -1754,6 +1755,17 @@ static bool interp__builtin_vector_reduce(InterpState &S, CodePtr OpPC,
   PrimType ElemT = *S.getContext().classify(ElemType);
   unsigned NumElems = Arg.getNumElems();
 
+  if (ElemType->isRealFloatingType()) {
+    if (ID != Builtin::BI__builtin_reduce_add &&
+        ID != Builtin::BI__builtin_reduce_mul)
+      llvm_unreachable("Only reduce_add and reduce_mul are supported for "
+                       "floating-point types.");
+    // Floating-point arithmetic is not valid for constant expression
+    // initialization. Returning false defers checks to integral constant
+    // expression validation, preventing a bad deref of Floating as an integer.
+    return false;
+  }
+
   INT_TYPE_SWITCH_NO_BOOL(ElemT, {
     T Result = Arg.atIndex(0).deref<T>();
     unsigned BitWidth = Result.bitWidth();
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4d4b7428abd505..12e3cb18bdb89d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4274,12 +4274,37 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
   }
 
-  case Builtin::BI__builtin_reduce_add:
+  case Builtin::BI__builtin_reduce_add: {
+    // Note: vector_reduce_fadd takes two arguments a
+    // scalar start value and a vector. That would mean to
+    // correctly call it we would need emitBuiltinWithOneOverloadedType<2>
+    // To keep the  builtin sema behavior the same despite type we will
+    // popululate vector_reduce_fadd scalar value with a 0.
+    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
+      Value *X = EmitScalarExpr(E->getArg(0));
+      auto EltTy = X->getType()->getScalarType();
+      Value *Seed = ConstantFP::get(EltTy, 0);
+      return RValue::get(Builder.CreateIntrinsic(
+          /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fadd,
+          ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fadd"));
+    }
+    assert(E->getArg(0)->getType()->hasIntegerRepresentation());
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_add, "rdx.add"));
-  case Builtin::BI__builtin_reduce_mul:
+  }
+  case Builtin::BI__builtin_reduce_mul: {
+    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
+      Value *X = EmitScalarExpr(E->getArg(0));
+      auto EltTy = X->getType()->getScalarType();
+      Value *Seed = ConstantFP::get(EltTy, 0);
+      return RValue::get(Builder.CreateIntrinsic(
+          /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fmul,
+          ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fmul"));
+    }
+    assert(E->getArg(0)->getType()->hasIntegerRepresentation());
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_mul, "rdx.mul"));
+  }
   case Builtin::BI__builtin_reduce_xor:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_xor, "rdx.xor"));
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a248a6b53b0d06..a13c25eb2b6f6b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2883,11 +2883,31 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     TheCall->setType(ElTy);
     break;
   }
+  case Builtin::BI__builtin_reduce_add:
+  case Builtin::BI__builtin_reduce_mul: {
+    if (PrepareBuiltinReduceMathOneArgCall(TheCall))
+      return ExprError();
+
+    const Expr *Arg = TheCall->getArg(0);
+    const auto *TyA = Arg->getType()->getAs<VectorType>();
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull()) {
+      Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
+          << 1 << /* vector of integers or floating points */ 10
+          << Arg->getType();
+      return ExprError();
+    }
+    TheCall->setType(ElTy);
+    break;
+  }
 
   // These builtins support vectors of integers only.
-  // TODO: ADD/MUL should support floating-point types.
-  case Builtin::BI__builtin_reduce_add:
-  case Builtin::BI__builtin_reduce_mul:
   case Builtin::BI__builtin_reduce_xor:
   case Builtin::BI__builtin_reduce_or:
   case Builtin::BI__builtin_reduce_and: {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 5906cb970f06c4..1a3dbd5e37a709 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1056,6 +1056,14 @@ namespace RecuceAdd {
   static_assert(__builtin_reduce_add((vector4uint){~0U, 0, 0, 1}) == 0);
   static_assert(__builtin_reduce_add((vector4ulong){~0ULL, 0, 0, 1}) == 0);
 
+  static_assert(__builtin_reduce_add((vector4float){}) == 0.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_add((vector4float){1.1, 2.2, 3.3, 4.4}) == 11.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_add((vector4double){100.1, 200.2, 300.3, 400.4}) == 1001.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+
+
 
 #ifdef __SIZEOF_INT128__
   typedef __int128 v4i128 __attribute__((__vector_size__(128 * 2)));
@@ -1091,6 +1099,13 @@ namespace ReduceMul {
       (~0U - 1));
 #endif
   static_assert(__builtin_reduce_mul((vector4ulong){~0ULL, 1, 1, 2}) == ~0ULL - 1);
+
+    static_assert(__builtin_reduce_mul((vector4float){}) == 0.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_mul((vector4float){1.0, 2.0, 3.0, 1.0}) == 6.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_mul((vector4double){3.0, 4.0, 1.0, 1.0}) == 12.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
 }
 
 namespace ReduceAnd {
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index e12fd729c84c0b..35f12ca710e3e3 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve  %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE   %s
 
 typedef float float4 __attribute__((ext_vector_type(4)));
+typedef double double4 __attribute__((ext_vector_type(4)));
 typedef short int si8 __attribute__((ext_vector_type(8)));
 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
 
@@ -61,6 +62,16 @@ void test_builtin_reduce_min(float4 vf1, si8 vi1, u4 vu1) {
   unsigned long long r5 = __builtin_reduce_min(cvi1);
 }
 
+void test_builtin_reduce_addf(float4 vf4, double4 vd4) {          
+  // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
+  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[VF4]])
+  float r2 = __builtin_reduce_add(vf4);
+
+  // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
+  // CHECK-NEXT: call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[VD4]])
+  double r3 = __builtin_reduce_add(vd4);
+}
+
 void test_builtin_reduce_add(si8 vi1, u4 vu1) {
   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
   // CHECK-NEXT: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VI1]])
@@ -83,6 +94,16 @@ void test_builtin_reduce_add(si8 vi1, u4 vu1) {
   unsigned long long r5 = __builtin_reduce_add(cvu1);
 }
 
+void test_builtin_reduce_mulf(float4 vf4, double4 vd4) {          
+  // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
+  // CHECK-NEXT: call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> [[VF4]])
+  float r2 = __builtin_reduce_mul(vf4);
+
+  // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
+  // CHECK-NEXT: call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> [[VD4]])
+  double r3 = __builtin_reduce_mul(vd4);
+}
+
 void test_builtin_reduce_mul(si8 vi1, u4 vu1) {
   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
   // CHECK-NEXT: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[VI1]])
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index 9b0d91bfd6e3d2..9e2dac7ebbe6f6 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -36,7 +36,7 @@ void test_builtin_reduce_min(int i, float4 v, int3 iv) {
   // expected-error@-1 {{1st argument must be a vector type (was 'int')}}
 }
 
-void test_builtin_reduce_add(int i, float4 v, int3 iv) {
+void test_builtin_reduce_add(int i, float f, int3 iv) {
   struct Foo s = __builtin_reduce_add(iv);
   // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}}
 
@@ -47,13 +47,13 @@ void test_builtin_reduce_add(int i, float4 v, int3 iv) {
   // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
 
   i = __builtin_reduce_add(i);
-  // expected-error@-1 {{1st argument must be a vector of integers (was 'int')}}
+  // expected-error@-1 {{1st argument must be a vector of integers or floating points (was 'int')}}
 
-  i = __builtin_reduce_add(v);
-  // expected-error@-1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}}
+  f = __builtin_reduce_add(f);
+  // expected-error@-1 {{1st argument must be a vector of integers or floating points (was 'float')}}
 }
 
-void test_builtin_reduce_mul(int i, float4 v, int3 iv) {
+void test_builtin_reduce_mul(int i, float f, int3 iv) {
   struct Foo s = __builtin_reduce_mul(iv);
   // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}}
 
@@ -64,10 +64,10 @@ void test_builtin_reduce_mul(int i, float4 v, int3 iv) {
   // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
 
   i = __builtin_reduce_mul(i);
-  // expected-error@-1 {{1st argument must be a vector of integers (was 'int')}}
+  // expected-error@-1 {{1st argument must be a vector of integers or floating points (was 'int')}}
 
-  i = __builtin_reduce_mul(v);
-  // expected-error@-1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}}
+  f = __builtin_reduce_mul(f);
+  // expected-error@-1 {{1st argument must be a vector of integers or floating points (was 'float')}}
 }
 
 void test_builtin_reduce_xor(int i, float4 v, int3 iv) {
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index b2f56e5a87ab1a..58a05ec2dd406a 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -746,6 +746,12 @@ constexpr long long reduceAddLong2 = __builtin_reduce_add((vector4long){(1LL <<
 static_assert(__builtin_reduce_add((vector4uint){~0U, 0, 0, 1}) == 0);
 static_assert(__builtin_reduce_add((vector4ulong){~0ULL, 0, 0, 1}) == 0);
 
+constexpr float reduceAddFloat = __builtin_reduce_add((vector4float){1.0, 2.0, 3.0, 4.0});
+// expected-error@-1 {{must be initialized by a constant expression}}
+
+constexpr double reduceAddDouble = __builtin_reduce_add((vector4double){-1.0, 2.0, -3.0, 4.0});
+// expected-error@-1 {{must be initialized by a constant expression}}
+
 static_assert(__builtin_reduce_mul((vector4char){}) == 0);
 static_assert(__builtin_reduce_mul((vector4char){1, 2, 3, 4}) == 24);
 static_assert(__builtin_reduce_mul((vector4short){1, 2, 30, 40}) == 2400);
@@ -766,6 +772,12 @@ constexpr long long reduceMulLong2 = __builtin_reduce_mul((vector4long){(1LL <<
 static_assert(__builtin_reduce_mul((vector4uint){~0U, 1, 1, 2}) == ~0U - 1);
 static_assert(__builtin_reduce_mul((vector4ulong){~0ULL, 1, 1, 2}) == ~0ULL - 1);
 
+constexpr float reduceMulFloat = __builtin_reduce_mul((vector4float){1.0, 2.0, 3.0, 1.0});
+// expected-error@-1 {{must be initialized by a constant expression}}
+
+constexpr double reduceMulDouble = __builtin_reduce_mul((vector4double){3.0, 4.0, 1.0, 1.0});
+// expected-error@-1 {{must be initialized by a constant expression}}
+
 static_assert(__builtin_reduce_and((vector4char){}) == 0);
 static_assert(__builtin_reduce_and((vector4char){(char)0x11, (char)0x22, (char)0x44, (char)0x88}) == 0);
 static_assert(__builtin_reduce_and((vector4short){(short)0x1111, (short)0x2222, (short)0x4444, (short)0x8888}) == 0);

llvmbot · 2024-12-18T06:49:57Z

@llvm/pr-subscribers-clang-codegen

Author: Farzon Lotfi (farzonl)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/120367.diff

8 Files Affected:

(modified) clang/include/clang/Basic/DiagnosticSemaKinds.td (+2-1)
(modified) clang/lib/AST/ByteCode/InterpBuiltin.cpp (+12)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+27-2)
(modified) clang/lib/Sema/SemaChecking.cpp (+23-3)
(modified) clang/test/AST/ByteCode/builtin-functions.cpp (+15)
(modified) clang/test/CodeGen/builtins-reduction-math.c (+21)
(modified) clang/test/Sema/builtins-reduction-math.c (+8-8)
(modified) clang/test/Sema/constant_builtins_vector.cpp (+12)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9344b620779b84..0e227c5a3d8179 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12338,7 +12338,8 @@ def err_builtin_invalid_arg_type: Error <
   "a vector of integers|"
   "an unsigned integer|"
   "an 'int'|"
-  "a vector of floating points}1 (was %2)">;
+  "a vector of floating points|"
+  "a vector of integers or floating points}1 (was %2)">;
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d6b33c8aeeaac3..be34b4bb04e3c2 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #include "../ExprConstShared.h"
 #include "Boolean.h"
+#include "ByteCode/Floating.h"
 #include "Compiler.h"
 #include "EvalEmitter.h"
 #include "Interp.h"
@@ -1754,6 +1755,17 @@ static bool interp__builtin_vector_reduce(InterpState &S, CodePtr OpPC,
   PrimType ElemT = *S.getContext().classify(ElemType);
   unsigned NumElems = Arg.getNumElems();
 
+  if (ElemType->isRealFloatingType()) {
+    if (ID != Builtin::BI__builtin_reduce_add &&
+        ID != Builtin::BI__builtin_reduce_mul)
+      llvm_unreachable("Only reduce_add and reduce_mul are supported for "
+                       "floating-point types.");
+    // Floating-point arithmetic is not valid for constant expression
+    // initialization. Returning false defers checks to integral constant
+    // expression validation, preventing a bad deref of Floating as an integer.
+    return false;
+  }
+
   INT_TYPE_SWITCH_NO_BOOL(ElemT, {
     T Result = Arg.atIndex(0).deref<T>();
     unsigned BitWidth = Result.bitWidth();
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4d4b7428abd505..12e3cb18bdb89d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4274,12 +4274,37 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
   }
 
-  case Builtin::BI__builtin_reduce_add:
+  case Builtin::BI__builtin_reduce_add: {
+    // Note: vector_reduce_fadd takes two arguments a
+    // scalar start value and a vector. That would mean to
+    // correctly call it we would need emitBuiltinWithOneOverloadedType<2>
+    // To keep the  builtin sema behavior the same despite type we will
+    // popululate vector_reduce_fadd scalar value with a 0.
+    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
+      Value *X = EmitScalarExpr(E->getArg(0));
+      auto EltTy = X->getType()->getScalarType();
+      Value *Seed = ConstantFP::get(EltTy, 0);
+      return RValue::get(Builder.CreateIntrinsic(
+          /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fadd,
+          ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fadd"));
+    }
+    assert(E->getArg(0)->getType()->hasIntegerRepresentation());
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_add, "rdx.add"));
-  case Builtin::BI__builtin_reduce_mul:
+  }
+  case Builtin::BI__builtin_reduce_mul: {
+    if (E->getArg(0)->getType()->hasFloatingRepresentation()) {
+      Value *X = EmitScalarExpr(E->getArg(0));
+      auto EltTy = X->getType()->getScalarType();
+      Value *Seed = ConstantFP::get(EltTy, 0);
+      return RValue::get(Builder.CreateIntrinsic(
+          /*ReturnType=*/EltTy, llvm::Intrinsic::vector_reduce_fmul,
+          ArrayRef<Value *>{Seed, X}, nullptr, "rdx.fmul"));
+    }
+    assert(E->getArg(0)->getType()->hasIntegerRepresentation());
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_mul, "rdx.mul"));
+  }
   case Builtin::BI__builtin_reduce_xor:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, llvm::Intrinsic::vector_reduce_xor, "rdx.xor"));
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a248a6b53b0d06..a13c25eb2b6f6b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2883,11 +2883,31 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     TheCall->setType(ElTy);
     break;
   }
+  case Builtin::BI__builtin_reduce_add:
+  case Builtin::BI__builtin_reduce_mul: {
+    if (PrepareBuiltinReduceMathOneArgCall(TheCall))
+      return ExprError();
+
+    const Expr *Arg = TheCall->getArg(0);
+    const auto *TyA = Arg->getType()->getAs<VectorType>();
+
+    QualType ElTy;
+    if (TyA)
+      ElTy = TyA->getElementType();
+    else if (Arg->getType()->isSizelessVectorType())
+      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
+
+    if (ElTy.isNull()) {
+      Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
+          << 1 << /* vector of integers or floating points */ 10
+          << Arg->getType();
+      return ExprError();
+    }
+    TheCall->setType(ElTy);
+    break;
+  }
 
   // These builtins support vectors of integers only.
-  // TODO: ADD/MUL should support floating-point types.
-  case Builtin::BI__builtin_reduce_add:
-  case Builtin::BI__builtin_reduce_mul:
   case Builtin::BI__builtin_reduce_xor:
   case Builtin::BI__builtin_reduce_or:
   case Builtin::BI__builtin_reduce_and: {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 5906cb970f06c4..1a3dbd5e37a709 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1056,6 +1056,14 @@ namespace RecuceAdd {
   static_assert(__builtin_reduce_add((vector4uint){~0U, 0, 0, 1}) == 0);
   static_assert(__builtin_reduce_add((vector4ulong){~0ULL, 0, 0, 1}) == 0);
 
+  static_assert(__builtin_reduce_add((vector4float){}) == 0.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_add((vector4float){1.1, 2.2, 3.3, 4.4}) == 11.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_add((vector4double){100.1, 200.2, 300.3, 400.4}) == 1001.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+
+
 
 #ifdef __SIZEOF_INT128__
   typedef __int128 v4i128 __attribute__((__vector_size__(128 * 2)));
@@ -1091,6 +1099,13 @@ namespace ReduceMul {
       (~0U - 1));
 #endif
   static_assert(__builtin_reduce_mul((vector4ulong){~0ULL, 1, 1, 2}) == ~0ULL - 1);
+
+    static_assert(__builtin_reduce_mul((vector4float){}) == 0.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_mul((vector4float){1.0, 2.0, 3.0, 1.0}) == 6.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
+  static_assert(__builtin_reduce_mul((vector4double){3.0, 4.0, 1.0, 1.0}) == 12.0);
+  // both-error@-1 {{static assertion expression is not an integral constant expression}}
 }
 
 namespace ReduceAnd {
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index e12fd729c84c0b..35f12ca710e3e3 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -4,6 +4,7 @@
 // RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve  %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE   %s
 
 typedef float float4 __attribute__((ext_vector_type(4)));
+typedef double double4 __attribute__((ext_vector_type(4)));
 typedef short int si8 __attribute__((ext_vector_type(8)));
 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
 
@@ -61,6 +62,16 @@ void test_builtin_reduce_min(float4 vf1, si8 vi1, u4 vu1) {
   unsigned long long r5 = __builtin_reduce_min(cvi1);
 }
 
+void test_builtin_reduce_addf(float4 vf4, double4 vd4) {          
+  // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
+  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[VF4]])
+  float r2 = __builtin_reduce_add(vf4);
+
+  // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
+  // CHECK-NEXT: call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[VD4]])
+  double r3 = __builtin_reduce_add(vd4);
+}
+
 void test_builtin_reduce_add(si8 vi1, u4 vu1) {
   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
   // CHECK-NEXT: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VI1]])
@@ -83,6 +94,16 @@ void test_builtin_reduce_add(si8 vi1, u4 vu1) {
   unsigned long long r5 = __builtin_reduce_add(cvu1);
 }
 
+void test_builtin_reduce_mulf(float4 vf4, double4 vd4) {          
+  // CHECK:      [[VF4:%.+]] = load <4 x float>, ptr %vf4.addr, align 16
+  // CHECK-NEXT: call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> [[VF4]])
+  float r2 = __builtin_reduce_mul(vf4);
+
+  // CHECK:      [[VD4:%.+]] = load <4 x double>, ptr %vd4.addr, align 16
+  // CHECK-NEXT: call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> [[VD4]])
+  double r3 = __builtin_reduce_mul(vd4);
+}
+
 void test_builtin_reduce_mul(si8 vi1, u4 vu1) {
   // CHECK:      [[VI1:%.+]] = load <8 x i16>, ptr %vi1.addr, align 16
   // CHECK-NEXT: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> [[VI1]])
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index 9b0d91bfd6e3d2..9e2dac7ebbe6f6 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -36,7 +36,7 @@ void test_builtin_reduce_min(int i, float4 v, int3 iv) {
   // expected-error@-1 {{1st argument must be a vector type (was 'int')}}
 }
 
-void test_builtin_reduce_add(int i, float4 v, int3 iv) {
+void test_builtin_reduce_add(int i, float f, int3 iv) {
   struct Foo s = __builtin_reduce_add(iv);
   // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}}
 
@@ -47,13 +47,13 @@ void test_builtin_reduce_add(int i, float4 v, int3 iv) {
   // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
 
   i = __builtin_reduce_add(i);
-  // expected-error@-1 {{1st argument must be a vector of integers (was 'int')}}
+  // expected-error@-1 {{1st argument must be a vector of integers or floating points (was 'int')}}
 
-  i = __builtin_reduce_add(v);
-  // expected-error@-1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}}
+  f = __builtin_reduce_add(f);
+  // expected-error@-1 {{1st argument must be a vector of integers or floating points (was 'float')}}
 }
 
-void test_builtin_reduce_mul(int i, float4 v, int3 iv) {
+void test_builtin_reduce_mul(int i, float f, int3 iv) {
   struct Foo s = __builtin_reduce_mul(iv);
   // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}}
 
@@ -64,10 +64,10 @@ void test_builtin_reduce_mul(int i, float4 v, int3 iv) {
   // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
 
   i = __builtin_reduce_mul(i);
-  // expected-error@-1 {{1st argument must be a vector of integers (was 'int')}}
+  // expected-error@-1 {{1st argument must be a vector of integers or floating points (was 'int')}}
 
-  i = __builtin_reduce_mul(v);
-  // expected-error@-1 {{1st argument must be a vector of integers (was 'float4' (vector of 4 'float' values))}}
+  f = __builtin_reduce_mul(f);
+  // expected-error@-1 {{1st argument must be a vector of integers or floating points (was 'float')}}
 }
 
 void test_builtin_reduce_xor(int i, float4 v, int3 iv) {
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index b2f56e5a87ab1a..58a05ec2dd406a 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -746,6 +746,12 @@ constexpr long long reduceAddLong2 = __builtin_reduce_add((vector4long){(1LL <<
 static_assert(__builtin_reduce_add((vector4uint){~0U, 0, 0, 1}) == 0);
 static_assert(__builtin_reduce_add((vector4ulong){~0ULL, 0, 0, 1}) == 0);
 
+constexpr float reduceAddFloat = __builtin_reduce_add((vector4float){1.0, 2.0, 3.0, 4.0});
+// expected-error@-1 {{must be initialized by a constant expression}}
+
+constexpr double reduceAddDouble = __builtin_reduce_add((vector4double){-1.0, 2.0, -3.0, 4.0});
+// expected-error@-1 {{must be initialized by a constant expression}}
+
 static_assert(__builtin_reduce_mul((vector4char){}) == 0);
 static_assert(__builtin_reduce_mul((vector4char){1, 2, 3, 4}) == 24);
 static_assert(__builtin_reduce_mul((vector4short){1, 2, 30, 40}) == 2400);
@@ -766,6 +772,12 @@ constexpr long long reduceMulLong2 = __builtin_reduce_mul((vector4long){(1LL <<
 static_assert(__builtin_reduce_mul((vector4uint){~0U, 1, 1, 2}) == ~0U - 1);
 static_assert(__builtin_reduce_mul((vector4ulong){~0ULL, 1, 1, 2}) == ~0ULL - 1);
 
+constexpr float reduceMulFloat = __builtin_reduce_mul((vector4float){1.0, 2.0, 3.0, 1.0});
+// expected-error@-1 {{must be initialized by a constant expression}}
+
+constexpr double reduceMulDouble = __builtin_reduce_mul((vector4double){3.0, 4.0, 1.0, 1.0});
+// expected-error@-1 {{must be initialized by a constant expression}}
+
 static_assert(__builtin_reduce_and((vector4char){}) == 0);
 static_assert(__builtin_reduce_and((vector4char){(char)0x11, (char)0x22, (char)0x44, (char)0x88}) == 0);
 static_assert(__builtin_reduce_and((vector4short){(short)0x1111, (short)0x2222, (short)0x4444, (short)0x8888}) == 0);

farzonl · 2024-12-18T06:59:33Z

clang/lib/AST/ByteCode/InterpBuiltin.cpp

+    // Floating-point arithmetic is not valid for constant expression
+    // initialization. Returning false defers checks to integral constant
+    // expression validation, preventing a bad deref of Floating as an integer.
+    return false;


I wanted to do something like this:

if(ElemType->isRealFloatingType()) { FPOptions FPO = Call->getFPFeaturesInEffect(S.getASTContext().getLangOpts()); llvm::RoundingMode RM = getRoundingMode(FPO); Floating Result = Arg.atIndex(0).deref<Floating>(); APFloat currResult = Result.getAPFloat(); for (unsigned I = 1; I != NumElems; ++I) { Floating Elem = Arg.atIndex(I).deref<Floating>(); if (ID == Builtin::BI__builtin_reduce_add) { if(APFloat::opStatus::opOK != currResult.add(Elem.getAPFloat(),RM)) return false; } else if (ID == Builtin::BI__builtin_reduce_mul) { if(APFloat::opStatus::opOK != currResult.multiply(Elem.getAPFloat(),RM)) return false; } else llvm_unreachable("Only reduce_add and reduce_mul are supported for floating-point types."); } S.Stk.push<Floating>(Floating(currResult)); return true; }

However there are checks further for isIntegralPointer() that make any code here to compute the reduction not make sense.

clang/include/clang/Basic/DiagnosticSemaKinds.td

llvm-beanz · 2024-12-18T17:17:40Z

clang/lib/AST/ByteCode/InterpBuiltin.cpp

+  if (ElemType->isRealFloatingType()) {
+    if (ID != Builtin::BI__builtin_reduce_add &&
+        ID != Builtin::BI__builtin_reduce_mul)
+      llvm_unreachable("Only reduce_add and reduce_mul are supported for "


Is this actually unreachable? You have a mix of both real error handling code and an unreachable. Code after an unreachable (like the return) can be removed by a compiler, which would cause some bad behavior here if this ever got hit.

Context the float case was exposed by the changes in SemaChecking.cpp which allowed reduce_add and reduce_mul to operate on floating point vectors. The remaining reduce builtins Builtin::BI__builtin_reduce_xor, Builtin::BI__builtin_reduce_or, and Builtin::BI__builtin_reduce_and are not reachable here because the integer checks in SemaChecking.cpp still apply to them. So yes this branch should be unreachable for all non reduce_add and reduce_mul reduction cases.

@llvm-beanz I don't understand how this code could be removed by the compiler. The code after the unreachable is in a different block. The unreachable is gated by the builtin check.

farzonl · 2024-12-18T18:51:18Z

@RKSimon It looks like you started this work with:

If you have time could you take a look and let me know what you think?

clang/lib/CodeGen/CGBuiltin.cpp

farzonl · 2025-01-03T17:17:19Z

@Il-Capitano @RKSimon could I get another review of this PR? Thanks in advance!

RKSimon · 2025-01-03T17:35:58Z

@RKSimon It looks like you started this work with:
1. [a23291b](https://github.com/llvm/llvm-project/commit/a23291b7db48670f7c57adcfb45877c826a97f22)

2. [8a92c45](https://github.com/llvm/llvm-project/commit/8a92c45e07dc81c83ca3afda3971d98c512429d4)
If you have time could you take a look and let me know what you think?

fp reductions are a nightmare - every time I thought we were getting somewhere, something else fastmath related causes more headaches.

farzonl · 2025-01-03T17:41:01Z

fp reductions are a nightmare - every time I thought we were getting somewhere, something else fastmath related causes more headaches.

@RKSimon should I not proceed with this PR? I was really hoping to use this in HLSL as it makes implementing many of our runtime apis easy in the frontend.

RKSimon · 2025-01-03T18:38:08Z

fp reductions are a nightmare - every time I thought we were getting somewhere, something else fastmath related causes more headaches.

@RKSimon should I not proceed with this PR? I was really hoping to use this in HLSL as it makes implementing many of our runtime apis easy in the frontend.

I'd definitely suggest you reduce the scope - maybe always expand the reduction serially in the frontend and not create the llvm intrinsic at all? Or maybe an alternative __builtin_reduce_fastmath_fadd / fmul that makes it clear whats happening, and have them always emit the reassoc variants of the reductions? There was a lot of resistance to reductions that change behaviour depending on which fast math flags were enabled.

CC @andykaylor @jcranmer-intel

Il-Capitano · 2025-01-06T10:07:47Z

__builtin_reduce_add/mul is defined to do recursive even-odd pairwise reduction in Clang, and the @llvm.vector.reduce.fadd intrinsic doesn't do that.

I'm not sure if there's any benefit to this definition over a sequential reduction. The only thing I can think that could be affected is signed-integer overflow being UB, but that doesn't seem to be optimized for, or checked with UBSan: https://godbolt.org/z/TEsc86rf5.

Maybe, as RKSimon suggested, having a separate builtin for floating-point types could be better? Something like __builtin_reduce_fadd/fmul that does sequential reduction, and another one for unordered reduction?
This builtin could also take a start value as the first argument to better match the LLVM intrinsic.

Or maybe an alternative __builtin_reduce_fastmath_fadd / fmul that makes it clear whats happening, and have them always emit the reassoc variants of the reductions?

I think __builtin_reduce_fadd_unordered or something similar would be a better choice, since "fastmath" implies more restrictions/assumptions than reassoc (no nans, no signed zeros, etc.).

…reduce_multipy

farzonl · 2025-01-06T20:08:31Z

I'd definitely suggest you reduce the scope - maybe always expand the reduction serially in the frontend and not create the llvm intrinsic at all?

Between @Il-Capitano and @RKSimon comments I think it makes the most sense to do a sequential reduction in the frontend. Thats actually easier because we won't have to add SPIRV backend support for vec reduce.

…l replace with sequential reduction

Il-Capitano · 2025-01-06T20:58:09Z

Just to clarify: @llvm.vector.reduce.fadd does sequential reduction by default, so I don't see a point in doing that manually in the frontend. Your previous implementation had the same behaviour.

The inconsistency with the sequential approach is that Clang defines the __builtin_reduce_* operations to do recursive even-odd pairwise reduction (i.e. (v[0] + v[1]) + (v[2] + v[3]) instead of ((v[0] + v[1]) + v[2]) + v[3])), and since floating-poing addition is not associative, these two can have different results.

My suggestion was to not use the existing __builtin_reduce_add and mul builtins, but to define a new one that is defined to do sequential reduction, matching the behaviour of @llvm.vector.reduce.fadd, and one that is unordered, i.e. @llvm.vector.reduce.fadd with the reassoc fast-math flag set (in practice this will do the even-odd pairwise reduction, but there is a difference in generated code quality between doing that in the frontend, and the backend: https://godbolt.org/z/a4rd44Eza).

You can see the generated code difference of @llvm.vector.reduce.fadd with and without the reassoc flag here: https://godbolt.org/z/zeWjxrxo5.

llvmbot added clang Clang issues not falling into any other category clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:codegen labels Dec 18, 2024

farzonl commented Dec 18, 2024

View reviewed changes

llvm-beanz reviewed Dec 18, 2024

View reviewed changes

farzonl requested a review from RKSimon December 18, 2024 18:49

Il-Capitano reviewed Dec 25, 2024

View reviewed changes

clang/lib/CodeGen/CGBuiltin.cpp Outdated Show resolved Hide resolved

Il-Capitano reviewed Dec 25, 2024

View reviewed changes

clang/lib/CodeGen/CGBuiltin.cpp Outdated Show resolved Hide resolved

farzonl force-pushed the feature/float_reduce_arthimetic branch 2 times, most recently from 7850912 to bae1662 Compare December 27, 2024 10:49

RKSimon requested review from andykaylor and jcranmer-intel January 3, 2025 18:38

farzonl added 2 commits January 6, 2025 13:04

[Clang] Add float type support to __builtin_reduce_add and __builtin_…

e46ef65

…reduce_multipy

address pr feedback

f935ecd

remove Intrinsic::vector_reduce_fadd and Intrinsic::vector_reduce_fmu…

a063f63

…l replace with sequential reduction

farzonl force-pushed the feature/float_reduce_arthimetic branch from bae1662 to a063f63 Compare January 6, 2025 20:11

farzonl marked this pull request as draft January 10, 2025 19:38

farzonl closed this Jan 16, 2025

farzonl deleted the feature/float_reduce_arthimetic branch January 16, 2025 07:33

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Clang] Add float type support to __builtin_reduce_add and __builtin_reduce_multipy #120367

[Clang] Add float type support to __builtin_reduce_add and __builtin_reduce_multipy #120367

farzonl commented Dec 18, 2024 •

edited

Loading

llvmbot commented Dec 18, 2024

llvmbot commented Dec 18, 2024

farzonl Dec 18, 2024

llvm-beanz Dec 18, 2024

farzonl Dec 18, 2024 •

edited

Loading

farzonl Jan 3, 2025 •

edited

Loading

farzonl commented Dec 18, 2024

farzonl commented Jan 3, 2025

RKSimon commented Jan 3, 2025

farzonl commented Jan 3, 2025

RKSimon commented Jan 3, 2025

Il-Capitano commented Jan 6, 2025

farzonl commented Jan 6, 2025

Il-Capitano commented Jan 6, 2025

[Clang] Add float type support to __builtin_reduce_add and __builtin_reduce_multipy #120367

[Clang] Add float type support to __builtin_reduce_add and __builtin_reduce_multipy #120367

Conversation

farzonl commented Dec 18, 2024 • edited Loading

llvmbot commented Dec 18, 2024

llvmbot commented Dec 18, 2024

farzonl Dec 18, 2024

Choose a reason for hiding this comment

llvm-beanz Dec 18, 2024

Choose a reason for hiding this comment

farzonl Dec 18, 2024 • edited Loading

Choose a reason for hiding this comment

farzonl Jan 3, 2025 • edited Loading

Choose a reason for hiding this comment

farzonl commented Dec 18, 2024

farzonl commented Jan 3, 2025

RKSimon commented Jan 3, 2025

farzonl commented Jan 3, 2025

RKSimon commented Jan 3, 2025

Il-Capitano commented Jan 6, 2025

farzonl commented Jan 6, 2025

Il-Capitano commented Jan 6, 2025

farzonl commented Dec 18, 2024 •

edited

Loading

farzonl Dec 18, 2024 •

edited

Loading

farzonl Jan 3, 2025 •

edited

Loading