fix: Disable optimization to avoid pontential errors

Some function may be ignored or not well executed if optimization is applied. A classic example is `test test_mm_set_rounding_mode`. _mm_round_ps is computed in compiling time, in which case, rounding mode changing is ignored.
DLTcollab · Jun 26, 2024 · e5de7cd · e5de7cd
1 parent 213bee6
commit e5de7cd
Showing 1 changed file with 16 additions and 10 deletions.
diff --git a/sse2neon.h b/sse2neon.h
@@ -106,6 +106,14 @@
 #pragma message("Macro name collisions may happen with unsupported compilers.")
 #endif
 
+#if defined(__GNUC__) && !defined(__clang__)
+#define FORCE_INLINE_NO_OPTIMIZE static inline __attribute__((optimize("O0")))
+#elif defined(__clang__)
+#define FORCE_INLINE_NO_OPTIMIZE static inline __attribute__((optnone))
+#else
+#define FORCE_INLINE_NO_OPTIMIZE FORCE_INLINE
+#endif
+
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
 #warning "GCC versions earlier than 10 are not supported."
 #endif
@@ -579,8 +587,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
 FORCE_INLINE __m128 _mm_ceil_ps(__m128);
 FORCE_INLINE __m128d _mm_floor_pd(__m128d);
 FORCE_INLINE __m128 _mm_floor_ps(__m128);
-FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
-FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+static inline __m128d _mm_round_pd(__m128d, int);
+static inline __m128 _mm_round_ps(__m128, int);
 // SSE4.2
 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
 
@@ -2162,7 +2170,7 @@ FORCE_INLINE int _mm_movemask_ps(__m128 a)
 // Multiply packed single-precision (32-bit) floating-point elements in a and b,
 // and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+FORCE_INLINE_NO_OPTIMIZE __m128 _mm_mul_ps(__m128 a, __m128 b)
 {
     return vreinterpretq_m128_f32(
         vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
@@ -2433,8 +2441,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
 // _MM_ROUND_TOWARD_ZERO
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
-FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
-{
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) {
     union {
         fpcr_bitfield field;
 #if defined(__aarch64__) || defined(_M_ARM64)
@@ -3843,7 +3850,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
-FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+FORCE_INLINE_NO_OPTIMIZE __m128i _mm_cvtpd_epi32(__m128d a)
 {
 // vrnd32xq_f64 not supported on clang
 #if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
@@ -3862,8 +3869,7 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
 // Convert packed double-precision (64-bit) floating-point elements in a to
 // packed 32-bit integers, and store the results in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
-FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
-{
+FORCE_INLINE_NO_OPTIMIZE __m64 _mm_cvtpd_pi32(__m128d a)  {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
     double d0 = ((double *) &rnd)[0];
     double d1 = ((double *) &rnd)[1];
@@ -7421,7 +7427,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
 // the rounding parameter, and store the results as packed double-precision
 // floating-point elements in dst.
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
-FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+FORCE_INLINE_NO_OPTIMIZE __m128d _mm_round_pd(__m128d a, int rounding)
 {
 #if defined(__aarch64__) || defined(_M_ARM64)
     switch (rounding) {
@@ -7490,7 +7496,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
 // the rounding parameter, and store the results as packed single-precision
 // floating-point elements in dst.
 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+FORCE_INLINE_NO_OPTIMIZE __m128 _mm_round_ps(__m128 a, int rounding)
 {
 #if (defined(__aarch64__) || defined(_M_ARM64)) || \
     defined(__ARM_FEATURE_DIRECTED_ROUNDING)