Skip to content

Commit

Permalink
fix: Disable optimization to avoid pontential errors
Browse files Browse the repository at this point in the history
Some function may be ignored or not well executed if optimization is
applied. A classic example is `test test_mm_set_rounding_mode`.

_mm_round_ps is computed in compiling time, in which case, rounding
mode changing is ignored.
  • Loading branch information
howjmay committed Jun 26, 2024
1 parent 213bee6 commit e5de7cd
Showing 1 changed file with 16 additions and 10 deletions.
26 changes: 16 additions & 10 deletions sse2neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@
#pragma message("Macro name collisions may happen with unsupported compilers.")
#endif

#if defined(__GNUC__) && !defined(__clang__)
#define FORCE_INLINE_NO_OPTIMIZE static inline __attribute__((optimize("O0")))
#elif defined(__clang__)
#define FORCE_INLINE_NO_OPTIMIZE static inline __attribute__((optnone))
#else
#define FORCE_INLINE_NO_OPTIMIZE FORCE_INLINE
#endif

#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
#warning "GCC versions earlier than 10 are not supported."
#endif
Expand Down Expand Up @@ -579,8 +587,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
FORCE_INLINE __m128 _mm_ceil_ps(__m128);
FORCE_INLINE __m128d _mm_floor_pd(__m128d);
FORCE_INLINE __m128 _mm_floor_ps(__m128);
FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
FORCE_INLINE __m128 _mm_round_ps(__m128, int);
static inline __m128d _mm_round_pd(__m128d, int);
static inline __m128 _mm_round_ps(__m128, int);
// SSE4.2
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);

Expand Down Expand Up @@ -2162,7 +2170,7 @@ FORCE_INLINE int _mm_movemask_ps(__m128 a)
// Multiply packed single-precision (32-bit) floating-point elements in a and b,
// and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
FORCE_INLINE_NO_OPTIMIZE __m128 _mm_mul_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_f32(
vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
Expand Down Expand Up @@ -2433,8 +2441,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
// _MM_ROUND_TOWARD_ZERO
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
{
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) {
union {
fpcr_bitfield field;
#if defined(__aarch64__) || defined(_M_ARM64)
Expand Down Expand Up @@ -3843,7 +3850,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
// Convert packed double-precision (64-bit) floating-point elements in a to
// packed 32-bit integers, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
FORCE_INLINE_NO_OPTIMIZE __m128i _mm_cvtpd_epi32(__m128d a)
{
// vrnd32xq_f64 not supported on clang
#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
Expand All @@ -3862,8 +3869,7 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
// Convert packed double-precision (64-bit) floating-point elements in a to
// packed 32-bit integers, and store the results in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
{
FORCE_INLINE_NO_OPTIMIZE __m64 _mm_cvtpd_pi32(__m128d a) {
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
double d0 = ((double *) &rnd)[0];
double d1 = ((double *) &rnd)[1];
Expand Down Expand Up @@ -7421,7 +7427,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
// the rounding parameter, and store the results as packed double-precision
// floating-point elements in dst.
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
FORCE_INLINE_NO_OPTIMIZE __m128d _mm_round_pd(__m128d a, int rounding)
{
#if defined(__aarch64__) || defined(_M_ARM64)
switch (rounding) {
Expand Down Expand Up @@ -7490,7 +7496,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
// the rounding parameter, and store the results as packed single-precision
// floating-point elements in dst.
// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
FORCE_INLINE_NO_OPTIMIZE __m128 _mm_round_ps(__m128 a, int rounding)
{
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
Expand Down

0 comments on commit e5de7cd

Please sign in to comment.