diff --git a/sse2neon.h b/sse2neon.h index 7bbf4e1a..30d2b18b 100644 --- a/sse2neon.h +++ b/sse2neon.h @@ -106,6 +106,17 @@ #pragma message("Macro name collisions may happen with unsupported compilers.") #endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma push_macro("FORCE_INLINE_OPTNONE") +#define FORCE_INLINE_OPTNONE static inline __attribute__((optimize("O0"))) +#elif defined(__clang__) +#pragma push_macro("FORCE_INLINE_OPTNONE") +#define FORCE_INLINE_OPTNONE static inline __attribute__((optnone)) +#else +#define FORCE_INLINE_OPTNONE FORCE_INLINE +#endif + #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10 #warning "GCC versions earlier than 10 are not supported." #endif @@ -579,8 +590,8 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d); FORCE_INLINE __m128 _mm_ceil_ps(__m128); FORCE_INLINE __m128d _mm_floor_pd(__m128d); FORCE_INLINE __m128 _mm_floor_ps(__m128); -FORCE_INLINE __m128d _mm_round_pd(__m128d, int); -FORCE_INLINE __m128 _mm_round_ps(__m128, int); +static inline __m128d _mm_round_pd(__m128d, int); +static inline __m128 _mm_round_ps(__m128, int); // SSE4.2 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); @@ -2162,7 +2173,7 @@ FORCE_INLINE int _mm_movemask_ps(__m128 a) // Multiply packed single-precision (32-bit) floating-point elements in a and b, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps -FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +FORCE_INLINE_OPTNONE __m128 _mm_mul_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); @@ -3843,7 +3854,7 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 -FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) +FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a) { // vrnd32xq_f64 not supported on clang #if defined(__ARM_FEATURE_FRINT) && !defined(__clang__) @@ -3862,7 +3873,7 @@ FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32 -FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) +FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double d0 = ((double *) &rnd)[0]; @@ -7421,7 +7432,7 @@ FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) // the rounding parameter, and store the results as packed double-precision // floating-point elements in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd -FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) +FORCE_INLINE_OPTNONE __m128d _mm_round_pd(__m128d a, int rounding) { #if defined(__aarch64__) || defined(_M_ARM64) switch (rounding) { @@ -7490,7 +7501,7 @@ FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) // the rounding parameter, and store the results as packed single-precision // floating-point elements in dst. // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps -FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +FORCE_INLINE_OPTNONE __m128 _mm_round_ps(__m128 a, int rounding) { #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) @@ -9280,6 +9291,7 @@ FORCE_INLINE uint64_t _rdtsc(void) #if defined(__GNUC__) || defined(__clang__) #pragma pop_macro("ALIGN_STRUCT") #pragma pop_macro("FORCE_INLINE") +#pragma pop_macro("FORCE_INLINE_OPTNONE") #endif #if defined(__GNUC__) && !defined(__clang__)