diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 4c7646414642..6d3a3d097003 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -123,13 +123,13 @@ * [ ] [`_mm512_cvtepu32lo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32lo_pd&expand=5236) * [ ] [`_mm512_cvtepu8_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi32&expand=5236) * [ ] [`_mm512_cvtepu8_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi64&expand=5236) - * [ ] [`_mm512_cvtpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epi32&expand=5236) - * [ ] [`_mm512_cvtpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epu32&expand=5236) + * [x] [`_mm512_cvtpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epi32&expand=5236) + * [x] [`_mm512_cvtpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epu32&expand=5236) * [ ] [`_mm512_cvtpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ps&expand=5236) * [ ] [`_mm512_cvtpd_pslo`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_pslo&expand=5236) * [ ] [`_mm512_cvtph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_ps&expand=5236) - * [ ] [`_mm512_cvtps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epi32&expand=5236) - * [ ] [`_mm512_cvtps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epu32&expand=5236) + * [x] [`_mm512_cvtps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epi32&expand=5236) + * [x] [`_mm512_cvtps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epu32&expand=5236) * [ ] [`_mm512_cvtps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_pd&expand=5236) * [ ] [`_mm512_cvtps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_ph&expand=5236) * [ ] [`_mm512_cvtpslo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpslo_pd&expand=5236) @@ -138,14 +138,14 @@ * [ ] [`_mm512_cvtsepi64_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi16&expand=5236) * [ ] [`_mm512_cvtsepi64_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi32&expand=5236) * [ ] [`_mm512_cvtsepi64_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi8&expand=5236) - * [ ] [`_mm512_cvtt_roundpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epi32&expand=5236) - * [ ] [`_mm512_cvtt_roundpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epu32&expand=5236) - * [ ] [`_mm512_cvtt_roundps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epi32&expand=5236) - * [ ] [`_mm512_cvtt_roundps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epu32&expand=5236) - * [ ] [`_mm512_cvttpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epi32&expand=5236) - * [ ] [`_mm512_cvttpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epu32&expand=5236) - * [ ] [`_mm512_cvttps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epi32&expand=5236) - * [ ] [`_mm512_cvttps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epu32&expand=5236) + * [x] [`_mm512_cvtt_roundpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epi32&expand=5236) + * [x] [`_mm512_cvtt_roundpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epu32&expand=5236) + * [x] [`_mm512_cvtt_roundps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epi32&expand=5236) + * [x] [`_mm512_cvtt_roundps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epu32&expand=5236) + * [x] [`_mm512_cvttpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epi32&expand=5236) + * [x] [`_mm512_cvttpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epu32&expand=5236) + * [x] [`_mm512_cvttps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epi32&expand=5236) + * [x] [`_mm512_cvttps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epu32&expand=5236) * [ ] [`_mm512_cvtusepi32_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi32_epi16&expand=5236) * [ ] [`_mm512_cvtusepi32_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi32_epi8&expand=5236) * [ ] [`_mm512_cvtusepi64_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi16&expand=5236) @@ -245,10 +245,10 @@ * [x] [`_mm512_loadu_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_pd&expand=5236) * [x] [`_mm512_loadu_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ps&expand=5236) * [ ] [`_mm512_loadu_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_si512&expand=5236) - * [ ] [`_mm512_mask2_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi32&expand=5236) - * [ ] [`_mm512_mask2_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi64&expand=5236) - * [ ] [`_mm512_mask2_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=5236) - * [ ] [`_mm512_mask2_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=5236) + * [x] [`_mm512_mask2_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi32&expand=5236) + * [x] [`_mm512_mask2_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi64&expand=5236) + * [x] [`_mm512_mask2_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=5236) + * [x] [`_mm512_mask2_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=5236) * [ ] [`_mm512_mask2int`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2int&expand=5236) * [x] [`_mm512_mask3_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pd&expand=5236) * [x] [`_mm512_mask3_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ps&expand=5236) @@ -394,8 +394,8 @@ * [ ] [`_mm512_mask_cvtepu32lo_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32lo_pd&expand=5236) * [ ] [`_mm512_mask_cvtepu8_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu8_epi32&expand=5236) * [ ] [`_mm512_mask_cvtepu8_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu8_epi64&expand=5236) - * [ ] [`_mm512_mask_cvtpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epi32&expand=5236) - * [ ] [`_mm512_mask_cvtpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epu32&expand=5236) + * [x] [`_mm512_mask_cvtpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epi32&expand=5236) + * [x] [`_mm512_mask_cvtpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epu32&expand=5236) * [ ] [`_mm512_mask_cvtpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ps&expand=5236) * [ ] [`_mm512_mask_cvtpd_pslo`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_pslo&expand=5236) * [ ] [`_mm512_mask_cvtph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_ps&expand=5236) @@ -414,14 +414,14 @@ * [ ] [`_mm512_mask_cvtsepi64_storeu_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi16&expand=5236) * [ ] [`_mm512_mask_cvtsepi64_storeu_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi32&expand=5236) * [ ] [`_mm512_mask_cvtsepi64_storeu_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi8&expand=5236) - * [ ] [`_mm512_mask_cvtt_roundpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epi32&expand=5236) - * [ ] [`_mm512_mask_cvtt_roundpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epu32&expand=5236) - * [ ] [`_mm512_mask_cvtt_roundps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epi32&expand=5236) - * [ ] [`_mm512_mask_cvtt_roundps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epu32&expand=5236) - * [ ] [`_mm512_mask_cvttpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttpd_epi32&expand=5236) - * [ ] [`_mm512_mask_cvttpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttpd_epu32&expand=5236) - * [ ] [`_mm512_mask_cvttps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttps_epi32&expand=5236) - * [ ] [`_mm512_mask_cvttps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttps_epu32&expand=5236) + * [x] [`_mm512_mask_cvtt_roundpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epi32&expand=5236) + * [x] [`_mm512_mask_cvtt_roundpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epu32&expand=5236) + * [x] [`_mm512_mask_cvtt_roundps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epi32&expand=5236) + * [x] [`_mm512_mask_cvtt_roundps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epu32&expand=5236) + * [x] [`_mm512_mask_cvttpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttpd_epi32&expand=5236) + * [x] [`_mm512_mask_cvttpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttpd_epu32&expand=5236) + * [x] [`_mm512_mask_cvttps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttps_epi32&expand=5236) + * [x] [`_mm512_mask_cvttps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttps_epu32&expand=5236) * [ ] [`_mm512_mask_cvtusepi32_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_epi16&expand=5236) * [ ] [`_mm512_mask_cvtusepi32_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_epi8&expand=5236) * [ ] [`_mm512_mask_cvtusepi32_storeu_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_storeu_epi16&expand=5236) @@ -556,21 +556,21 @@ * [x] [`_mm512_mask_mullox_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mullox_epi64&expand=5236) * [x] [`_mm512_mask_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_or_epi32&expand=5236) * [x] [`_mm512_mask_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_or_epi64&expand=5236) - * [ ] [`_mm512_mask_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_pd&expand=5236) - * [ ] [`_mm512_mask_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_ps&expand=5236) - * [ ] [`_mm512_mask_permutevar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_epi32&expand=5236) - * [ ] [`_mm512_mask_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_pd&expand=5236) - * [ ] [`_mm512_mask_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_ps&expand=5236) - * [ ] [`_mm512_mask_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi32&expand=5236) - * [ ] [`_mm512_mask_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi64&expand=5236) - * [ ] [`_mm512_mask_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_pd&expand=5236) - * [ ] [`_mm512_mask_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_ps&expand=5236) - * [ ] [`_mm512_mask_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_epi64&expand=5236) - * [ ] [`_mm512_mask_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_pd&expand=5236) - * [ ] [`_mm512_mask_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi32&expand=5236) - * [ ] [`_mm512_mask_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi64&expand=5236) - * [ ] [`_mm512_mask_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_pd&expand=5236) - * [ ] [`_mm512_mask_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_ps&expand=5236) + * [x] [`_mm512_mask_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_pd&expand=5236) + * [x] [`_mm512_mask_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_ps&expand=5236) + * [x] [`_mm512_mask_permutevar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_epi32&expand=5236) + * [x] [`_mm512_mask_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_pd&expand=5236) + * [x] [`_mm512_mask_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_ps&expand=5236) + * [x] [`_mm512_mask_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi32&expand=5236) + * [x] [`_mm512_mask_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi64&expand=5236) + * [x] [`_mm512_mask_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_pd&expand=5236) + * [x] [`_mm512_mask_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_ps&expand=5236) + * [x] [`_mm512_mask_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_epi64&expand=5236) + * [x] [`_mm512_mask_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_pd&expand=5236) + * [x] [`_mm512_mask_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi32&expand=5236) + * [x] [`_mm512_mask_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi64&expand=5236) + * [x] [`_mm512_mask_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_pd&expand=5236) + * [x] [`_mm512_mask_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_ps&expand=5236) * [x] [`_mm512_mask_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_pd&expand=5236) * [x] [`_mm512_mask_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_ps&expand=5236) * [ ] [`_mm512_mask_reduce_add_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi32&expand=5236) @@ -615,15 +615,15 @@ * [ ] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236) * [ ] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236) * [ ] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236) - * [ ] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236) + * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236) * [x] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236) * [ ] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236) * [ ] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236) * [ ] [`_mm512_mask_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5236) * [ ] [`_mm512_mask_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x4&expand=5236) * [ ] [`_mm512_mask_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x2&expand=5236) - * [ ] [`_mm512_mask_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_pd&expand=5236) - * [ ] [`_mm512_mask_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_ps&expand=5236) + * [x] [`_mm512_mask_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_pd&expand=5236) + * [x] [`_mm512_mask_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_ps&expand=5236) * [x] [`_mm512_mask_sll_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sll_epi32&expand=5236) * [x] [`_mm512_mask_sll_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sll_epi64&expand=5236) * [x] [`_mm512_mask_slli_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_slli_epi32&expand=5236) @@ -709,7 +709,7 @@ * [ ] [`_mm512_maskz_cvt_roundpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ps&expand=5236) * [ ] [`_mm512_maskz_cvt_roundph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_ps&expand=5236) * [x] [`_mm512_maskz_cvt_roundps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=5236) - * [ ] [`_mm512_maskz_cvt_roundps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epu32&expand=5236) + * [x] [`_mm512_maskz_cvt_roundps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epu32&expand=5236) * [ ] [`_mm512_maskz_cvt_roundps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_pd&expand=5236) * [ ] [`_mm512_maskz_cvt_roundps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_ph&expand=5236) * [ ] [`_mm512_maskz_cvtepi16_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_epi32&expand=5236) @@ -731,12 +731,12 @@ * [ ] [`_mm512_maskz_cvtepu32_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ps&expand=5236) * [ ] [`_mm512_maskz_cvtepu8_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu8_epi32&expand=5236) * [ ] [`_mm512_maskz_cvtepu8_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu8_epi64&expand=5236) - * [ ] [`_mm512_maskz_cvtpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epi32&expand=5236) - * [ ] [`_mm512_maskz_cvtpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epu32&expand=5236) + * [x] [`_mm512_maskz_cvtpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epi32&expand=5236) + * [x] [`_mm512_maskz_cvtpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epu32&expand=5236) * [ ] [`_mm512_maskz_cvtpd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ps&expand=5236) * [ ] [`_mm512_maskz_cvtph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_ps&expand=5236) - * [ ] [`_mm512_maskz_cvtps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_epi32&expand=5236) - * [ ] [`_mm512_maskz_cvtps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_epu32&expand=5236) + * [x] [`_mm512_maskz_cvtps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_epi32&expand=5236) + * [x] [`_mm512_maskz_cvtps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_epu32&expand=5236) * [ ] [`_mm512_maskz_cvtps_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_pd&expand=5236) * [ ] [`_mm512_maskz_cvtps_ph`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_ph&expand=5236) * [ ] [`_mm512_maskz_cvtsepi32_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi32_epi16&expand=5236) @@ -744,14 +744,14 @@ * [ ] [`_mm512_maskz_cvtsepi64_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi16&expand=5236) * [ ] [`_mm512_maskz_cvtsepi64_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi32&expand=5236) * [ ] [`_mm512_maskz_cvtsepi64_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi8&expand=5236) - * [ ] [`_mm512_maskz_cvtt_roundpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundpd_epi32&expand=5236) - * [ ] [`_mm512_maskz_cvtt_roundpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundpd_epu32&expand=5236) - * [ ] [`_mm512_maskz_cvtt_roundps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=5236) - * [ ] [`_mm512_maskz_cvtt_roundps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epu32&expand=5236) - * [ ] [`_mm512_maskz_cvttpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttpd_epi32&expand=5236) - * [ ] [`_mm512_maskz_cvttpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttpd_epu32&expand=5236) - * [ ] [`_mm512_maskz_cvttps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttps_epi32&expand=5236) - * [ ] [`_mm512_maskz_cvttps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttps_epu32&expand=5236) + * [x] [`_mm512_maskz_cvtt_roundpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundpd_epi32&expand=5236) + * [x] [`_mm512_maskz_cvtt_roundpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundpd_epu32&expand=5236) + * [x] [`_mm512_maskz_cvtt_roundps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=5236) + * [x] [`_mm512_maskz_cvtt_roundps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epu32&expand=5236) + * [x] [`_mm512_maskz_cvttpd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttpd_epi32&expand=5236) + * [x] [`_mm512_maskz_cvttpd_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttpd_epu32&expand=5236) + * [x] [`_mm512_maskz_cvttps_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttps_epi32&expand=5236) + * [x] [`_mm512_maskz_cvttps_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttps_epu32&expand=5236) * [ ] [`_mm512_maskz_cvtusepi32_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi32_epi16&expand=5236) * [ ] [`_mm512_maskz_cvtusepi32_epi8`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi32_epi8&expand=5236) * [ ] [`_mm512_maskz_cvtusepi64_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi64_epi16&expand=5236) @@ -853,20 +853,20 @@ * [x] [`_mm512_maskz_mullo_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mullo_epi32&expand=5236) * [x] [`_mm512_maskz_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_or_epi32&expand=5236) * [x] [`_mm512_maskz_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_or_epi64&expand=5236) - * [ ] [`_mm512_maskz_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_pd&expand=5236) - * [ ] [`_mm512_maskz_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_ps&expand=5236) - * [ ] [`_mm512_maskz_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_pd&expand=5236) - * [ ] [`_mm512_maskz_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_ps&expand=5236) - * [ ] [`_mm512_maskz_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi32&expand=5236) - * [ ] [`_mm512_maskz_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi64&expand=5236) - * [ ] [`_mm512_maskz_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_pd&expand=5236) - * [ ] [`_mm512_maskz_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_ps&expand=5236) - * [ ] [`_mm512_maskz_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_epi64&expand=5236) - * [ ] [`_mm512_maskz_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_pd&expand=5236) - * [ ] [`_mm512_maskz_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi32&expand=5236) - * [ ] [`_mm512_maskz_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi64&expand=5236) - * [ ] [`_mm512_maskz_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_pd&expand=5236) - * [ ] [`_mm512_maskz_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_ps&expand=5236) + * [x] [`_mm512_maskz_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_pd&expand=5236) + * [x] [`_mm512_maskz_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_ps&expand=5236) + * [x] [`_mm512_maskz_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_pd&expand=5236) + * [x] [`_mm512_maskz_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_ps&expand=5236) + * [x] [`_mm512_maskz_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi32&expand=5236) + * [x] [`_mm512_maskz_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi64&expand=5236) + * [x] [`_mm512_maskz_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_pd&expand=5236) + * [x] [`_mm512_maskz_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_ps&expand=5236) + * [x] [`_mm512_maskz_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_epi64&expand=5236) + * [x] [`_mm512_maskz_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_pd&expand=5236) + * [x] [`_mm512_maskz_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi32&expand=5236) + * [x] [`_mm512_maskz_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi64&expand=5236) + * [x] [`_mm512_maskz_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_pd&expand=5236) + * [x] [`_mm512_maskz_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_ps&expand=5236) * [x] [`_mm512_maskz_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp14_pd&expand=5236) * [x] [`_mm512_maskz_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp14_ps&expand=5236) * [x] [`_mm512_maskz_rol_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi32&expand=5236) @@ -881,8 +881,8 @@ * [ ] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236) * [ ] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236) * [ ] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236) - * [ ] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236) - * [ ] [`_mm512_maskz_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=5236) + * [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236) + * [x] [`_mm512_maskz_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=5236) * [ ] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236) * [ ] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236) * [ ] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236) @@ -894,8 +894,8 @@ * [ ] [`_mm512_maskz_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5236) * [ ] [`_mm512_maskz_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32x4&expand=5236) * [ ] [`_mm512_maskz_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64x2&expand=5236) - * [ ] [`_mm512_maskz_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_pd&expand=5236) - * [ ] [`_mm512_maskz_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_ps&expand=5236) + * [x] [`_mm512_maskz_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_pd&expand=5236) + * [x] [`_mm512_maskz_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_ps&expand=5236) * [x] [`_mm512_maskz_sll_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sll_epi32&expand=5236) * [x] [`_mm512_maskz_sll_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sll_epi64&expand=5236) * [x] [`_mm512_maskz_slli_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_slli_epi32&expand=5236) @@ -966,23 +966,23 @@ * [x] [`_mm512_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_epi32&expand=5236) * [x] [`_mm512_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_epi64&expand=5236) * [x] [`_mm512_or_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_si512&expand=5236) - * [ ] [`_mm512_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_pd&expand=5236) - * [ ] [`_mm512_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_ps&expand=5236) - * [ ] [`_mm512_permutevar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_epi32&expand=5236) - * [ ] [`_mm512_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_pd&expand=5236) - * [ ] [`_mm512_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_ps&expand=5236) - * [ ] [`_mm512_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi32&expand=5236) - * [ ] [`_mm512_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi64&expand=5236) - * [ ] [`_mm512_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_pd&expand=5236) - * [ ] [`_mm512_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ps&expand=5236) - * [ ] [`_mm512_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_epi64&expand=5236) - * [ ] [`_mm512_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_pd&expand=5236) - * [ ] [`_mm512_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi32&expand=5236) - * [ ] [`_mm512_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi64&expand=5236) - * [ ] [`_mm512_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_pd&expand=5236) - * [ ] [`_mm512_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ps&expand=5236) - * [ ] [`_mm512_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_pd&expand=5236) - * [ ] [`_mm512_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_ps&expand=5236) + * [x] [`_mm512_permute_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_pd&expand=5236) + * [x] [`_mm512_permute_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_ps&expand=5236) + * [x] [`_mm512_permutevar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_epi32&expand=5236) + * [x] [`_mm512_permutevar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_pd&expand=5236) + * [x] [`_mm512_permutevar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_ps&expand=5236) + * [x] [`_mm512_permutex2var_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi32&expand=5236) + * [x] [`_mm512_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi64&expand=5236) + * [x] [`_mm512_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_pd&expand=5236) + * [x] [`_mm512_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ps&expand=5236) + * [x] [`_mm512_permutex_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_epi64&expand=5236) + * [x] [`_mm512_permutex_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_pd&expand=5236) + * [x] [`_mm512_permutexvar_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi32&expand=5236) + * [x] [`_mm512_permutexvar_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi64&expand=5236) + * [x] [`_mm512_permutexvar_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_pd&expand=5236) + * [x] [`_mm512_permutexvar_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ps&expand=5236) + * [x] [`_mm512_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_pd&expand=5236) + * [x] [`_mm512_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_ps&expand=5236) * [ ] [`_mm512_reduce_add_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi32&expand=5236) * [ ] [`_mm512_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=5236) * [ ] [`_mm512_reduce_add_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_pd&expand=5236) @@ -1059,8 +1059,8 @@ * [ ] [`_mm512_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5236) * [ ] [`_mm512_shuffle_i32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32x4&expand=5236) * [ ] [`_mm512_shuffle_i64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5236) - * [ ] [`_mm512_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_pd&expand=5236) - * [ ] [`_mm512_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_ps&expand=5236) + * [x] [`_mm512_shuffle_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_pd&expand=5236) + * [x] [`_mm512_shuffle_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_ps&expand=5236) * [x] [`_mm512_sll_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sll_epi32&expand=5236) * [x] [`_mm512_sll_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sll_epi64&expand=5236) * [x] [`_mm512_slli_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_slli_epi32&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 1ee9fcba6dfa..37d0ca0480b6 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -5442,378 +5442,814 @@ pub unsafe fn _mm512_maskz_cvt_roundps_pd(k: __mmask8, a: __m256, sae: i32) -> _ transmute(r) } -/// Returns vector of type `__m512d` with all elements set to zero. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vxorps))] -pub unsafe fn _mm512_setzero_pd() -> __m512d { - // All-0 is a properly initialized __m512d - mem::zeroed() -} - -/// Returns vector of type `__m512d` with all elements set to zero. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd) +/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtt_roundps_epi32&expand=1916) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vxorps))] -pub unsafe fn _mm512_setzero_ps() -> __m512 { - // All-0 is a properly initialized __m512 - mem::zeroed() +#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_cvtt_roundps_epi32(a: __m512, sae: i32) -> __m512i { + macro_rules! call { + ($imm4:expr) => { + vcvttps2dq( + a.as_f32x16(), + _mm512_setzero_si512().as_i32x16(), + 0b11111111_11111111, + $imm4, + ) + }; + } + let r = constify_imm4_sae!(sae, call); + transmute(r) } -/// Returns vector of type `__m512i` with all elements set to zero. +/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512) -#[inline] -#[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vxorps))] -pub unsafe fn _mm512_setzero_si512() -> __m512i { - // All-0 is a properly initialized __m512i - mem::zeroed() -} - -/// Sets packed 32-bit integers in `dst` with the supplied values in reverse -/// order. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtt_roundps_epi32&expand=1917) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_setr_epi32( - e15: i32, - e14: i32, - e13: i32, - e12: i32, - e11: i32, - e10: i32, - e9: i32, - e8: i32, - e7: i32, - e6: i32, - e5: i32, - e4: i32, - e3: i32, - e2: i32, - e1: i32, - e0: i32, +#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_cvtt_roundps_epi32( + src: __m512i, + k: __mmask16, + a: __m512, + sae: i32, ) -> __m512i { - let r = i32x16( - e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, - ); + macro_rules! call { + ($imm4:expr) => { + vcvttps2dq(a.as_f32x16(), src.as_i32x16(), k, $imm4) + }; + } + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. +/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtt_roundps_epi32&expand=1918) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] +#[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d { - let zero = _mm512_setzero_pd().as_f64x8(); - let neg_one = -1; - let slice = slice as *const i8; - let offsets = offsets.as_i32x8(); +pub unsafe fn _mm512_maskz_cvtt_roundps_epi32(k: __mmask16, a: __m512, sae: i32) -> __m512i { macro_rules! call { - ($imm8:expr) => { - vgatherdpd(zero, slice, offsets, neg_one, $imm8) + ($imm4:expr) => { + vcvttps2dq(a.as_f32x16(), _mm512_setzero_si512().as_i32x16(), k, $imm4) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd) +/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtt_roundps_epu32&expand=1922) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] -#[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_i32gather_pd( - src: __m512d, - mask: __mmask8, - offsets: __m256i, - slice: *const u8, - scale: i32, -) -> __m512d { - let src = src.as_f64x8(); - let slice = slice as *const i8; - let offsets = offsets.as_i32x8(); +#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_cvtt_roundps_epu32(a: __m512, sae: i32) -> __m512i { macro_rules! call { - ($imm8:expr) => { - vgatherdpd(src, slice, offsets, mask as i8, $imm8) + ($imm4:expr) => { + vcvttps2udq( + a.as_f32x16(), + _mm512_setzero_si512().as_i32x16(), + 0b11111111_11111111, + $imm4, + ) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. +/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtt_roundps_epu32&expand=1923) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] -#[rustc_args_required_const(2)] -pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d { - let zero = _mm512_setzero_pd().as_f64x8(); - let neg_one = -1; - let slice = slice as *const i8; - let offsets = offsets.as_i64x8(); +#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_cvtt_roundps_epu32( + src: __m512i, + k: __mmask16, + a: __m512, + sae: i32, +) -> __m512i { macro_rules! call { - ($imm8:expr) => { - vgatherqpd(zero, slice, offsets, neg_one, $imm8) + ($imm4:expr) => { + vcvttps2udq(a.as_f32x16(), src.as_i32x16(), k, $imm4) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. +/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtt_roundps_epu32&expand=1924) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] -#[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_i64gather_pd( - src: __m512d, - mask: __mmask8, - offsets: __m512i, - slice: *const u8, - scale: i32, -) -> __m512d { - let src = src.as_f64x8(); - let slice = slice as *const i8; - let offsets = offsets.as_i64x8(); +#[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_cvtt_roundps_epu32(k: __mmask16, a: __m512, sae: i32) -> __m512i { macro_rules! call { - ($imm8:expr) => { - vgatherqpd(src, slice, offsets, mask as i8, $imm8) + ($imm4:expr) => { + vcvttps2udq(a.as_f32x16(), _mm512_setzero_si512().as_i32x16(), k, $imm4) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_ps) +/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtt_roundpd_epi32&expand=1904) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] -#[rustc_args_required_const(2)] -pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 { - let zero = _mm256_setzero_ps().as_f32x8(); - let neg_one = -1; - let slice = slice as *const i8; - let offsets = offsets.as_i64x8(); +#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_cvtt_roundpd_epi32(a: __m512d, sae: i32) -> __m256i { macro_rules! call { - ($imm8:expr) => { - vgatherqps(zero, slice, offsets, neg_one, $imm8) + ($imm4:expr) => { + vcvttpd2dq( + a.as_f64x8(), + _mm256_setzero_si256().as_i32x8(), + 0b11111111, + $imm4, + ) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. +/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtt_roundpd_epi32&expand=1905) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] -#[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_i64gather_ps( - src: __m256, - mask: __mmask8, - offsets: __m512i, - slice: *const u8, - scale: i32, -) -> __m256 { - let src = src.as_f32x8(); - let slice = slice as *const i8; - let offsets = offsets.as_i64x8(); +#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_cvtt_roundpd_epi32( + src: __m256i, + k: __mmask8, + a: __m512d, + sae: i32, +) -> __m256i { macro_rules! call { - ($imm8:expr) => { - vgatherqps(src, slice, offsets, mask as i8, $imm8) + ($imm4:expr) => { + vcvttpd2dq(a.as_f64x8(), src.as_i32x8(), k, $imm4) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. +/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtt_roundps_epi32&expand=1918) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] +#[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm512_i32gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m512 { - let zero = _mm512_setzero_ps().as_f32x16(); - let neg_one = -1; - let slice = slice as *const i8; - let offsets = offsets.as_i32x16(); +pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32(k: __mmask8, a: __m512d, sae: i32) -> __m256i { macro_rules! call { - ($imm8:expr) => { - vgatherdps(zero, slice, offsets, neg_one, $imm8) + ($imm4:expr) => { + vcvttpd2dq(a.as_f64x8(), _mm256_setzero_si256().as_i32x8(), k, $imm4) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_ps) +/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtt_roundpd_epu32&expand=1910) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] -#[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_i32gather_ps( - src: __m512, - mask: __mmask16, - offsets: __m512i, - slice: *const u8, - scale: i32, -) -> __m512 { - let src = src.as_f32x16(); - let slice = slice as *const i8; - let offsets = offsets.as_i32x16(); +#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_cvtt_roundpd_epu32(a: __m512d, sae: i32) -> __m256i { macro_rules! call { - ($imm8:expr) => { - vgatherdps(src, slice, offsets, mask as i16, $imm8) + ($imm4:expr) => { + vcvttpd2udq( + a.as_f64x8(), + _mm256_setzero_si256().as_i32x8(), + 0b11111111, + $imm4, + ) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather 32-bit integers from memory using 32-bit indices. +/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtt_roundpd_epu32&expand=1911) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] -#[rustc_args_required_const(2)] -pub unsafe fn _mm512_i32gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i { - let zero = _mm512_setzero_si512().as_i32x16(); - let neg_one = -1; - let slice = slice as *const i8; - let offsets = offsets.as_i32x16(); +#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_cvtt_roundpd_epu32( + src: __m256i, + k: __mmask8, + a: __m512d, + sae: i32, +) -> __m256i { macro_rules! call { - ($imm8:expr) => { - vpgatherdd(zero, slice, offsets, neg_one, $imm8) + ($imm4:expr) => { + vcvttpd2udq(a.as_f64x8(), src.as_i32x8(), k, $imm4) }; } - let r = constify_imm8_gather!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather 32-bit integers from memory using 32-bit indices. +/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvttps_epi32&expand=1984) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] -#[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_i32gather_epi32( - src: __m512i, - mask: __mmask16, - offsets: __m512i, - slice: *const u8, - scale: i32, -) -> __m512i { - let src = src.as_i32x16(); - let mask = mask as i16; - let slice = slice as *const i8; - let offsets = offsets.as_i32x16(); +#[cfg_attr(test, assert_instr(vcvttps2dq))] +pub unsafe fn _mm512_cvttps_epi32(a: __m512) -> __m512i { + transmute(vcvttps2dq( + a.as_f32x16(), + _mm512_setzero_si512().as_i32x16(), + 0b11111111_11111111, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvttps_epi32&expand=1985) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttps2dq))] +pub unsafe fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i { + transmute(vcvttps2dq( + a.as_f32x16(), + src.as_i32x16(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvttps_epi32&expand=1986) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttps2dq))] +pub unsafe fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i { + transmute(vcvttps2dq( + a.as_f32x16(), + _mm512_setzero_si512().as_i32x16(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvttps_epu32&expand=2002) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttps2udq))] +pub unsafe fn _mm512_cvttps_epu32(a: __m512) -> __m512i { + transmute(vcvttps2udq( + a.as_f32x16(), + _mm512_setzero_si512().as_i32x16(), + 0b11111111_11111111, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvttps_epu32&expand=2003) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttps2udq))] +pub unsafe fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i { + transmute(vcvttps2udq( + a.as_f32x16(), + src.as_i32x16(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvttps_epu32&expand=2004) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttps2udq))] +pub unsafe fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i { + transmute(vcvttps2udq( + a.as_f32x16(), + _mm512_setzero_si512().as_i32x16(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtt_roundpd_epu32&expand=1912) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32(k: __mmask8, a: __m512d, sae: i32) -> __m256i { macro_rules! call { - ($imm8:expr) => { - vpgatherdd(src, slice, offsets, mask, $imm8) + ($imm4:expr) => { + vcvttpd2udq(a.as_f64x8(), _mm256_setzero_si256().as_i32x8(), k, $imm4) }; } - let r = constify_imm8!(scale, call); + let r = constify_imm4_sae!(sae, call); transmute(r) } -/// Gather 64-bit integers from memory using 32-bit indices. +/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvttpd_epi32&expand=1947) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] +#[cfg_attr(test, assert_instr(vcvttpd2dq))] +pub unsafe fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i { + transmute(vcvttpd2dq( + a.as_f64x8(), + _mm256_setzero_si256().as_i32x8(), + 0b11111111, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvttpd_epi32&expand=1948) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttpd2dq))] +pub unsafe fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i { + transmute(vcvttpd2dq( + a.as_f64x8(), + src.as_i32x8(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvttpd_epi32&expand=1949) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttpd2dq))] +pub unsafe fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i { + transmute(vcvttpd2dq( + a.as_f64x8(), + _mm256_setzero_si256().as_i32x8(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvttpd_epu32&expand=1965) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttpd2udq))] +pub unsafe fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i { + transmute(vcvttpd2udq( + a.as_f64x8(), + _mm256_setzero_si256().as_i32x8(), + 0b11111111, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvttpd_epu32&expand=1966) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttpd2udq))] +pub unsafe fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i { + transmute(vcvttpd2udq( + a.as_f64x8(), + src.as_i32x8(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvttpd_epu32&expand=1967) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vcvttpd2udq))] +pub unsafe fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i { + transmute(vcvttpd2udq( + a.as_f64x8(), + _mm256_setzero_si256().as_i32x8(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Returns vector of type `__m512d` with all elements set to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_setzero_pd() -> __m512d { + // All-0 is a properly initialized __m512d + mem::zeroed() +} + +/// Returns vector of type `__m512d` with all elements set to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_setzero_ps() -> __m512 { + // All-0 is a properly initialized __m512 + mem::zeroed() +} + +/// Returns vector of type `__m512i` with all elements set to zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vxorps))] +pub unsafe fn _mm512_setzero_si512() -> __m512i { + // All-0 is a properly initialized __m512i + mem::zeroed() +} + +/// Sets packed 32-bit integers in `dst` with the supplied values in reverse +/// order. +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_setr_epi32( + e15: i32, + e14: i32, + e13: i32, + e12: i32, + e11: i32, + e10: i32, + e9: i32, + e8: i32, + e7: i32, + e6: i32, + e5: i32, + e4: i32, + e3: i32, + e2: i32, + e1: i32, + e0: i32, +) -> __m512i { + let r = i32x16( + e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, + ); + transmute(r) +} + +/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] #[rustc_args_required_const(2)] -pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i { - let zero = _mm512_setzero_si512().as_i64x8(); +pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d { + let zero = _mm512_setzero_pd().as_f64x8(); let neg_one = -1; let slice = slice as *const i8; let offsets = offsets.as_i32x8(); macro_rules! call { ($imm8:expr) => { - vpgatherdq(zero, slice, offsets, neg_one, $imm8) + vgatherdpd(zero, slice, offsets, neg_one, $imm8) }; } let r = constify_imm8_gather!(scale, call); transmute(r) } -/// Gather 64-bit integers from memory using 32-bit indices. +/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] +#[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))] #[rustc_args_required_const(4)] -pub unsafe fn _mm512_mask_i32gather_epi64( - src: __m512i, +pub unsafe fn _mm512_mask_i32gather_pd( + src: __m512d, mask: __mmask8, offsets: __m256i, slice: *const u8, scale: i32, -) -> __m512i { - let src = src.as_i64x8(); - let mask = mask as i8; +) -> __m512d { + let src = src.as_f64x8(); let slice = slice as *const i8; let offsets = offsets.as_i32x8(); macro_rules! call { ($imm8:expr) => { - vpgatherdq(src, slice, offsets, mask, $imm8) + vgatherdpd(src, slice, offsets, mask as i8, $imm8) }; } let r = constify_imm8_gather!(scale, call); transmute(r) } -/// Gather 64-bit integers from memory using 64-bit indices. +/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] +#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] #[rustc_args_required_const(2)] -pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i { - let zero = _mm512_setzero_si512().as_i64x8(); +pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d { + let zero = _mm512_setzero_pd().as_f64x8(); let neg_one = -1; let slice = slice as *const i8; let offsets = offsets.as_i64x8(); macro_rules! call { ($imm8:expr) => { - vpgatherqq(zero, slice, offsets, neg_one, $imm8) + vgatherqpd(zero, slice, offsets, neg_one, $imm8) }; } let r = constify_imm8_gather!(scale, call); transmute(r) } -/// Gather 64-bit integers from memory using 64-bit indices. +/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] +#[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i64gather_pd( + src: __m512d, + mask: __mmask8, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m512d { + let src = src.as_f64x8(); + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherqpd(src, slice, offsets, mask as i8, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 { + let zero = _mm256_setzero_ps().as_f32x8(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherqps(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherqps, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i64gather_ps( + src: __m256, + mask: __mmask8, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m256 { + let src = src.as_f32x8(); + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vgatherqps(src, slice, offsets, mask as i8, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_i32gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m512 { + let zero = _mm512_setzero_ps().as_f32x16(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vgatherdps(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgatherdps, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32gather_ps( + src: __m512, + mask: __mmask16, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m512 { + let src = src.as_f32x16(); + let slice = slice as *const i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vgatherdps(src, slice, offsets, mask as i16, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather 32-bit integers from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_i32gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i { + let zero = _mm512_setzero_si512().as_i32x16(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vpgatherdd(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather 32-bit integers from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32gather_epi32( + src: __m512i, + mask: __mmask16, + offsets: __m512i, + slice: *const u8, + scale: i32, +) -> __m512i { + let src = src.as_i32x16(); + let mask = mask as i16; + let slice = slice as *const i8; + let offsets = offsets.as_i32x16(); + macro_rules! call { + ($imm8:expr) => { + vpgatherdd(src, slice, offsets, mask, $imm8) + }; + } + let r = constify_imm8!(scale, call); + transmute(r) +} + +/// Gather 64-bit integers from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i { + let zero = _mm512_setzero_si512().as_i64x8(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherdq(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather 64-bit integers from memory using 32-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_i32gather_epi64( + src: __m512i, + mask: __mmask8, + offsets: __m256i, + slice: *const u8, + scale: i32, +) -> __m512i { + let src = src.as_i64x8(); + let mask = mask as i8; + let slice = slice as *const i8; + let offsets = offsets.as_i32x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherdq(src, slice, offsets, mask, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather 64-bit integers from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i { + let zero = _mm512_setzero_si512().as_i64x8(); + let neg_one = -1; + let slice = slice as *const i8; + let offsets = offsets.as_i64x8(); + macro_rules! call { + ($imm8:expr) => { + vpgatherqq(zero, slice, offsets, neg_one, $imm8) + }; + } + let r = constify_imm8_gather!(scale, call); + transmute(r) +} + +/// Gather 64-bit integers from memory using 64-bit indices. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))] #[rustc_args_required_const(4)] pub unsafe fn _mm512_mask_i64gather_epi64( src: __m512i, @@ -7365,1835 +7801,3016 @@ pub unsafe fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) - transmute(simd_select_bitmask(k, shf, zero)) } -/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permute_ps&expand=4170) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpandq))] -pub unsafe fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_and(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_permute_ps(a: __m512, imm8: i32) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vpermilps(a.as_f32x16(), _mm512_set1_epi32($imm8).as_i32x16()) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) } -/// Performs element-by-element bitwise AND between packed 32-bit integer elements of v2 and v3, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_and_epi32&expand=273) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permute_ps&expand=4168) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpandd))] -pub unsafe fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let and = _mm512_and_epi32(a, b).as_i32x16(); - transmute(simd_select_bitmask(k, and, src.as_i32x16())) +#[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_permute_ps(src: __m512, k: __mmask16, a: __m512, imm8: i32) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vpermilps(a.as_f32x16(), _mm512_set1_epi32($imm8).as_i32x16()) + }; + } + let permute = constify_imm8_sae!(imm8, call); + transmute(simd_select_bitmask(k, permute, src.as_f32x16())) } -/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_and_epi32&expand=274) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permute_ps&expand=4169) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpandd))] -pub unsafe fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let and = _mm512_and_epi32(a, b).as_i32x16(); - let zero = _mm512_setzero_si512().as_i32x16(); - transmute(simd_select_bitmask(k, and, zero)) +#[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_permute_ps(k: __mmask16, a: __m512, imm8: i32) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vpermilps(a.as_f32x16(), _mm512_set1_epi32($imm8).as_i32x16()) + }; + } + let permute = constify_imm8_sae!(imm8, call); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst. +/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi64&expand=279) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permute_pd&expand=4161) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpandq))] -pub unsafe fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_and(a.as_i64x8(), b.as_i64x8())) +#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_permute_pd(a: __m512d, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vpermilpd(a.as_f64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) } -/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_and_epi64&expand=280) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permute_pd&expand=4159) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpandq))] -pub unsafe fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i { - let and = _mm512_and_epi64(a, b).as_i64x8(); - transmute(simd_select_bitmask(k, and, src.as_i64x8())) +#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_permute_pd(src: __m512d, k: __mmask8, a: __m512d, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vpermilpd(a.as_f64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let permute = constify_imm8_sae!(imm8, call); + transmute(simd_select_bitmask(k, permute, src.as_f64x8())) } -/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_and_Epi32&expand=274) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permute_pd&expand=4160) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpandq))] -pub unsafe fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { - let and = _mm512_and_epi64(a, b).as_i64x8(); - let zero = _mm512_setzero_si512().as_i64x8(); - transmute(simd_select_bitmask(k, and, zero)) +#[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_permute_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vpermilpd(a.as_f64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let permute = constify_imm8_sae!(imm8, call); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst. +/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_si512&expand=302) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex_epi64&expand=4208) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpandq))] -pub unsafe fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_and(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] +//shoud be vpermq, but generate vpermpd. It generates vpermq with mask. change to vbroadcast becaise CI Windows +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_permutex_epi64(a: __m512i, imm8: i32) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpermq(a.as_i64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) } -/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst. +/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_epi32&expand=4042) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex_epi64&expand=4206) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vporq))] -pub unsafe fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_or(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vpbroadcast, imm8 = 0b11111111))] //shoud be vpermq. change to vpbroadcast becaise CI Windows +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_permutex_epi64( + src: __m512i, + k: __mmask8, + a: __m512i, + imm8: i32, +) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpermq(a.as_i64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let permute = constify_imm8_sae!(imm8, call); + transmute(simd_select_bitmask(k, permute, src.as_i64x8())) } -/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_or_epi32&expand=4040) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex_epi64&expand=4207) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpord))] -pub unsafe fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let or = _mm512_or_epi32(a, b).as_i32x16(); - transmute(simd_select_bitmask(k, or, src.as_i32x16())) +#[cfg_attr(test, assert_instr(vpbroadcast, imm8 = 0b11111111))] //shoud be vpermq. change to vpbroadcast becaise CI Windows +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_permutex_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpermq(a.as_i64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let permute = constify_imm8_sae!(imm8, call); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_or_epi32&expand=4041) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex_pd&expand=4214) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpord))] -pub unsafe fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let or = _mm512_or_epi32(a, b).as_i32x16(); - let zero = _mm512_setzero_si512().as_i32x16(); - transmute(simd_select_bitmask(k, or, zero)) +#[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_permutex_pd(a: __m512d, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vpermpd(a.as_f64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) } -/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst. +/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_epi64&expand=4051) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex_pd&expand=4212) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vporq))] -pub unsafe fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_or(a.as_i64x8(), b.as_i64x8())) +#[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_permutex_pd(src: __m512d, k: __mmask8, a: __m512d, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vpermpd(a.as_f64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let permute = constify_imm8_sae!(imm8, call); + transmute(simd_select_bitmask(k, permute, src.as_f64x8())) } -/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_or_epi64&expand=4049) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex_pd&expand=4213) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vporq))] -pub unsafe fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i { - let or = _mm512_or_epi64(a, b).as_i64x8(); - transmute(simd_select_bitmask(k, or, src.as_i64x8())) +#[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_permutex_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vpermpd(a.as_f64x8(), _mm512_set1_epi64($imm8).as_i64x8()) + }; + } + let permute = constify_imm8_sae!(imm8, call); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_or_epi64&expand=4050) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutevar_epi32&expand=4182) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vporq))] -pub unsafe fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { - let or = _mm512_or_epi64(a, b).as_i64x8(); - let zero = _mm512_setzero_si512().as_i64x8(); - transmute(simd_select_bitmask(k, or, zero)) +#[cfg_attr(test, assert_instr(vperm))] //should be vpermd, but generate vpermps. It generates vpermd with mask +pub unsafe fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i { + transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) } -/// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst. +/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_si512&expand=4072) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutevar_epi32&expand=4181) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vporq))] -pub unsafe fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_or(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vpermd))] +pub unsafe fn _mm512_mask_permutevar_epi32( + src: __m512i, + k: __mmask16, + idx: __m512i, + a: __m512i, +) -> __m512i { + let permute = _mm512_permutevar_epi32(idx, a).as_i32x16(); + transmute(simd_select_bitmask(k, permute, src.as_i32x16())) } -/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst. +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_epi32&expand=6142) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutevar_ps&expand=4200) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpxorq))] -pub unsafe fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vpermilps))] +pub unsafe fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 { + transmute(vpermilps(a.as_f32x16(), b.as_i32x16())) } -/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_xor_epi32&expand=6140) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutevar_ps&expand=4198) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpxord))] -pub unsafe fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let xor = _mm512_xor_epi32(a, b).as_i32x16(); - transmute(simd_select_bitmask(k, xor, src.as_i32x16())) +#[cfg_attr(test, assert_instr(vpermilps))] +pub unsafe fn _mm512_mask_permutevar_ps( + src: __m512, + k: __mmask16, + a: __m512, + b: __m512i, +) -> __m512 { + let permute = _mm512_permutevar_ps(a, b).as_f32x16(); + transmute(simd_select_bitmask(k, permute, src.as_f32x16())) } -/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_xor_epi32&expand=6141) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutevar_ps&expand=4199) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpxord))] -pub unsafe fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { - let xor = _mm512_xor_epi32(a, b).as_i32x16(); - let zero = _mm512_setzero_si512().as_i32x16(); - transmute(simd_select_bitmask(k, xor, zero)) +#[cfg_attr(test, assert_instr(vpermilps))] +pub unsafe fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 { + let permute = _mm512_permutevar_ps(a, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst. +/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_epi64&expand=6151) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutevar_pd&expand=4191) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpxorq))] -pub unsafe fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_xor(a.as_i64x8(), b.as_i64x8())) +#[cfg_attr(test, assert_instr(vpermilpd))] +pub unsafe fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d { + transmute(vpermilpd(a.as_f64x8(), b.as_i64x8())) } -/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_xor_epi64&expand=6149) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutevar_pd&expand=4189) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpxorq))] -pub unsafe fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i { - let xor = _mm512_xor_epi64(a, b).as_i64x8(); - transmute(simd_select_bitmask(k, xor, src.as_i64x8())) +#[cfg_attr(test, assert_instr(vpermilpd))] +pub unsafe fn _mm512_mask_permutevar_pd( + src: __m512d, + k: __mmask8, + a: __m512d, + b: __m512i, +) -> __m512d { + let permute = _mm512_permutevar_pd(a, b).as_f64x8(); + transmute(simd_select_bitmask(k, permute, src.as_f64x8())) } -/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_xor_epi64&expand=6150) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutevar_pd&expand=4190) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpxorq))] -pub unsafe fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { - let xor = _mm512_xor_epi64(a, b).as_i64x8(); - let zero = _mm512_setzero_si512().as_i64x8(); - transmute(simd_select_bitmask(k, xor, zero)) +#[cfg_attr(test, assert_instr(vpermilpd))] +pub unsafe fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d { + let permute = _mm512_permutevar_pd(a, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst. +/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_si512&expand=6172) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutexvar_epi32&expand=4301) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpxorq))] -pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i { - transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vperm))] //should be vpermd, but generate vpermps. It generates vpermd with mask +pub unsafe fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i { + transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) } -/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k. +/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kand_mask16&expand=3212) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutexvar_epi32&expand=4299) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw -pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(kandw(a, b)) +#[cfg_attr(test, assert_instr(vpermd))] +pub unsafe fn _mm512_mask_permutexvar_epi32( + src: __m512i, + k: __mmask16, + idx: __m512i, + a: __m512i, +) -> __m512i { + let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16(); + transmute(simd_select_bitmask(k, permute, src.as_i32x16())) } -/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k. +/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kand&expand=3210) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutexvar_epi32&expand=4300) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw -pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(kandw(a, b)) +#[cfg_attr(test, assert_instr(vpermd))] +pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i { + let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k. +/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kor_mask16&expand=3239) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutexvar_epi64&expand=4307) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw -pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(korw(a, b)) +#[cfg_attr(test, assert_instr(vperm))] //should be vpermq, but generate vpermpd. It generates vpermd with mask +pub unsafe fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i { + transmute(vpermq(a.as_i64x8(), idx.as_i64x8())) } -/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k. +/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kor&expand=3237) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutexvar_epi64&expand=4305) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw -pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(korw(a, b)) +#[cfg_attr(test, assert_instr(vpermq))] +pub unsafe fn _mm512_mask_permutexvar_epi64( + src: __m512i, + k: __mmask8, + idx: __m512i, + a: __m512i, +) -> __m512i { + let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8(); + transmute(simd_select_bitmask(k, permute, src.as_i64x8())) } -/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k. +/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxor_mask16&expand=3291) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutexvar_epi64&expand=4306) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw -pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(kxorw(a, b)) +#[cfg_attr(test, assert_instr(vpermq))] +pub unsafe fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i { + let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k. +/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxor&expand=3289) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutevar_ps&expand=4200) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw -pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 { - transmute(kxorw(a, b)) +#[cfg_attr(test, assert_instr(vpermps))] +pub unsafe fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 { + transmute(vpermps(a.as_f32x16(), idx.as_i32x16())) } -/// Sets packed 32-bit integers in `dst` with the supplied values. +/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutexvar_ps&expand=4326) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_set_ps( - e0: f32, - e1: f32, - e2: f32, - e3: f32, - e4: f32, - e5: f32, - e6: f32, - e7: f32, - e8: f32, - e9: f32, - e10: f32, - e11: f32, - e12: f32, - e13: f32, - e14: f32, - e15: f32, +#[cfg_attr(test, assert_instr(vpermps))] +pub unsafe fn _mm512_mask_permutexvar_ps( + src: __m512, + k: __mmask16, + idx: __m512i, + a: __m512, ) -> __m512 { - _mm512_setr_ps( - e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, - ) + let permute = _mm512_permutexvar_ps(idx, a).as_f32x16(); + transmute(simd_select_bitmask(k, permute, src.as_f32x16())) } -/// Sets packed 32-bit integers in `dst` with the supplied values in -/// reverse order. +/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutexvar_ps&expand=4327) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_setr_ps( - e0: f32, - e1: f32, - e2: f32, - e3: f32, - e4: f32, - e5: f32, - e6: f32, - e7: f32, - e8: f32, - e9: f32, - e10: f32, - e11: f32, - e12: f32, - e13: f32, - e14: f32, - e15: f32, -) -> __m512 { - let r = f32x16::new( - e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, - ); - transmute(r) +#[cfg_attr(test, assert_instr(vpermps))] +pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 { + let permute = _mm512_permutexvar_ps(idx, a).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Broadcast 64-bit float `a` to all elements of `dst`. +/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutexvar_pd&expand=4322) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d { - transmute(f64x8::splat(a)) +#[cfg_attr(test, assert_instr(vpermpd))] +pub unsafe fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d { + transmute(vpermpd(a.as_f64x8(), idx.as_i64x8())) } -/// Broadcast 32-bit float `a` to all elements of `dst`. +/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutexvar_pd&expand=4320) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 { - transmute(f32x16::splat(a)) +#[cfg_attr(test, assert_instr(vpermpd))] +pub unsafe fn _mm512_mask_permutexvar_pd( + src: __m512d, + k: __mmask8, + idx: __m512i, + a: __m512d, +) -> __m512d { + let permute = _mm512_permutexvar_pd(idx, a).as_f64x8(); + transmute(simd_select_bitmask(k, permute, src.as_f64x8())) } -/// Sets packed 32-bit integers in `dst` with the supplied values. +/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutexvar_pd&expand=4321) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_set_epi32( - e15: i32, - e14: i32, - e13: i32, - e12: i32, - e11: i32, - e10: i32, - e9: i32, - e8: i32, - e7: i32, - e6: i32, - e5: i32, - e4: i32, - e3: i32, - e2: i32, - e1: i32, - e0: i32, -) -> __m512i { - _mm512_setr_epi32( - e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, - ) +#[cfg_attr(test, assert_instr(vpermpd))] +pub unsafe fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d { + let permute = _mm512_permutexvar_pd(idx, a).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Broadcast 32-bit integer `a` to all elements of `dst`. +/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex2var_epi32&expand=4238) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i { - transmute(i32x16::splat(a)) +#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d +pub unsafe fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i { + transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16())) } -/// Broadcast 64-bit integer `a` to all elements of `dst`. +/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex2var_epi32&expand=4235) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i { - transmute(i64x8::splat(a)) +#[cfg_attr(test, assert_instr(vpermt2d))] +pub unsafe fn _mm512_mask_permutex2var_epi32( + a: __m512i, + k: __mmask16, + idx: __m512i, + b: __m512i, +) -> __m512i { + let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16(); + transmute(simd_select_bitmask(k, permute, a.as_i32x16())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector. +/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex2var_epi32&expand=4237) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 { - _mm512_cmp_ps_mask(a, b, _CMP_LT_OS) +#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d +pub unsafe fn _mm512_maskz_permutex2var_epi32( + k: __mmask16, + a: __m512i, + idx: __m512i, + b: __m512i, +) -> __m512i { + let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2_permutex2var_epi32&expand=4236) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmplt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { - _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OS) +#[cfg_attr(test, assert_instr(vpermi2d))] +pub unsafe fn _mm512_mask2_permutex2var_epi32( + a: __m512i, + idx: __m512i, + k: __mmask16, + b: __m512i, +) -> __m512i { + let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16(); + transmute(simd_select_bitmask(k, permute, idx.as_i32x16())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. +/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnlt_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex2var_epi64&expand=4250) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 { - _mm512_cmp_ps_mask(a, b, _CMP_NLT_US) +#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q +pub unsafe fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i { + transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnlt_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex2var_epi64&expand=4247) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmpnlt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { - _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLT_US) +#[cfg_attr(test, assert_instr(vpermt2q))] +pub unsafe fn _mm512_mask_permutex2var_epi64( + a: __m512i, + k: __mmask8, + idx: __m512i, + b: __m512i, +) -> __m512i { + let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8(); + transmute(simd_select_bitmask(k, permute, a.as_i64x8())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector. +/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex2var_epi64&expand=4249) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 { - _mm512_cmp_ps_mask(a, b, _CMP_LE_OS) +#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q +pub unsafe fn _mm512_maskz_permutex2var_epi64( + k: __mmask8, + a: __m512i, + idx: __m512i, + b: __m512i, +) -> __m512i { + let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2_permutex2var_epi64&expand=4248) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmple_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { - _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OS) +#[cfg_attr(test, assert_instr(vpermi2q))] +pub unsafe fn _mm512_mask2_permutex2var_epi64( + a: __m512i, + idx: __m512i, + k: __mmask8, + b: __m512i, +) -> __m512i { + let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8(); + transmute(simd_select_bitmask(k, permute, idx.as_i64x8())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. +/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnle_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex2var_ps&expand=4286) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 { - _mm512_cmp_ps_mask(a, b, _CMP_NLE_US) +#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps +pub unsafe fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 { + transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnle_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex2var_ps&expand=4283) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmpnle_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { - _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLE_US) +#[cfg_attr(test, assert_instr(vpermt2ps))] +pub unsafe fn _mm512_mask_permutex2var_ps( + a: __m512, + k: __mmask16, + idx: __m512i, + b: __m512, +) -> __m512 { + let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16(); + transmute(simd_select_bitmask(k, permute, a.as_f32x16())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector. +/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex2var_ps&expand=4285) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 { - _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ) +#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps +pub unsafe fn _mm512_maskz_permutex2var_ps( + k: __mmask16, + a: __m512, + idx: __m512i, + b: __m512, +) -> __m512 { + let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2_permutex2var_ps&expand=4284) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmpeq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { - _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OQ) +#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps +pub unsafe fn _mm512_mask2_permutex2var_ps( + a: __m512, + idx: __m512i, + k: __mmask16, + b: __m512, +) -> __m512 { + let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector. +/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex2var_pd&expand=4274) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 { - _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ) +#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd +pub unsafe fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d { + transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex2var_pd&expand=4271) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmpneq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { - _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_UQ) +#[cfg_attr(test, assert_instr(vpermt2pd))] +pub unsafe fn _mm512_mask_permutex2var_pd( + a: __m512d, + k: __mmask8, + idx: __m512i, + b: __m512d, +) -> __m512d { + let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8(); + transmute(simd_select_bitmask(k, permute, a.as_f64x8())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op. +/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex2var_pd&expand=4273) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2)] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: i32) -> __mmask16 { - let neg_one = -1; - macro_rules! call { - ($imm5:expr) => { - vcmpps( - a.as_f32x16(), - b.as_f32x16(), - $imm5, - neg_one, - _MM_FROUND_CUR_DIRECTION, - ) - }; - } - let r = constify_imm5!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd +pub unsafe fn _mm512_maskz_permutex2var_pd( + k: __mmask8, + a: __m512d, + idx: __m512i, + b: __m512d, +) -> __m512d { + let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op, -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set) /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2_permutex2var_pd&expand=4272) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3)] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i32) -> __mmask16 { - macro_rules! call { - ($imm5:expr) => { - vcmpps( - a.as_f32x16(), - b.as_f32x16(), - $imm5, - m as i16, - _MM_FROUND_CUR_DIRECTION, - ) - }; - } - let r = constify_imm5!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd +pub unsafe fn _mm512_mask2_permutex2var_pd( + a: __m512d, + idx: __m512i, + k: __mmask8, + b: __m512d, +) -> __m512d { + let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, permute, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op. +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_ps&expand=5203) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2, 3)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm512_cmp_round_ps_mask(a: __m512, b: __m512, op: i32, sae: i32) -> __mmask16 { - let neg_one = -1; - macro_rules! call { - ($imm5:expr, $imm4:expr) => { - vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4) +#[cfg_attr(test, assert_instr(vshufps, imm8 = 0))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shuffle_ps(a: __m512, b: __m512, imm8: i32) -> __m512 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); }; } - let r = constify_imm5_sae!(op, sae, call); - transmute(r) + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28), + 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29), + 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30), + _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28), + 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29), + 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30), + _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12), + 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13), + 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14), + _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15), + } + }; + } + match imm8 & 0x3 { + 0 => shuffle1!(0, 4, 8, 12), + 1 => shuffle1!(1, 5, 9, 13), + 2 => shuffle1!(2, 6, 10, 14), + _ => shuffle1!(3, 7, 11, 15), + } } -/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op, -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_ps&expand=5201) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3, 4)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm512_mask_cmp_round_ps_mask( - m: __mmask16, +#[cfg_attr(test, assert_instr(vshufps, imm8 = 0))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shuffle_ps( + src: __m512, + k: __mmask16, a: __m512, b: __m512, - op: i32, - sae: i32, -) -> __mmask16 { - macro_rules! call { - ($imm5:expr, $imm4:expr) => { - vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4) + imm8: i32, +) -> __m512 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); }; } - let r = constify_imm5_sae!(op, sae, call); - transmute(r) + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28), + 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29), + 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30), + _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28), + 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29), + 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30), + _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12), + 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13), + 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14), + _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 4, 8, 12), + 1 => shuffle1!(1, 5, 9, 13), + 2 => shuffle1!(2, 6, 10, 14), + _ => shuffle1!(3, 7, 11, 15), + }; + + transmute(simd_select_bitmask(k, shuffle, src.as_f32x16())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. +/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_ps&expand=5202) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 { - _mm512_cmp_ps_mask(a, b, _CMP_ORD_Q) +#[cfg_attr(test, assert_instr(vshufps, imm8 = 0))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shuffle_ps(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle4 { + ( + $a:expr, + $b:expr, + $c:expr, + $d:expr, + $e:expr, + $f:expr, + $g:expr, + $h:expr, + $i:expr, + $j:expr, + $k:expr, + $l:expr, + $m:expr, + $n:expr, + $o:expr, + $p:expr + ) => { + simd_shuffle16( + a, + b, + [ + $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, + ], + ); + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => { + match (imm8 >> 6) & 0x3 { + 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28), + 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29), + 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30), + _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => { + match (imm8 >> 4) & 0x3 { + 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28), + 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29), + 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30), + _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31), + } + }; + } + macro_rules! shuffle1 { + ($a:expr, $e:expr, $i: expr, $m: expr) => { + match (imm8 >> 2) & 0x3 { + 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12), + 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13), + 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14), + _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15), + } + }; + } + let shuffle = match imm8 & 0x3 { + 0 => shuffle1!(0, 4, 8, 12), + 1 => shuffle1!(1, 5, 9, 13), + 2 => shuffle1!(2, 6, 10, 14), + _ => shuffle1!(3, 7, 11, 15), + }; + + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, shuffle, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. +/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_pd&expand=5192) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_mask_cmpord_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { - _mm512_mask_cmp_ps_mask(m, a, b, _CMP_ORD_Q) +#[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_shuffle_pd(a: __m512d, b: __m512d, imm8: i32) -> __m512d { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle8 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle7 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 7) & 0x1 { + 0 => shuffle8!($a, $b, $c, $d, $e, $f, $g, 14), + _ => shuffle8!($a, $b, $c, $d, $e, $f, $g, 15), + } + }; + } + macro_rules! shuffle6 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => { + match (imm8 >> 6) & 0x1 { + 0 => shuffle7!($a, $b, $c, $d, $e, $f, 6), + _ => shuffle7!($a, $b, $c, $d, $e, $f, 7), + } + }; + } + macro_rules! shuffle5 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => { + match (imm8 >> 5) & 0x1 { + 0 => shuffle6!($a, $b, $c, $d, $e, 12), + _ => shuffle6!($a, $b, $c, $d, $e, 13), + } + }; + } + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + match (imm8 >> 4) & 0x1 { + 0 => shuffle5!($a, $b, $c, $d, 4), + _ => shuffle5!($a, $b, $c, $d, 5), + } + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr) => { + match (imm8 >> 3) & 0x1 { + 0 => shuffle4!($a, $b, $c, 10), + _ => shuffle4!($a, $b, $c, 11), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match (imm8 >> 2) & 0x1 { + 0 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 3), + } + }; + } + macro_rules! shuffle1 { + ($a:expr) => { + match (imm8 >> 1) & 0x1 { + 0 => shuffle2!($a, 8), + _ => shuffle2!($a, 9), + } + }; + } + match imm8 & 0x1 { + 0 => shuffle1!(0), + _ => shuffle1!(1), + } } -/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. +/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_pd&expand=5190) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 { - _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q) +#[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_shuffle_pd( + src: __m512d, + k: __mmask8, + a: __m512d, + b: __m512d, + imm8: i32, +) -> __m512d { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle8 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle7 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 7) & 0x1 { + 0 => shuffle8!($a, $b, $c, $d, $e, $f, $g, 14), + _ => shuffle8!($a, $b, $c, $d, $e, $f, $g, 15), + } + }; + } + macro_rules! shuffle6 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => { + match (imm8 >> 6) & 0x1 { + 0 => shuffle7!($a, $b, $c, $d, $e, $f, 6), + _ => shuffle7!($a, $b, $c, $d, $e, $f, 7), + } + }; + } + macro_rules! shuffle5 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => { + match (imm8 >> 5) & 0x1 { + 0 => shuffle6!($a, $b, $c, $d, $e, 12), + _ => shuffle6!($a, $b, $c, $d, $e, 13), + } + }; + } + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + match (imm8 >> 4) & 0x1 { + 0 => shuffle5!($a, $b, $c, $d, 4), + _ => shuffle5!($a, $b, $c, $d, 5), + } + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr) => { + match (imm8 >> 3) & 0x1 { + 0 => shuffle4!($a, $b, $c, 10), + _ => shuffle4!($a, $b, $c, 11), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match (imm8 >> 2) & 0x1 { + 0 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 3), + } + }; + } + macro_rules! shuffle1 { + ($a:expr) => { + match (imm8 >> 1) & 0x1 { + 0 => shuffle2!($a, 8), + _ => shuffle2!($a, 9), + } + }; + } + let shuffle = match imm8 & 0x1 { + 0 => shuffle1!(0), + _ => shuffle1!(1), + }; + + transmute(simd_select_bitmask(k, shuffle, src.as_f64x8())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. +/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_ps_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_pd&expand=5191) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_mask_cmpunord_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { - _mm512_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_Q) +#[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8: i32) -> __m512d { + let imm8 = (imm8 & 0xFF) as u8; + macro_rules! shuffle8 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => { + simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]); + }; + } + macro_rules! shuffle7 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr) => { + match (imm8 >> 7) & 0x1 { + 0 => shuffle8!($a, $b, $c, $d, $e, $f, $g, 14), + _ => shuffle8!($a, $b, $c, $d, $e, $f, $g, 15), + } + }; + } + macro_rules! shuffle6 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => { + match (imm8 >> 6) & 0x1 { + 0 => shuffle7!($a, $b, $c, $d, $e, $f, 6), + _ => shuffle7!($a, $b, $c, $d, $e, $f, 7), + } + }; + } + macro_rules! shuffle5 { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => { + match (imm8 >> 5) & 0x1 { + 0 => shuffle6!($a, $b, $c, $d, $e, 12), + _ => shuffle6!($a, $b, $c, $d, $e, 13), + } + }; + } + macro_rules! shuffle4 { + ($a:expr, $b:expr, $c:expr, $d:expr) => { + match (imm8 >> 4) & 0x1 { + 0 => shuffle5!($a, $b, $c, $d, 4), + _ => shuffle5!($a, $b, $c, $d, 5), + } + }; + } + macro_rules! shuffle3 { + ($a:expr, $b:expr, $c:expr) => { + match (imm8 >> 3) & 0x1 { + 0 => shuffle4!($a, $b, $c, 10), + _ => shuffle4!($a, $b, $c, 11), + } + }; + } + macro_rules! shuffle2 { + ($a:expr, $b:expr) => { + match (imm8 >> 2) & 0x1 { + 0 => shuffle3!($a, $b, 2), + _ => shuffle3!($a, $b, 3), + } + }; + } + macro_rules! shuffle1 { + ($a:expr) => { + match (imm8 >> 1) & 0x1 { + 0 => shuffle2!($a, 8), + _ => shuffle2!($a, 9), + } + }; + } + let shuffle = match imm8 & 0x1 { + 0 => shuffle1!(0), + _ => shuffle1!(1), + }; + + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, shuffle, zero)) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector. +/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { - _mm512_cmp_pd_mask(a, b, _CMP_LT_OS) +#[cfg_attr(test, assert_instr(vpandq))] //should be vpandd, but generate vpandq +pub unsafe fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_and(a.as_i32x16(), b.as_i32x16())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Performs element-by-element bitwise AND between packed 32-bit integer elements of v2 and v3, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_and_epi32&expand=273) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmplt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { - _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OS) +#[cfg_attr(test, assert_instr(vpandd))] +pub unsafe fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let and = _mm512_and_epi32(a, b).as_i32x16(); + transmute(simd_select_bitmask(k, and, src.as_i32x16())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. +/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnlt_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_and_epi32&expand=274) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { - _mm512_cmp_pd_mask(a, b, _CMP_NLT_US) +#[cfg_attr(test, assert_instr(vpandd))] +pub unsafe fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let and = _mm512_and_epi32(a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, and, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnlt_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi64&expand=279) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { - _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US) +#[cfg_attr(test, assert_instr(vpandq))] +pub unsafe fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_and(a.as_i64x8(), b.as_i64x8())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector. +/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_and_epi64&expand=280) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { - _mm512_cmp_pd_mask(a, b, _CMP_LE_OS) +#[cfg_attr(test, assert_instr(vpandq))] +pub unsafe fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let and = _mm512_and_epi64(a, b).as_i64x8(); + transmute(simd_select_bitmask(k, and, src.as_i64x8())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_and_Epi32&expand=274) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmple_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { - _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OS) +#[cfg_attr(test, assert_instr(vpandq))] +pub unsafe fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let and = _mm512_and_epi64(a, b).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, and, zero)) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. +/// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnle_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_si512&expand=302) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { - _mm512_cmp_pd_mask(a, b, _CMP_NLE_US) +#[cfg_attr(test, assert_instr(vpandq))] +pub unsafe fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_and(a.as_i32x16(), b.as_i32x16())) } -/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnle_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_epi32&expand=4042) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmpnle_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { - _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLE_US) +#[cfg_attr(test, assert_instr(vporq))] +pub unsafe fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_or(a.as_i32x16(), b.as_i32x16())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector. +/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_or_epi32&expand=4040) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { - _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ) +#[cfg_attr(test, assert_instr(vpord))] +pub unsafe fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let or = _mm512_or_epi32(a, b).as_i32x16(); + transmute(simd_select_bitmask(k, or, src.as_i32x16())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_or_epi32&expand=4041) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmpeq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { - _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OQ) +#[cfg_attr(test, assert_instr(vpord))] +pub unsafe fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let or = _mm512_or_epi32(a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, or, zero)) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector. +/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_epi64&expand=4051) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { - _mm512_cmp_pd_mask(a, b, _CMP_NEQ_UQ) +#[cfg_attr(test, assert_instr(vporq))] +pub unsafe fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_or(a.as_i64x8(), b.as_i64x8())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_or_epi64&expand=4049) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp))] -pub unsafe fn _mm512_mask_cmpneq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { - _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_UQ) +#[cfg_attr(test, assert_instr(vporq))] +pub unsafe fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let or = _mm512_or_epi64(a, b).as_i64x8(); + transmute(simd_select_bitmask(k, or, src.as_i64x8())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op. +/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_or_epi64&expand=4050) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2)] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, op: i32) -> __mmask8 { - let neg_one = -1; - macro_rules! call { - ($imm5:expr) => { - vcmppd( - a.as_f64x8(), - b.as_f64x8(), - $imm5, - neg_one, - _MM_FROUND_CUR_DIRECTION, - ) - }; - } - let r = constify_imm5!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vporq))] +pub unsafe fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let or = _mm512_or_epi64(a, b).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, or, zero)) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op, -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_si512&expand=4072) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3)] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_mask_cmp_pd_mask(m: __mmask8, a: __m512d, b: __m512d, op: i32) -> __mmask8 { - macro_rules! call { - ($imm5:expr) => { - vcmppd( - a.as_f64x8(), - b.as_f64x8(), - $imm5, - m as i8, - _MM_FROUND_CUR_DIRECTION, - ) - }; - } - let r = constify_imm5!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vporq))] +pub unsafe fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_or(a.as_i32x16(), b.as_i32x16())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op. +/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_epi32&expand=6142) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2, 3)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm512_cmp_round_pd_mask(a: __m512d, b: __m512d, op: i32, sae: i32) -> __mmask8 { - let neg_one = -1; - macro_rules! call { - ($imm5:expr, $imm4:expr) => { - vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4) - }; - } - let r = constify_imm5_sae!(op, sae, call); - transmute(r) +#[cfg_attr(test, assert_instr(vpxorq))] +pub unsafe fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op, -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_xor_epi32&expand=6140) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3, 4)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm512_mask_cmp_round_pd_mask( - m: __mmask8, - a: __m512d, - b: __m512d, - op: i32, - sae: i32, -) -> __mmask8 { - macro_rules! call { - ($imm5:expr, $imm4:expr) => { - vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4) - }; - } - let r = constify_imm5_sae!(op, sae, call); - transmute(r) +#[cfg_attr(test, assert_instr(vpxord))] +pub unsafe fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let xor = _mm512_xor_epi32(a, b).as_i32x16(); + transmute(simd_select_bitmask(k, xor, src.as_i32x16())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. +/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_xor_epi32&expand=6141) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { - _mm512_cmp_pd_mask(a, b, _CMP_ORD_Q) +#[cfg_attr(test, assert_instr(vpxord))] +pub unsafe fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { + let xor = _mm512_xor_epi32(a, b).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, xor, zero)) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. +/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_epi64&expand=6151) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_mask_cmpord_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { - _mm512_mask_cmp_pd_mask(m, a, b, _CMP_ORD_Q) +#[cfg_attr(test, assert_instr(vpxorq))] +pub unsafe fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_xor(a.as_i64x8(), b.as_i64x8())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. +/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_xor_epi64&expand=6149) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { - _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q) +#[cfg_attr(test, assert_instr(vpxorq))] +pub unsafe fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let xor = _mm512_xor_epi64(a, b).as_i64x8(); + transmute(simd_select_bitmask(k, xor, src.as_i64x8())) } -/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. +/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_pd_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_xor_epi64&expand=6150) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vcmp, op = 0))] -pub unsafe fn _mm512_mask_cmpunord_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { - _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q) +#[cfg_attr(test, assert_instr(vpxorq))] +pub unsafe fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i { + let xor = _mm512_xor_epi64(a, b).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, xor, zero)) } -/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss_mask&expand=5236,755,757) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_si512&expand=6172) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm_cmp_ss_mask(a: __m128, b: __m128, op: i32) -> __mmask8 { - let neg_one = -1; - macro_rules! call { - ($imm5:expr) => { - vcmpss(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION) - }; - } - let r = constify_imm5!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vpxorq))] +pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i { + transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) } -/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ss_mask&expand=5236,755,757) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kand_mask16&expand=3212) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm_mask_cmp_ss_mask(m: __mmask8, a: __m128, b: __m128, op: i32) -> __mmask8 { - macro_rules! call { - ($imm5:expr) => { - vcmpss(a, b, $imm5, m as i8, _MM_FROUND_CUR_DIRECTION) - }; - } - let r = constify_imm5!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw +pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { + transmute(kandw(a, b)) } -/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=5236,755,757) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kand&expand=3210) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2, 3)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm_cmp_round_ss_mask(a: __m128, b: __m128, op: i32, sae: i32) -> __mmask8 { - let neg_one = -1; - macro_rules! call { - ($imm5:expr, $imm4:expr) => { - vcmpss(a, b, $imm5, neg_one, $imm4) - }; - } - let r = constify_imm5_sae!(op, sae, call); - transmute(r) +#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw +pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 { + transmute(kandw(a, b)) } -/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=5236,755,757) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kor_mask16&expand=3239) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3, 4)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm_mask_cmp_round_ss_mask( - m: __mmask8, - a: __m128, - b: __m128, - op: i32, - sae: i32, -) -> __mmask8 { - macro_rules! call { - ($imm5:expr, $imm4:expr) => { - vcmpss(a, b, $imm5, m as i8, $imm4) - }; - } - let r = constify_imm5_sae!(op, sae, call); - transmute(r) +#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw +pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { + transmute(korw(a, b)) } -/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=5236,755,757) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kor&expand=3237) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm_cmp_sd_mask(a: __m128d, b: __m128d, op: i32) -> __mmask8 { - let neg_one = -1; - macro_rules! call { - ($imm5:expr) => { - vcmpsd(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION) - }; - } - let r = constify_imm5!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw +pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 { + transmute(korw(a, b)) } -/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=5236,755,757) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxor_mask16&expand=3291) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm_mask_cmp_sd_mask(m: __mmask8, a: __m128d, b: __m128d, op: i32) -> __mmask8 { - macro_rules! call { - ($imm5:expr) => { - vcmpsd(a, b, $imm5, m as i8, _MM_FROUND_CUR_DIRECTION) - }; - } - let r = constify_imm5!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw +pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 { + transmute(kxorw(a, b)) } -/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=5236,755,757) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxor&expand=3289) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2, 3)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm_cmp_round_sd_mask(a: __m128d, b: __m128d, op: i32, sae: i32) -> __mmask8 { - let neg_one = -1; - macro_rules! call { - ($imm5:expr, $imm4:expr) => { - vcmpsd(a, b, $imm5, neg_one, $imm4) - }; - } - let r = constify_imm5_sae!(op, sae, call); - transmute(r) +#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw +pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 { + transmute(kxorw(a, b)) } -/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// Sets packed 32-bit integers in `dst` with the supplied values. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236,755,757) +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3, 4)] -#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] -pub unsafe fn _mm_mask_cmp_round_sd_mask( - m: __mmask8, - a: __m128d, - b: __m128d, - op: i32, - sae: i32, -) -> __mmask8 { - macro_rules! call { - ($imm5:expr, $imm4:expr) => { - vcmpsd(a, b, $imm5, m as i8, $imm4) - }; - } - let r = constify_imm5_sae!(op, sae, call); - transmute(r) +pub unsafe fn _mm512_set_ps( + e0: f32, + e1: f32, + e2: f32, + e3: f32, + e4: f32, + e5: f32, + e6: f32, + e7: f32, + e8: f32, + e9: f32, + e10: f32, + e11: f32, + e12: f32, + e13: f32, + e14: f32, + e15: f32, +) -> __m512 { + _mm512_setr_ps( + e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, + ) } -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. +/// Sets packed 32-bit integers in `dst` with the supplied values in +/// reverse order. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32) +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_lt(a.as_u32x16(), b.as_u32x16())) +pub unsafe fn _mm512_setr_ps( + e0: f32, + e1: f32, + e2: f32, + e3: f32, + e4: f32, + e5: f32, + e6: f32, + e7: f32, + e8: f32, + e9: f32, + e10: f32, + e11: f32, + e12: f32, + e13: f32, + e14: f32, + e15: f32, +) -> __m512 { + let r = f32x16::new( + e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, + ); + transmute(r) } -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epu32) +/// Broadcast 64-bit float `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmplt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmplt_epu32_mask(a, b) & m +pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d { + transmute(f64x8::splat(a)) } -/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in a mask vector. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu32) +/// Broadcast 32-bit float `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_gt(a.as_u32x16(), b.as_u32x16())) +pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 { + transmute(f32x16::splat(a)) } -/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu32) +/// Sets packed 32-bit integers in `dst` with the supplied values. #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpgt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpgt_epu32_mask(a, b) & m +pub unsafe fn _mm512_set_epi32( + e15: i32, + e14: i32, + e13: i32, + e12: i32, + e11: i32, + e10: i32, + e9: i32, + e8: i32, + e7: i32, + e6: i32, + e5: i32, + e4: i32, + e3: i32, + e2: i32, + e1: i32, + e0: i32, +) -> __m512i { + _mm512_setr_epi32( + e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, + ) } -/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu32) +/// Broadcast 32-bit integer `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_le(a.as_u32x16(), b.as_u32x16())) +pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i { + transmute(i32x16::splat(a)) } -/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). -/// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu32) +/// Broadcast 64-bit integer `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmple_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmple_epu32_mask(a, b) & m +pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i { + transmute(i64x8::splat(a)) } -/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_ge(a.as_u32x16(), b.as_u32x16())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_LT_OS) } -/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpge_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpge_epu32_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmplt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OS) } -/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnlt_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_eq(a.as_u32x16(), b.as_u32x16())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_NLT_US) } -/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in a mask vector k +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnlt_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpeq_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpeq_epu32_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpnlt_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLT_US) } -/// Compare packed unsigned 32-bit integers in a and b for inequality, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_ne(a.as_u32x16(), b.as_u32x16())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_LE_OS) } -/// Compare packed unsigned 32-bit integers in a and b for inequality, and store the results in a mask vector k +/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu32_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpneq_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpneq_epu32_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmple_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OS) } -/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by op. +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu32_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnle_ps) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2)] -#[cfg_attr(test, assert_instr(vpcmp, op = 0))] -pub unsafe fn _mm512_cmp_epu32_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask16 { - let neg_one = -1; - macro_rules! call { - ($imm3:expr) => { - vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one) - }; - } - let r = constify_imm3!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_NLE_US) } -/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by op, -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu32_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnle_ps) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3)] -#[cfg_attr(test, assert_instr(vpcmp, op = 0))] -pub unsafe fn _mm512_mask_cmp_epu32_mask( - m: __mmask16, - a: __m512i, - b: __m512i, - op: _MM_CMPINT_ENUM, -) -> __mmask16 { - macro_rules! call { - ($imm3:expr) => { - vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, m as i16) - }; - } - let r = constify_imm3!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpnle_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLE_US) } -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_lt(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ) } -/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k +/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmplt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmplt_epi32_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpeq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OQ) } -/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_ps) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_gt(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ) } -/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in a mask vector k +/// Compare packed single-precision (32-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpgt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpgt_epi32_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpneq_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_UQ) } -/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_le(a.as_i32x16(), b.as_i32x16())) +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, op: i32) -> __mmask16 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr) => { + vcmpps( + a.as_f32x16(), + b.as_f32x16(), + $imm5, + neg_one, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm5!(op, call); + transmute(r) } -/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmple_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmple_epi32_mask(a, b) & m +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_ps_mask(m: __mmask16, a: __m512, b: __m512, op: i32) -> __mmask16 { + macro_rules! call { + ($imm5:expr) => { + vcmpps( + a.as_f32x16(), + b.as_f32x16(), + $imm5, + m as i16, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm5!(op, call); + transmute(r) } -/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_ge(a.as_i32x16(), b.as_i32x16())) +#[rustc_args_required_const(2, 3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm512_cmp_round_ps_mask(a: __m512, b: __m512, op: i32, sae: i32) -> __mmask16 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) } -/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpge_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpge_epi32_mask(a, b) & m +#[rustc_args_required_const(3, 4)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm512_mask_cmp_round_ps_mask( + m: __mmask16, + a: __m512, + b: __m512, + op: i32, + sae: i32, +) -> __mmask16 { + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) } -/// Compare packed signed 32-bit integers in a and b for equality, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_eq(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_ORD_Q) } -/// Compare packed signed 32-bit integers in a and b for equality, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpeq_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpeq_epi32_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmpord_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_ORD_Q) } -/// Compare packed signed 32-bit integers in a and b for inequality, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { - simd_bitmask::(simd_ne(a.as_i32x16(), b.as_i32x16())) +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 { + _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q) } -/// Compare packed signed 32-bit integers in a and b for inequality, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi32) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_ps_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpneq_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { - _mm512_cmpneq_epi32_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmpunord_ps_mask(m: __mmask16, a: __m512, b: __m512) -> __mmask16 { + _mm512_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_Q) } -/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by op. +/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi32_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_pd) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2)] -#[cfg_attr(test, assert_instr(vpcmp, op = 0))] -pub unsafe fn _mm512_cmp_epi32_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask16 { - let neg_one = -1; - macro_rules! call { - ($imm3:expr) => { - vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one) - }; - } - let r = constify_imm3!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_LT_OS) } -/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by op, -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi32_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_pd) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3)] -#[cfg_attr(test, assert_instr(vpcmp, op = 0))] -pub unsafe fn _mm512_mask_cmp_epi32_mask( - m: __mmask16, - a: __m512i, - b: __m512i, - op: _MM_CMPINT_ENUM, -) -> __mmask16 { - macro_rules! call { - ($imm3:expr) => { - vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, m as i16) - }; - } - let r = constify_imm3!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmplt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OS) } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnlt_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_NLT_US) } -/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnlt_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmplt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmplt_epu64_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US) } -/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector. +/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_LE_OS) } -/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector k +/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpgt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpgt_epu64_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmple_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OS) } -/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpnle_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_NLE_US) } -/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k +/// Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpnle_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmple_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmple_epu64_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpnle_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLE_US) } -/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ) } -/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpge_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpge_epu64_mask(b, a) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpeq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OQ) } -/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector. +/// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_pd) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8())) +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_NEQ_UQ) } -/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector k +/// Compare packed double-precision (64-bit) floating-point elements in a and b for inequality, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpeq_epu64_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp))] +pub unsafe fn _mm512_mask_cmpneq_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_UQ) } -/// Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector. +/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, op: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr) => { + vcmppd( + a.as_f64x8(), + b.as_f64x8(), + $imm5, + neg_one, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm5!(op, call); + transmute(r) } -/// Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpneq_epu64_mask(a, b) & m +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_pd_mask(m: __mmask8, a: __m512d, b: __m512d, op: i32) -> __mmask8 { + macro_rules! call { + ($imm5:expr) => { + vcmppd( + a.as_f64x8(), + b.as_f64x8(), + $imm5, + m as i8, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm5!(op, call); + transmute(r) } -/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op. +/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2)] -#[cfg_attr(test, assert_instr(vpcmp, op = 0))] -pub unsafe fn _mm512_cmp_epu64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask8 { +#[rustc_args_required_const(2, 3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm512_cmp_round_pd_mask(a: __m512d, b: __m512d, op: i32, sae: i32) -> __mmask8 { let neg_one = -1; macro_rules! call { - ($imm3:expr) => { - vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one) + ($imm5:expr, $imm4:expr) => { + vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4) }; } - let r = constify_imm3!(op, call); + let r = constify_imm5_sae!(op, sae, call); transmute(r) } -/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op, +/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by op, /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_round_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3)] -#[cfg_attr(test, assert_instr(vpcmp, op = 0))] -pub unsafe fn _mm512_mask_cmp_epu64_mask( +#[rustc_args_required_const(3, 4)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm512_mask_cmp_round_pd_mask( m: __mmask8, - a: __m512i, - b: __m512i, - op: _MM_CMPINT_ENUM, + a: __m512d, + b: __m512d, + op: i32, + sae: i32, ) -> __mmask8 { macro_rules! call { - ($imm3:expr) => { - vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, m as i8) + ($imm5:expr, $imm4:expr) => { + vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, m as i8, $imm4) }; } - let r = constify_imm3!(op, call); + let r = constify_imm5_sae!(op, sae, call); transmute(r) } -/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8())) +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_ORD_Q) } -/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpord_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmplt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmplt_epi64_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmpord_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_ORD_Q) } -/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in a mask vector. +/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 { + _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q) } -/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpunord_pd_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpgt_epi64_mask(a, b) & m +#[cfg_attr(test, assert_instr(vcmp, op = 0))] +pub unsafe fn _mm512_mask_cmpunord_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 { + _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q) } -/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss_mask&expand=5236,755,757) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8())) +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_cmp_ss_mask(a: __m128, b: __m128, op: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr) => { + vcmpss(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION) + }; + } + let r = constify_imm5!(op, call); + transmute(r) } -/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ss_mask&expand=5236,755,757) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmple_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmple_epi64_mask(a, b) & m +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_mask_cmp_ss_mask(m: __mmask8, a: __m128, b: __m128, op: i32) -> __mmask8 { + macro_rules! call { + ($imm5:expr) => { + vcmpss(a, b, $imm5, m as i8, _MM_FROUND_CUR_DIRECTION) + }; + } + let r = constify_imm5!(op, call); + transmute(r) } -/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=5236,755,757) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8())) +#[rustc_args_required_const(2, 3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_cmp_round_ss_mask(a: __m128, b: __m128, op: i32, sae: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpss(a, b, $imm5, neg_one, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) } -/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=5236,755,757) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpge_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpge_epi64_mask(b, a) & m +#[rustc_args_required_const(3, 4)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_mask_cmp_round_ss_mask( + m: __mmask8, + a: __m128, + b: __m128, + op: i32, + sae: i32, +) -> __mmask8 { + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpss(a, b, $imm5, m as i8, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) } -/// Compare packed signed 64-bit integers in a and b for equality, and store the results in a mask vector. +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_cmp_sd_mask(a: __m128d, b: __m128d, op: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr) => { + vcmpsd(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_mask_cmp_sd_mask(m: __mmask8, a: __m128d, b: __m128d, op: i32) -> __mmask8 { + macro_rules! call { + ($imm5:expr) => { + vcmpsd(a, b, $imm5, m as i8, _MM_FROUND_CUR_DIRECTION) + }; + } + let r = constify_imm5!(op, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2, 3)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_cmp_round_sd_mask(a: __m128d, b: __m128d, op: i32, sae: i32) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpsd(a, b, $imm5, neg_one, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in a mask vector using zeromask m (the element is zeroed out when mask bit 0 is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236,755,757) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3, 4)] +#[cfg_attr(test, assert_instr(vcmp, op = 0, sae = 4))] +pub unsafe fn _mm_mask_cmp_round_sd_mask( + m: __mmask8, + a: __m128d, + b: __m128d, + op: i32, + sae: i32, +) -> __mmask8 { + macro_rules! call { + ($imm5:expr, $imm4:expr) => { + vcmpsd(a, b, $imm5, m as i8, $imm4) + }; + } + let r = constify_imm5_sae!(op, sae, call); + transmute(r) +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu32) #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) +pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_lt(a.as_u32x16(), b.as_u32x16())) } -/// Compare packed signed 64-bit integers in a and b for equality, and store the results in a mask vector k +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epu32) #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpeq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpeq_epi64_mask(a, b) & m +pub unsafe fn _mm512_mask_cmplt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epu32_mask(a, b) & m } -/// Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector. +/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu32) #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { - simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) +pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_gt(a.as_u32x16(), b.as_u32x16())) } -/// Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector k +/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in a mask vector k /// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi64) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu32) #[inline] #[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(vpcmp))] -pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { - _mm512_cmpneq_epi64_mask(a, b) & m +pub unsafe fn _mm512_mask_cmpgt_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpgt_epu32_mask(a, b) & m } -/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op. +/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu32) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(2)] -#[cfg_attr(test, assert_instr(vpcmp, op = 0))] -pub unsafe fn _mm512_cmp_epi64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask8 { - let neg_one = -1; - macro_rules! call { - ($imm3:expr) => { - vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one) - }; - } - let r = constify_imm3!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_le(a.as_u32x16(), b.as_u32x16())) } -/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op, -/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64_mask) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu32) #[inline] #[target_feature(enable = "avx512f")] -#[rustc_args_required_const(3)] -#[cfg_attr(test, assert_instr(vpcmp, op = 0))] -pub unsafe fn _mm512_mask_cmp_epi64_mask( - m: __mmask8, - a: __m512i, - b: __m512i, - op: _MM_CMPINT_ENUM, -) -> __mmask8 { - macro_rules! call { - ($imm3:expr) => { - vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, m as i8) - }; - } - let r = constify_imm3!(op, call); - transmute(r) +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmple_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmple_epu32_mask(a, b) & m } -/// Returns vector of type `__m512d` with undefined elements. +/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu32) #[inline] #[target_feature(enable = "avx512f")] -// This intrinsic has no corresponding instruction. -pub unsafe fn _mm512_undefined_pd() -> __m512d { - _mm512_set1_pd(0.0) +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_ge(a.as_u32x16(), b.as_u32x16())) } -/// Returns vector of type `__m512` with undefined elements. +/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu32) #[inline] #[target_feature(enable = "avx512f")] -// This intrinsic has no corresponding instruction. -pub unsafe fn _mm512_undefined_ps() -> __m512 { - _mm512_set1_ps(0.0) +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpge_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpge_epu32_mask(a, b) & m } -/// Loads 512-bits (composed of 8 packed double-precision (64-bit) -/// floating-point elements) from memory into result. -/// `mem_addr` does not need to be aligned on any particular boundary. +/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu32) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovups))] -pub unsafe fn _mm512_loadu_pd(mem_addr: *const f64) -> __m512d { - ptr::read_unaligned(mem_addr as *const __m512d) +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_eq(a.as_u32x16(), b.as_u32x16())) } -/// Stores 512-bits (composed of 8 packed double-precision (64-bit) -/// floating-point elements) from `a` into memory. -/// `mem_addr` does not need to be aligned on any particular boundary. +/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu32) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovups))] -pub unsafe fn _mm512_storeu_pd(mem_addr: *mut f64, a: __m512d) { - ptr::write_unaligned(mem_addr as *mut __m512d, a); +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpeq_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpeq_epu32_mask(a, b) & m } -/// Loads 512-bits (composed of 16 packed single-precision (32-bit) -/// floating-point elements) from memory into result. -/// `mem_addr` does not need to be aligned on any particular boundary. +/// Compare packed unsigned 32-bit integers in a and b for inequality, and store the results in a mask vector. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu32) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovups))] -pub unsafe fn _mm512_loadu_ps(mem_addr: *const f32) -> __m512 { - ptr::read_unaligned(mem_addr as *const __m512) +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_ne(a.as_u32x16(), b.as_u32x16())) } -/// Stores 512-bits (composed of 16 packed single-precision (32-bit) -/// floating-point elements) from `a` into memory. -/// `mem_addr` does not need to be aligned on any particular boundary. +/// Compare packed unsigned 32-bit integers in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ps) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu32_mask) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmovups))] -#[stable(feature = "simd_x86", since = "1.27.0")] -pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) { - ptr::write_unaligned(mem_addr as *mut __m512, a); +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpneq_epu32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpneq_epu32_mask(a, b) & m } -/// Sets packed 64-bit integers in `dst` with the supplied values in -/// reverse order. +/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by op. /// -/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu32_mask) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_setr_pd( - e0: f64, - e1: f64, - e2: f64, - e3: f64, - e4: f64, - e5: f64, - e6: f64, - e7: f64, -) -> __m512d { - let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7); +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_cmp_epu32_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask16 { + let neg_one = -1; + macro_rules! call { + ($imm3:expr) => { + vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one) + }; + } + let r = constify_imm3!(op, call); transmute(r) } -/// Sets packed 64-bit integers in `dst` with the supplied values. +/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). /// -/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu32_mask) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_set_pd( - e0: f64, - e1: f64, +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_epu32_mask( + m: __mmask16, + a: __m512i, + b: __m512i, + op: _MM_CMPINT_ENUM, +) -> __mmask16 { + macro_rules! call { + ($imm3:expr) => { + vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, m as i16) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_lt(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmplt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmplt_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_gt(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpgt_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpgt_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_le(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmple_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmple_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_ge(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpge_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpge_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b for equality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_eq(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed signed 32-bit integers in a and b for equality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpeq_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpeq_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + simd_bitmask::(simd_ne(a.as_i32x16(), b.as_i32x16())) +} + +/// Compare packed signed 32-bit integers in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi32) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpneq_epi32_mask(m: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + _mm512_cmpneq_epi32_mask(a, b) & m +} + +/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi32_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_cmp_epi32_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask16 { + let neg_one = -1; + macro_rules! call { + ($imm3:expr) => { + vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi32_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_epi32_mask( + m: __mmask16, + a: __m512i, + b: __m512i, + op: _MM_CMPINT_ENUM, +) -> __mmask16 { + macro_rules! call { + ($imm3:expr) => { + vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, m as i16) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8())) +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmplt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmplt_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8())) +} + +/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpgt_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpgt_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8())) +} + +/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmple_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmple_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8())) +} + +/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpge_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpge_epu64_mask(b, a) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8())) +} + +/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpeq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpeq_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epu64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) +} + +/// Compare packed unsigned 64-bit integers in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epu64_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpneq_epu64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpneq_epu64_mask(a, b) & m +} + +/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_cmp_epu64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm3:expr) => { + vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epu64_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_epu64_mask( + m: __mmask8, + a: __m512i, + b: __m512i, + op: _MM_CMPINT_ENUM, +) -> __mmask8 { + macro_rules! call { + ($imm3:expr) => { + vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, m as i8) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmplt_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8())) +} + +/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmplt_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmplt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmplt_epi64_mask(a, b) & m +} + +/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpgt_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) +} + +/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpgt_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpgt_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpgt_epi64_mask(a, b) & m +} + +/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmple_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8())) +} + +/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmple_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmple_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmple_epi64_mask(a, b) & m +} + +/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpge_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8())) +} + +/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpge_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpge_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpge_epi64_mask(b, a) & m +} + +/// Compare packed signed 64-bit integers in a and b for equality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpeq_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) +} + +/// Compare packed signed 64-bit integers in a and b for equality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpeq_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpeq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpeq_epi64_mask(a, b) & m +} + +/// Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062&text=_mm512_cmpneq_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) +} + +/// Compare packed signed 64-bit integers in a and b for inequality, and store the results in a mask vector k +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmpneq_epi64) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpcmp))] +pub unsafe fn _mm512_mask_cmpneq_epi64_mask(m: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + _mm512_cmpneq_epi64_mask(a, b) & m +} + +/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(2)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_cmp_epi64_mask(a: __m512i, b: __m512i, op: _MM_CMPINT_ENUM) -> __mmask8 { + let neg_one = -1; + macro_rules! call { + ($imm3:expr) => { + vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by op, +/// using zeromask m (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,1063&text=_mm512_mask_cmp_epi64_mask) +#[inline] +#[target_feature(enable = "avx512f")] +#[rustc_args_required_const(3)] +#[cfg_attr(test, assert_instr(vpcmp, op = 0))] +pub unsafe fn _mm512_mask_cmp_epi64_mask( + m: __mmask8, + a: __m512i, + b: __m512i, + op: _MM_CMPINT_ENUM, +) -> __mmask8 { + macro_rules! call { + ($imm3:expr) => { + vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, m as i8) + }; + } + let r = constify_imm3!(op, call); + transmute(r) +} + +/// Returns vector of type `__m512d` with undefined elements. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd) +#[inline] +#[target_feature(enable = "avx512f")] +// This intrinsic has no corresponding instruction. +pub unsafe fn _mm512_undefined_pd() -> __m512d { + _mm512_set1_pd(0.0) +} + +/// Returns vector of type `__m512` with undefined elements. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps) +#[inline] +#[target_feature(enable = "avx512f")] +// This intrinsic has no corresponding instruction. +pub unsafe fn _mm512_undefined_ps() -> __m512 { + _mm512_set1_ps(0.0) +} + +/// Loads 512-bits (composed of 8 packed double-precision (64-bit) +/// floating-point elements) from memory into result. +/// `mem_addr` does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] +pub unsafe fn _mm512_loadu_pd(mem_addr: *const f64) -> __m512d { + ptr::read_unaligned(mem_addr as *const __m512d) +} + +/// Stores 512-bits (composed of 8 packed double-precision (64-bit) +/// floating-point elements) from `a` into memory. +/// `mem_addr` does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_pd) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] +pub unsafe fn _mm512_storeu_pd(mem_addr: *mut f64, a: __m512d) { + ptr::write_unaligned(mem_addr as *mut __m512d, a); +} + +/// Loads 512-bits (composed of 16 packed single-precision (32-bit) +/// floating-point elements) from memory into result. +/// `mem_addr` does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] +pub unsafe fn _mm512_loadu_ps(mem_addr: *const f32) -> __m512 { + ptr::read_unaligned(mem_addr as *const __m512) +} + +/// Stores 512-bits (composed of 16 packed single-precision (32-bit) +/// floating-point elements) from `a` into memory. +/// `mem_addr` does not need to be aligned on any particular boundary. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ps) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovups))] +#[stable(feature = "simd_x86", since = "1.27.0")] +pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) { + ptr::write_unaligned(mem_addr as *mut __m512, a); +} + +/// Sets packed 64-bit integers in `dst` with the supplied values in +/// reverse order. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_setr_pd( + e0: f64, + e1: f64, + e2: f64, + e3: f64, + e4: f64, + e5: f64, + e6: f64, + e7: f64, +) -> __m512d { + let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7); + transmute(r) +} + +/// Sets packed 64-bit integers in `dst` with the supplied values. +/// +/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_pd) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_set_pd( + e0: f64, + e1: f64, e2: f64, e3: f64, e4: f64, @@ -9329,6 +10946,15 @@ extern "C" { #[link_name = "llvm.x86.avx512.mask.cvtps2pd.512"] fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.mask.cvttps2dq.512"] + fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16; + #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"] + fn vcvttps2udq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> u32x16; + #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"] + fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8; + #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.512"] + fn vcvttpd2udq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> u32x8; + #[link_name = "llvm.x86.avx512.gather.dpd.512"] fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8; #[link_name = "llvm.x86.avx512.gather.dps.512"] @@ -9440,6 +11066,29 @@ extern "C" { #[link_name = "llvm.x86.avx512.psrav.q.512"] fn vpsravq(a: i64x8, count: i64x8) -> i64x8; + #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"] + fn vpermilps(a: f32x16, b: i32x16) -> f32x16; + #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"] + fn vpermilpd(a: f64x8, b: i64x8) -> f64x8; + + #[link_name = "llvm.x86.avx512.permvar.si.512"] + fn vpermd(a: i32x16, idx: i32x16) -> i32x16; + #[link_name = "llvm.x86.avx512.permvar.di.512"] + fn vpermq(a: i64x8, idx: i64x8) -> i64x8; + #[link_name = "llvm.x86.avx512.permvar.sf.512"] + fn vpermps(a: f32x16, idx: i32x16) -> f32x16; + #[link_name = "llvm.x86.avx512.permvar.df.512"] + fn vpermpd(a: f64x8, idx: i64x8) -> f64x8; + + #[link_name = "llvm.x86.avx512.vpermi2var.d.512"] + fn vpermi2d(a: i32x16, idx: i32x16, b: i32x16) -> i32x16; + #[link_name = "llvm.x86.avx512.vpermi2var.q.512"] + fn vpermi2q(a: i64x8, idx: i64x8, b: i64x8) -> i64x8; + #[link_name = "llvm.x86.avx512.vpermi2var.ps.512"] + fn vpermi2ps(a: f32x16, idx: i32x16, b: f32x16) -> f32x16; + #[link_name = "llvm.x86.avx512.vpermi2var.pd.512"] + fn vpermi2pd(a: f64x8, idx: i64x8, b: f64x8) -> f64x8; + #[link_name = "llvm.x86.avx512.kand.w"] fn kandw(ma: u16, mb: u16) -> u16; #[link_name = "llvm.x86.avx512.kor.w"] @@ -12742,140 +14391,280 @@ mod tests { let a = _mm512_setr_ps( 0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_maskz_cvtps_epi32(0, a); + let r = _mm512_maskz_cvtps_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvtps_epi32(0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtps_epu32() { + let a = _mm512_setr_ps( + 0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_cvtps_epu32(a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtps_epu32() { + let a = _mm512_setr_ps( + 0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let src = _mm512_set1_epi32(0); + let r = _mm512_mask_cvtps_epu32(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvtps_epu32(src, 0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtps_epu32() { + let a = _mm512_setr_ps( + 0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_maskz_cvtps_epu32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvtps_epu32(0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); + assert_eq_m512i(r, e); + let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let src = _mm512_set1_epi32(0); + let r = + _mm512_mask_cvt_roundps_epi32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvt_roundps_epi32( + src, + 0b00000000_11111111, + a, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_maskz_cvt_roundps_epi32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvt_roundps_epi32( + 0b00000000_11111111, + a, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16); + assert_eq_m512i(r, e); + let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let src = _mm512_set1_epi32(0); + let r = + _mm512_mask_cvt_roundps_epu32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvt_roundps_epu32( + src, + 0b00000000_11111111, + a, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvt_roundps_epu32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_maskz_cvt_roundps_epu32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_cvt_roundps_epu32( + 0b00000000_11111111, + a, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_cvtt_roundps_epi32(a, _MM_FROUND_NO_EXC); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let src = _mm512_set1_epi32(0); + let r = _mm512_mask_cvtt_roundps_epi32(src, 0, a, _MM_FROUND_NO_EXC); + assert_eq_m512i(r, src); + let r = _mm512_mask_cvtt_roundps_epi32(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtt_roundps_epi32() { + let a = _mm512_setr_ps( + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + ); + let r = _mm512_maskz_cvtt_roundps_epi32(0, a, _MM_FROUND_NO_EXC); assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvtps_epi32(0b00000000_11111111, a); - let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm512_maskz_cvtt_roundps_epi32(0b00000000_11111111, a, _MM_FROUND_NO_EXC); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvtps_epu32() { + unsafe fn test_mm512_cvtt_roundps_epu32() { let a = _mm512_setr_ps( - 0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_cvtps_epu32(a); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16); + let r = _mm512_cvtt_roundps_epu32(a, _MM_FROUND_NO_EXC); + let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvtps_epu32() { + unsafe fn test_mm512_mask_cvtt_roundps_epu32() { let a = _mm512_setr_ps( - 0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); let src = _mm512_set1_epi32(0); - let r = _mm512_mask_cvtps_epu32(src, 0, a); + let r = _mm512_mask_cvtt_roundps_epu32(src, 0, a, _MM_FROUND_NO_EXC); assert_eq_m512i(r, src); - let r = _mm512_mask_cvtps_epu32(src, 0b00000000_11111111, a); + let r = _mm512_mask_cvtt_roundps_epu32(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC); let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvtps_epu32() { + unsafe fn test_mm512_maskz_cvtt_roundps_epu32() { let a = _mm512_setr_ps( - 0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, + 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_maskz_cvtps_epu32(0, a); + let r = _mm512_maskz_cvtt_roundps_epu32(0, a, _MM_FROUND_NO_EXC); assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvtps_epu32(0b00000000_11111111, a); + let r = _mm512_maskz_cvtt_roundps_epu32(0b00000000_11111111, a, _MM_FROUND_NO_EXC); let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvt_roundps_epi32() { + unsafe fn test_mm512_cvttps_epi32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16); - assert_eq_m512i(r, e); - let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); - let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_cvttps_epi32(a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvt_roundps_epi32() { + unsafe fn test_mm512_mask_cvttps_epi32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); let src = _mm512_set1_epi32(0); - let r = - _mm512_mask_cvt_roundps_epi32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let r = _mm512_mask_cvttps_epi32(src, 0, a); assert_eq_m512i(r, src); - let r = _mm512_mask_cvt_roundps_epi32( - src, - 0b00000000_11111111, - a, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, - ); - let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvt_roundps_epi32() { + unsafe fn test_mm512_maskz_cvttps_epi32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_maskz_cvt_roundps_epi32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let r = _mm512_maskz_cvttps_epi32(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvt_roundps_epi32( - 0b00000000_11111111, - a, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, - ); - let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a); + let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_cvt_roundps_epu32() { + unsafe fn test_mm512_cvttps_epu32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16); - assert_eq_m512i(r, e); - let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + let r = _mm512_cvttps_epu32(a); let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_cvt_roundps_epu32() { + unsafe fn test_mm512_mask_cvttps_epu32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); let src = _mm512_set1_epi32(0); - let r = - _mm512_mask_cvt_roundps_epu32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let r = _mm512_mask_cvttps_epu32(src, 0, a); assert_eq_m512i(r, src); - let r = _mm512_mask_cvt_roundps_epu32( - src, - 0b00000000_11111111, - a, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, - ); + let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a); let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_cvt_roundps_epu32() { + unsafe fn test_mm512_maskz_cvttps_epu32() { let a = _mm512_setr_ps( 0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5, ); - let r = _mm512_maskz_cvt_roundps_epu32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let r = _mm512_maskz_cvttps_epu32(0, a); assert_eq_m512i(r, _mm512_setzero_si512()); - let r = _mm512_maskz_cvt_roundps_epu32( - 0b00000000_11111111, - a, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, - ); + let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a); let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0); assert_eq_m512i(r, e); } @@ -14196,134 +15985,621 @@ mod tests { let r = _mm512_mask_srl_epi32(a, 0, a, count); assert_eq_m512i(r, a); - let r = _mm512_mask_srl_epi32(a, 0b11111111_11111111, a, count); - let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm512_mask_srl_epi32(a, 0b11111111_11111111, a, count); + let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_srl_epi32() { + let a = _mm512_set_epi32( + 1 << 31, + 1 << 0, + 1 << 1, + 1 << 2, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1 << 31, + ); + let count = _mm_set_epi32(2, 0, 0, 2); + let r = _mm512_maskz_srl_epi32(0, a, count); + assert_eq_m512i(r, _mm512_setzero_si512()); + + let r = _mm512_maskz_srl_epi32(0b00000000_11111111, a, count); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 29); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_sra_epi32() { + let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); + let count = _mm_set_epi32(1, 0, 0, 2); + let r = _mm512_sra_epi32(a, count); + let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_sra_epi32() { + let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16); + let count = _mm_set_epi32(0, 0, 0, 2); + let r = _mm512_mask_sra_epi32(a, 0, a, count); + assert_eq_m512i(r, a); + + let r = _mm512_mask_sra_epi32(a, 0b11111111_11111111, a, count); + let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_sra_epi32() { + let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14); + let count = _mm_set_epi32(2, 0, 0, 2); + let r = _mm512_maskz_sra_epi32(0, a, count); + assert_eq_m512i(r, _mm512_setzero_si512()); + + let r = _mm512_maskz_sra_epi32(0b00000000_11111111, a, count); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_srav_epi32() { + let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); + let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + let r = _mm512_srav_epi32(a, count); + let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_srav_epi32() { + let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16); + let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); + let r = _mm512_mask_srav_epi32(a, 0, a, count); + assert_eq_m512i(r, a); + + let r = _mm512_mask_srav_epi32(a, 0b11111111_11111111, a, count); + let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_srav_epi32() { + let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14); + let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2); + let r = _mm512_maskz_srav_epi32(0, a, count); + assert_eq_m512i(r, _mm512_setzero_si512()); + + let r = _mm512_maskz_srav_epi32(0b00000000_11111111, a, count); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_srai_epi32() { + let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15); + let r = _mm512_srai_epi32(a, 2); + let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_srai_epi32() { + let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15); + let r = _mm512_mask_srai_epi32(a, 0, a, 2); + assert_eq_m512i(r, a); + + let r = _mm512_mask_srai_epi32(a, 0b11111111_11111111, a, 2); + let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_srai_epi32() { + let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15); + let r = _mm512_maskz_srai_epi32(0, a, 2); + assert_eq_m512i(r, _mm512_setzero_si512()); + + let r = _mm512_maskz_srai_epi32(0b00000000_11111111, a, 2); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permute_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let r = _mm512_permute_ps(a, 1); + let e = _mm512_set_ps( + 2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permute_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let r = _mm512_mask_permute_ps(a, 0b00000000_00000000, a, 1); + assert_eq_m512(r, a); + let r = _mm512_mask_permute_ps(a, 0b11111111_11111111, a, 1); + let e = _mm512_set_ps( + 2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permute_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let r = _mm512_maskz_permute_ps(0, a, 1); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_permute_ps(0b00000000_11111111, a, 1); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutevar_epi32() { + let idx = _mm512_set1_epi32(1); + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_permutevar_epi32(idx, a); + let e = _mm512_set1_epi32(14); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutevar_epi32() { + let idx = _mm512_set1_epi32(1); + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_mask_permutevar_epi32(a, 0, idx, a); + assert_eq_m512i(r, a); + let r = _mm512_mask_permutevar_epi32(a, 0b11111111_11111111, idx, a); + let e = _mm512_set1_epi32(14); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutevar_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let b = _mm512_set1_epi32(1); + let r = _mm512_permutevar_ps(a, b); + let e = _mm512_set_ps( + 2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutevar_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let b = _mm512_set1_epi32(1); + let r = _mm512_mask_permutevar_ps(a, 0, a, b); + assert_eq_m512(r, a); + let r = _mm512_mask_permutevar_ps(a, 0b11111111_11111111, a, b); + let e = _mm512_set_ps( + 2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutevar_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let b = _mm512_set1_epi32(1); + let r = _mm512_maskz_permutevar_ps(0, a, b); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_permutevar_ps(0b00000000_11111111, a, b); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutexvar_epi32() { + let idx = _mm512_set1_epi32(1); + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_permutexvar_epi32(idx, a); + let e = _mm512_set1_epi32(14); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutexvar_epi32() { + let idx = _mm512_set1_epi32(1); + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_mask_permutexvar_epi32(a, 0, idx, a); + assert_eq_m512i(r, a); + let r = _mm512_mask_permutexvar_epi32(a, 0b11111111_11111111, idx, a); + let e = _mm512_set1_epi32(14); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutexvar_epi32() { + let idx = _mm512_set1_epi32(1); + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let r = _mm512_maskz_permutexvar_epi32(0, idx, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_permutexvar_epi32(0b00000000_11111111, idx, a); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_srl_epi32() { - let a = _mm512_set_epi32( - 1 << 31, - 1 << 0, - 1 << 1, - 1 << 2, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1 << 31, + unsafe fn test_mm512_permutexvar_ps() { + let idx = _mm512_set1_epi32(1); + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., ); - let count = _mm_set_epi32(2, 0, 0, 2); - let r = _mm512_maskz_srl_epi32(0, a, count); - assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_permutexvar_ps(idx, a); + let e = _mm512_set1_ps(14.); + assert_eq_m512(r, e); + } - let r = _mm512_maskz_srl_epi32(0b00000000_11111111, a, count); - let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 29); - assert_eq_m512i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutexvar_ps() { + let idx = _mm512_set1_epi32(1); + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let r = _mm512_mask_permutexvar_ps(a, 0, idx, a); + assert_eq_m512(r, a); + let r = _mm512_mask_permutexvar_ps(a, 0b11111111_11111111, idx, a); + let e = _mm512_set1_ps(14.); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_sra_epi32() { - let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); - let count = _mm_set_epi32(1, 0, 0, 2); - let r = _mm512_sra_epi32(a, count); - let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + unsafe fn test_mm512_maskz_permutexvar_ps() { + let idx = _mm512_set1_epi32(1); + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let r = _mm512_maskz_permutexvar_ps(0, idx, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_permutexvar_ps(0b00000000_11111111, idx, a); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 14., 14., 14., 14., 14., 14., 14., 14., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutex2var_epi32() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let idx = _mm512_set_epi32( + 1, + 1 << 4, + 2, + 1 << 4, + 3, + 1 << 4, + 4, + 1 << 4, + 5, + 1 << 4, + 6, + 1 << 4, + 7, + 1 << 4, + 8, + 1 << 4, + ); + let b = _mm512_set1_epi32(100); + let r = _mm512_permutex2var_epi32(a, idx, b); + let e = _mm512_set_epi32( + 14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100, + ); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_sra_epi32() { - let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16); - let count = _mm_set_epi32(0, 0, 0, 2); - let r = _mm512_mask_sra_epi32(a, 0, a, count); + unsafe fn test_mm512_mask_permutex2var_epi32() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let idx = _mm512_set_epi32( + 1, + 1 << 4, + 2, + 1 << 4, + 3, + 1 << 4, + 4, + 1 << 4, + 5, + 1 << 4, + 6, + 1 << 4, + 7, + 1 << 4, + 8, + 1 << 4, + ); + let b = _mm512_set1_epi32(100); + let r = _mm512_mask_permutex2var_epi32(a, 0, idx, b); assert_eq_m512i(r, a); - - let r = _mm512_mask_sra_epi32(a, 0b11111111_11111111, a, count); - let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4); + let r = _mm512_mask_permutex2var_epi32(a, 0b11111111_11111111, idx, b); + let e = _mm512_set_epi32( + 14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100, + ); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_sra_epi32() { - let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14); - let count = _mm_set_epi32(2, 0, 0, 2); - let r = _mm512_maskz_sra_epi32(0, a, count); + unsafe fn test_mm512_maskz_permutex2var_epi32() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let idx = _mm512_set_epi32( + 1, + 1 << 4, + 2, + 1 << 4, + 3, + 1 << 4, + 4, + 1 << 4, + 5, + 1 << 4, + 6, + 1 << 4, + 7, + 1 << 4, + 8, + 1 << 4, + ); + let b = _mm512_set1_epi32(100); + let r = _mm512_maskz_permutex2var_epi32(0, a, idx, b); assert_eq_m512i(r, _mm512_setzero_si512()); - - let r = _mm512_maskz_sra_epi32(0b00000000_11111111, a, count); - let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4); + let r = _mm512_maskz_permutex2var_epi32(0b00000000_11111111, a, idx, b); + let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 10, 100, 9, 100, 8, 100, 7, 100); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_srav_epi32() { - let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); - let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - let r = _mm512_srav_epi32(a, count); - let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); + unsafe fn test_mm512_mask2_permutex2var_epi32() { + let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + let idx = _mm512_set_epi32( + 1000, + 1 << 4, + 2000, + 1 << 4, + 3000, + 1 << 4, + 4000, + 1 << 4, + 5, + 1 << 4, + 6, + 1 << 4, + 7, + 1 << 4, + 8, + 1 << 4, + ); + let b = _mm512_set1_epi32(100); + let r = _mm512_mask2_permutex2var_epi32(a, idx, 0, b); + assert_eq_m512i(r, idx); + let r = _mm512_mask2_permutex2var_epi32(a, idx, 0b00000000_11111111, b); + let e = _mm512_set_epi32( + 1000, + 1 << 4, + 2000, + 1 << 4, + 3000, + 1 << 4, + 4000, + 1 << 4, + 10, + 100, + 9, + 100, + 8, + 100, + 7, + 100, + ); assert_eq_m512i(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_srav_epi32() { - let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16); - let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1); - let r = _mm512_mask_srav_epi32(a, 0, a, count); - assert_eq_m512i(r, a); - - let r = _mm512_mask_srav_epi32(a, 0b11111111_11111111, a, count); - let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8); - assert_eq_m512i(r, e); + unsafe fn test_mm512_permutex2var_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let idx = _mm512_set_epi32( + 1, + 1 << 4, + 2, + 1 << 4, + 3, + 1 << 4, + 4, + 1 << 4, + 5, + 1 << 4, + 6, + 1 << 4, + 7, + 1 << 4, + 8, + 1 << 4, + ); + let b = _mm512_set1_ps(100.); + let r = _mm512_permutex2var_ps(a, idx, b); + let e = _mm512_set_ps( + 14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_srav_epi32() { - let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14); - let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2); - let r = _mm512_maskz_srav_epi32(0, a, count); - assert_eq_m512i(r, _mm512_setzero_si512()); - - let r = _mm512_maskz_srav_epi32(0b00000000_11111111, a, count); - let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4); - assert_eq_m512i(r, e); + unsafe fn test_mm512_mask_permutex2var_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let idx = _mm512_set_epi32( + 1, + 1 << 4, + 2, + 1 << 4, + 3, + 1 << 4, + 4, + 1 << 4, + 5, + 1 << 4, + 6, + 1 << 4, + 7, + 1 << 4, + 8, + 1 << 4, + ); + let b = _mm512_set1_ps(100.); + let r = _mm512_mask_permutex2var_ps(a, 0, idx, b); + assert_eq_m512(r, a); + let r = _mm512_mask_permutex2var_ps(a, 0b11111111_11111111, idx, b); + let e = _mm512_set_ps( + 14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_srai_epi32() { - let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15); - let r = _mm512_srai_epi32(a, 2); - let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4); - assert_eq_m512i(r, e); + unsafe fn test_mm512_maskz_permutex2var_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let idx = _mm512_set_epi32( + 1, + 1 << 4, + 2, + 1 << 4, + 3, + 1 << 4, + 4, + 1 << 4, + 5, + 1 << 4, + 6, + 1 << 4, + 7, + 1 << 4, + 8, + 1 << 4, + ); + let b = _mm512_set1_ps(100.); + let r = _mm512_maskz_permutex2var_ps(0, a, idx, b); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_permutex2var_ps(0b00000000_11111111, a, idx, b); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_srai_epi32() { - let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15); - let r = _mm512_mask_srai_epi32(a, 0, a, 2); - assert_eq_m512i(r, a); + unsafe fn test_mm512_mask2_permutex2var_ps() { + let a = _mm512_set_ps( + 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., + ); + let idx = _mm512_set_epi32( + 1, + 1 << 4, + 2, + 1 << 4, + 3, + 1 << 4, + 4, + 1 << 4, + 5, + 1 << 4, + 6, + 1 << 4, + 7, + 1 << 4, + 8, + 1 << 4, + ); + let b = _mm512_set1_ps(100.); + let r = _mm512_mask2_permutex2var_ps(a, idx, 0, b); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_mask2_permutex2var_ps(a, idx, 0b00000000_11111111, b); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100., + ); + assert_eq_m512(r, e); + } - let r = _mm512_mask_srai_epi32(a, 0b11111111_11111111, a, 2); - let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4); - assert_eq_m512i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_shuffle_ps() { + let a = _mm512_setr_ps( + 1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16., + ); + let b = _mm512_setr_ps( + 2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15., + ); + let r = _mm512_shuffle_ps(a, b, 0x0F); + let e = _mm512_setr_ps( + 8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_maskz_srai_epi32() { - let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15); - let r = _mm512_maskz_srai_epi32(0, a, 2); - assert_eq_m512i(r, _mm512_setzero_si512()); + unsafe fn test_mm512_mask_shuffle_ps() { + let a = _mm512_setr_ps( + 1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16., + ); + let b = _mm512_setr_ps( + 2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15., + ); + let r = _mm512_mask_shuffle_ps(a, 0, a, b, 0x0F); + assert_eq_m512(r, a); + let r = _mm512_mask_shuffle_ps(a, 0b11111111_11111111, a, b, 0x0F); + let e = _mm512_setr_ps( + 8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10., + ); + assert_eq_m512(r, e); + } - let r = _mm512_maskz_srai_epi32(0b00000000_11111111, a, 2); - let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4); - assert_eq_m512i(r, e); + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_shuffle_ps() { + let a = _mm512_setr_ps( + 1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16., + ); + let b = _mm512_setr_ps( + 2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15., + ); + let r = _mm512_maskz_shuffle_ps(0, a, b, 0x0F); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_shuffle_ps(0b00000000_11111111, a, b, 0x0F); + let e = _mm512_setr_ps( + 8., 8., 2., 2., 16., 16., 10., 10., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); } #[simd_test(enable = "avx512f")] diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 1882ad671213..d3993a95d7e7 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -1089,6 +1089,122 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtt_roundpd_epi32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let r = _mm512_cvtt_roundpd_epi32(a, _MM_FROUND_NO_EXC); + let e = _mm256_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtt_roundpd_epi32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let src = _mm256_set1_epi32(0); + let r = _mm512_mask_cvtt_roundpd_epi32(src, 0, a, _MM_FROUND_NO_EXC); + assert_eq_m256i(r, src); + let r = _mm512_mask_cvtt_roundpd_epi32(src, 0b00001111, a, _MM_FROUND_NO_EXC); + let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtt_roundpd_epi32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let r = _mm512_maskz_cvtt_roundpd_epi32(0, a, _MM_FROUND_NO_EXC); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm512_maskz_cvtt_roundpd_epi32(0b00001111, a, _MM_FROUND_NO_EXC); + let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvtt_roundpd_epu32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let r = _mm512_cvtt_roundpd_epu32(a, _MM_FROUND_NO_EXC); + let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvtt_roundpd_epu32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let src = _mm256_set1_epi32(0); + let r = _mm512_mask_cvtt_roundpd_epu32(src, 0, a, _MM_FROUND_NO_EXC); + assert_eq_m256i(r, src); + let r = _mm512_mask_cvtt_roundpd_epu32(src, 0b00001111, a, _MM_FROUND_NO_EXC); + let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvtt_roundpd_epu32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let r = _mm512_maskz_cvtt_roundpd_epu32(0, a, _MM_FROUND_NO_EXC); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm512_maskz_cvtt_roundpd_epu32(0b00001111, a, _MM_FROUND_NO_EXC); + let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvttpd_epi32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let r = _mm512_cvttpd_epi32(a); + let e = _mm256_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvttpd_epi32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let src = _mm256_set1_epi32(0); + let r = _mm512_mask_cvttpd_epi32(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm512_mask_cvttpd_epi32(src, 0b00001111, a); + let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvttpd_epi32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let r = _mm512_maskz_cvttpd_epi32(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm512_maskz_cvttpd_epi32(0b00001111, a); + let e = _mm256_setr_epi32(0, -1, 2, -3, 0, 0, 0, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_cvttpd_epu32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let r = _mm512_cvttpd_epu32(a); + let e = _mm256_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_cvttpd_epu32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let src = _mm256_set1_epi32(0); + let r = _mm512_mask_cvttpd_epu32(src, 0, a); + assert_eq_m256i(r, src); + let r = _mm512_mask_cvttpd_epu32(src, 0b00001111, a); + let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_cvttpd_epu32() { + let a = _mm512_setr_pd(0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5); + let r = _mm512_maskz_cvttpd_epu32(0, a); + assert_eq_m256i(r, _mm256_setzero_si256()); + let r = _mm512_maskz_cvttpd_epu32(0b00001111, a); + let e = _mm256_setr_epi32(0, -1, 2, -1, 0, 0, 0, 0); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_add_round_pd() { let a = _mm512_setr_pd(8., 9.5, 10., 11.5, 12., 13.5, 14., 0.000000000000000007); @@ -3746,6 +3862,332 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permute_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_permute_pd(a, 1); + let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permute_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_mask_permute_pd(a, 0, a, 1); + assert_eq_m512d(r, a); + let r = _mm512_mask_permute_pd(a, 0b11111111, a, 1); + let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permute_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_maskz_permute_pd(0, a, 1); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_permute_pd(0b00001111, a, 1); + let e = _mm512_set_pd(0., 0., 0., 0., 5., 5., 7., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutex_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm512_permutex_epi64(a, 1); + let e = _mm512_set_epi64(6, 6, 6, 6, 6, 6, 6, 6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutex_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm512_mask_permutex_epi64(a, 0, a, 1); + assert_eq_m512i(r, a); + let r = _mm512_mask_permutex_epi64(a, 0b11111111, a, 1); + let e = _mm512_set_epi64(6, 6, 6, 6, 6, 6, 6, 6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutex_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm512_maskz_permutex_epi64(0, a, 1); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_permutex_epi64(0b00001111, a, 1); + let e = _mm512_set_epi64(0, 0, 0, 0, 6, 6, 6, 6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutex_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_permutex_pd(a, 1); + let e = _mm512_set_pd(6., 6., 6., 6., 6., 6., 6., 6.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutex_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_mask_permutex_pd(a, 0, a, 1); + assert_eq_m512d(r, a); + let r = _mm512_mask_permutex_pd(a, 0b11111111, a, 1); + let e = _mm512_set_pd(6., 6., 6., 6., 6., 6., 6., 6.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutex_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_maskz_permutex_pd(0, a, 1); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_permutex_pd(0b00001111, a, 1); + let e = _mm512_set_pd(0., 0., 0., 0., 6., 6., 6., 6.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutevar_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let b = _mm512_set1_epi64(1); + let r = _mm512_permutevar_pd(a, b); + let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutevar_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let b = _mm512_set1_epi64(1); + let r = _mm512_mask_permutevar_pd(a, 0, a, b); + assert_eq_m512d(r, a); + let r = _mm512_mask_permutevar_pd(a, 0b11111111, a, b); + let e = _mm512_set_pd(1., 1., 3., 3., 5., 5., 7., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutevar_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let b = _mm512_set1_epi64(1); + let r = _mm512_maskz_permutevar_pd(0, a, b); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_permutevar_pd(0b00001111, a, b); + let e = _mm512_set_pd(0., 0., 0., 0., 5., 5., 7., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutexvar_epi64() { + let idx = _mm512_set1_epi64(1); + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm512_permutexvar_epi64(idx, a); + let e = _mm512_set1_epi64(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutexvar_epi64() { + let idx = _mm512_set1_epi64(1); + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm512_mask_permutexvar_epi64(a, 0, idx, a); + assert_eq_m512i(r, a); + let r = _mm512_mask_permutexvar_epi64(a, 0b11111111, idx, a); + let e = _mm512_set1_epi64(6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutexvar_epi64() { + let idx = _mm512_set1_epi64(1); + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let r = _mm512_maskz_permutexvar_epi64(0, idx, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_permutexvar_epi64(0b00001111, idx, a); + let e = _mm512_set_epi64(0, 0, 0, 0, 6, 6, 6, 6); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutexvar_pd() { + let idx = _mm512_set1_epi64(1); + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_permutexvar_pd(idx, a); + let e = _mm512_set1_pd(6.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutexvar_pd() { + let idx = _mm512_set1_epi64(1); + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_mask_permutexvar_pd(a, 0, idx, a); + assert_eq_m512d(r, a); + let r = _mm512_mask_permutexvar_pd(a, 0b11111111, idx, a); + let e = _mm512_set1_pd(6.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutexvar_pd() { + let idx = _mm512_set1_epi64(1); + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let r = _mm512_maskz_permutexvar_pd(0, idx, a); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_permutexvar_pd(0b00001111, idx, a); + let e = _mm512_set_pd(0., 0., 0., 0., 6., 6., 6., 6.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutex2var_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3); + let b = _mm512_set1_epi64(100); + let r = _mm512_permutex2var_epi64(a, idx, b); + let e = _mm512_set_epi64(6, 100, 5, 100, 4, 100, 3, 100); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutex2var_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3); + let b = _mm512_set1_epi64(100); + let r = _mm512_mask_permutex2var_epi64(a, 0, idx, b); + assert_eq_m512i(r, a); + let r = _mm512_mask_permutex2var_epi64(a, 0b11111111, idx, b); + let e = _mm512_set_epi64(6, 100, 5, 100, 4, 100, 3, 100); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutex2var_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3); + let b = _mm512_set1_epi64(100); + let r = _mm512_maskz_permutex2var_epi64(0, a, idx, b); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_permutex2var_epi64(0b00001111, a, idx, b); + let e = _mm512_set_epi64(0, 0, 0, 0, 4, 100, 3, 100); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask2_permutex2var_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let idx = _mm512_set_epi64(1000, 1 << 3, 2000, 1 << 3, 3, 1 << 3, 4, 1 << 3); + let b = _mm512_set1_epi64(100); + let r = _mm512_mask2_permutex2var_epi64(a, idx, 0, b); + assert_eq_m512i(r, idx); + let r = _mm512_mask2_permutex2var_epi64(a, idx, 0b00001111, b); + let e = _mm512_set_epi64(1000, 1 << 3, 2000, 1 << 3, 4, 100, 3, 100); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_permutex2var_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3); + let b = _mm512_set1_pd(100.); + let r = _mm512_permutex2var_pd(a, idx, b); + let e = _mm512_set_pd(6., 100., 5., 100., 4., 100., 3., 100.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_permutex2var_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3); + let b = _mm512_set1_pd(100.); + let r = _mm512_mask_permutex2var_pd(a, 0, idx, b); + assert_eq_m512d(r, a); + let r = _mm512_mask_permutex2var_pd(a, 0b11111111, idx, b); + let e = _mm512_set_pd(6., 100., 5., 100., 4., 100., 3., 100.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_permutex2var_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3); + let b = _mm512_set1_pd(100.); + let r = _mm512_maskz_permutex2var_pd(0, a, idx, b); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_permutex2var_pd(0b00001111, a, idx, b); + let e = _mm512_set_pd(0., 0., 0., 0., 4., 100., 3., 100.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask2_permutex2var_pd() { + let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); + let idx = _mm512_set_epi64(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3); + let b = _mm512_set1_pd(100.); + let r = _mm512_mask2_permutex2var_pd(a, idx, 0, b); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_mask2_permutex2var_pd(a, idx, 0b00001111, b); + let e = _mm512_set_pd(0., 0., 0., 0., 4., 100., 3., 100.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_shuffle_pd() { + let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.); + let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.); + let r = _mm512_shuffle_pd( + a, + b, + 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7, + ); + let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_shuffle_pd() { + let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.); + let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.); + let r = _mm512_mask_shuffle_pd( + a, + 0, + a, + b, + 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7, + ); + assert_eq_m512d(r, a); + let r = _mm512_mask_shuffle_pd( + a, + 0b11111111, + a, + b, + 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7, + ); + let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_shuffle_pd() { + let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.); + let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.); + let r = _mm512_maskz_shuffle_pd( + 0, + a, + b, + 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7, + ); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_shuffle_pd( + 0b00001111, + a, + b, + 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7, + ); + let e = _mm512_setr_pd(4., 3., 8., 7., 0., 0., 0., 0.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_and_epi64() { let a = _mm512_set_epi64(1 << 0 | 1 << 15, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 2 | 1 << 3);