From c93ae0e5327ff298434495f47d03ed39051360bc Mon Sep 17 00:00:00 2001 From: Sparrow Li Date: Wed, 10 Nov 2021 23:19:59 +0800 Subject: [PATCH] Add remaining insturctions (#1250) * add vmmla vusmmla vsm4e vsm3 vrax1 vxar vsha512 vbcax veor3 neon instructions * update runtime feature detect * correct tests * add `vrnd32x` `vrnd64x` * add MISSING.md --- crates/core_arch/MISSING.md | 194 ++++++ .../core_arch/src/aarch64/neon/generated.rs | 602 ++++++++++++++++-- crates/core_arch/src/aarch64/neon/mod.rs | 140 ++++ crates/core_arch/src/arm_shared/neon/mod.rs | 86 +++ crates/core_arch/src/lib.rs | 3 +- crates/std_detect/src/detect/arch/arm.rs | 2 + crates/stdarch-gen/neon.spec | 188 +++++- crates/stdarch-gen/src/main.rs | 56 +- crates/stdarch-verify/tests/arm.rs | 26 + 9 files changed, 1197 insertions(+), 100 deletions(-) create mode 100644 crates/core_arch/MISSING.md diff --git a/crates/core_arch/MISSING.md b/crates/core_arch/MISSING.md new file mode 100644 index 0000000000000..99eb794a55e58 --- /dev/null +++ b/crates/core_arch/MISSING.md @@ -0,0 +1,194 @@ +## The following neon instructions are currently not implemented in stdarch + +### Can be implemented next: + +`vcls_u16` + +`vcls_u32` + +`vcls_u8` + +`vclsq_u16` + +`vclsq_u32` + +`vclsq_u8` + +`vcreate_s16` + +`vcreate_u16` + +`vpaddq_s64` + +`vpaddq_u64` + +`vreinterpretq_p128_f32` + +`vreinterpretq_p128_f64` + +`vreinterpretq_p128_p16` + +`vreinterpretq_p128_p8` + +`vreinterpretq_p128_s16` + +`vreinterpretq_p128_s32` + +`vreinterpretq_p128_s64` + +`vreinterpretq_p128_s8` + +`vreinterpretq_p128_u16` + +`vreinterpretq_p128_u32` + +`vreinterpretq_p128_u64` + +`vreinterpretq_p128_u8` + +`vslid_n_s64` + +`vslid_n_u64` + +`vsrid_n_s64` + +`vsrid_n_u64` + +### Not implemented on arm: + +`vcadd_rot270_f32` + +`vcadd_rot90_f32` + +`vcaddq_rot270_f32` + +`vcaddq_rot90_f32` + +`vdot_s32` + +`vdot_u32` + +`vdotq_s32` + +`vdotq_u32` + +`vdot_lane_s32` + +`vdot_lane_u32` + +`vdotq_lane_s32` + +`vdotq_lane_u32` + +`vcmla_f32` + +`vcmla_lane_f32` + +`vcmla_laneq_f32` + +`vcmla_rot180_f32` + +`vcmla_rot180_lane_f32` + +`vcmla_rot180_laneq_f32` + +`vcmla_rot270_f32` + +`vcmla_rot270_lane_f32` + +`vcmla_rot270_laneq_f32` + +`vcmla_rot90_f32` + +`vcmla_rot90_lane_f32` + +`vcmla_rot90_laneq_f32` + +`vcmlaq_f32` + +`vcmlaq_lane_f32` + +`vcmlaq_laneq_f32` + +`vcmlaq_rot180_f32` + +`vcmlaq_rot180_lane_f32` + +`vcmlaq_rot180_laneq_f32` + +`vcmlaq_rot270_f32` + +`vcmlaq_rot270_lane_f32` + +`vcmlaq_rot270_laneq_f32` + +`vcmlaq_rot90_f32` + +`vcmlaq_rot90_lane_f32` + +`vcmlaq_rot90_laneq_f32` + +### Not implemented in LLVM: + +`vrnd32x_f64` + +`vrnd32xq_f64` + +`vrnd32z_f64` + +`vrnd32zq_f64` + +`vrnd64x_f64` + +`vrnd64xq_f64` + +`vrnd64z_f64` + +`vrnd64zq_f64` + +### LLVM Select errors may occur: + +`vsudot_lane_s32` + +`vsudot_laneq_s32` + +`vsudotq_lane_s32` + +`vsudotq_laneq_s32` + +`vusdot_lane_s32` + +`vusdot_laneq_s32` + +`vusdot_s32` + +`vusdotq_lane_s32` + +`vusdotq_laneq_s32` + +`vusdotq_s32v` + +`vqshlu_n_s16` + +`vqshlu_n_s32` + +`vqshlu_n_s64` + +`vqshlu_n_s8` + +`vqshlub_n_s8` + +`vqshlud_n_s64` + +`vqshluh_n_s16` + +`vqshluq_n_s16` + +`vqshluq_n_s32` + +`vqshluq_n_s64` + +`vqshluq_n_s8` + +`vqshlus_n_s32` + diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 194695c11c550..bdf6158bb2c1e 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -11,66 +11,106 @@ use stdarch_test::assert_instr; /// Three-way exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(eor3))] pub unsafe fn veor3q_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t { - simd_xor(simd_xor(a, b), c) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v16i8")] + fn veor3q_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t; + } + veor3q_s8_(a, b, c) } /// Three-way exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(eor3))] pub unsafe fn veor3q_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { - simd_xor(simd_xor(a, b), c) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v8i16")] + fn veor3q_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; + } + veor3q_s16_(a, b, c) } /// Three-way exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(eor3))] pub unsafe fn veor3q_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { - simd_xor(simd_xor(a, b), c) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v4i32")] + fn veor3q_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; + } + veor3q_s32_(a, b, c) } /// Three-way exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(eor3))] pub unsafe fn veor3q_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t { - simd_xor(simd_xor(a, b), c) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3s.v2i64")] + fn veor3q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t; + } + veor3q_s64_(a, b, c) } /// Three-way exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(eor3))] pub unsafe fn veor3q_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t { - simd_xor(simd_xor(a, b), c) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v16i8")] + fn veor3q_u8_(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t; + } + veor3q_u8_(a, b, c) } /// Three-way exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(eor3))] pub unsafe fn veor3q_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t { - simd_xor(simd_xor(a, b), c) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v8i16")] + fn veor3q_u16_(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t; + } + veor3q_u16_(a, b, c) } /// Three-way exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(eor3))] pub unsafe fn veor3q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { - simd_xor(simd_xor(a, b), c) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v4i32")] + fn veor3q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; + } + veor3q_u32_(a, b, c) } /// Three-way exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(eor3))] pub unsafe fn veor3q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { - simd_xor(simd_xor(a, b), c) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.eor3u.v2i64")] + fn veor3q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t; + } + veor3q_u64_(a, b, c) } /// Absolute difference between the arguments of Floating @@ -7770,66 +7810,106 @@ pub unsafe fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t { /// Bit clear and exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(bcax))] pub unsafe fn vbcaxq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t { - simd_xor(a, vbicq_s8(b, c)) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v16i8")] + fn vbcaxq_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t; + } + vbcaxq_s8_(a, b, c) } /// Bit clear and exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(bcax))] pub unsafe fn vbcaxq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t { - simd_xor(a, vbicq_s16(b, c)) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v8i16")] + fn vbcaxq_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; + } + vbcaxq_s16_(a, b, c) } /// Bit clear and exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(bcax))] pub unsafe fn vbcaxq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t { - simd_xor(a, vbicq_s32(b, c)) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v4i32")] + fn vbcaxq_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; + } + vbcaxq_s32_(a, b, c) } /// Bit clear and exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(bcax))] pub unsafe fn vbcaxq_s64(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t { - simd_xor(a, vbicq_s64(b, c)) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxs.v2i64")] + fn vbcaxq_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t; + } + vbcaxq_s64_(a, b, c) } /// Bit clear and exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(bcax))] pub unsafe fn vbcaxq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t { - simd_xor(a, vbicq_u8(b, c)) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v16i8")] + fn vbcaxq_u8_(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t; + } + vbcaxq_u8_(a, b, c) } /// Bit clear and exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(bcax))] pub unsafe fn vbcaxq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t { - simd_xor(a, vbicq_u16(b, c)) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v8i16")] + fn vbcaxq_u16_(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t; + } + vbcaxq_u16_(a, b, c) } /// Bit clear and exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(bcax))] pub unsafe fn vbcaxq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { - simd_xor(a, vbicq_u32(b, c)) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v4i32")] + fn vbcaxq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; + } + vbcaxq_u32_(a, b, c) } /// Bit clear and exclusive OR #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(bcax))] pub unsafe fn vbcaxq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { - simd_xor(a, vbicq_u64(b, c)) + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.bcaxu.v2i64")] + fn vbcaxq_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t; + } + vbcaxq_u64_(a, b, c) } /// Floating-point complex add @@ -11886,6 +11966,240 @@ pub unsafe fn vshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> ui simd_shuffle4!(a, vshrn_n_u64::(b), [0, 1, 2, 3]) } +/// SM3PARTW1 +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm3partw1))] +pub unsafe fn vsm3partw1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3partw1")] + fn vsm3partw1q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; + } + vsm3partw1q_u32_(a, b, c) +} + +/// SM3PARTW2 +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm3partw2))] +pub unsafe fn vsm3partw2q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3partw2")] + fn vsm3partw2q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; + } + vsm3partw2q_u32_(a, b, c) +} + +/// SM3SS1 +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm3ss1))] +pub unsafe fn vsm3ss1q_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3ss1")] + fn vsm3ss1q_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; + } + vsm3ss1q_u32_(a, b, c) +} + +/// SM4 key +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm4ekey))] +pub unsafe fn vsm4ekeyq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm4ekey")] + fn vsm4ekeyq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; + } + vsm4ekeyq_u32_(a, b) +} + +/// SM4 encode +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm4e))] +pub unsafe fn vsm4eq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm4e")] + fn vsm4eq_u32_(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; + } + vsm4eq_u32_(a, b) +} + +/// Rotate and exclusive OR +#[inline] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(rax1))] +pub unsafe fn vrax1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.rax1")] + fn vrax1q_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; + } + vrax1q_u64_(a, b) +} + +/// SHA512 hash update part 1 +#[inline] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(sha512h))] +pub unsafe fn vsha512hq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512h")] + fn vsha512hq_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t; + } + vsha512hq_u64_(a, b, c) +} + +/// SHA512 hash update part 2 +#[inline] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(sha512h2))] +pub unsafe fn vsha512h2q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512h2")] + fn vsha512h2q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t; + } + vsha512h2q_u64_(a, b, c) +} + +/// SHA512 schedule update 0 +#[inline] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(sha512su0))] +pub unsafe fn vsha512su0q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512su0")] + fn vsha512su0q_u64_(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; + } + vsha512su0q_u64_(a, b) +} + +/// SHA512 schedule update 1 +#[inline] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(sha512su1))] +pub unsafe fn vsha512su1q_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sha512su1")] + fn vsha512su1q_u64_(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t; + } + vsha512su1q_u64_(a, b, c) +} + +/// Floating-point round to 32-bit integer, using current rounding mode +#[inline] +#[target_feature(enable = "neon,frintts")] +#[cfg_attr(test, assert_instr(frint32x))] +pub unsafe fn vrnd32x_f32(a: float32x2_t) -> float32x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32x.v2f32")] + fn vrnd32x_f32_(a: float32x2_t) -> float32x2_t; + } + vrnd32x_f32_(a) +} + +/// Floating-point round to 32-bit integer, using current rounding mode +#[inline] +#[target_feature(enable = "neon,frintts")] +#[cfg_attr(test, assert_instr(frint32x))] +pub unsafe fn vrnd32xq_f32(a: float32x4_t) -> float32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32x.v4f32")] + fn vrnd32xq_f32_(a: float32x4_t) -> float32x4_t; + } + vrnd32xq_f32_(a) +} + +/// Floating-point round to 32-bit integer toward zero +#[inline] +#[target_feature(enable = "neon,frintts")] +#[cfg_attr(test, assert_instr(frint32z))] +pub unsafe fn vrnd32z_f32(a: float32x2_t) -> float32x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32z.v2f32")] + fn vrnd32z_f32_(a: float32x2_t) -> float32x2_t; + } + vrnd32z_f32_(a) +} + +/// Floating-point round to 32-bit integer toward zero +#[inline] +#[target_feature(enable = "neon,frintts")] +#[cfg_attr(test, assert_instr(frint32z))] +pub unsafe fn vrnd32zq_f32(a: float32x4_t) -> float32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint32z.v4f32")] + fn vrnd32zq_f32_(a: float32x4_t) -> float32x4_t; + } + vrnd32zq_f32_(a) +} + +/// Floating-point round to 64-bit integer, using current rounding mode +#[inline] +#[target_feature(enable = "neon,frintts")] +#[cfg_attr(test, assert_instr(frint64x))] +pub unsafe fn vrnd64x_f32(a: float32x2_t) -> float32x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64x.v2f32")] + fn vrnd64x_f32_(a: float32x2_t) -> float32x2_t; + } + vrnd64x_f32_(a) +} + +/// Floating-point round to 64-bit integer, using current rounding mode +#[inline] +#[target_feature(enable = "neon,frintts")] +#[cfg_attr(test, assert_instr(frint64x))] +pub unsafe fn vrnd64xq_f32(a: float32x4_t) -> float32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64x.v4f32")] + fn vrnd64xq_f32_(a: float32x4_t) -> float32x4_t; + } + vrnd64xq_f32_(a) +} + +/// Floating-point round to 64-bit integer toward zero +#[inline] +#[target_feature(enable = "neon,frintts")] +#[cfg_attr(test, assert_instr(frint64z))] +pub unsafe fn vrnd64z_f32(a: float32x2_t) -> float32x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64z.v2f32")] + fn vrnd64z_f32_(a: float32x2_t) -> float32x2_t; + } + vrnd64z_f32_(a) +} + +/// Floating-point round to 64-bit integer toward zero +#[inline] +#[target_feature(enable = "neon,frintts")] +#[cfg_attr(test, assert_instr(frint64z))] +pub unsafe fn vrnd64zq_f32(a: float32x4_t) -> float32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.frint64z.v4f32")] + fn vrnd64zq_f32_(a: float32x4_t) -> float32x4_t; + } + vrnd64zq_f32_(a) +} + /// Transpose vectors #[inline] #[target_feature(enable = "neon")] @@ -13086,7 +13400,7 @@ mod test { use std::mem::transmute; use stdarch_test::simd_test; - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_veor3q_s8() { let a: i8x16 = i8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); let b: i8x16 = i8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); @@ -13096,7 +13410,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_veor3q_s16() { let a: i16x8 = i16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); let b: i16x8 = i16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); @@ -13106,7 +13420,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_veor3q_s32() { let a: i32x4 = i32x4::new(0x00, 0x01, 0x02, 0x03); let b: i32x4 = i32x4::new(0x00, 0x00, 0x00, 0x00); @@ -13116,7 +13430,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_veor3q_s64() { let a: i64x2 = i64x2::new(0x00, 0x01); let b: i64x2 = i64x2::new(0x00, 0x00); @@ -13126,7 +13440,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_veor3q_u8() { let a: u8x16 = u8x16::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F); let b: u8x16 = u8x16::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); @@ -13136,7 +13450,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_veor3q_u16() { let a: u16x8 = u16x8::new(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); let b: u16x8 = u16x8::new(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); @@ -13146,7 +13460,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_veor3q_u32() { let a: u32x4 = u32x4::new(0x00, 0x01, 0x02, 0x03); let b: u32x4 = u32x4::new(0x00, 0x00, 0x00, 0x00); @@ -13156,7 +13470,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_veor3q_u64() { let a: u64x2 = u64x2::new(0x00, 0x01); let b: u64x2 = u64x2::new(0x00, 0x00); @@ -19212,7 +19526,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_vbcaxq_s8() { let a: i8x16 = i8x16::new(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0); let b: i8x16 = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); @@ -19222,7 +19536,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_vbcaxq_s16() { let a: i16x8 = i16x8::new(1, 0, 1, 0, 1, 0, 1, 0); let b: i16x8 = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7); @@ -19232,7 +19546,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_vbcaxq_s32() { let a: i32x4 = i32x4::new(1, 0, 1, 0); let b: i32x4 = i32x4::new(0, 1, 2, 3); @@ -19242,7 +19556,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_vbcaxq_s64() { let a: i64x2 = i64x2::new(1, 0); let b: i64x2 = i64x2::new(0, 1); @@ -19252,7 +19566,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_vbcaxq_u8() { let a: u8x16 = u8x16::new(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0); let b: u8x16 = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); @@ -19262,7 +19576,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_vbcaxq_u16() { let a: u16x8 = u16x8::new(1, 0, 1, 0, 1, 0, 1, 0); let b: u16x8 = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7); @@ -19272,7 +19586,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_vbcaxq_u32() { let a: u32x4 = u32x4::new(1, 0, 1, 0); let b: u32x4 = u32x4::new(0, 1, 2, 3); @@ -19282,7 +19596,7 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] + #[simd_test(enable = "neon,sha3")] unsafe fn test_vbcaxq_u64() { let a: u64x2 = u64x2::new(1, 0); let b: u64x2 = u64x2::new(0, 1); @@ -22841,6 +23155,166 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm3partw1q_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let c: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(2147549312, 3221323968, 131329, 2684362752); + let r: u32x4 = transmute(vsm3partw1q_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm3partw2q_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let c: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(128, 256, 384, 1077977696); + let r: u32x4 = transmute(vsm3partw2q_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm3ss1q_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let c: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(0, 0, 0, 2098176); + let r: u32x4 = transmute(vsm3ss1q_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm4ekeyq_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(1784948604, 136020997, 2940231695, 3789947679); + let r: u32x4 = transmute(vsm4ekeyq_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm4eq_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(1093874472, 3616769504, 3878330411, 2765298765); + let r: u32x4 = transmute(vsm4eq_u32(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sha3")] + unsafe fn test_vrax1q_u64() { + let a: u64x2 = u64x2::new(1, 2); + let b: u64x2 = u64x2::new(3, 4); + let e: u64x2 = u64x2::new(7, 10); + let r: u64x2 = transmute(vrax1q_u64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sha3")] + unsafe fn test_vsha512hq_u64() { + let a: u64x2 = u64x2::new(1, 2); + let b: u64x2 = u64x2::new(3, 4); + let c: u64x2 = u64x2::new(5, 6); + let e: u64x2 = u64x2::new(11189044327219203, 7177611956453380); + let r: u64x2 = transmute(vsha512hq_u64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sha3")] + unsafe fn test_vsha512h2q_u64() { + let a: u64x2 = u64x2::new(1, 2); + let b: u64x2 = u64x2::new(3, 4); + let c: u64x2 = u64x2::new(5, 6); + let e: u64x2 = u64x2::new(5770237651009406214, 349133864969); + let r: u64x2 = transmute(vsha512h2q_u64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sha3")] + unsafe fn test_vsha512su0q_u64() { + let a: u64x2 = u64x2::new(1, 2); + let b: u64x2 = u64x2::new(3, 4); + let e: u64x2 = u64x2::new(144115188075855874, 9439544818968559619); + let r: u64x2 = transmute(vsha512su0q_u64(transmute(a), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sha3")] + unsafe fn test_vsha512su1q_u64() { + let a: u64x2 = u64x2::new(1, 2); + let b: u64x2 = u64x2::new(3, 4); + let c: u64x2 = u64x2::new(5, 6); + let e: u64x2 = u64x2::new(105553116266526, 140737488355368); + let r: u64x2 = transmute(vsha512su1q_u64(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,frintts")] + unsafe fn test_vrnd32x_f32() { + let a: f32x2 = f32x2::new(1.1, 1.9); + let e: f32x2 = f32x2::new(1.0, 2.0); + let r: f32x2 = transmute(vrnd32x_f32(transmute(a))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,frintts")] + unsafe fn test_vrnd32xq_f32() { + let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3); + let e: f32x4 = f32x4::new(1.0, 2.0, -2.0, -2.0); + let r: f32x4 = transmute(vrnd32xq_f32(transmute(a))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,frintts")] + unsafe fn test_vrnd32z_f32() { + let a: f32x2 = f32x2::new(1.1, 1.9); + let e: f32x2 = f32x2::new(1.0, 1.0); + let r: f32x2 = transmute(vrnd32z_f32(transmute(a))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,frintts")] + unsafe fn test_vrnd32zq_f32() { + let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3); + let e: f32x4 = f32x4::new(1.0, 1.0, -1.0, -2.0); + let r: f32x4 = transmute(vrnd32zq_f32(transmute(a))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,frintts")] + unsafe fn test_vrnd64x_f32() { + let a: f32x2 = f32x2::new(1.1, 1.9); + let e: f32x2 = f32x2::new(1.0, 2.0); + let r: f32x2 = transmute(vrnd64x_f32(transmute(a))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,frintts")] + unsafe fn test_vrnd64xq_f32() { + let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3); + let e: f32x4 = f32x4::new(1.0, 2.0, -2.0, -2.0); + let r: f32x4 = transmute(vrnd64xq_f32(transmute(a))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,frintts")] + unsafe fn test_vrnd64z_f32() { + let a: f32x2 = f32x2::new(1.1, 1.9); + let e: f32x2 = f32x2::new(1.0, 1.0); + let r: f32x2 = transmute(vrnd64z_f32(transmute(a))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,frintts")] + unsafe fn test_vrnd64zq_f32() { + let a: f32x4 = f32x4::new(1.1, 1.9, -1.7, -2.3); + let e: f32x4 = f32x4::new(1.0, 1.0, -1.0, -2.0); + let r: f32x4 = transmute(vrnd64zq_f32(transmute(a))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vtrn1_s8() { let a: i8x8 = i8x8::new(0, 2, 4, 6, 8, 10, 12, 14); diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs index 1cc10dc15daa9..d23e43c435280 100644 --- a/crates/core_arch/src/aarch64/neon/mod.rs +++ b/crates/core_arch/src/aarch64/neon/mod.rs @@ -3208,6 +3208,97 @@ pub unsafe fn vsriq_n_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x transmute(vsriq_n_s64_(transmute(a), transmute(b), N)) } +/// SM3TT1A +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm3tt1a, IMM2 = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vsm3tt1aq_u32( + a: uint32x4_t, + b: uint32x4_t, + c: uint32x4_t, +) -> uint32x4_t { + static_assert_imm2!(IMM2); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt1a")] + fn vsm3tt1aq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t; + } + vsm3tt1aq_u32_(a, b, c, IMM2 as i64) +} + +/// SM3TT1B +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm3tt1b, IMM2 = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vsm3tt1bq_u32( + a: uint32x4_t, + b: uint32x4_t, + c: uint32x4_t, +) -> uint32x4_t { + static_assert_imm2!(IMM2); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt1b")] + fn vsm3tt1bq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t; + } + vsm3tt1bq_u32_(a, b, c, IMM2 as i64) +} + +/// SM3TT2A +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm3tt2a, IMM2 = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vsm3tt2aq_u32( + a: uint32x4_t, + b: uint32x4_t, + c: uint32x4_t, +) -> uint32x4_t { + static_assert_imm2!(IMM2); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt2a")] + fn vsm3tt2aq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t; + } + vsm3tt2aq_u32_(a, b, c, IMM2 as i64) +} + +/// SM3TT2B +#[inline] +#[target_feature(enable = "neon,sm4")] +#[cfg_attr(test, assert_instr(sm3tt2b, IMM2 = 0))] +#[rustc_legacy_const_generics(3)] +pub unsafe fn vsm3tt2bq_u32( + a: uint32x4_t, + b: uint32x4_t, + c: uint32x4_t, +) -> uint32x4_t { + static_assert_imm2!(IMM2); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.sm3tt2b")] + fn vsm3tt2bq_u32_(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, imm2: i64) -> uint32x4_t; + } + vsm3tt2bq_u32_(a, b, c, IMM2 as i64) +} + +/// Exclusive OR and rotate +#[inline] +#[target_feature(enable = "neon,sha3")] +#[cfg_attr(test, assert_instr(xar, IMM6 = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vxarq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t { + static_assert_imm6!(IMM6); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.crypto.xar")] + fn vxarq_u64_(a: uint64x2_t, b: uint64x2_t, n: i64) -> uint64x2_t; + } + vxarq_u64_(a, b, IMM6 as i64) +} + #[cfg(test)] mod tests { use crate::core_arch::aarch64::test_support::*; @@ -4866,6 +4957,55 @@ mod tests { assert_eq!(vals[1], 1.); assert_eq!(vals[2], 2.); } + + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm3tt1aq_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let c: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(2, 1536, 4, 16395); + let r: u32x4 = transmute(vsm3tt1aq_u32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm3tt1bq_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let c: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(2, 1536, 4, 16392); + let r: u32x4 = transmute(vsm3tt1bq_u32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm3tt2aq_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let c: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(2, 1572864, 4, 1447435); + let r: u32x4 = transmute(vsm3tt2aq_u32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sm4")] + unsafe fn test_vsm3tt2bq_u32() { + let a: u32x4 = u32x4::new(1, 2, 3, 4); + let b: u32x4 = u32x4::new(1, 2, 3, 4); + let c: u32x4 = u32x4::new(1, 2, 3, 4); + let e: u32x4 = u32x4::new(2, 1572864, 4, 1052680); + let r: u32x4 = transmute(vsm3tt2bq_u32::<0>(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,sha3")] + unsafe fn test_vxarq_u64() { + let a: u64x2 = u64x2::new(1, 2); + let b: u64x2 = u64x2::new(3, 4); + let e: u64x2 = u64x2::new(2, 6); + let r: u64x2 = transmute(vxarq_u64::<0>(transmute(a), transmute(b))); + assert_eq!(r, e); + } } #[cfg(test)] diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs index 588c86537df9a..15c659ded9b0b 100644 --- a/crates/core_arch/src/arm_shared/neon/mod.rs +++ b/crates/core_arch/src/arm_shared/neon/mod.rs @@ -4806,6 +4806,63 @@ pub unsafe fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t { } } +/// 8-bit integer matrix multiply-accumulate +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(smmla))] +pub unsafe fn vmmlaq_s32(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.smmla.v4i32.v16i8")] + #[cfg_attr( + target_arch = "aarch64", + link_name = "llvm.aarch64.neon.smmla.v4i32.v16i8" + )] + fn vmmlaq_s32_(a: int32x4_t, b: int8x16_t, c: int8x16_t) -> int32x4_t; + } + vmmlaq_s32_(a, b, c) +} + +/// 8-bit integer matrix multiply-accumulate +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ummla))] +pub unsafe fn vmmlaq_u32(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.ummla.v4i32.v16i8")] + #[cfg_attr( + target_arch = "aarch64", + link_name = "llvm.aarch64.neon.ummla.v4i32.v16i8" + )] + fn vmmlaq_u32_(a: uint32x4_t, b: uint8x16_t, c: uint8x16_t) -> uint32x4_t; + } + vmmlaq_u32_(a, b, c) +} + +/// Unsigned and signed 8-bit integer matrix multiply-accumulate +#[inline] +#[target_feature(enable = "neon,i8mm")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(usmmla))] +pub unsafe fn vusmmlaq_s32(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.usmmla.v4i32.v16i8")] + #[cfg_attr( + target_arch = "aarch64", + link_name = "llvm.aarch64.neon.usmmla.v4i32.v16i8" + )] + fn vusmmlaq_s32_(a: int32x4_t, b: uint8x16_t, c: int8x16_t) -> int32x4_t; + } + vusmmlaq_s32_(a, b, c) +} + #[cfg(test)] mod tests { use super::*; @@ -10368,6 +10425,35 @@ mod tests { let e: u16x8 = transmute(vrev64q_p16(transmute(a))); assert_eq!(r, e); } + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vmmlaq_s32() { + let a: i32x4 = i32x4::new(1, 3, 4, 9); + let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16); + let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16); + let e: i32x4 = i32x4::new(1, 2, 3, 4); + let r: i32x4 = transmute(vmmlaq_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vmmlaq_u32() { + let a: u32x4 = u32x4::new(1, 3, 4, 9); + let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16); + let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16); + let e: u32x4 = u32x4::new(1, 2, 3, 4); + let r: u32x4 = transmute(vmmlaq_u32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon,i8mm")] + unsafe fn test_vusmmlaq_s32() { + let a: i32x4 = i32x4::new(1, 3, 4, 9); + let b: i8x16 = i8x16::new(1, 21, 31, 14, 5, 6, 17, 8, 9, 13, 15, 12, 13, 19, 20, 16); + let c: i8x16 = i8x16::new(12, 22, 3, 4, 5, 56, 7, 8, 91, 10, 11, 15, 13, 14, 17, 16); + let e: i32x4 = i32x4::new(1, 2, 3, 4); + let r: i32x4 = transmute(vusmmlaq_s32(transmute(a), transmute(b), transmute(c))); + assert_eq!(r, e); + } } #[cfg(all(test, target_arch = "arm", target_endian = "little"))] diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs index 33b0627d612d4..d43192b3998e6 100644 --- a/crates/core_arch/src/lib.rs +++ b/crates/core_arch/src/lib.rs @@ -33,7 +33,8 @@ f16c_target_feature, allow_internal_unstable, decl_macro, - bench_black_box + bench_black_box, + asm_const )] #![cfg_attr(test, feature(test, abi_vectorcall))] #![deny(clippy::missing_inline_in_public_items)] diff --git a/crates/std_detect/src/detect/arch/arm.rs b/crates/std_detect/src/detect/arch/arm.rs index d96514c8449ab..9e7dda094feb8 100644 --- a/crates/std_detect/src/detect/arch/arm.rs +++ b/crates/std_detect/src/detect/arch/arm.rs @@ -22,4 +22,6 @@ features! { /// FEAT_AES (AES instructions) @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] sha2: "sha2"; /// FEAT_SHA1 & FEAT_SHA256 (SHA1 & SHA2-256 instructions) + @FEATURE: #[unstable(feature = "stdsimd", issue = "27731")] i8mm: "i8mm"; + /// FEAT_I8MM } diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 0dca59839bc52..933d2bc41f012 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -104,15 +104,16 @@ generate int*_t, uint*_t, int64x*_t, uint64x*_t /// Three-way exclusive OR name = veor3 -multi_fn = simd_xor, {simd_xor, a, b}, c a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 c = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F +target = sha3 -// llvm does not currently support `eor3` instructions -aarch64 = nop +aarch64 = eor3 +link-aarch64 = llvm.aarch64.crypto.eor3s._EXT_ generate int8x16_t, int16x8_t, int32x4_t, int64x2_t +link-aarch64 = llvm.aarch64.crypto.eor3u._EXT_ generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t //////////////////// @@ -4438,15 +4439,16 @@ generate uint32x4_t:uint32x4_t:uint64x2_t /// Bit clear and exclusive OR name = vbcax -multi_fn = simd_xor, a, {vbic-self-noext, b, c} a = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 c = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 validate 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 +target = sha3 -// llvm does not currently support the `bcax` instruction -aarch64 = nop +aarch64 = bcax +link-aarch64 = llvm.aarch64.crypto.bcaxs._EXT_ generate int8x16_t, int16x8_t, int32x4_t, int64x2_t +link-aarch64 = llvm.aarch64.crypto.bcaxu._EXT_ generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t /// Floating-point complex add @@ -4805,24 +4807,6 @@ generate float32x2_t:f32, float64x2_t:f64 aarch64 = fminnmv generate float32x4_t:f32 -/// 8-bit integer matrix multiply-accumulate -name = vmmlaq -a = 1, 2, 3, 4 -b = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 -c = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 -validate 1, 2, 3, 4 -target = i8mm - -aarch64 = smmla -link-aarch64 = smmla._EXT_._EXT3_ -// the feature `i8mm` is not valid for some target -//generate int32x4_t:int8x16_t:int8x16_t:int32x4_t - -aarch64 = ummla -link-aarch64 = ummla._EXT_._EXT3_ -// the feature `i8mm` is not valid for some target -//generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t - /// Vector move name = vmovl_high no-q @@ -6862,6 +6846,162 @@ aarch64 = usra arm = vsra generate uint*_t, uint64x*_t +/// SM3PARTW1 +name = vsm3partw1 +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +c = 1, 2, 3, 4 +validate 2147549312, 3221323968, 131329, 2684362752 +target = sm4 + +aarch64 = sm3partw1 +link-aarch64 = llvm.aarch64.crypto.sm3partw1 +generate uint32x4_t + +/// SM3PARTW2 +name = vsm3partw2 +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +c = 1, 2, 3, 4 +validate 128, 256, 384, 1077977696 +target = sm4 + +aarch64 = sm3partw2 +link-aarch64 = llvm.aarch64.crypto.sm3partw2 +generate uint32x4_t + +/// SM3SS1 +name = vsm3ss1 +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +c = 1, 2, 3, 4 +validate 0, 0, 0, 2098176 +target = sm4 + +aarch64 = sm3ss1 +link-aarch64 = llvm.aarch64.crypto.sm3ss1 +generate uint32x4_t + +/// SM4 key +name = vsm4ekey +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +validate 1784948604, 136020997, 2940231695, 3789947679 +target = sm4 + +aarch64 = sm4ekey +link-aarch64 = llvm.aarch64.crypto.sm4ekey +generate uint32x4_t + +/// SM4 encode +name = vsm4e +a = 1, 2, 3, 4 +b = 1, 2, 3, 4 +validate 1093874472, 3616769504, 3878330411, 2765298765 +target = sm4 + +aarch64 = sm4e +link-aarch64 = llvm.aarch64.crypto.sm4e +generate uint32x4_t + +/// Rotate and exclusive OR +name = vrax1 +a = 1, 2 +b = 3, 4 +validate 7, 10 +target = sha3 + +aarch64 = rax1 +link-aarch64 = llvm.aarch64.crypto.rax1 +generate uint64x2_t + +/// SHA512 hash update part 1 +name = vsha512h +a = 1, 2 +b = 3, 4 +c = 5, 6 +validate 11189044327219203, 7177611956453380 +target = sha3 + +aarch64 = sha512h +link-aarch64 = llvm.aarch64.crypto.sha512h +generate uint64x2_t + +/// SHA512 hash update part 2 +name = vsha512h2 +a = 1, 2 +b = 3, 4 +c = 5, 6 +validate 5770237651009406214, 349133864969 +target = sha3 + +aarch64 = sha512h2 +link-aarch64 = llvm.aarch64.crypto.sha512h2 +generate uint64x2_t + +/// SHA512 schedule update 0 +name = vsha512su0 +a = 1, 2 +b = 3, 4 +validate 144115188075855874, 9439544818968559619 +target = sha3 + +aarch64 = sha512su0 +link-aarch64 = llvm.aarch64.crypto.sha512su0 +generate uint64x2_t + +/// SHA512 schedule update 1 +name = vsha512su1 +a = 1, 2 +b = 3, 4 +c = 5, 6 +validate 105553116266526, 140737488355368 +target = sha3 + +aarch64 = sha512su1 +link-aarch64 = llvm.aarch64.crypto.sha512su1 +generate uint64x2_t + +/// Floating-point round to 32-bit integer, using current rounding mode +name = vrnd32x +a = 1.1, 1.9, -1.7, -2.3 +validate 1.0, 2.0, -2.0, -2.0 +target = frintts + +aarch64 = frint32x +link-aarch64 = frint32x._EXT_ +generate float32x2_t, float32x4_t + +/// Floating-point round to 32-bit integer toward zero +name = vrnd32z +a = 1.1, 1.9, -1.7, -2.3 +validate 1.0, 1.0, -1.0, -2.0 +target = frintts + +aarch64 = frint32z +link-aarch64 = frint32z._EXT_ +generate float32x2_t, float32x4_t + +/// Floating-point round to 64-bit integer, using current rounding mode +name = vrnd64x +a = 1.1, 1.9, -1.7, -2.3 +validate 1.0, 2.0, -2.0, -2.0 +target = frintts + +aarch64 = frint64x +link-aarch64 = frint64x._EXT_ +generate float32x2_t, float32x4_t + +/// Floating-point round to 64-bit integer toward zero +name = vrnd64z +a = 1.1, 1.9, -1.7, -2.3 +validate 1.0, 1.0, -1.0, -2.0 +target = frintts + +aarch64 = frint64z +link-aarch64 = frint64z._EXT_ +generate float32x2_t, float32x4_t + /// Transpose elements name = vtrn multi_fn = simd_shuffle-in_len-!, a1:in_t, a, b, {transpose-1-in_len} diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs index a33933ad97670..4ef3cb091dcb4 100644 --- a/crates/stdarch-gen/src/main.rs +++ b/crates/stdarch-gen/src/main.rs @@ -464,7 +464,10 @@ enum TargetFeature { FCMA, Dotprod, I8MM, + SHA3, RDM, + SM4, + FTTS, } #[derive(Clone, Copy)] @@ -1068,7 +1071,10 @@ fn gen_aarch64( FCMA => "neon,fcma", Dotprod => "neon,dotprod", I8MM => "neon,i8mm", + SHA3 => "neon,sha3", RDM => "rdm", + SM4 => "neon,sm4", + FTTS => "neon,frintts", }; let current_fn = if let Some(current_fn) = current_fn.clone() { if link_aarch64.is_some() { @@ -1379,6 +1385,13 @@ fn gen_aarch64( fn_decl, call_params ); + let test_target = match target { + I8MM => "neon,i8mm", + SM4 => "neon,sm4", + SHA3 => "neon,sha3", + FTTS => "neon,frintts", + _ => "neon", + }; let test = match fn_type { Fntype::Normal => gen_test( &name, @@ -1388,6 +1401,7 @@ fn gen_aarch64( [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])], type_len(out_t), para_num, + test_target, ), Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)), Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])), @@ -1575,12 +1589,13 @@ fn gen_test( len_in: [usize; 3], len_out: usize, para_num: i32, + target: &str, ) -> String { let mut test = format!( r#" - #[simd_test(enable = "neon")] + #[simd_test(enable = "{}")] unsafe fn test_{}() {{"#, - name, + target, name, ); for (a, b, c, n, e) in current_tests { let a: Vec = a.iter().take(len_in[0]).cloned().collect(); @@ -1777,7 +1792,10 @@ fn gen_arm( FCMA => "neon,fcma", Dotprod => "neon,dotprod", I8MM => "neon,i8mm", + SHA3 => "neon,sha3", RDM => "rdm", + SM4 => "neon,sm4", + FTTS => "neon,frintts", }; let current_target_arm = match target { Default => "v7", @@ -1787,8 +1805,11 @@ fn gen_arm( AES => "aes,v8", FCMA => "v8", // v8.3a Dotprod => "v8", // v8.2a - I8MM => "v8", // v8.6a + I8MM => "v8,i8mm", RDM => unreachable!(), + SM4 => unreachable!(), + SHA3 => unreachable!(), + FTTS => unreachable!(), }; let current_fn = if let Some(current_fn) = current_fn.clone() { if link_aarch64.is_some() || link_arm.is_some() { @@ -2364,6 +2385,13 @@ fn gen_arm( call, ) }; + let test_target = match target { + I8MM => "neon,i8mm", + SM4 => "neon,sm4", + SHA3 => "neon,sha3", + FTTS => "neon,frintts", + _ => "neon", + }; let test = match fn_type { Fntype::Normal => gen_test( &name, @@ -2373,6 +2401,7 @@ fn gen_arm( [type_len(in_t[0]), type_len(in_t[1]), type_len(in_t[2])], type_len(out_t), para_num, + test_target, ), Fntype::Load => gen_load_test(&name, in_t, &out_t, current_tests, type_len(out_t)), Fntype::Store => gen_store_test(&name, in_t, &out_t, current_tests, type_len(in_t[1])), @@ -3173,7 +3202,10 @@ mod test { "fcma" => FCMA, "dotprod" => Dotprod, "i8mm" => I8MM, + "sha3" => SHA3, "rdm" => RDM, + "sm4" => SM4, + "frintts" => FTTS, _ => Default, }, _ => Default, @@ -3278,20 +3310,22 @@ mod test { tests_aarch64.push('}'); tests_aarch64.push('\n'); - let arm_out_path: PathBuf = PathBuf::from(env::var("OUT_DIR").unwrap()) - .join("src") - .join("arm_shared") - .join("neon"); + let arm_out_path: PathBuf = + PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string())) + .join("src") + .join("arm_shared") + .join("neon"); std::fs::create_dir_all(&arm_out_path)?; let mut file_arm = File::create(arm_out_path.join(ARM_OUT))?; file_arm.write_all(out_arm.as_bytes())?; file_arm.write_all(tests_arm.as_bytes())?; - let aarch64_out_path: PathBuf = PathBuf::from(env::var("OUT_DIR").unwrap()) - .join("src") - .join("aarch64") - .join("neon"); + let aarch64_out_path: PathBuf = + PathBuf::from(env::var("OUT_DIR").unwrap_or("crates/core_arch".to_string())) + .join("src") + .join("aarch64") + .join("neon"); std::fs::create_dir_all(&aarch64_out_path)?; let mut file_aarch = File::create(aarch64_out_path.join(AARCH64_OUT))?; diff --git a/crates/stdarch-verify/tests/arm.rs b/crates/stdarch-verify/tests/arm.rs index bd894e0baae1a..ce7039ce72aaa 100644 --- a/crates/stdarch-verify/tests/arm.rs +++ b/crates/stdarch-verify/tests/arm.rs @@ -559,6 +559,32 @@ fn verify_all_signatures() { "vaddq_p16", "vaddq_p64", "vaddq_p128", + "vsm4ekeyq_u32", + "vsm4eq_u32", + "vmmlaq_s32", + "vmmlaq_u32", + "vusmmlaq_s32", + "vsm3partw1q_u32", + "vsm3partw2q_u32", + "vsm3ss1q_u32", + "vsm3tt1aq_u32", + "vsm3tt1bq_u32", + "vsm3tt2aq_u32", + "vsm3tt2bq_u32", + "vrax1q_u64", + "vxarq_u64", + "vsha512hq_u64", + "vsha512h2q_u64", + "vsha512su0q_u64", + "vsha512su1q_u64", + "vrnd32x_f32", + "vrnd32xq_f32", + "vrnd32z_f32", + "vrnd32zq_f32", + "vrnd64x_f32", + "vrnd64xq_f32", + "vrnd64z_f32", + "vrnd64zq_f32", "__dbg", ]; let arm = match map.get(rust.name) {