From 636afae89d6af4abdeac64d1949ef8b892fc1bbf Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sat, 27 Jan 2024 21:49:50 +0100 Subject: [PATCH] Add support form emulated arch Fix #998 --- include/xsimd/arch/xsimd_emulated.hpp | 1483 +++++++++++++++++ include/xsimd/arch/xsimd_isa.hpp | 4 + .../xsimd/types/xsimd_emulated_register.hpp | 70 + 3 files changed, 1557 insertions(+) create mode 100644 include/xsimd/arch/xsimd_emulated.hpp create mode 100644 include/xsimd/types/xsimd_emulated_register.hpp diff --git a/include/xsimd/arch/xsimd_emulated.hpp b/include/xsimd/arch/xsimd_emulated.hpp new file mode 100644 index 000000000..cee43b239 --- /dev/null +++ b/include/xsimd/arch/xsimd_emulated.hpp @@ -0,0 +1,1483 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_EMULATED_HPP +#define XSIMD_EMULATED_HPP + +#include +#include +#include + +#include "../arch/xsimd_scalar.hpp" + +#include "../types/xsimd_emulated_register.hpp" +#include "../types/xsimd_utils.hpp" + +namespace xsimd +{ + template + struct batch_bool_constant; + + template + inline batch bitwise_cast(batch const& x) noexcept; + + template + struct batch_constant; + + namespace kernel + { + using namespace types; + + // fwd + template + inline batch insert(batch const& self, T val, index, requires_arch) noexcept; + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, Indices...>, requires_arch) noexcept; + + namespace detail + { + template + auto apply(F func, batch> const& b, Bs const&... bs) -> decltype(func(b.data[I], bs.data[I]...)) + { + return func(b.data[I], bs.data[I]...); + } + + template + auto apply(F func, ::xsimd::detail::index_sequence, batch> const& b, Bs const&... bs) -> std::array + { + return { apply(func, b, bs...)... }; + } + + template + auto apply(F func, batch> const& b, Bs const&... bs) -> std::array + { + return apply(func, ::xsimd::detail::make_index_sequence(), b, bs...); + } + } + + // abs + template + inline batch> abs(batch> const& self, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + return detail::apply([](T v) + { return xsimd::abs(v); }, + self); + } + + // add + template + inline batch> add(batch> const& self, batch> const& other, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + return detail::apply([](T v0, T v1) + { return xsimd::add(v0, v1); }, + self, other); + } + + // all + template + inline bool all(batch_bool> const& self, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + return std::all_of(self.data.begin(), self.data.end(), [](T v) + { return bool(v); }); + } + + // any + template + inline bool any(batch_bool> const& self, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + return std::any_of(self.data.begin(), self.data.end(), [](T v) + { return bool(v); }); + } + + // batch_bool_cast + template + inline batch_bool> batch_bool_cast(batch_bool> const& self, batch_bool> const&, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + return { self.data }; + } + + // bitwise_and + template + inline batch> bitwise_and(batch> const& self, batch> const& other, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + return detail::apply([](T v0, T v1) + { return xsimd::bitwise_and(v0, v1); }, + self, other); + } + + // bitwise_andnot + template + inline batch> bitwise_andnot(batch> const& self, batch> const& other, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + return detail::apply([](T v0, T v1) + { return xsimd::bitwise_andnot(v0, v1); }, + self, other); + } + +#if 0 + + // bitwise_lshift + template ::value, void>::type> + inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_slli_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_slli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_slli_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // bitwise_not + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); + } + template ::value, void>::type> + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_si128(self, _mm_set1_epi32(-1)); + } + template ::value, void>::type> + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_si128(self, _mm_set1_epi32(-1)); + } + template + inline batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); + } + template + inline batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); + } + + // bitwise_or + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_ps(self, other); + } + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_ps(self, other); + } + template ::value, void>::type> + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_si128(self, other); + } + template ::value, void>::type> + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_si128(self, other); + } + + template + inline batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_pd(self, other); + } + + template + inline batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_pd(self, other); + } + + // bitwise_rshift + template ::value, void>::type> + inline batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF); + __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self); + __m128i res = _mm_srai_epi16(self, other); + return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_srai_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_srai_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + // from https://github.com/samyvilar/vect/blob/master/vect_128.h + return _mm_or_si128( + _mm_srli_epi64(self, other), + _mm_slli_epi64( + _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32), + 64 - other)); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_srli_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_srli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_srli_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } + + // bitwise_xor + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_ps(self, other); + } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_ps(self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_si128(self, other); + } + template + inline batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_pd(self, other); + } + template + inline batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_pd(self, other); + } + template ::value, void>::type> + inline batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_si128(self, other); + } + + // bitwise_cast + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castsi128_ps(self); + } + template ::type>::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castps_si128(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castsi128_pd(self); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castps_pd(self); + } + template + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castpd_ps(self); + } + template ::value, void>::type> + inline batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castpd_si128(self); + } + + // broadcast + template + batch inline broadcast(float val, requires_arch) noexcept + { + return _mm_set1_ps(val); + } + template ::value, void>::type> + inline batch broadcast(T val, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_set1_epi8(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_set1_epi16(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_set1_epi32(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_set1_epi64x(val); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch broadcast(double val, requires_arch) noexcept + { + return _mm_set1_pd(val); + } + + // store_complex + namespace detail + { + // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned + // complex_low + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpacklo_ps(self.real(), self.imag()); + } + // complex_high + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpackhi_ps(self.real(), self.imag()); + } + template + inline batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpacklo_pd(self.real(), self.imag()); + } + template + inline batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpackhi_pd(self.real(), self.imag()); + } + } + + // decr_if + template ::value, void>::type> + inline batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return self + batch(mask.data); + } + + // div + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_div_ps(self, other); + } + template + inline batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_div_pd(self, other); + } + + // fast_cast + namespace detail + { + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_cvtepi32_ps(self); + } + + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to sse2 + __m128i xH = _mm_srli_epi64(x, 32); + xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 + __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); + __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 + __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + return _mm_add_pd(f, _mm_castsi128_pd(xL)); + } + + template + inline batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to sse2 + __m128i xH = _mm_srai_epi32(x, 16); + xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); + xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67 + __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); + __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 + __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 + return _mm_add_pd(f, _mm_castsi128_pd(xL)); + } + + template + inline batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_cvttps_epi32(self); + } + } +#endif + + // eq + template + inline batch_bool> eq(batch> const& self, batch> const& other, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + return detail::apply([](T v0, T v1) + { return xsimd::eq(v0, v1); }, + self, other); + } +#if 0 + + // from_mask + template + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint32_t lut[][4] = { + { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + }; + assert(!(mask & ~0xFul) && "inbound mask"); + return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask])); + } + template + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint64_t lut[][4] = { + { 0x0000000000000000ul, 0x0000000000000000ul }, + { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, + { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, + { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, + }; + assert(!(mask & ~0x3ul) && "inbound mask"); + return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask])); + } + template ::value, void>::type> + inline batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint64_t lut64[] = { + 0x0000000000000000, + 0x000000000000FFFF, + 0x00000000FFFF0000, + 0x00000000FFFFFFFF, + 0x0000FFFF00000000, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF, + 0xFFFF000000000000, + 0xFFFF00000000FFFF, + 0xFFFF0000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF0000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFFFFFFFFFF, + }; + alignas(A::alignment()) static const uint32_t lut32[] = { + 0x00000000, + 0x000000FF, + 0x0000FF00, + 0x0000FFFF, + 0x00FF0000, + 0x00FF00FF, + 0x00FFFF00, + 0x00FFFFFF, + 0xFF000000, + 0xFF0000FF, + 0xFF00FF00, + 0xFF00FFFF, + 0xFFFF0000, + 0xFFFF00FF, + 0xFFFFFF00, + 0xFFFFFFFF, + }; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + assert(!(mask & ~0xFFFF) && "inbound mask"); + return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + assert(!(mask & ~0xFF) && "inbound mask"); + return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_castps_si128(from_mask(batch_bool {}, mask, sse2 {})); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_castpd_si128(from_mask(batch_bool {}, mask, sse2 {})); + } + } + + // ge + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpge_ps(self, other); + } + template + inline batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpge_pd(self, other); + } + + // gt + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpgt_ps(self, other); + } + template ::value, void>::type> + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_cmpgt_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_cmpgt_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_cmpgt_epi32(self, other); + } + else + { + return gt(self, other, generic {}); + } + } + else + { + return gt(self, other, generic {}); + } + } + + template + inline batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpgt_pd(self, other); + } + + // haddp + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); + __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); + __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); + tmp0 = _mm_add_ps(tmp0, tmp1); + tmp1 = _mm_unpacklo_ps(row[2], row[3]); + tmp1 = _mm_add_ps(tmp1, tmp2); + tmp2 = _mm_movehl_ps(tmp1, tmp0); + tmp0 = _mm_movelh_ps(tmp0, tmp1); + return _mm_add_ps(tmp0, tmp2); + } + template + inline batch haddp(batch const* row, requires_arch) noexcept + { + return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]), + _mm_unpackhi_pd(row[0], row[1])); + } + + // incr_if + template ::value, void>::type> + inline batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return self - batch(mask.data); + } + + // insert + template ::value, void>::type> + inline batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_insert_epi16(self, val, I); + } + else + { + return insert(self, val, pos, generic {}); + } + } + + // isnan + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm_cmpunord_ps(self, self); + } + template + inline batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm_cmpunord_pd(self, self); + } + + // load_aligned + template + inline batch load_aligned(float const* mem, convert, requires_arch) noexcept + { + return _mm_load_ps(mem); + } + template ::value, void>::type> + inline batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + return _mm_load_si128((__m128i const*)mem); + } + template + inline batch load_aligned(double const* mem, convert, requires_arch) noexcept + { + return _mm_load_pd(mem); + } + + // load_unaligned + template + inline batch load_unaligned(float const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_ps(mem); + } + template ::value, void>::type> + inline batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_si128((__m128i const*)mem); + } + template + inline batch load_unaligned(double const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_pd(mem); + } + + // load_complex + namespace detail + { + // Redefine these methods in the SSE-based archs if required + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) }; + } + template + inline batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) }; + } + } + + // le + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmple_ps(self, other); + } + template + inline batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmple_pd(self, other); + } + + // lt + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmplt_ps(self, other); + } + template ::value, void>::type> + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_cmplt_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_cmplt_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_cmplt_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + __m128i tmp1 = _mm_sub_epi64(self, other); + __m128i tmp2 = _mm_xor_si128(self, other); + __m128i tmp3 = _mm_andnot_si128(other, self); + __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); + __m128i tmp5 = _mm_or_si128(tmp3, tmp4); + __m128i tmp6 = _mm_srai_epi32(tmp5, 31); + return _mm_shuffle_epi32(tmp6, 0xF5); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits::lowest()))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits::lowest()))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits::lowest()))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); + auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); + __m128i tmp1 = _mm_sub_epi64(xself, xother); + __m128i tmp2 = _mm_xor_si128(xself, xother); + __m128i tmp3 = _mm_andnot_si128(xother, xself); + __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); + __m128i tmp5 = _mm_or_si128(tmp3, tmp4); + __m128i tmp6 = _mm_srai_epi32(tmp5, 31); + return _mm_shuffle_epi32(tmp6, 0xF5); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } + + template + inline batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmplt_pd(self, other); + } + + /* compression table to turn 0b10 into 0b1, + * 0b100010 into 0b101 etc + */ + namespace detail + { + inline int mask_lut(int mask) + { + // clang-format off + static const int mask_lut[256] = { + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + }; + // clang-format on + return mask_lut[mask & 0xAA]; + } + } + + // mask + template ::value, void>::type> + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_movemask_epi8(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint64_t mask8 = _mm_movemask_epi8(self); + return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_movemask_ps(_mm_castsi128_ps(self)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_movemask_pd(_mm_castsi128_pd(self)); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_ps(self); + } + + template + inline uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_pd(self); + } + + // max + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_ps(self, other); + } + template ::value, void>::type> + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self > other, self, other); + } + template + inline batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_pd(self, other); + } + + // min + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_ps(self, other); + } + template ::value, void>::type> + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self <= other, self, other); + } + template + inline batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_pd(self, other); + } + + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mul_ps(self, other); + } + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mul_pd(self, other); + } + + // mul + template + inline batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mullo_epi16(self, other); + } + + // nearbyint_as_int + template + inline batch nearbyint_as_int(batch const& self, + requires_arch) noexcept + { + return _mm_cvtps_epi32(self); + } + + // neg + template ::value, void>::type> + inline batch neg(batch const& self, requires_arch) noexcept + { + return 0 - self; + } + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); + } + + template + inline batch neg(batch const& self, requires_arch) noexcept + { + return _mm_xor_pd( + self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); + } + + // neq + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpneq_ps(self, other); + } + template ::value, void>::type> + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_ps(self, other); + } + template ::value, void>::type> + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data))); + } + + template + inline batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpneq_pd(self, other); + } + template + inline batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_pd(self, other); + } + + // reciprocal + template + inline batch reciprocal(batch const& self, + kernel::requires_arch) + { + return _mm_rcp_ps(self); + } + + // reduce_add + template + inline float reduce_add(batch const& self, requires_arch) noexcept + { + __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self)); + __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); + return _mm_cvtss_f32(tmp1); + } + + // reduce_max + template ::type> + inline T reduce_max(batch const& self, requires_arch) noexcept + { + constexpr auto mask0 = detail::shuffle(2, 3, 0, 0); + batch step0 = _mm_shuffle_epi32(self, mask0); + batch acc0 = max(self, step0); + + constexpr auto mask1 = detail::shuffle(1, 0, 0, 0); + batch step1 = _mm_shuffle_epi32(acc0, mask1); + batch acc1 = max(acc0, step1); + + constexpr auto mask2 = detail::shuffle(1, 0, 0, 0); + batch step2 = _mm_shufflelo_epi16(acc1, mask2); + batch acc2 = max(acc1, step2); + if (sizeof(T) == 2) + return acc2.get(0); + batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); + batch acc3 = max(acc2, step3); + return acc3.get(0); + } + + // reduce_min + template ::type> + inline T reduce_min(batch const& self, requires_arch) noexcept + { + constexpr auto mask0 = detail::shuffle(2, 3, 0, 0); + batch step0 = _mm_shuffle_epi32(self, mask0); + batch acc0 = min(self, step0); + + constexpr auto mask1 = detail::shuffle(1, 0, 0, 0); + batch step1 = _mm_shuffle_epi32(acc0, mask1); + batch acc1 = min(acc0, step1); + + constexpr auto mask2 = detail::shuffle(1, 0, 0, 0); + batch step2 = _mm_shufflelo_epi16(acc1, mask2); + batch acc2 = min(acc1, step2); + if (sizeof(T) == 2) + return acc2.get(0); + batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); + batch acc3 = min(acc2, step3); + return acc3.get(0); + } + + template ::value, void>::type> + inline T reduce_add(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); + __m128i tmp2 = _mm_add_epi32(self, tmp1); + __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01); + __m128i tmp4 = _mm_add_epi32(tmp2, tmp3); + return _mm_cvtsi128_si32(tmp4); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); + __m128i tmp2 = _mm_add_epi64(self, tmp1); +#if defined(__x86_64__) + return _mm_cvtsi128_si64(tmp2); +#else + __m128i m; + _mm_storel_epi64(&m, tmp2); + int64_t i; + std::memcpy(&i, &m, sizeof(i)); + return i; +#endif + } + else + { + return hadd(self, generic {}); + } + } + template + inline double reduce_add(batch const& self, requires_arch) noexcept + { + return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self))); + } + + // rsqrt + template + inline batch rsqrt(batch const& val, requires_arch) noexcept + { + return _mm_rsqrt_ps(val); + } + template + inline batch rsqrt(batch const& val, requires_arch) noexcept + { + return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val))); + } + + // select + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br)); + } + + template ::value, void>::type> + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br)); + } + template ::value, void>::type> + inline batch select(batch_bool_constant, Values...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, sse2 {}); + } + template + inline batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br)); + } + + // shuffle + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3> mask, requires_arch) noexcept + { + constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3); + // shuffle within lane + if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4) + return _mm_shuffle_ps(x, y, smask); + + // shuffle within opposite lane + if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4) + return _mm_shuffle_ps(y, x, smask); + return shuffle(x, y, mask, generic {}); + } + + template + inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1> mask, requires_arch) noexcept + { + constexpr uint32_t smask = detail::mod_shuffle(I0, I1); + // shuffle within lane + if (I0 < 2 && I1 >= 2) + return _mm_shuffle_pd(x, y, smask); + + // shuffle within opposite lane + if (I0 >= 2 && I1 < 2) + return _mm_shuffle_pd(y, x, smask); + return shuffle(x, y, mask, generic {}); + } + + // sqrt + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm_sqrt_ps(val); + } + template + inline batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm_sqrt_pd(val); + } + + // slide_left + template + inline batch slide_left(batch const& x, requires_arch) noexcept + { + return _mm_slli_si128(x, N); + } + + // slide_right + template + inline batch slide_right(batch const& x, requires_arch) noexcept + { + return _mm_srli_si128(x, N); + } + + // sadd + + template ::value, void>::type> + inline batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_adds_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_adds_epi16(self, other); + } + else + { + return sadd(self, other, generic {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_adds_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_adds_epu16(self, other); + } + else + { + return sadd(self, other, generic {}); + } + } + } +#endif + + // set + template + inline batch> set(batch> const&, requires_arch>, Values... values) noexcept + { + static_assert(std::is_same>::value, "consistent arch"); + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return { typename batch>::register_type { static_cast(values)... } }; + } + +#if 0 + + // ssub + + template ::value, void>::type> + inline batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_subs_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_subs_epi16(self, other); + } + else + { + return ssub(self, other, generic {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_subs_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_subs_epu16(self, other); + } + else + { + return ssub(self, other, generic {}); + } + } + } +#endif + // store_aligned + template + inline void store_aligned(T* mem, batch> const& self, requires_arch>) noexcept + { + static_assert(std::is_same>::value, "matching arch"); + std::copy(self.data.begin(), self.data.end(), mem); + } +#if 0 + + // store_unaligned + template + inline void store_unaligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_ps(mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_si128((__m128i*)mem, self); + } + template ::value, void>::type> + inline void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm_storeu_si128((__m128i*)mem, self); + } + template + inline void store_unaligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_pd(mem, self); + } + + // sub + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_sub_ps(self, other); + } + template ::value, void>::type> + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_sub_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_sub_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_sub_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_sub_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_sub_pd(self, other); + } + + // swizzle + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3); + return _mm_shuffle_ps(self, self, index); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1>, requires_arch) noexcept + { + constexpr uint32_t index = detail::shuffle(V0, V1); + return _mm_shuffle_pd(self, self, index); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1>, requires_arch) noexcept + { + constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1); + return _mm_shuffle_epi32(self, index); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept + { + constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3); + return _mm_shuffle_epi32(self, index); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + } + + // zip_hi + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpackhi_ps(self, other); + } + template ::value, void>::type> + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_unpackhi_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_unpackhi_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_unpackhi_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_unpackhi_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpackhi_pd(self, other); + } + + // zip_lo + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpacklo_ps(self, other); + } + template ::value, void>::type> + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_unpacklo_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_unpacklo_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_unpacklo_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_unpacklo_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + inline batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpacklo_pd(self, other); + } +#endif + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 0edd77674..65948115c 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -16,6 +16,10 @@ #include "./xsimd_generic_fwd.hpp" +#if XSIMD_WITH_EMULATED +#include "./xsimd_emulated.hpp" +#endif + #if XSIMD_WITH_SSE2 #include "./xsimd_sse2.hpp" #endif diff --git a/include/xsimd/types/xsimd_emulated_register.hpp b/include/xsimd/types/xsimd_emulated_register.hpp new file mode 100644 index 000000000..80e0d5627 --- /dev/null +++ b/include/xsimd/types/xsimd_emulated_register.hpp @@ -0,0 +1,70 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_EMULATED_REGISTER_HPP +#define XSIMD_EMULATED_REGISTER_HPP + +#include "./xsimd_generic_arch.hpp" +#include "./xsimd_register.hpp" + +namespace xsimd +{ + /** + * @ingroup architectures + * + * emulated instructions + */ + template + struct emulated : generic + { + static constexpr bool supported() noexcept { return true; } + static constexpr bool available() noexcept { return true; } + static constexpr bool requires_alignment() noexcept { return false; } + static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); } + static constexpr std::size_t alignment() noexcept { return 8; } + static constexpr char const* name() noexcept { return "emulated"; } + }; + + namespace types + { + template + struct simd_emulated_bool_register + { + using register_type = std::array; + register_type data; + simd_emulated_bool_register() = default; + simd_emulated_bool_register(register_type r) { data = r; } + operator register_type() const noexcept { return data; } + }; + template + struct get_bool_simd_register> + { + using type = simd_emulated_bool_register; + }; + + template + struct simd_register> + { + using register_type = std::array; + register_type data; + inline operator register_type() const noexcept + { + return data; + } + }; + template + struct has_simd_register> : std::true_type + { + }; + } +} + +#endif