diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 116ea7762..469cbcaf9 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -29,6 +29,11 @@ namespace xsimd template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; + namespace detail { XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept @@ -1676,6 +1681,78 @@ namespace xsimd return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + batch tmp_lo0[8]; + for (int i = 0; i < 8; ++i) + tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]); + transpose(tmp_lo0 + 0, tmp_lo0 + 8, sse4_2 {}); + + batch tmp_hi0[8]; + for (int i = 0; i < 8; ++i) + tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[8 + i]); + transpose(tmp_hi0 + 0, tmp_hi0 + 8, sse4_2 {}); + + batch tmp_lo1[8]; + for (int i = 0; i < 8; ++i) + tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1); + transpose(tmp_lo1 + 0, tmp_lo1 + 8, sse4_2 {}); + + batch tmp_hi1[8]; + for (int i = 0; i < 8; ++i) + tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[8 + i], 1); + transpose(tmp_hi1 + 0, tmp_hi1 + 8, sse4_2 {}); + + for (int i = 0; i < 8; ++i) + matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]); + for (int i = 0; i < 8; ++i) + matrix_begin[i + 8] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + batch tmp_lo0[16]; + for (int i = 0; i < 16; ++i) + tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]); + transpose(tmp_lo0 + 0, tmp_lo0 + 16, sse4_2 {}); + + batch tmp_hi0[16]; + for (int i = 0; i < 16; ++i) + tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[16 + i]); + transpose(tmp_hi0 + 0, tmp_hi0 + 16, sse4_2 {}); + + batch tmp_lo1[16]; + for (int i = 0; i < 16; ++i) + tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1); + transpose(tmp_lo1 + 0, tmp_lo1 + 16, sse4_2 {}); + + batch tmp_hi1[16]; + for (int i = 0; i < 16; ++i) + tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[16 + i], 1); + transpose(tmp_hi1 + 0, tmp_hi1 + 16, sse4_2 {}); + + for (int i = 0; i < 16; ++i) + matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]); + for (int i = 0; i < 16; ++i) + matrix_begin[i + 16] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept