diff --git a/src/libraries/Common/src/System/HexConverter.cs b/src/libraries/Common/src/System/HexConverter.cs index 7986ccb43a9b1f..81b56970be05a1 100644 --- a/src/libraries/Common/src/System/HexConverter.cs +++ b/src/libraries/Common/src/System/HexConverter.cs @@ -99,13 +99,8 @@ internal static (Vector128, Vector128) AsciiToHexVector128(Vector128 Vector128 lowNibbles = Vector128.UnpackLow(shiftedSrc, src); Vector128 highNibbles = Vector128.UnpackHigh(shiftedSrc, src); - return (ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)), - ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF))); - - // TODO: remove once https://github.com/dotnet/runtime/pull/80963 is merged - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static Vector128 ShuffleUnsafe(Vector128 value, Vector128 mask) - => Ssse3.IsSupported ? Ssse3.Shuffle(value, mask) : AdvSimd.Arm64.VectorTableLookup(value, mask); + return (Vector128.ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)), + Vector128.ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF))); } private static void EncodeToUtf16_Vector128(ReadOnlySpan bytes, Span chars, Casing casing) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs index f4ebc942ac331a..cc239d1a5e981a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs @@ -477,20 +477,17 @@ private static unsafe void Avx2Decode(ref byte* srcBytes, ref byte* destBytes, b destBytes = dest; } - // This can be replaced once https://github.com/dotnet/runtime/issues/63331 is implemented. [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 SimdShuffle(Vector128 left, Vector128 right, Vector128 mask8F) { Debug.Assert((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian); - if (Ssse3.IsSupported) + if (AdvSimd.Arm64.IsSupported) { - return Ssse3.Shuffle(left, right); - } - else - { - return AdvSimd.Arm64.VectorTableLookup(left, Vector128.BitwiseAnd(right, mask8F)); + right &= mask8F; } + + return Vector128.ShuffleUnsafe(left, right); } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs index 1dd36f67306615..45007ac7d22b52 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/Ordinal.cs @@ -88,8 +88,8 @@ private static bool EqualsIgnoreCase_Vector128(ref char charA, ref char charB, i Vector128 vec2; do { - vec1 = Vector128.LoadUnsafe(ref Unsafe.As(ref charA), i); - vec2 = Vector128.LoadUnsafe(ref Unsafe.As(ref charB), i); + vec1 = Vector128.LoadUnsafe(ref charA, i); + vec2 = Vector128.LoadUnsafe(ref charB, i); if (!Utf16Utility.AllCharsInVector128AreAscii(vec1 | vec2)) { diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs index 87c258cee79832..207024d99962fb 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs @@ -860,10 +860,10 @@ private static Vector128 IndexOfAnyLookupCore(Vector128 source, Vect // The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle. // Lookup the rows via the lower nibble and the column via the higher nibble. - Vector128 bitMask = Shuffle(bitmapLookup, lowNibbles); + Vector128 bitMask = Vector128.ShuffleUnsafe(bitmapLookup, lowNibbles); // For values above 127, the high nibble will be above 7. We construct the positions vector for the shuffle such that those values map to 0. - Vector128 bitPositions = Shuffle(Vector128.Create(0x8040201008040201, 0).AsByte(), highNibbles); + Vector128 bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201, 0).AsByte(), highNibbles); Vector128 result = bitMask & bitPositions; return result; @@ -909,10 +909,10 @@ private static Vector128 IndexOfAnyLookup(Vector128 source Vector128 lowNibbles = source & Vector128.Create((byte)0xF); Vector128 highNibbles = Vector128.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector128.Create((byte)0xF); - Vector128 row0 = Shuffle(bitmapLookup0, lowNibbles); - Vector128 row1 = Shuffle(bitmapLookup1, lowNibbles); + Vector128 row0 = Vector128.ShuffleUnsafe(bitmapLookup0, lowNibbles); + Vector128 row1 = Vector128.ShuffleUnsafe(bitmapLookup1, lowNibbles); - Vector128 bitmask = Shuffle(Vector128.Create(0x8040201008040201).AsByte(), highNibbles); + Vector128 bitmask = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibbles); Vector128 mask = Vector128.GreaterThan(highNibbles.AsSByte(), Vector128.Create((sbyte)0x7)).AsByte(); Vector128 bitsets = Vector128.ConditionalSelect(mask, row1, row0); @@ -944,16 +944,6 @@ private static Vector256 IndexOfAnyLookup(Vector256 source return TNegator.NegateIfNeeded(result); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector128 Shuffle(Vector128 vector, Vector128 indices) - { - // We're not using Vector128.Shuffle as the caller already accounts for and relies on differences in behavior between platforms. - return - Ssse3.IsSupported ? Ssse3.Shuffle(vector, indices) : - AdvSimd.Arm64.IsSupported ? AdvSimd.Arm64.VectorTableLookup(vector, indices) : - PackedSimd.Swizzle(vector, indices); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int ComputeFirstIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs index d0a0a7821913e1..2b13c5b19ce7e3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/ProbabilisticMap.cs @@ -1,9 +1,13 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.X86; #pragma warning disable IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228 @@ -23,8 +27,17 @@ namespace System.Buffers [StructLayout(LayoutKind.Sequential)] internal readonly struct ProbabilisticMap { - private const int IndexMask = 0x7; - private const int IndexShift = 0x3; + // The vectorized algorithm operates on bytes instead of uint32s. + // The index and shift are adjusted so that we represent the structure + // as "32 x uint8" instead of "8 x uint32". + // We use the vectorized implementation when we have access to Sse41 or Arm64 intrinsics. + private const uint VectorizedIndexMask = 31u; + private const int VectorizedIndexShift = 5; + + // If we don't support vectorization, use uint32 to speed up + // "IsCharBitSet" checks in scalar loops. + private const uint PortableIndexMask = 7u; + private const int PortableIndexShift = 3; private readonly uint _e0, _e1, _e2, _e3, _e4, _e5, _e6, _e7; @@ -56,23 +69,116 @@ public ProbabilisticMap(ReadOnlySpan values) if (hasAscii) { // Common to search for ASCII symbols. Just set the high value once. - charMap |= 1u; + SetCharBit(ref charMap, 0); } } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void SetCharBit(ref uint charMap, byte value) => - Unsafe.Add(ref charMap, (uint)value & IndexMask) |= 1u << (value >> IndexShift); + private static void SetCharBit(ref uint charMap, byte value) + { + if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported) + { + Unsafe.Add(ref Unsafe.As(ref charMap), value & VectorizedIndexMask) |= (byte)(1u << (value >> VectorizedIndexShift)); + } + else + { + Unsafe.Add(ref charMap, value & PortableIndexMask) |= 1u << (value >> PortableIndexShift); + } + } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsCharBitSet(ref uint charMap, byte value) => - (Unsafe.Add(ref charMap, (uint)value & IndexMask) & (1u << (value >> IndexShift))) != 0; + private static bool IsCharBitSet(ref uint charMap, byte value) => Sse41.IsSupported || AdvSimd.Arm64.IsSupported + ? (Unsafe.Add(ref Unsafe.As(ref charMap), value & VectorizedIndexMask) & (1u << (value >> VectorizedIndexShift))) != 0 + : (Unsafe.Add(ref charMap, value & PortableIndexMask) & (1u << (value >> PortableIndexShift))) != 0; [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool Contains(ref uint charMap, ReadOnlySpan values, int ch) => IsCharBitSet(ref charMap, (byte)ch) && IsCharBitSet(ref charMap, (byte)(ch >> 8)) && - values.Contains((char)ch); + Contains(values, (char)ch); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool Contains(ReadOnlySpan values, char ch) => + SpanHelpers.NonPackedContainsValueType( + ref Unsafe.As(ref MemoryMarshal.GetReference(values)), + (short)ch, + values.Length); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 ContainsMask32CharsAvx2(Vector256 charMapLower, Vector256 charMapUpper, ref char searchSpace) + { + Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref searchSpace, (nuint)Vector256.Count); + + Vector256 sourceLower = Avx2.PackUnsignedSaturate( + (source0 & Vector256.Create((ushort)255)).AsInt16(), + (source1 & Vector256.Create((ushort)255)).AsInt16()); + + Vector256 sourceUpper = Avx2.PackUnsignedSaturate( + (source0 >>> 8).AsInt16(), + (source1 >>> 8).AsInt16()); + + Vector256 resultLower = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceLower); + Vector256 resultUpper = IsCharBitSetAvx2(charMapLower, charMapUpper, sourceUpper); + + return resultLower & resultUpper; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 IsCharBitSetAvx2(Vector256 charMapLower, Vector256 charMapUpper, Vector256 values) + { + // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564 + Vector256 highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15); + + Vector256 bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble); + + Vector256 index = values & Vector256.Create((byte)VectorizedIndexMask); + Vector256 bitMaskLower = Avx2.Shuffle(charMapLower, index); + Vector256 bitMaskUpper = Avx2.Shuffle(charMapUpper, index - Vector256.Create((byte)16)); + Vector256 mask = Vector256.GreaterThan(index, Vector256.Create((byte)15)); + Vector256 bitMask = Vector256.ConditionalSelect(mask, bitMaskUpper, bitMaskLower); + + return ~Vector256.Equals(bitMask & bitPositions, Vector256.Zero); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 ContainsMask16Chars(Vector128 charMapLower, Vector128 charMapUpper, ref char searchSpace) + { + Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref searchSpace, (nuint)Vector128.Count); + + Vector128 sourceLower = Sse2.IsSupported + ? Sse2.PackUnsignedSaturate((source0 & Vector128.Create((ushort)255)).AsInt16(), (source1 & Vector128.Create((ushort)255)).AsInt16()) + : AdvSimd.Arm64.UnzipEven(source0.AsByte(), source1.AsByte()); + + Vector128 sourceUpper = Sse2.IsSupported + ? Sse2.PackUnsignedSaturate((source0 >>> 8).AsInt16(), (source1 >>> 8).AsInt16()) + : AdvSimd.Arm64.UnzipOdd(source0.AsByte(), source1.AsByte()); + + Vector128 resultLower = IsCharBitSet(charMapLower, charMapUpper, sourceLower); + Vector128 resultUpper = IsCharBitSet(charMapLower, charMapUpper, sourceUpper); + + return resultLower & resultUpper; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector128 IsCharBitSet(Vector128 charMapLower, Vector128 charMapUpper, Vector128 values) + { + // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564 + Vector128 highNibble = Sse2.IsSupported + ? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15) + : values >>> VectorizedIndexShift; + + Vector128 bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble); + + Vector128 index = values & Vector128.Create((byte)VectorizedIndexMask); + Vector128 bitMaskLower = Vector128.ShuffleUnsafe(charMapLower, index); + Vector128 bitMaskUpper = Vector128.ShuffleUnsafe(charMapUpper, index - Vector128.Create((byte)16)); + Vector128 mask = Vector128.GreaterThan(index, Vector128.Create((byte)15)); + Vector128 bitMask = Vector128.ConditionalSelect(mask, bitMaskUpper, bitMaskLower); + + return ~Vector128.Equals(bitMask & bitPositions, Vector128.Zero); + } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool ShouldUseSimpleLoop(int searchSpaceLength, int valuesLength) @@ -115,7 +221,7 @@ private static int IndexOfAny(ref char searchSpace, int searchSpaceLen while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) { char c = cur; - if (TNegator.NegateIfNeeded(valuesSpan.Contains(c))) + if (TNegator.NegateIfNeeded(Contains(valuesSpan, c))) { return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); } @@ -147,7 +253,7 @@ private static int LastIndexOfAny(ref char searchSpace, int searchSpac for (int i = searchSpaceLength - 1; i >= 0; i--) { char c = Unsafe.Add(ref searchSpace, i); - if (TNegator.NegateIfNeeded(valuesSpan.Contains(c))) + if (TNegator.NegateIfNeeded(Contains(valuesSpan, c))) { return i; } @@ -198,6 +304,11 @@ private static int ProbabilisticLastIndexOfAny(ref char searchSpace, i internal static int IndexOfAny(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { + if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && typeof(TNegator) == typeof(IndexOfAnyAsciiSearcher.DontNegate) && searchSpaceLength >= 16) + { + return IndexOfAnyVectorized(ref charMap, ref searchSpace, searchSpaceLength, values); + } + ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); ref char cur = ref searchSpace; @@ -206,7 +317,7 @@ internal static int IndexOfAny(ref uint charMap, ref char searchSpace, int ch = cur; if (TNegator.NegateIfNeeded(Contains(ref charMap, values, ch))) { - return (int)(Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); } cur = ref Unsafe.Add(ref cur, 1); @@ -230,5 +341,113 @@ internal static int LastIndexOfAny(ref uint charMap, ref char searchSp return -1; } + + private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) + { + Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported); + Debug.Assert(searchSpaceLength >= 16); + + ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); + ref char cur = ref searchSpace; + + Vector128 charMapLower = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap)); + Vector128 charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap), (nuint)Vector128.Count); + + if (Avx2.IsSupported && searchSpaceLength >= 32) + { + Vector256 charMapLower256 = Vector256.Create(charMapLower, charMapLower); + Vector256 charMapUpper256 = Vector256.Create(charMapUpper, charMapUpper); + + ref char lastStartVectorAvx2 = ref Unsafe.Subtract(ref searchSpaceEnd, 32); + + while (true) + { + Vector256 result = ContainsMask32CharsAvx2(charMapLower256, charMapUpper256, ref cur); + + if (result != Vector256.Zero) + { + // Account for how ContainsMask32CharsAvx2 packed the source chars (Avx2.PackUnsignedSaturate). + result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); + + uint mask = result.ExtractMostSignificantBits(); + do + { + ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask)); + + if (Contains(values, candidatePos)) + { + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char)); + } + + mask = BitOperations.ResetLowestSetBit(mask); + } + while (mask != 0); + } + + cur = ref Unsafe.Add(ref cur, 32); + + if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVectorAvx2)) + { + if (Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + { + return -1; + } + + if (Unsafe.ByteOffset(ref cur, ref searchSpaceEnd) > 16 * sizeof(char)) + { + // If we have more than 16 characters left to process, we can + // adjust the current vector and do one last iteration of Avx2. + cur = ref lastStartVectorAvx2; + } + else + { + // Otherwise adjust the vector such that we'll only need to do a single + // iteration of ContainsMask16Chars below. + cur = ref Unsafe.Subtract(ref searchSpaceEnd, 16); + break; + } + } + } + } + + ref char lastStartVector = ref Unsafe.Subtract(ref searchSpaceEnd, 16); + + while (true) + { + Vector128 result = ContainsMask16Chars(charMapLower, charMapUpper, ref cur); + + if (result != Vector128.Zero) + { + uint mask = result.ExtractMostSignificantBits(); + do + { + ref char candidatePos = ref Unsafe.Add(ref cur, BitOperations.TrailingZeroCount(mask)); + + if (Contains(values, candidatePos)) + { + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref candidatePos) / sizeof(char)); + } + + mask = BitOperations.ResetLowestSetBit(mask); + } + while (mask != 0); + } + + cur = ref Unsafe.Add(ref cur, 16); + + if (Unsafe.IsAddressGreaterThan(ref cur, ref lastStartVector)) + { + if (Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + { + break; + } + + // Adjust the current vector and do one last iteration. + cur = ref lastStartVector; + } + } + + return -1; + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 8eadbbb37980c4..a8853d950e1cd6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -6,6 +6,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics.Arm; +using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; namespace System.Runtime.Intrinsics @@ -1820,6 +1821,21 @@ public static Vector128 LoadUnsafe(ref T source, nuint elementOffset) return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); } + /// Loads a vector from the given source and reinterprets it as . + /// The source from which the vector will be loaded. + /// The vector loaded from . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128 LoadUnsafe(ref char source) => + LoadUnsafe(ref Unsafe.As(ref source)); + + /// Loads a vector from the given source and element offset and reinterprets it as . + /// The source to which will be added before loading the vector. + /// The element offset from from which the vector will be loaded. + /// The vector loaded from plus . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128 LoadUnsafe(ref char source, nuint elementOffset) => + LoadUnsafe(ref Unsafe.As(ref source), elementOffset); + /// Computes the maximum of two vectors on a per-element basis. /// The type of the elements in the vector. /// The vector to compare with . @@ -2419,6 +2435,35 @@ public static Vector128 Shuffle(Vector128 vector, Vector128 return result; } + /// Creates a new vector by selecting values from an input vector using a set of indices. + /// Behavior is platform-dependent for out-of-range indices. + /// The input vector from which values are selected. + /// The per-element indices used to select a value from . + /// A new vector containing the values from selected by the given . + /// Unlike Shuffle, this method delegates to the underlying hardware intrinsic without ensuring that are normalized to [0, 15]. + /// On hardware with support, indices are treated as modulo 16, and if the high bit is set, the result will be set to 0 for that element. + /// On hardware with or support, this method behaves the same as Shuffle. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128 ShuffleUnsafe(Vector128 vector, Vector128 indices) + { + if (Ssse3.IsSupported) + { + return Ssse3.Shuffle(vector, indices); + } + + if (AdvSimd.Arm64.IsSupported) + { + return AdvSimd.Arm64.VectorTableLookup(vector, indices); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.Swizzle(vector, indices); + } + + return Shuffle(vector, indices); + } + /// Creates a new vector by selecting values from an input vector using a set of indices. /// The input vector from which values are selected. /// The per-element indices used to select a value from . diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 212133baf28b33..340d20b0812327 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -1809,6 +1809,21 @@ public static Vector256 LoadUnsafe(ref T source, nuint elementOffset) return Unsafe.ReadUnaligned>(ref Unsafe.As(ref source)); } + /// Loads a vector from the given source and reinterprets it as . + /// The source from which the vector will be loaded. + /// The vector loaded from . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 LoadUnsafe(ref char source) => + LoadUnsafe(ref Unsafe.As(ref source)); + + /// Loads a vector from the given source and element offset and reinterprets it as . + /// The source to which will be added before loading the vector. + /// The element offset from from which the vector will be loaded. + /// The vector loaded from plus . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 LoadUnsafe(ref char source, nuint elementOffset) => + LoadUnsafe(ref Unsafe.As(ref source), elementOffset); + /// Computes the maximum of two vectors on a per-element basis. /// The type of the elements in the vector. /// The vector to compare with . diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs index e51370c7805383..f48659545b9d8a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs @@ -68,7 +68,6 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + 1)), // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285 SEARCH_TWO_CHARS: - ref ushort ushortSearchSpace = ref Unsafe.As(ref searchSpace); if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength - Vector256.Count >= 0) { // Find the last unique (which is not equal to ch1) character @@ -89,8 +88,8 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, offset + 1)), // Make sure we don't go out of bounds Debug.Assert(offset + ch1ch2Distance + Vector256.Count <= searchSpaceLength); - Vector256 cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance))); - Vector256 cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset)); + Vector256 cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); + Vector256 cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset)); Vector256 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); // Early out: cmpAnd is all zeros @@ -156,8 +155,8 @@ ref Unsafe.As(ref value), (nuint)(uint)valueLength * 2)) // Make sure we don't go out of bounds Debug.Assert(offset + ch1ch2Distance + Vector128.Count <= searchSpaceLength); - Vector128 cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance))); - Vector128 cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset)); + Vector128 cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); + Vector128 cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset)); Vector128 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); // Early out: cmpAnd is all zeros @@ -254,7 +253,6 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, relativeIndex + 1)), // Based on http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd "Algorithm 1: Generic SIMD" by Wojciech Mula // Some details about the implementation can also be found in https://github.com/dotnet/runtime/pull/63285 SEARCH_TWO_CHARS: - ref ushort ushortSearchSpace = ref Unsafe.As(ref searchSpace); if (Vector256.IsHardwareAccelerated && searchSpaceMinusValueTailLength >= Vector256.Count) { offset = searchSpaceMinusValueTailLength - Vector256.Count; @@ -272,8 +270,8 @@ ref Unsafe.As(ref Unsafe.Add(ref searchSpace, relativeIndex + 1)), do { - Vector256 cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset)); - Vector256 cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance))); + Vector256 cmpCh1 = Vector256.Equals(ch1, Vector256.LoadUnsafe(ref searchSpace, (nuint)offset)); + Vector256 cmpCh2 = Vector256.Equals(ch2, Vector256.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); Vector256 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); // Early out: cmpAnd is all zeros @@ -321,8 +319,8 @@ ref Unsafe.As(ref value), (nuint)(uint)valueLength * 2)) do { - Vector128 cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset)); - Vector128 cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)(offset + ch1ch2Distance))); + Vector128 cmpCh1 = Vector128.Equals(ch1, Vector128.LoadUnsafe(ref searchSpace, (nuint)offset)); + Vector128 cmpCh2 = Vector128.Equals(ch2, Vector128.LoadUnsafe(ref searchSpace, (nuint)(offset + ch1ch2Distance))); Vector128 cmpAnd = (cmpCh1 & cmpCh2).AsByte(); // Early out: cmpAnd is all zeros diff --git a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs index c5433157967bc6..589c7d021e3f5d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/String.Manipulation.cs @@ -1916,7 +1916,7 @@ private static void MakeSeparatorListVectorized(ReadOnlySpan sourceSpan, r nuint offset = 0; nuint lengthToExamine = (uint)sourceSpan.Length; - ref ushort source = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceSpan)); + ref char source = ref MemoryMarshal.GetReference(sourceSpan); Vector128 v1 = Vector128.Create((ushort)c); Vector128 v2 = Vector128.Create((ushort)c2); @@ -1947,7 +1947,7 @@ private static void MakeSeparatorListVectorized(ReadOnlySpan sourceSpan, r while (offset < lengthToExamine) { - char curr = (char)Unsafe.Add(ref source, offset); + char curr = Unsafe.Add(ref source, offset); if (curr == c || curr == c2 || curr == c3) { sepListBuilder.Append((int)offset);