-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Unify unroll limits in a single entry point #83274
Conversation
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak Issue Detailsnull
|
Fixes #82529? |
Ah, didn't see this one. Yeah, it does. It zeroes xor eax, eax
vxorps ymm0, ymm0
vmovdqu ymmword ptr[rdx], ymm0
vmovdqu ymmword ptr[rdx+20H], ymm0
vmovdqu ymmword ptr[rdx+40H], ymm0
vmovdqu ymmword ptr[rdx+60H], ymm0
vmovdqu ymmword ptr[rdx+80H], ymm0
mov qword ptr [rdx+A0H], rax but only with AVX or on arm64 |
Bencmarks: Memsetpublic unsafe class MemsetBenchmarks
{
private static readonly byte[] Data1 = new byte[1024];
[Benchmark] public void Memset8() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 8);
[Benchmark] public void Memset10() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 10);
[Benchmark] public void Memset14() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 14);
[Benchmark] public void Memset16() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 16);
[Benchmark] public void Memset17() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 17);
[Benchmark] public void Memset20() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 20);
[Benchmark] public void Memset32() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 32);
[Benchmark] public void Memset33() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 33);
[Benchmark] public void Memset40() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 40);
[Benchmark] public void Memset50() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 50);
[Benchmark] public void Memset64() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 64);
[Benchmark] public void Memset65() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 65);
[Benchmark] public void Memset80() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 80);
[Benchmark] public void Memset90() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 90);
[Benchmark] public void Memset110() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 110);
[Benchmark] public void Memset128() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 128);
[Benchmark] public void Memset129() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 129);
[Benchmark] public void Memset200() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 200);
[Benchmark] public void Memset256() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 256);
[Benchmark] public void Memset257() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 257);
[Benchmark] public void Memset300() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 300);
[Benchmark] public void Memset400() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 400);
[Benchmark] public void Memset512() => Unsafe.InitBlockUnaligned(ref Data1[0], 0, 512);
} Memcpypublic unsafe class MemcpyBenchmarks
{
private static readonly byte[] Data1 = new byte[1024];
private static readonly byte[] Data2 = new byte[1024];
[Benchmark] public void Memcpy8() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 8);
[Benchmark] public void Memcpy10() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 10);
[Benchmark] public void Memcpy14() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 14);
[Benchmark] public void Memcpy16() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 16);
[Benchmark] public void Memcpy17() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 17);
[Benchmark] public void Memcpy20() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 20);
[Benchmark] public void Memcpy32() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 32);
[Benchmark] public void Memcpy33() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 33);
[Benchmark] public void Memcpy40() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 40);
[Benchmark] public void Memcpy50() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 50);
[Benchmark] public void Memcpy64() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 64);
[Benchmark] public void Memcpy65() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 65);
[Benchmark] public void Memcpy80() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 80);
[Benchmark] public void Memcpy90() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 90);
[Benchmark] public void Memcpy110() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 110);
[Benchmark] public void Memcpy128() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 128);
[Benchmark] public void Memcpy129() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 129);
[Benchmark] public void Memcpy200() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 200);
[Benchmark] public void Memcpy256() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 256);
[Benchmark] public void Memcpy257() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 257);
[Benchmark] public void Memcpy300() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 300);
[Benchmark] public void Memcpy400() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 400);
[Benchmark] public void Memcpy512() => Unsafe.CopyBlockUnaligned(ref Data1[0], ref Data2[0], 512);
} Verified on: Core i7 8700k, Core i9 9980HK, planning to test on Ryzen 7950X |
@dotnet/jit-contrib PTAL Visible things this PR fixes:
Diffs are not too big outside of coreclr_tests collection - around +2k-3k for libraries.pmi: diffs To improve some of them I filed:
A typical size regression looks like this: mov qword ptr [rbp-C8H], rdx
mov rdx, bword ptr [rbp+18H]
; byrRegs +[rdx]
- lea rcx, bword ptr [rbp-B8H]
- ; byrRegs +[rcx]
- mov r8d, 80
- call CORINFO_HELP_MEMCPY
- ; byrRegs -[rcx rdx]
+ vmovdqu ymm0, ymmword ptr[rdx]
+ vmovdqu ymmword ptr[rbp-B8H], ymm0
+ vmovdqu ymm0, ymmword ptr[rdx+20H]
+ vmovdqu ymmword ptr[rbp-98H], ymm0
+ vmovdqu xmm0, xmmword ptr [rdx+40H]
+ vmovdqu xmmword ptr [rbp-78H], xmm0
mov rdx, qword ptr [rbp-C0H]
+ ; byrRegs -[rdx]
mov r8, qword ptr [rbp-C8H]
lea r9, [rbp-B8H]
lea rcx, [rbp-40H] which is 2x faster on all machines I tested. There are several cases where unrolling produces more compact code than Diffs are mostly negative for ARM64, e.g.: |
@tannergooding @dotnet/jit-contrib PTAL |
The current limits were a bit odd, e.g. hard limit of 128 bytes on x64 no matter if it supports AVX or not (2x less instructions).
Also, the new limits overall match whatever native compilers do for memset/memcpy unroll in -Os (size-aware): https://godbolt.org/z/dW1qqaP9a
Closes #82529