Use simd for small prolog zeroing (ia32/x64)#32442
Use simd for small prolog zeroing (ia32/x64)#32442benaadams wants to merge 4 commits intodotnet:masterfrom
Conversation
|
At R2R changes prologue for G_M41002_IG01:
sub rsp, 96
mov rsi, rcx
lea rdi, [rsp+28H]
mov ecx, 12
xor rax, rax
rep stosd
mov rcx, rsi
mov qword ptr [rsp+58H], rdx
mov rbx, rcx
mov rsi, rdx
mov rdi, r8
;; bbWeight=1 PerfScore 31.50to sub rsp, 96
xorps xmm4, xmm4
movdqu xmmword ptr [rsp+28H], xmm4
movdqu xmmword ptr [rsp+38H], xmm4
movdqu xmmword ptr [rsp+48H], xmm4
mov qword ptr [rsp+58H], rdx
mov rbx, rcx
mov rsi, rdx
mov rdi, r8
;; bbWeight=1 PerfScore 8.33 |
|
At Jit time (when it can use longer sub rsp, 96
mov rsi, rcx
lea rdi, [rsp+20H]
mov ecx, 16
xor rax, rax
rep stosd
mov rcx, rsi
mov rsi, rcx
;; bbWeight=1 PerfScore 32.00to sub rsp, 96
vzeroupper
vxorps ymm4, ymm4
vmovdqu ymmword ptr[rsp+20H], ymm4
vmovdqu ymmword ptr[rsp+40H], ymm4
mov rsi, rcx
;; bbWeight=1 PerfScore 8.83 |
|
|
sub rsp, 96
mov rsi, rcx
lea rdi, [rsp+20H]
mov ecx, 16
xor rax, rax
rep stosd
mov rcx, rsi
;; bbWeight=1 PerfScore 30.75To sub rsp, 96
xorps xmm4, xmm4
movdqu xmmword ptr [rsp+20H], xmm4
movdqu xmmword ptr [rsp+30H], xmm4
movdqu xmmword ptr [rsp+40H], xmm4
movdqu xmmword ptr [rsp+50H], xmm4
mov rsi, rdx
;; bbWeight=1 PerfScore 8.58Though at Tier1 that would convert to a smaller form of sub rsp, 96
vxorps ymm4, ymm4
movdqu ymmword ptr [rsp+20H], ymm4
movdqu ymmword ptr [rsp+40H], ymm4
mov rsi, rdx |
The
and
Neither look to be listed for a specific micro-architecture. Since the current cache line size is 64-bytes and I believe we only guarantee 16-byte stack alignment there would be a decent chance of crossing the cache line boundary for 2x YMM writes. |
Only pointer sized alignment? e.g 8 byte on x64
Still going to hit it with 4x XMM writes; its only removing 1 chance in a 32 byte segment (by having 2x 16 so a split in the middle).
Since the issue happens at page boundaries and the stack is 8 byte aligned, the chance is ever so slightly lower at 2 in 4096 rather than 3 in 4096?
Choose 4 x SIMD size due to the code gen size rather than the performance; only using |
No, I believe the stack (for TYP_SIMD) has some special support and will be 16-byte aligned (maybe @CarolEidt could confirm while I try to find the relevant code).
Ah, I see; makes sense. |
Could we align to 32-byte for 64bit ? (And then not use Avx on 32-bit, not sure if its supported there?) That would also make |
|
There is some information on the stack frame alignment here: https://github.com/dotnet/runtime/blob/master/src/coreclr/src/jit/lclvars.cpp#L4327 and here: https://github.com/dotnet/runtime/blob/master/src/coreclr/src/jit/lclvars.cpp#L6256 IIRC, we only properly align (on the stack and only for x64) |
5d056d2 to
89ae26b
Compare
|
ABI guarantees that the stack has 16 byte alignment for x64, both on Windows and SysV, outside of the prolog and epilog, for non-leaf methods. The local var area is likewise guaranteed by the jit to be 16 byte aligned, see eg |
So it should be possible to 32 byte align on x64 to avoid any page split penalties in the |
|
Change with a larger struct private readonly Memory<byte> _expected = new byte[5];
public bool ReadOnlySequenceEqual(ReadOnlySequence<byte> ros)
{
var span = ros.FirstSpan;
var expected = _expected.Span;
return (span.Length == expected.Length) ?
SequenceEqualFast(span, expected) :
SequenceEqualSlow(ros, expected);
}
[MethodImpl(MethodImplOptions.NoInlining)]
public static bool SequenceEqualFast(ReadOnlySpan<byte> input, ReadOnlySpan<byte> expected) => true;
[MethodImpl(MethodImplOptions.NoInlining)]
public static bool SequenceEqualSlow(ReadOnlySequence<byte> input, ReadOnlySpan<byte> expected) => true;From sub rsp, 168
vzeroupper
mov rsi, rcx
lea rdi, [rsp+28H]
mov ecx, 32
xor rax, rax
rep stosd
mov rcx, rsi
mov rdi, rcx
mov rsi, rdx
;; bbWeight=1 PerfScore 36.25To sub rsp, 168
vzeroupper
vxorps ymm4, ymm4
vmovdqu ymmword ptr[rsp+28H], ymm4
vmovdqu ymmword ptr[rsp+48H], ymm4
vmovdqu ymmword ptr[rsp+68H], ymm4
vmovdqu ymmword ptr[rsp+88H], ymm4
mov rdi, rcx
mov rsi, rdx
;; bbWeight=1 PerfScore 14.08 |
We currently don't have enough guarantees -- we'd need to implement "dynamic alignment" or similar, which comes with its own set of tradeoffs. |
|
So on the one hand crossing a page should be rareish; but on the other hand if you do cross a page and its in you hot-path range then will be continuously crossing a page as the stack unwinds and rewinds which would then make it very common. (i.e. once you are locked into crossing a page, it will happen frequently) Hmm... @tannergooding as you highlight that probably is quite a problem 😢 Some of the structs are pretty big (e.g. Don't want to add a branch to align... Could start stack sizes >= 64 unconditionally with a However that would over zero the stack by 16 bytes when unaligned; would that be a problem? (e.g. are there stack markers to detect overruns which would get upset if they were zeroed?) |
|
Talking of G_M25757_IG01:
push rbp
push r15
push r14
push r13
push r12
push rdi
push rsi
push rbx
sub rsp, 216
vzeroupper
lea rbp, [rsp+110H]
mov rsi, rcx
lea rdi, [rbp-D8H]
mov ecx, 40
xor rax, rax
rep stosd
mov rcx, rsi
mov qword ptr [rbp-E8H], rsp
mov gword ptr [rbp+10H], rcx
mov bword ptr [rbp+18H], rdx
mov bword ptr [rbp+28H], r9
mov esi, r8d
;; bbWeight=1 PerfScore 40.50
G_M25757_IG02:
mov rcx, bword ptr [rbp+18H]
mov rdx, gword ptr [rcx]
mov rcx, bword ptr [rbp+18H]
mov rdi, gword ptr [rcx+8]
mov rcx, bword ptr [rbp+18H]
mov ebx, dword ptr [rcx+16]
and ebx, 0xD1FFAB1E
mov rcx, bword ptr [rbp+18H]
mov r14d, dword ptr [rcx+20]
and r14d, 0xD1FFAB1E
cmp rdx, rdi
je SHORT G_M25757_IG10
;; bbWeight=1 PerfScore 13.75Bunch of busy work with With sub rsp, 216
vzeroupper
lea rbp, [rsp+110H]
vxorps ymm4, ymm4
vmovdqu ymmword ptr[rbp-D8H], ymm4
vmovdqu ymmword ptr[rbp-B8H], ymm4
vmovdqu ymmword ptr[rbp-98H], ymm4
vmovdqu ymmword ptr[rbp-78H], ymm4
vmovdqu ymmword ptr[rbp-58H], ymm4However only using sub rsp, 216
vzeroupper
lea rbp, [rsp+110H]
vxorps xmm4, xmm4
vmovdqu xmmword ptr [rbp-D8H], xmm4
vmovdqu xmmword ptr [rbp-C8H], xmm4
vmovdqu xmmword ptr [rbp-B8H], xmm4
vmovdqu xmmword ptr [rbp-A8H], xmm4
vmovdqu xmmword ptr [rbp-98H], xmm4
vmovdqu xmmword ptr [rbp-88H], xmm4
vmovdqu xmmword ptr [rbp-78H], xmm4
vmovdqu xmmword ptr [rbp-68H], xmm4
vmovdqu xmmword ptr [rbp-58H], xmm4
vmovdqu xmmword ptr [rbp-48H], xmm4 |
|
Now produces: sub rsp, 216
vzeroupper
lea rbp, [rsp+110H]
vxorps ymm4, ymm4
vmovdqu xmmword ptr [rbp-D8H], xmm4 ; Zero first 16 bytes
lea rax, [rbp-D8H] ; Get block start
and rax, 31 ; Get offset for 32 byte alignment of block (is 16 byte aligned and first 16 are already zeroed)
add rax, -216 ; Add back offset
add rax, rbp ; Add back stack pointer
vmovdqu ymmword ptr[rax], ymm4
vmovdqu ymmword ptr[rax+32], ymm4
vmovdqu ymmword ptr[rax+64], ymm4
vmovdqu ymmword ptr[rax+96], ymm4
vmovdqu ymmword ptr[rax+128], ymm4@tannergooding would that work? |
6856159 to
2f06da8
Compare
Probably don't want to use non temporal? Not having visibility from other procs is fine (since its thread stack); however likely want it in the cache as its going to be immediately used? |
|
Maybe? I wonder if this might cause store buffer stalls. |
|
Also 32 bytes aligns lea rdi, [rbp-98H]
mov ecx, 26
xor rax, rax
rep stosd To lea rdi, [rbp-98H]
xorps xmm4, xmm4
movdqu xmmword ptr [rbp-98H], xmm4
and rdi, 31
mov ecx, 26
sub rcx, rdi
add rdi, -152
add rdi, rbp
xor rax, rax
rep stosd |
Cross gen of corelib which is non-AVX is |
I would be ok to take this regression for this fix. |
|
Haven't got it quite right... |
|
Perhaps not everything is fully 16 byte aligned; not able to trigger this failure in simple program yet. @AndyAyersMS when you say:
Is there a way to detect its a leaf method? Then I can have more variable realignment but keep it to leaf methods (i.e. not assume its 16byte aligned) |
|
Think I can use? bool is16byteAligned = (compiler->compLclFrameSize % 16) == 0; |
For Windows x64, I think you can check if You could also add explicit tracking in |
5392cf1 to
4f5a326
Compare
|
I've change it to assume no alignment and to always fully align with 2x xmm at start for It does make it regress further slightly |
|
Might wait to see if tests pass before breaking it down more 😅 |
Decided to do that 😄 #32538 |
|
Closing in preference of #32538 |
|
Ah... might be issue between bytes and dwords |
rep stosd has a low throughput if the memory is not 32byte aligned; however it is a compact code size. Use faster throughput SIMD instructions when the amount to zero in the prologue is low; moving back to rep stosd when it becomes too many instructions.
|
Will do it in other one |

rep stosd has a low throughput if the memory is not 32byte aligned; however it is a compact code size.
Use faster throughput SIMD instructions when the amount to zero in the prologue is low; moving back to rep stosd when it becomes too many instructions.
Also align
rep stosdto 32 bytes when it is used.Best checked with ignore whitespace (due to new
if/elseindenting)Contributes to #8890
/cc @erozenfeld not entirely sure what I'm doing :)