Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Can Unsafe.As do-not-enreg[SVB] ld-addr-op be elided on inlining? #10029

Closed
benaadams opened this issue Mar 25, 2018 · 5 comments
Closed

Can Unsafe.As do-not-enreg[SVB] ld-addr-op be elided on inlining? #10029

benaadams opened this issue Mar 25, 2018 · 5 comments
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI optimization
Milestone

Comments

@benaadams
Copy link
Member

benaadams commented Mar 25, 2018

Using Unsafe.As(ref causes variable to be marked as do-not-enreg[SVB] ld-addr-op.

Can this be elided through inlining?

Example dotnet/corefx#25510 (comment)

public static Quaternion Normalize(Quaternion value)
{
    Vector4 q = Unsafe.As<Quaternion, Vector4>(ref value);
    q = Vector4.Normalize(q);
    return Unsafe.As<Vector4, Quaternion>(ref q);
}
public static Quaternion Normalize_New()
{
    Quaternion start = new Quaternion(8.5f, 9.4f, 1.2f, 1f);

    Quaternion c1 = Quaternion.Normalize(start);
    Quaternion c2 = Quaternion.Normalize(c1);
    Quaternion c3 = Quaternion.Normalize(c2);
    return Quaternion.Normalize(c3);
}
; Assembly listing for method Program:Normalize_New():struct
;
;  V00 RetBuf       [V00,T05] (  4,  4   )   byref  ->  rcx        
;* V01 loc0         [V01    ] (  0,  0   )  struct (16) zero-ref   
;  V02 tmp1         [V02,T06] (  2,  4   )  struct (16) [rsp+0xA8]   do-not-enreg[SB]
;  V03 tmp2         [V03,T07] (  2,  4   )  struct (16) [rsp+0x98]   do-not-enreg[SB]
;  V04 tmp3         [V04,T08] (  2,  4   )  struct (16) [rsp+0x88]   do-not-enreg[SB]
;  V05 tmp4         [V05    ] (  2,  4   )  struct (16) [rsp+0x78]   do-not-enreg[XSVB] addr-exposed ld-addr-op
;  V06 tmp5         [V06,T16] (  2,  2   )  simd16  ->  [rsp+0x60]   do-not-enreg[SB] ld-addr-op
;  V07 tmp6         [V07,T17] (  2,  2   )  simd16  ->  mm0        
;  V08 tmp7         [V08,T01] (  4,  8   )  simd16  ->  mm0         ld-addr-op
;* V09 tmp8         [V09    ] (  0,  0   )   float  ->  zero-ref   
;  V10 tmp9         [V10,T24] (  2,  2   )   float  ->  mm1        
;  V11 tmp10        [V11,T25] (  2,  2   )   float  ->  mm1        
;* V12 tmp11        [V12    ] (  0,  0   )  simd16  ->  zero-ref   
;  V13 tmp12        [V13,T09] (  2,  4   )  simd16  ->  mm1        
;  V14 tmp13        [V14,T10] (  2,  4   )  struct (16) [rsp+0x50]   do-not-enreg[SVB] ld-addr-op
;  V15 tmp14        [V15,T18] (  2,  2   )  simd16  ->  [rsp+0x40]   do-not-enreg[SB] ld-addr-op
;  V16 tmp15        [V16,T19] (  2,  2   )  simd16  ->  mm0        
;  V17 tmp16        [V17,T02] (  4,  8   )  simd16  ->  mm0         ld-addr-op
;* V18 tmp17        [V18    ] (  0,  0   )   float  ->  zero-ref   
;  V19 tmp18        [V19,T26] (  2,  2   )   float  ->  mm1        
;  V20 tmp19        [V20,T27] (  2,  2   )   float  ->  mm1        
;* V21 tmp20        [V21    ] (  0,  0   )  simd16  ->  zero-ref   
;  V22 tmp21        [V22,T11] (  2,  4   )  simd16  ->  mm1        
;  V23 tmp22        [V23,T12] (  2,  4   )  struct (16) [rsp+0x30]   do-not-enreg[SVB] ld-addr-op
;  V24 tmp23        [V24,T20] (  2,  2   )  simd16  ->  [rsp+0x20]   do-not-enreg[SB] ld-addr-op
;  V25 tmp24        [V25,T21] (  2,  2   )  simd16  ->  mm0        
;  V26 tmp25        [V26,T03] (  4,  8   )  simd16  ->  mm0         ld-addr-op
;* V27 tmp26        [V27    ] (  0,  0   )   float  ->  zero-ref   
;  V28 tmp27        [V28,T28] (  2,  2   )   float  ->  mm1        
;  V29 tmp28        [V29,T29] (  2,  2   )   float  ->  mm1        
;* V30 tmp29        [V30    ] (  0,  0   )  simd16  ->  zero-ref   
;  V31 tmp30        [V31,T13] (  2,  4   )  simd16  ->  mm1        
;  V32 tmp31        [V32,T14] (  2,  4   )  struct (16) [rsp+0x10]   do-not-enreg[SVB] ld-addr-op
;  V33 tmp32        [V33,T22] (  2,  2   )  simd16  ->  [rsp+0x00]   do-not-enreg[SB] ld-addr-op
;  V34 tmp33        [V34,T23] (  2,  2   )  simd16  ->  mm0        
;  V35 tmp34        [V35,T04] (  4,  8   )  simd16  ->  mm0         ld-addr-op
;* V36 tmp35        [V36    ] (  0,  0   )   float  ->  zero-ref   
;  V37 tmp36        [V37,T30] (  2,  2   )   float  ->  mm1        
;  V38 tmp37        [V38,T31] (  2,  2   )   float  ->  mm1        
;* V39 tmp38        [V39    ] (  0,  0   )  simd16  ->  zero-ref   
;  V40 tmp39        [V40,T15] (  2,  4   )  simd16  ->  mm1        
;  V41 tmp40        [V41,T32] (  2,  2   )   float  ->  mm0         V01.X(offs=0x00) P-INDEP
;  V42 tmp41        [V42,T33] (  2,  2   )   float  ->  mm1         V01.Y(offs=0x04) P-INDEP
;  V43 tmp42        [V43,T34] (  2,  2   )   float  ->  mm2         V01.Z(offs=0x08) P-INDEP
;  V44 tmp43        [V44,T35] (  2,  2   )   float  ->  mm3         V01.W(offs=0x0c) P-INDEP
;  V45 tmp44        [V45,T00] (  5, 10   )   byref  ->  rax         stack-byref
;# V46 OutArgs      [V46    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]  
;
; Lcl frame size = 184

G_M39231_IG01:
       4881ECB8000000       sub      rsp, 184
       C5F877               vzeroupper 

G_M39231_IG02:
       C4E17A10055D010000   vmovss   xmm0, dword ptr [reloc @RWD00]
       C4E17A100D58010000   vmovss   xmm1, dword ptr [reloc @RWD04]
       C4E17A101553010000   vmovss   xmm2, dword ptr [reloc @RWD08]
       C4E17A101D4E010000   vmovss   xmm3, dword ptr [reloc @RWD12]
       488D442478           lea      rax, bword ptr [rsp+78H]
       C4E17A1100           vmovss   dword ptr [rax], xmm0
       C4E17A114804         vmovss   dword ptr [rax+4], xmm1
       C4E17A115008         vmovss   dword ptr [rax+8], xmm2
       C4E17A11580C         vmovss   dword ptr [rax+12], xmm3
       C4E17910442478       vmovupd  xmm0, xmmword ptr [rsp+78H]
       C4E17828C8           vmovaps  xmm1, xmm0
       C4E37140C8F1         vdpps    xmm1, xmm0, 241
       C4E17251C9           vsqrtss  xmm1, xmm1
       C4E27918C9           vbroadcastss xmm1, xmm1
       C4E1785EC1           vdivps   xmm0, xmm1
       C4E17929442460       vmovapd  xmmword ptr [rsp+60H], xmm0  ; stack shuffle
       C4E17A6F442460       vmovdqu  xmm0, qword ptr [rsp+60H]    ; stack shuffle
       C4E17A7F8424A8000000 vmovdqu  qword ptr [rsp+A8H], xmm0    ; stack shuffle
       C4E17A6F8424A8000000 vmovdqu  xmm0, qword ptr [rsp+A8H]    ; stack shuffle
       C4E17A7F442450       vmovdqu  qword ptr [rsp+50H], xmm0    ; stack shuffle
       C4E17910442450       vmovupd  xmm0, xmmword ptr [rsp+50H]  ; stack shuffle
       C4E17828C8           vmovaps  xmm1, xmm0
       C4E37140C8F1         vdpps    xmm1, xmm0, 241
       C4E17251C9           vsqrtss  xmm1, xmm1
       C4E27918C9           vbroadcastss xmm1, xmm1
       C4E1785EC1           vdivps   xmm0, xmm1
       C4E17929442440       vmovapd  xmmword ptr [rsp+40H], xmm0  ; stack shuffle
       C4E17A6F442440       vmovdqu  xmm0, qword ptr [rsp+40H]    ; stack shuffle
       C4E17A7F842498000000 vmovdqu  qword ptr [rsp+98H], xmm0    ; stack shuffle
       C4E17A6F842498000000 vmovdqu  xmm0, qword ptr [rsp+98H]    ; stack shuffle
       C4E17A7F442430       vmovdqu  qword ptr [rsp+30H], xmm0    ; stack shuffle
       C4E17910442430       vmovupd  xmm0, xmmword ptr [rsp+30H]  ; stack shuffle
       C4E17828C8           vmovaps  xmm1, xmm0
       C4E37140C8F1         vdpps    xmm1, xmm0, 241
       C4E17251C9           vsqrtss  xmm1, xmm1
       C4E27918C9           vbroadcastss xmm1, xmm1
       C4E1785EC1           vdivps   xmm0, xmm1
       C4E17929442420       vmovapd  xmmword ptr [rsp+20H], xmm0  ; stack shuffle
       C4E17A6F442420       vmovdqu  xmm0, qword ptr [rsp+20H]    ; stack shuffle
       C4E17A7F842488000000 vmovdqu  qword ptr [rsp+88H], xmm0    ; stack shuffle
       C4E17A6F842488000000 vmovdqu  xmm0, qword ptr [rsp+88H]    ; stack shuffle
       C4E17A7F442410       vmovdqu  qword ptr [rsp+10H], xmm0    ; stack shuffle
       C4E17910442410       vmovupd  xmm0, xmmword ptr [rsp+10H]  ; stack shuffle
       C4E17828C8           vmovaps  xmm1, xmm0
       C4E37140C8F1         vdpps    xmm1, xmm0, 241
       C4E17251C9           vsqrtss  xmm1, xmm1
       C4E27918C9           vbroadcastss xmm1, xmm1
       C4E1785EC1           vdivps   xmm0, xmm1
       C4E179290424         vmovapd  xmmword ptr [rsp], xmm0
       C4E17A6F0424         vmovdqu  xmm0, qword ptr [rsp]
       C4E17A7F01           vmovdqu  qword ptr [rcx], xmm0
       488BC1               mov      rax, rcx

G_M39231_IG03:
       4881C4B8000000       add      rsp, 184
       C3                   ret      

; Total bytes of code 357, prolog size 10 for method Program:Normalize_New():struct

Or is there a better approach? (e.g. Unsafe.Read requires fixed and Unsafe.ReadUnaligned is a double ref also requires Unsafe.As<Quaternion, byte>)

/cc @CarolEidt @AndyAyersMS

category:cq
theme:structs
skill-level:expert
cost:medium
impact:medium

@AndyAyersMS
Copy link
Member

Took a quick look and am not really sure there is any easy way to improve on this. But I could be overlooking something.

The use of Unsafe.As effectively hides the fact that the fields of the structs are being accessed, and so the promotion heuristic does not fire:

Not promoting promotable struct local V02: #fields = 4, fieldAccessed = 0.

... etc ...

For structs with 3 or fewer fields the jit will promote even without seeing evidence of individual field accesses, but when there are 4 or more the field access evidence is needed.

Changing the limit from 3 to 4 doesn't materially improve things as the struct copies don't get properly optimized even when the structs get promoted. We are not really prepared to handle a simd type with independently nameable sub-parts. So possibly further changes in the copy heuristic are needed to realize that simd copies can be made efficiently via block copy instead of field by field. And there may still be other issues beyond this. But perhaps it is viable.

Or, possibly, there needs to be some higher-level logic in the jit that realizes that the Quaternion is isomorphic to a simd type and so can be treated as one over larger stretches of code so the conversions in and out of different isomorphic type forms don't manifest as actual copies. That sort of type propagation can be effective if the meat and potatoes operations on the more abstract type are all internally recast as simd. But if not (say only a few operations are converted) then this starts to verge into the territory of being able to automatically recognize simd operations in non-simd code, and that is not an easy road either.

Maybe Carol has some other ideas...

@benaadams
Copy link
Member Author

I was hoping to get the output looking more like what happens when you switch it for Vector4 where the stack isn't really used and everything stays in registers:

; Assembly listing for method Program:Normalize_Vector4():struct
; ...
; Lcl frame size = 0

G_M3011_IG01:
       C5F877               vzeroupper 

G_M3011_IG02:
       C4E17A1005CC000000   vmovss   xmm0, dword ptr [reloc @RWD00]
       C4E17A100DC7000000   vmovss   xmm1, dword ptr [reloc @RWD04]
       C4E17A1015C2000000   vmovss   xmm2, dword ptr [reloc @RWD08]
       C4E17A101DBD000000   vmovss   xmm3, dword ptr [reloc @RWD12]
       C4E15857E4           vxorps   xmm4, xmm4
       C4E15A10E3           vmovss   xmm4, xmm4, xmm3
       C4E15973FC04         vpslldq  xmm4, 4
       C4E15A10E2           vmovss   xmm4, xmm4, xmm2
       C4E15973FC04         vpslldq  xmm4, 4
       C4E15A10E1           vmovss   xmm4, xmm4, xmm1
       C4E15973FC04         vpslldq  xmm4, 4
       C4E15A10E0           vmovss   xmm4, xmm4, xmm0
       C4E17828C4           vmovaps  xmm0, xmm4
       C4E17828C8           vmovaps  xmm1, xmm0
       C4E37140C8F1         vdpps    xmm1, xmm0, 241
       C4E17251C9           vsqrtss  xmm1, xmm1
       C4E27918C9           vbroadcastss xmm1, xmm1
       C4E1785EC1           vdivps   xmm0, xmm1
       C4E17828C8           vmovaps  xmm1, xmm0
       C4E37140C8F1         vdpps    xmm1, xmm0, 241
       C4E17251C9           vsqrtss  xmm1, xmm1
       C4E27918C9           vbroadcastss xmm1, xmm1
       C4E1785EC1           vdivps   xmm0, xmm1
       C4E17828C8           vmovaps  xmm1, xmm0
       C4E37140C8F1         vdpps    xmm1, xmm0, 241
       C4E17251C9           vsqrtss  xmm1, xmm1
       C4E27918C9           vbroadcastss xmm1, xmm1
       C4E1785EC1           vdivps   xmm0, xmm1
       C4E17828C8           vmovaps  xmm1, xmm0
       C4E37140C8F1         vdpps    xmm1, xmm0, 241
       C4E17251C9           vsqrtss  xmm1, xmm1
       C4E27918C9           vbroadcastss xmm1, xmm1
       C4E1785EC1           vdivps   xmm0, xmm1
       C4E1791101           vmovupd  xmmword ptr [rcx], xmm0
       488BC1               mov      rax, rcx

G_M3011_IG03:
       C3                   ret      

; Total bytes of code 200, prolog size 3 for method Program:Normalize_Vector4():struct

@benaadams
Copy link
Member Author

Also in a pattern that worked for general structs rather than just the Quaternion one; for example rows of matrixes could also then use the approach

@mikedn
Copy link
Contributor

mikedn commented Mar 26, 2018

Apparently not reusing q produces slightly better code:

Vector4 q = Unsafe.As<Quaternion, Vector4>(ref value);
Vector4 r = Vector4.Normalize(q);
return Unsafe.As<Vector4, Quaternion>(ref r);

I haven't investigated to see why is that.

@msftgits msftgits transferred this issue from dotnet/coreclr Jan 31, 2020
@msftgits msftgits added this to the Future milestone Jan 31, 2020
@jakobbotsch
Copy link
Member

Codegen today is

; Assembly listing for method Program:Normalize_New():System.Numerics.Quaternion
; Emitting BLENDED_CODE for X64 with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
;  V00 RetBuf       [V00,T00] (  4,  4   )   byref  ->  rcx         single-def
;# V01 OutArgs      [V01    ] (  1,  1   )  struct ( 0) [rsp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V02 tmp1         [V02    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "NewObj constructor temp"
;* V03 tmp2         [V03    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
;* V04 tmp3         [V04    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
;* V05 tmp4         [V05    ] (  0,  0   )  simd16  ->  zero-ref    "spilled call-like call argument"
;* V06 tmp5         [V06    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inlining Arg"
;* V07 tmp6         [V07,T04] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline stloc first use temp"
;* V08 tmp7         [V08    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inlining Arg"
;  V09 tmp8         [V09,T01] (  4,  4   )  simd16  ->  mm0         ld-addr-op "Inline stloc first use temp"
;* V10 tmp9         [V10    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inlining Arg"
;  V11 tmp10        [V11,T02] (  4,  4   )  simd16  ->  mm0         ld-addr-op "Inline stloc first use temp"
;* V12 tmp11        [V12    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inlining Arg"
;  V13 tmp12        [V13,T03] (  4,  4   )  simd16  ->  mm0         ld-addr-op "Inline stloc first use temp"
;
; Lcl frame size = 0

G_M50836_IG01:  ;; offset=0000H
       vzeroupper
                                                ;; size=3 bbWeight=1 PerfScore 1.00
G_M50836_IG02:  ;; offset=0003H
       vmovups  xmm0, xmmword ptr [reloc @RWD00]
       vdpps    xmm0, xmm0, xmmword ptr [reloc @RWD00], -1
       vsqrtps  xmm0, xmm0
       vmovups  xmm1, xmmword ptr [reloc @RWD00]
       vdivps   xmm0, xmm1, xmm0
       vdpps    xmm1, xmm0, xmm0, -1
       vsqrtps  xmm1, xmm1
       vdivps   xmm0, xmm0, xmm1
       vdpps    xmm1, xmm0, xmm0, -1
       vsqrtps  xmm1, xmm1
       vdivps   xmm0, xmm0, xmm1
       vdpps    xmm1, xmm0, xmm0, -1
       vsqrtps  xmm1, xmm1
       vdivps   xmm0, xmm0, xmm1
       vmovups  xmmword ptr [rcx], xmm0
       mov      rax, rcx
                                                ;; size=83 bbWeight=1 PerfScore 140.25
G_M50836_IG03:  ;; offset=0056H
       ret
                                                ;; size=1 bbWeight=1 PerfScore 1.00
RWD00   dq      4116666641080000h, 3F8000003F99999Ah


; Total bytes of code 87, prolog size 3, PerfScore 150.95, instruction count 18, allocated bytes for code 87 (MethodHash=d479396b) for method Program:Normalize_New():System.Numerics.Quaternion
; ============================================================

@ghost ghost locked as resolved and limited conversation to collaborators Jul 17, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI optimization
Projects
None yet
Development

No branches or pull requests

5 participants