-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Closed
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone
Description
Consider the following code:
public Vector256<int> M(Vector256<int> x, long* lpFrequency)
{
x += x;
QueryPerformanceFrequency(lpFrequency);
return x;
}
public Vector256<int> M2(Vector256<int> x, long* lpFrequency)
{
x += x;
N(lpFrequency);
return x;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public static int N(long* lpFrequency)
{
return QueryPerformanceFrequency(lpFrequency);
}
[DllImport("kernel32", ExactSpelling = true)]
[SuppressGCTransition]
public static extern int QueryPerformanceFrequency(long* lpFrequency);For M, because the QPC is direct and because Vector256<int> is directly used in the function we get a vzeroupper directly before the P/Invoke is called:
; Method C:M(System.Runtime.Intrinsics.Vector256`1[int],ulong):System.Runtime.Intrinsics.Vector256`1[int]:this
G_M000_IG01: ;; offset=0000H
55 push rbp
57 push rdi
56 push rsi
4883EC20 sub rsp, 32
C5F877 vzeroupper
488D6C2430 lea rbp, [rsp+30H]
48895518 mov bword ptr [rbp+18H], rdx
498BF0 mov rsi, r8
G_M000_IG02: ;; offset=0016H
C5FC1006 vmovups ymm0, ymmword ptr[rsi]
C5FDFEC0 vpaddd ymm0, ymm0, ymm0
48897520 mov bword ptr [rbp+20H], rsi
C5FC1106 vmovups ymmword ptr[rsi], ymm0
498BC9 mov rcx, r9
48B880474922FE7F0000 mov rax, 0x7FFE22494780
G_M000_IG03: ;; offset=0033H
C5F877 vzeroupper
FFD0 call rax ; C:QueryPerformanceFrequency(ulong):int
488B7520 mov rsi, bword ptr [rbp+20H]
C5FC1006 vmovups ymm0, ymmword ptr[rsi]
488B7D18 mov rdi, bword ptr [rbp+18H]
C5FC1107 vmovups ymmword ptr[rdi], ymm0
833DB59EE15F00 cmp dword ptr [(reloc 0x7ffcd12fd8a4)], 0
750E jne SHORT G_M000_IG06
G_M000_IG04: ;; offset=0051H
488BC7 mov rax, rdi
G_M000_IG05: ;; offset=0054H
C5F877 vzeroupper
4883C420 add rsp, 32
5E pop rsi
5F pop rdi
5D pop rbp
C3 ret
G_M000_IG06: ;; offset=005FH
E85C79AA5F call CORINFO_HELP_POLL_GC
EBEB jmp SHORT G_M000_IG04
; Total bytes of code: 102However, for M2 because the call to the P/Invoke is hidden behind the non-inlined method N and because N does not itself utilize any instructions requiring YMM, we miss out on this and do not emit an additional vzeroupper:
; Method C:M2(System.Runtime.Intrinsics.Vector256`1[int],ulong):System.Runtime.Intrinsics.Vector256`1[int]:this
G_M000_IG01: ;; offset=0000H
57 push rdi
56 push rsi
4883EC28 sub rsp, 40
C5F877 vzeroupper
488BFA mov rdi, rdx
498BF0 mov rsi, r8
G_M000_IG02: ;; offset=000FH
C5FC1006 vmovups ymm0, ymmword ptr[rsi]
C5FDFEC0 vpaddd ymm0, ymm0, ymm0
C5FC1106 vmovups ymmword ptr[rsi], ymm0
498BC9 mov rcx, r9
FF15FCF92300 call [C:N(ulong):int]
C5FC1006 vmovups ymm0, ymmword ptr[rsi]
C5FC1107 vmovups ymmword ptr[rdi], ymm0
488BC7 mov rax, rdi
G_M000_IG03: ;; offset=002FH
C5F877 vzeroupper
4883C428 add rsp, 40
5E pop rsi
5F pop rdi
C3 ret
; Total bytes of code: 57
; Method C:N(ulong):int
G_M000_IG01: ;; offset=0000H
55 push rbp
56 push rsi
4883EC28 sub rsp, 40
488D6C2430 lea rbp, [rsp+30H]
488BF1 mov rsi, rcx
G_M000_IG02: ;; offset=000EH
833DEF9EE45F00 cmp dword ptr [(reloc 0x7ffcd12fd8a4)], 0
7517 jne SHORT G_M000_IG06
G_M000_IG03: ;; offset=0017H
488BCE mov rcx, rsi
48B880474922FE7F0000 mov rax, 0x7FFE22494780
G_M000_IG04: ;; offset=0024H
FFD0 call rax ; C:QueryPerformanceFrequency(ulong):int
90 nop
G_M000_IG05: ;; offset=0027H
4883C428 add rsp, 40
5E pop rsi
5D pop rbp
C3 ret
G_M000_IG06: ;; offset=002EH
E88D79AD5F call CORINFO_HELP_POLL_GC
EBE2 jmp SHORT G_M000_IG03
; Total bytes of code: 53If the P/Invoke being called uses any legacy encoded SIMD instructions, then there is a significant (up to 10x perf regression per SIMD instruction) penalty that is incurred. Native code that utilizes SIMD instructions typically ends up with the "legacy encoding" since AVX is not part of the baseline instruction set.
Metadata
Metadata
Assignees
Labels
area-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI